Merge "ANS: Switch from PDFs to CDFs." into nextgenv2
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 12022be..0e54c40 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -28,7 +28,7 @@
 
 namespace {
 
-static const unsigned int kMaxDimension = 64;
+static const unsigned int kMaxDimension = MAX_CU_SIZE;
 
 typedef void (*ConvolveFunc)(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
@@ -102,7 +102,7 @@
   //                               = 23
   // and filter_max_width          = 16
   //
-  uint8_t intermediate_buffer[71 * kMaxDimension];
+  uint8_t intermediate_buffer[(kMaxDimension+8) * kMaxDimension];
   const int intermediate_next_stride = 1 - intermediate_height * output_width;
 
   // Horizontal pass (src -> transposed intermediate).
@@ -183,9 +183,9 @@
 
   assert(output_width <= kMaxDimension);
   assert(output_height <= kMaxDimension);
-  filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, 64,
+  filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, kMaxDimension,
                      output_width, output_height);
-  block2d_average_c(tmp, 64, dst_ptr, dst_stride,
+  block2d_average_c(tmp, kMaxDimension, dst_ptr, dst_stride,
                     output_width, output_height);
 }
 
@@ -214,7 +214,7 @@
    *                               = 23
    * and filter_max_width = 16
    */
-  uint16_t intermediate_buffer[71 * kMaxDimension];
+  uint16_t intermediate_buffer[(kMaxDimension+8) * kMaxDimension];
   const int intermediate_next_stride = 1 - intermediate_height * output_width;
 
   // Horizontal pass (src -> transposed intermediate).
@@ -302,9 +302,10 @@
 
   assert(output_width <= kMaxDimension);
   assert(output_height <= kMaxDimension);
-  highbd_filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, 64,
+  highbd_filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter,
+                            tmp, kMaxDimension,
                             output_width, output_height, bd);
-  highbd_block2d_average_c(tmp, 64, dst_ptr, dst_stride,
+  highbd_block2d_average_c(tmp, kMaxDimension, dst_ptr, dst_stride,
                            output_width, output_height);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -351,7 +352,7 @@
 
  protected:
   static const int kDataAlignment = 16;
-  static const int kOuterBlockSize = 256;
+  static const int kOuterBlockSize = 4*kMaxDimension;
   static const int kInputStride = kOuterBlockSize;
   static const int kOutputStride = kOuterBlockSize;
   static const int kInputBufferSize = kOuterBlockSize * kOuterBlockSize;
@@ -414,7 +415,8 @@
   void CopyOutputToRef() {
     memcpy(output_ref_, output_, kOutputBufferSize);
 #if CONFIG_VP9_HIGHBITDEPTH
-    memcpy(output16_ref_, output16_, kOutputBufferSize);
+    memcpy(output16_ref_, output16_,
+           kOutputBufferSize * sizeof(*output16_ref_));
 #endif
   }
 
@@ -426,41 +428,41 @@
   }
 
   uint8_t *input() const {
+    const int index = BorderTop() * kOuterBlockSize + BorderLeft();
 #if CONFIG_VP9_HIGHBITDEPTH
     if (UUT_->use_highbd_ == 0) {
-      return input_ + BorderTop() * kOuterBlockSize + BorderLeft();
+      return input_ + index;
     } else {
-      return CONVERT_TO_BYTEPTR(input16_ + BorderTop() * kOuterBlockSize +
-                                BorderLeft());
+      return CONVERT_TO_BYTEPTR(input16_) + index;
     }
 #else
-    return input_ + BorderTop() * kOuterBlockSize + BorderLeft();
+    return input_ + index;
 #endif
   }
 
   uint8_t *output() const {
+    const int index = BorderTop() * kOuterBlockSize + BorderLeft();
 #if CONFIG_VP9_HIGHBITDEPTH
     if (UUT_->use_highbd_ == 0) {
-      return output_ + BorderTop() * kOuterBlockSize + BorderLeft();
+      return output_ + index;
     } else {
-      return CONVERT_TO_BYTEPTR(output16_ + BorderTop() * kOuterBlockSize +
-                                BorderLeft());
+      return CONVERT_TO_BYTEPTR(output16_ + index);
     }
 #else
-    return output_ + BorderTop() * kOuterBlockSize + BorderLeft();
+    return output_ + index;
 #endif
   }
 
   uint8_t *output_ref() const {
+    const int index = BorderTop() * kOuterBlockSize + BorderLeft();
 #if CONFIG_VP9_HIGHBITDEPTH
     if (UUT_->use_highbd_ == 0) {
-      return output_ref_ + BorderTop() * kOuterBlockSize + BorderLeft();
+      return output_ref_ + index;
     } else {
-      return CONVERT_TO_BYTEPTR(output16_ref_ + BorderTop() * kOuterBlockSize +
-                                BorderLeft());
+      return CONVERT_TO_BYTEPTR(output16_ref_ + index);
     }
 #else
-    return output_ref_ + BorderTop() * kOuterBlockSize + BorderLeft();
+    return output_ref_ + index;
 #endif
   }
 
@@ -1035,6 +1037,11 @@
     wrap_convolve8_vert_c_8, wrap_convolve8_avg_vert_c_8,
     wrap_convolve8_c_8, wrap_convolve8_avg_c_8, 8);
 INSTANTIATE_TEST_CASE_P(C_8, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(128, 64, &convolve8_c),
+    make_tuple(64, 128, &convolve8_c),
+    make_tuple(128, 128, &convolve8_c),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
     make_tuple(4, 4, &convolve8_c),
     make_tuple(8, 4, &convolve8_c),
     make_tuple(4, 8, &convolve8_c),
@@ -1057,6 +1064,11 @@
     wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10,
     wrap_convolve8_c_10, wrap_convolve8_avg_c_10, 10);
 INSTANTIATE_TEST_CASE_P(C_10, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(128, 64, &convolve10_c),
+    make_tuple(64, 128, &convolve10_c),
+    make_tuple(128, 128, &convolve10_c),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
     make_tuple(4, 4, &convolve10_c),
     make_tuple(8, 4, &convolve10_c),
     make_tuple(4, 8, &convolve10_c),
@@ -1079,6 +1091,11 @@
     wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12,
     wrap_convolve8_c_12, wrap_convolve8_avg_c_12, 12);
 INSTANTIATE_TEST_CASE_P(C_12, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(128, 64, &convolve12_c),
+    make_tuple(64, 128, &convolve12_c),
+    make_tuple(128, 128, &convolve12_c),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
     make_tuple(4, 4, &convolve12_c),
     make_tuple(8, 4, &convolve12_c),
     make_tuple(4, 8, &convolve12_c),
@@ -1105,6 +1122,11 @@
     vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
 
 INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(128, 64, &convolve8_c),
+    make_tuple(64, 128, &convolve8_c),
+    make_tuple(128, 128, &convolve8_c),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
     make_tuple(4, 4, &convolve8_c),
     make_tuple(8, 4, &convolve8_c),
     make_tuple(4, 8, &convolve8_c),
@@ -1158,7 +1180,12 @@
     wrap_convolve8_horiz_sse2_12, wrap_convolve8_avg_horiz_sse2_12,
     wrap_convolve8_vert_sse2_12, wrap_convolve8_avg_vert_sse2_12,
     wrap_convolve8_sse2_12, wrap_convolve8_avg_sse2_12, 12);
-INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values(
+INSTANTIATE_TEST_CASE_P(SSE2_8, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(128, 64, &convolve8_sse2),
+    make_tuple(64, 128, &convolve8_sse2),
+    make_tuple(128, 128, &convolve8_sse2),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
     make_tuple(4, 4, &convolve8_sse2),
     make_tuple(8, 4, &convolve8_sse2),
     make_tuple(4, 8, &convolve8_sse2),
@@ -1171,7 +1198,13 @@
     make_tuple(32, 32, &convolve8_sse2),
     make_tuple(64, 32, &convolve8_sse2),
     make_tuple(32, 64, &convolve8_sse2),
-    make_tuple(64, 64, &convolve8_sse2),
+    make_tuple(64, 64, &convolve8_sse2)));
+INSTANTIATE_TEST_CASE_P(SSE2_10, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(128, 64, &convolve10_sse2),
+    make_tuple(64, 128, &convolve10_sse2),
+    make_tuple(128, 128, &convolve10_sse2),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
     make_tuple(4, 4, &convolve10_sse2),
     make_tuple(8, 4, &convolve10_sse2),
     make_tuple(4, 8, &convolve10_sse2),
@@ -1184,7 +1217,13 @@
     make_tuple(32, 32, &convolve10_sse2),
     make_tuple(64, 32, &convolve10_sse2),
     make_tuple(32, 64, &convolve10_sse2),
-    make_tuple(64, 64, &convolve10_sse2),
+    make_tuple(64, 64, &convolve10_sse2)));
+INSTANTIATE_TEST_CASE_P(SSE2_12, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(128, 64, &convolve12_sse2),
+    make_tuple(64, 128, &convolve12_sse2),
+    make_tuple(128, 128, &convolve12_sse2),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
     make_tuple(4, 4, &convolve12_sse2),
     make_tuple(8, 4, &convolve12_sse2),
     make_tuple(4, 8, &convolve12_sse2),
@@ -1213,6 +1252,11 @@
     vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
 
 INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(128, 64, &convolve8_sse2),
+    make_tuple(64, 128, &convolve8_sse2),
+    make_tuple(128, 128, &convolve8_sse2),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
     make_tuple(4, 4, &convolve8_sse2),
     make_tuple(8, 4, &convolve8_sse2),
     make_tuple(4, 8, &convolve8_sse2),
@@ -1237,9 +1281,14 @@
     vpx_convolve8_ssse3, vpx_convolve8_avg_ssse3,
     vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c,
     vpx_scaled_vert_c, vpx_scaled_avg_vert_c,
-    vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+    vpx_scaled_2d_ssse3, vpx_scaled_avg_2d_c, 0);
 
 INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(128, 64, &convolve8_ssse3),
+    make_tuple(64, 128, &convolve8_ssse3),
+    make_tuple(128, 128, &convolve8_ssse3),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
     make_tuple(4, 4, &convolve8_ssse3),
     make_tuple(8, 4, &convolve8_ssse3),
     make_tuple(4, 8, &convolve8_ssse3),
@@ -1266,6 +1315,11 @@
     vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
 
 INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(128, 64, &convolve8_avx2),
+    make_tuple(64, 128, &convolve8_avx2),
+    make_tuple(128, 128, &convolve8_avx2),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
     make_tuple(4, 4, &convolve8_avx2),
     make_tuple(8, 4, &convolve8_avx2),
     make_tuple(4, 8, &convolve8_avx2),
@@ -1281,7 +1335,8 @@
     make_tuple(64, 64, &convolve8_avx2)));
 #endif  // HAVE_AVX2 && HAVE_SSSE3
 
-#if HAVE_NEON
+// TODO(any): Make NEON versions support 128x128 128x64 64x128 block sizes
+#if HAVE_NEON && !(CONFIG_VP10 && CONFIG_EXT_PARTITION)
 #if HAVE_NEON_ASM
 const ConvolveFunctions convolve8_neon(
     vpx_convolve_copy_neon, vpx_convolve_avg_neon,
@@ -1303,6 +1358,11 @@
 #endif  // HAVE_NEON_ASM
 
 INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(128, 64, &convolve8_neon),
+    make_tuple(64, 128, &convolve8_neon),
+    make_tuple(128, 128, &convolve8_neon),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
     make_tuple(4, 4, &convolve8_neon),
     make_tuple(8, 4, &convolve8_neon),
     make_tuple(4, 8, &convolve8_neon),
@@ -1318,7 +1378,8 @@
     make_tuple(64, 64, &convolve8_neon)));
 #endif  // HAVE_NEON
 
-#if HAVE_DSPR2
+// TODO(any): Make DSPR2 versions support 128x128 128x64 64x128 block sizes
+#if HAVE_DSPR2 && !(CONFIG_VP10 && CONFIG_EXT_PARTITION)
 const ConvolveFunctions convolve8_dspr2(
     vpx_convolve_copy_dspr2, vpx_convolve_avg_dspr2,
     vpx_convolve8_horiz_dspr2, vpx_convolve8_avg_horiz_dspr2,
@@ -1329,6 +1390,11 @@
     vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
 
 INSTANTIATE_TEST_CASE_P(DSPR2, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(128, 64, &convolve8_dspr2),
+    make_tuple(64, 128, &convolve8_dspr2),
+    make_tuple(128, 128, &convolve8_dspr2),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
     make_tuple(4, 4, &convolve8_dspr2),
     make_tuple(8, 4, &convolve8_dspr2),
     make_tuple(4, 8, &convolve8_dspr2),
@@ -1344,7 +1410,8 @@
     make_tuple(64, 64, &convolve8_dspr2)));
 #endif
 
-#if HAVE_MSA
+// TODO(any): Make MSA versions support 128x128 128x64 64x128 block sizes
+#if HAVE_MSA && !(CONFIG_VP10 && CONFIG_EXT_PARTITION)
 const ConvolveFunctions convolve8_msa(
     vpx_convolve_copy_msa, vpx_convolve_avg_msa,
     vpx_convolve8_horiz_msa, vpx_convolve8_avg_horiz_msa,
@@ -1355,6 +1422,11 @@
     vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
 
 INSTANTIATE_TEST_CASE_P(MSA, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(128, 64, &convolve8_msa),
+    make_tuple(64, 128, &convolve8_msa),
+    make_tuple(128, 128, &convolve8_msa),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
     make_tuple(4, 4, &convolve8_msa),
     make_tuple(8, 4, &convolve8_msa),
     make_tuple(4, 8, &convolve8_msa),
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 0c91aee..59ce895 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -19,8 +19,8 @@
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
+#include "test/transform_test_base.h"
 #include "test/util.h"
-#include "vp9/common/vp9_entropy.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
@@ -28,16 +28,16 @@
 using libvpx_test::ACMRandom;
 
 namespace {
-const int kNumCoeffs = 16;
 typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
 typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
-typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
-                        int tx_type);
 typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         int tx_type);
+using libvpx_test::FhtFunc;
 
-typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct4x4Param;
-typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht4x4Param;
+typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t, int>
+Dct4x4Param;
+typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t, int>
+Ht4x4Param;
 
 void fdct4x4_ref(const int16_t *in, tran_low_t *out, int stride,
                  int /*tx_type*/) {
@@ -89,197 +89,9 @@
 #endif  // HAVE_SSE2
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-class Trans4x4TestBase {
- public:
-  virtual ~Trans4x4TestBase() {}
-
- protected:
-  virtual void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) = 0;
-
-  virtual void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) = 0;
-
-  void RunAccuracyCheck(int limit) {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    uint32_t max_error = 0;
-    int64_t total_error = 0;
-    const int count_test_block = 10000;
-    for (int i = 0; i < count_test_block; ++i) {
-      DECLARE_ALIGNED(16, int16_t, test_input_block[kNumCoeffs]);
-      DECLARE_ALIGNED(16, tran_low_t, test_temp_block[kNumCoeffs]);
-      DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
-      DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
-#if CONFIG_VP9_HIGHBITDEPTH
-      DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
-      DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
-#endif
-
-      // Initialize a test block with input range [-255, 255].
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        if (bit_depth_ == VPX_BITS_8) {
-          src[j] = rnd.Rand8();
-          dst[j] = rnd.Rand8();
-          test_input_block[j] = src[j] - dst[j];
-#if CONFIG_VP9_HIGHBITDEPTH
-        } else {
-          src16[j] = rnd.Rand16() & mask_;
-          dst16[j] = rnd.Rand16() & mask_;
-          test_input_block[j] = src16[j] - dst16[j];
-#endif
-        }
-      }
-
-      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block,
-                                          test_temp_block, pitch_));
-      if (bit_depth_ == VPX_BITS_8) {
-        ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
-#if CONFIG_VP9_HIGHBITDEPTH
-      } else {
-        ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block,
-                                            CONVERT_TO_BYTEPTR(dst16), pitch_));
-#endif
-      }
-
-      for (int j = 0; j < kNumCoeffs; ++j) {
-#if CONFIG_VP9_HIGHBITDEPTH
-        const uint32_t diff =
-            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
-#else
-        ASSERT_EQ(VPX_BITS_8, bit_depth_);
-        const uint32_t diff = dst[j] - src[j];
-#endif
-        const uint32_t error = diff * diff;
-        if (max_error < error)
-          max_error = error;
-        total_error += error;
-      }
-    }
-
-    EXPECT_GE(static_cast<uint32_t>(limit), max_error)
-        << "Error: 4x4 FHT/IHT has an individual round trip error > "
-        << limit;
-
-    EXPECT_GE(count_test_block * limit, total_error)
-        << "Error: 4x4 FHT/IHT has average round trip error > " << limit
-        << " per block";
-  }
-
-  void RunCoeffCheck() {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 5000;
-    DECLARE_ALIGNED(16, int16_t, input_block[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
-
-    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-mask_, mask_].
-      for (int j = 0; j < kNumCoeffs; ++j)
-        input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
-
-      fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_);
-      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));
-
-      // The minimum quant value is 4.
-      for (int j = 0; j < kNumCoeffs; ++j)
-        EXPECT_EQ(output_block[j], output_ref_block[j]);
-    }
-  }
-
-  void RunMemCheck() {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 5000;
-    DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
-
-    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-mask_, mask_].
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;
-      }
-      if (i == 0) {
-        for (int j = 0; j < kNumCoeffs; ++j)
-          input_extreme_block[j] = mask_;
-      } else if (i == 1) {
-        for (int j = 0; j < kNumCoeffs; ++j)
-          input_extreme_block[j] = -mask_;
-      }
-
-      fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
-      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_extreme_block,
-                                          output_block, pitch_));
-
-      // The minimum quant value is 4.
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        EXPECT_EQ(output_block[j], output_ref_block[j]);
-        EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block[j]))
-            << "Error: 4x4 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
-      }
-    }
-  }
-
-  void RunInvAccuracyCheck(int limit) {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 1000;
-    DECLARE_ALIGNED(16, int16_t, in[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
-    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
-    DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
-#if CONFIG_VP9_HIGHBITDEPTH
-    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
-    DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
-#endif
-
-    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-mask_, mask_].
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        if (bit_depth_ == VPX_BITS_8) {
-          src[j] = rnd.Rand8();
-          dst[j] = rnd.Rand8();
-          in[j] = src[j] - dst[j];
-#if CONFIG_VP9_HIGHBITDEPTH
-        } else {
-          src16[j] = rnd.Rand16() & mask_;
-          dst16[j] = rnd.Rand16() & mask_;
-          in[j] = src16[j] - dst16[j];
-#endif
-        }
-      }
-
-      fwd_txfm_ref(in, coeff, pitch_, tx_type_);
-
-      if (bit_depth_ == VPX_BITS_8) {
-        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
-#if CONFIG_VP9_HIGHBITDEPTH
-      } else {
-        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
-                                            pitch_));
-#endif
-      }
-
-      for (int j = 0; j < kNumCoeffs; ++j) {
-#if CONFIG_VP9_HIGHBITDEPTH
-        const uint32_t diff =
-            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
-#else
-        const uint32_t diff = dst[j] - src[j];
-#endif
-        const uint32_t error = diff * diff;
-        EXPECT_GE(static_cast<uint32_t>(limit), error)
-            << "Error: 4x4 IDCT has error " << error
-            << " at index " << j;
-      }
-    }
-  }
-
-  int pitch_;
-  int tx_type_;
-  FhtFunc fwd_txfm_ref;
-  vpx_bit_depth_t bit_depth_;
-  int mask_;
-};
 
 class Trans4x4DCT
-    : public Trans4x4TestBase,
+    : public libvpx_test::TransformTestBase,
       public ::testing::TestWithParam<Dct4x4Param> {
  public:
   virtual ~Trans4x4DCT() {}
@@ -292,6 +104,7 @@
     fwd_txfm_ref = fdct4x4_ref;
     bit_depth_ = GET_PARAM(3);
     mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = GET_PARAM(4);
   }
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
@@ -324,7 +137,7 @@
 }
 
 class Trans4x4HT
-    : public Trans4x4TestBase,
+    : public libvpx_test::TransformTestBase,
       public ::testing::TestWithParam<Ht4x4Param> {
  public:
   virtual ~Trans4x4HT() {}
@@ -337,6 +150,7 @@
     fwd_txfm_ref = fht4x4_ref;
     bit_depth_ = GET_PARAM(3);
     mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = GET_PARAM(4);
   }
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
@@ -370,7 +184,7 @@
 }
 
 class Trans4x4WHT
-    : public Trans4x4TestBase,
+    : public libvpx_test::TransformTestBase,
       public ::testing::TestWithParam<Dct4x4Param> {
  public:
   virtual ~Trans4x4WHT() {}
@@ -383,6 +197,7 @@
     fwd_txfm_ref = fwht4x4_ref;
     bit_depth_ = GET_PARAM(3);
     mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = GET_PARAM(4);
   }
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
@@ -419,54 +234,54 @@
 INSTANTIATE_TEST_CASE_P(
     C, Trans4x4DCT,
     ::testing::Values(
-        make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_10, 0, VPX_BITS_10),
-        make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_12, 0, VPX_BITS_12),
-        make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, 0, VPX_BITS_8)));
+        make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_10, 0, VPX_BITS_10, 16),
+        make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_12, 0, VPX_BITS_12, 16),
+        make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, 0, VPX_BITS_8, 16)));
 #else
 INSTANTIATE_TEST_CASE_P(
     C, Trans4x4DCT,
     ::testing::Values(
-        make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, 0, VPX_BITS_8)));
+         make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, 0, VPX_BITS_8, 16)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     C, Trans4x4HT,
     ::testing::Values(
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 0, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 1, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 2, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 3, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 0, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 1, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 2, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 3, VPX_BITS_12),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 0, VPX_BITS_10, 16),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 1, VPX_BITS_10, 16),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 2, VPX_BITS_10, 16),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 3, VPX_BITS_10, 16),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 0, VPX_BITS_12, 16),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 1, VPX_BITS_12, 16),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 2, VPX_BITS_12, 16),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 3, VPX_BITS_12, 16),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8, 16),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8, 16),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8, 16),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8, 16)));
 #else
 INSTANTIATE_TEST_CASE_P(
     C, Trans4x4HT,
     ::testing::Values(
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8, 16),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8, 16),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8, 16),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8, 16)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     C, Trans4x4WHT,
     ::testing::Values(
-        make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_10, 0, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_12, 0, VPX_BITS_12),
-        make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8)));
+        make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_10, 0, VPX_BITS_10, 16),
+        make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_12, 0, VPX_BITS_12, 16),
+        make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8, 16)));
 #else
 INSTANTIATE_TEST_CASE_P(
     C, Trans4x4WHT,
     ::testing::Values(
-        make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8)));
+        make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8, 16)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -474,17 +289,17 @@
     NEON, Trans4x4DCT,
     ::testing::Values(
         make_tuple(&vpx_fdct4x4_c,
-                   &vpx_idct4x4_16_add_neon, 0, VPX_BITS_8)));
+                   &vpx_idct4x4_16_add_neon, 0, VPX_BITS_8, 16)));
 #endif  // HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     NEON, Trans4x4HT,
     ::testing::Values(
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3, VPX_BITS_8)));
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 0, VPX_BITS_8, 16),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 1, VPX_BITS_8, 16),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 2, VPX_BITS_8, 16),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3, VPX_BITS_8, 16)));
 #endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if CONFIG_USE_X86INC && HAVE_MMX && !CONFIG_VP9_HIGHBITDEPTH && \
@@ -492,7 +307,8 @@
 INSTANTIATE_TEST_CASE_P(
     MMX, Trans4x4WHT,
     ::testing::Values(
-        make_tuple(&vp9_fwht4x4_mmx, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8)));
+        make_tuple(&vp9_fwht4x4_mmx, &vpx_iwht4x4_16_add_c, 0,
+                   VPX_BITS_8, 16)));
 #endif
 
 #if CONFIG_USE_X86INC && HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && \
@@ -500,7 +316,8 @@
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans4x4WHT,
     ::testing::Values(
-        make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_sse2, 0, VPX_BITS_8)));
+        make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_sse2, 0,
+                   VPX_BITS_8, 16)));
 #endif
 
 #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -508,47 +325,60 @@
     SSE2, Trans4x4DCT,
     ::testing::Values(
         make_tuple(&vpx_fdct4x4_sse2,
-                   &vpx_idct4x4_16_add_sse2, 0, VPX_BITS_8)));
+                   &vpx_idct4x4_16_add_sse2, 0, VPX_BITS_8, 16)));
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans4x4HT,
     ::testing::Values(
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3, VPX_BITS_8)));
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 0,
+                   VPX_BITS_8, 16),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 1,
+                   VPX_BITS_8, 16),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 2,
+                   VPX_BITS_8, 16),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3,
+                   VPX_BITS_8, 16)));
 #endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans4x4DCT,
     ::testing::Values(
-        make_tuple(&vpx_highbd_fdct4x4_c,    &idct4x4_10_sse2, 0, VPX_BITS_10),
-        make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_10_sse2, 0, VPX_BITS_10),
-        make_tuple(&vpx_highbd_fdct4x4_c,    &idct4x4_12_sse2, 0, VPX_BITS_12),
-        make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_12_sse2, 0, VPX_BITS_12),
+        make_tuple(&vpx_highbd_fdct4x4_c,    &idct4x4_10_sse2, 0,
+                   VPX_BITS_10, 16),
+        make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_10_sse2, 0,
+                   VPX_BITS_10, 16),
+        make_tuple(&vpx_highbd_fdct4x4_c,    &idct4x4_12_sse2, 0,
+                   VPX_BITS_12, 16),
+        make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_12_sse2, 0,
+                   VPX_BITS_12, 16),
         make_tuple(&vpx_fdct4x4_sse2,      &vpx_idct4x4_16_add_c, 0,
-                   VPX_BITS_8)));
+                   VPX_BITS_8, 16)));
 
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans4x4HT,
     ::testing::Values(
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8, 16),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8, 16),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8, 16),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8, 16)));
 #endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     MSA, Trans4x4DCT,
     ::testing::Values(
-        make_tuple(&vpx_fdct4x4_msa, &vpx_idct4x4_16_add_msa, 0, VPX_BITS_8)));
+        make_tuple(&vpx_fdct4x4_msa, &vpx_idct4x4_16_add_msa, 0,
+                   VPX_BITS_8, 16)));
 INSTANTIATE_TEST_CASE_P(
     MSA, Trans4x4HT,
     ::testing::Values(
-        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 3, VPX_BITS_8)));
+        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 0,
+                   VPX_BITS_8, 16),
+        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 1,
+                   VPX_BITS_8, 16),
+        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 2,
+                   VPX_BITS_8, 16),
+        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 3,
+                   VPX_BITS_8, 16)));
 #endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace
diff --git a/test/masked_sad_test.cc b/test/masked_sad_test.cc
index c09104c..34223ea 100644
--- a/test/masked_sad_test.cc
+++ b/test/masked_sad_test.cc
@@ -50,16 +50,16 @@
 TEST_P(MaskedSADTest, OperationCheck) {
   unsigned int ref_ret, ret;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint8_t,  src_ptr[4096]);
-  DECLARE_ALIGNED(16, uint8_t,  ref_ptr[4096]);
-  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[4096]);
+  DECLARE_ALIGNED(16, uint8_t, src_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t, ref_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
   int err_count = 0;
   int first_failure = -1;
-  int src_stride = 64;
-  int ref_stride = 64;
-  int msk_stride = 64;
+  int src_stride = MAX_CU_SIZE;
+  int ref_stride = MAX_CU_SIZE;
+  int msk_stride = MAX_CU_SIZE;
   for (int i = 0; i < number_of_iterations; ++i) {
-    for (int j = 0; j < 4096; j++) {
+    for (int j = 0; j < MAX_CU_SIZE*MAX_CU_SIZE; j++) {
       src_ptr[j] = rnd.Rand8();
       ref_ptr[j] = rnd.Rand8();
       msk_ptr[j] = ((rnd.Rand8()&0x7f) > 64) ? rnd.Rand8()&0x3f : 64;
@@ -108,18 +108,18 @@
 TEST_P(HighbdMaskedSADTest, OperationCheck) {
   unsigned int ref_ret, ret;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint16_t,  src_ptr[4096]);
-  DECLARE_ALIGNED(16, uint16_t,  ref_ptr[4096]);
-  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[4096]);
+  DECLARE_ALIGNED(16, uint16_t, src_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
+  DECLARE_ALIGNED(16, uint16_t, ref_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
   uint8_t* src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
   uint8_t* ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
   int err_count = 0;
   int first_failure = -1;
-  int src_stride = 64;
-  int ref_stride = 64;
-  int msk_stride = 64;
+  int src_stride = MAX_CU_SIZE;
+  int ref_stride = MAX_CU_SIZE;
+  int msk_stride = MAX_CU_SIZE;
   for (int i = 0; i < number_of_iterations; ++i) {
-    for (int j = 0; j < 4096; j++) {
+    for (int j = 0; j < MAX_CU_SIZE*MAX_CU_SIZE; j++) {
       src_ptr[j] = rnd.Rand16()&0xfff;
       ref_ptr[j] = rnd.Rand16()&0xfff;
       msk_ptr[j] = ((rnd.Rand8()&0x7f) > 64) ? rnd.Rand8()&0x3f : 64;
@@ -148,6 +148,14 @@
 INSTANTIATE_TEST_CASE_P(
   SSSE3_C_COMPARE, MaskedSADTest,
   ::testing::Values(
+#if CONFIG_EXT_PARTITION
+    make_tuple(&vpx_masked_sad128x128_ssse3,
+               &vpx_masked_sad128x128_c),
+    make_tuple(&vpx_masked_sad128x64_ssse3,
+               &vpx_masked_sad128x64_c),
+    make_tuple(&vpx_masked_sad64x128_ssse3,
+               &vpx_masked_sad64x128_c),
+#endif  // CONFIG_EXT_PARTITION
     make_tuple(&vpx_masked_sad64x64_ssse3,
                &vpx_masked_sad64x64_c),
     make_tuple(&vpx_masked_sad64x32_ssse3,
@@ -178,32 +186,40 @@
 INSTANTIATE_TEST_CASE_P(
   SSSE3_C_COMPARE, HighbdMaskedSADTest,
   ::testing::Values(
-    make_tuple(&vp9_highbd_masked_sad64x64_ssse3,
-               &vp9_highbd_masked_sad64x64_c),
-    make_tuple(&vp9_highbd_masked_sad64x32_ssse3,
-               &vp9_highbd_masked_sad64x32_c),
-    make_tuple(&vp9_highbd_masked_sad32x64_ssse3,
-               &vp9_highbd_masked_sad32x64_c),
-    make_tuple(&vp9_highbd_masked_sad32x32_ssse3,
-               &vp9_highbd_masked_sad32x32_c),
-    make_tuple(&vp9_highbd_masked_sad32x16_ssse3,
-               &vp9_highbd_masked_sad32x16_c),
-    make_tuple(&vp9_highbd_masked_sad16x32_ssse3,
-               &vp9_highbd_masked_sad16x32_c),
-    make_tuple(&vp9_highbd_masked_sad16x16_ssse3,
-               &vp9_highbd_masked_sad16x16_c),
-    make_tuple(&vp9_highbd_masked_sad16x8_ssse3,
-               &vp9_highbd_masked_sad16x8_c),
-    make_tuple(&vp9_highbd_masked_sad8x16_ssse3,
-               &vp9_highbd_masked_sad8x16_c),
-    make_tuple(&vp9_highbd_masked_sad8x8_ssse3,
-               &vp9_highbd_masked_sad8x8_c),
-    make_tuple(&vp9_highbd_masked_sad8x4_ssse3,
-               &vp9_highbd_masked_sad8x4_c),
-    make_tuple(&vp9_highbd_masked_sad4x8_ssse3,
-               &vp9_highbd_masked_sad4x8_c),
-    make_tuple(&vp9_highbd_masked_sad4x4_ssse3,
-               &vp9_highbd_masked_sad4x4_c)));
+#if CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_masked_sad128x128_ssse3,
+               &vpx_highbd_masked_sad128x128_c),
+    make_tuple(&vpx_highbd_masked_sad128x64_ssse3,
+               &vpx_highbd_masked_sad128x64_c),
+    make_tuple(&vpx_highbd_masked_sad64x128_ssse3,
+               &vpx_highbd_masked_sad64x128_c),
+#endif  // CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_masked_sad64x64_ssse3,
+               &vpx_highbd_masked_sad64x64_c),
+    make_tuple(&vpx_highbd_masked_sad64x32_ssse3,
+               &vpx_highbd_masked_sad64x32_c),
+    make_tuple(&vpx_highbd_masked_sad32x64_ssse3,
+               &vpx_highbd_masked_sad32x64_c),
+    make_tuple(&vpx_highbd_masked_sad32x32_ssse3,
+               &vpx_highbd_masked_sad32x32_c),
+    make_tuple(&vpx_highbd_masked_sad32x16_ssse3,
+               &vpx_highbd_masked_sad32x16_c),
+    make_tuple(&vpx_highbd_masked_sad16x32_ssse3,
+               &vpx_highbd_masked_sad16x32_c),
+    make_tuple(&vpx_highbd_masked_sad16x16_ssse3,
+               &vpx_highbd_masked_sad16x16_c),
+    make_tuple(&vpx_highbd_masked_sad16x8_ssse3,
+               &vpx_highbd_masked_sad16x8_c),
+    make_tuple(&vpx_highbd_masked_sad8x16_ssse3,
+               &vpx_highbd_masked_sad8x16_c),
+    make_tuple(&vpx_highbd_masked_sad8x8_ssse3,
+               &vpx_highbd_masked_sad8x8_c),
+    make_tuple(&vpx_highbd_masked_sad8x4_ssse3,
+               &vpx_highbd_masked_sad8x4_c),
+    make_tuple(&vpx_highbd_masked_sad4x8_ssse3,
+               &vpx_highbd_masked_sad4x8_c),
+    make_tuple(&vpx_highbd_masked_sad4x4_ssse3,
+               &vpx_highbd_masked_sad4x4_c)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_SSSE3
 }  // namespace
diff --git a/test/masked_variance_test.cc b/test/masked_variance_test.cc
index fc37759..1f8bf1e2 100644
--- a/test/masked_variance_test.cc
+++ b/test/masked_variance_test.cc
@@ -20,10 +20,10 @@
 
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_filter.h"
-
-#define MAX_SIZE 64
+#include "vpx_mem/vpx_mem.h"
 
 using libvpx_test::ACMRandom;
 
@@ -58,17 +58,17 @@
   unsigned int ref_ret, opt_ret;
   unsigned int ref_sse, opt_sse;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint8_t,  src_ptr[MAX_SIZE*MAX_SIZE]);
-  DECLARE_ALIGNED(16, uint8_t,  ref_ptr[MAX_SIZE*MAX_SIZE]);
-  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[MAX_SIZE*MAX_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t,  src_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t,  ref_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
   int err_count = 0;
   int first_failure = -1;
-  int src_stride = MAX_SIZE;
-  int ref_stride = MAX_SIZE;
-  int msk_stride = MAX_SIZE;
+  int src_stride = MAX_CU_SIZE;
+  int ref_stride = MAX_CU_SIZE;
+  int msk_stride = MAX_CU_SIZE;
 
   for (int i = 0; i < number_of_iterations; ++i) {
-    for (int j = 0; j < MAX_SIZE*MAX_SIZE; j++) {
+    for (int j = 0; j < MAX_CU_SIZE*MAX_CU_SIZE; j++) {
       src_ptr[j] = rnd.Rand8();
       ref_ptr[j] = rnd.Rand8();
       msk_ptr[j] = rnd(65);
@@ -100,19 +100,19 @@
   unsigned int ref_ret, opt_ret;
   unsigned int ref_sse, opt_sse;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint8_t,  src_ptr[MAX_SIZE*MAX_SIZE]);
-  DECLARE_ALIGNED(16, uint8_t,  ref_ptr[MAX_SIZE*MAX_SIZE]);
-  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[MAX_SIZE*MAX_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t,  src_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t,  ref_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
   int err_count = 0;
   int first_failure = -1;
-  int src_stride = MAX_SIZE;
-  int ref_stride = MAX_SIZE;
-  int msk_stride = MAX_SIZE;
+  int src_stride = MAX_CU_SIZE;
+  int ref_stride = MAX_CU_SIZE;
+  int msk_stride = MAX_CU_SIZE;
 
   for (int i = 0; i < 8; ++i) {
-    memset(src_ptr, (i & 0x1) ? 255 : 0, MAX_SIZE*MAX_SIZE);
-    memset(ref_ptr, (i & 0x2) ? 255 : 0, MAX_SIZE*MAX_SIZE);
-    memset(msk_ptr, (i & 0x4) ?  64 : 0, MAX_SIZE*MAX_SIZE);
+    memset(src_ptr, (i & 0x1) ? 255 : 0, MAX_CU_SIZE*MAX_CU_SIZE);
+    memset(ref_ptr, (i & 0x2) ? 255 : 0, MAX_CU_SIZE*MAX_CU_SIZE);
+    memset(msk_ptr, (i & 0x4) ?  64 : 0, MAX_CU_SIZE*MAX_CU_SIZE);
 
     ref_ret = ref_func_(src_ptr, src_stride,
                         ref_ptr, ref_stride,
@@ -166,21 +166,21 @@
   unsigned int ref_ret, opt_ret;
   unsigned int ref_sse, opt_sse;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint8_t,  src_ptr[(MAX_SIZE+1)*(MAX_SIZE+1)]);
-  DECLARE_ALIGNED(16, uint8_t,  ref_ptr[(MAX_SIZE+1)*(MAX_SIZE+1)]);
-  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[(MAX_SIZE+1)*(MAX_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint8_t,  src_ptr[(MAX_CU_SIZE+1)*(MAX_CU_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint8_t,  ref_ptr[(MAX_CU_SIZE+1)*(MAX_CU_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[(MAX_CU_SIZE+1)*(MAX_CU_SIZE+1)]);
   int err_count = 0;
   int first_failure = -1;
-  int src_stride = (MAX_SIZE+1);
-  int ref_stride = (MAX_SIZE+1);
-  int msk_stride = (MAX_SIZE+1);
+  int src_stride = (MAX_CU_SIZE+1);
+  int ref_stride = (MAX_CU_SIZE+1);
+  int msk_stride = (MAX_CU_SIZE+1);
   int xoffset;
   int yoffset;
 
   for (int i = 0; i < number_of_iterations; ++i) {
     int xoffsets[] = {0, 4, rnd(BIL_SUBPEL_SHIFTS)};
     int yoffsets[] = {0, 4, rnd(BIL_SUBPEL_SHIFTS)};
-    for (int j = 0; j < (MAX_SIZE+1)*(MAX_SIZE+1); j++) {
+    for (int j = 0; j < (MAX_CU_SIZE+1)*(MAX_CU_SIZE+1); j++) {
       src_ptr[j] = rnd.Rand8();
       ref_ptr[j] = rnd.Rand8();
       msk_ptr[j] = rnd(65);
@@ -221,23 +221,23 @@
   unsigned int ref_ret, opt_ret;
   unsigned int ref_sse, opt_sse;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint8_t,  src_ptr[(MAX_SIZE+1)*(MAX_SIZE+1)]);
-  DECLARE_ALIGNED(16, uint8_t,  ref_ptr[(MAX_SIZE+1)*(MAX_SIZE+1)]);
-  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[(MAX_SIZE+1)*(MAX_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint8_t,  src_ptr[(MAX_CU_SIZE+1)*(MAX_CU_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint8_t,  ref_ptr[(MAX_CU_SIZE+1)*(MAX_CU_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[(MAX_CU_SIZE+1)*(MAX_CU_SIZE+1)]);
   int first_failure_x = -1;
   int first_failure_y = -1;
   int err_count = 0;
   int first_failure = -1;
-  int src_stride = (MAX_SIZE+1);
-  int ref_stride = (MAX_SIZE+1);
-  int msk_stride = (MAX_SIZE+1);
+  int src_stride = (MAX_CU_SIZE+1);
+  int ref_stride = (MAX_CU_SIZE+1);
+  int msk_stride = (MAX_CU_SIZE+1);
 
   for (int xoffset = 0 ; xoffset < BIL_SUBPEL_SHIFTS ; xoffset++) {
     for (int yoffset = 0 ; yoffset < BIL_SUBPEL_SHIFTS ; yoffset++) {
       for (int i = 0; i < 8; ++i) {
-        memset(src_ptr, (i & 0x1) ? 255 : 0, (MAX_SIZE+1)*(MAX_SIZE+1));
-        memset(ref_ptr, (i & 0x2) ? 255 : 0, (MAX_SIZE+1)*(MAX_SIZE+1));
-        memset(msk_ptr, (i & 0x4) ?  64 : 0, (MAX_SIZE+1)*(MAX_SIZE+1));
+        memset(src_ptr, (i & 0x1) ? 255 : 0, (MAX_CU_SIZE+1)*(MAX_CU_SIZE+1));
+        memset(ref_ptr, (i & 0x2) ? 255 : 0, (MAX_CU_SIZE+1)*(MAX_CU_SIZE+1));
+        memset(msk_ptr, (i & 0x4) ?  64 : 0, (MAX_CU_SIZE+1)*(MAX_CU_SIZE+1));
 
         ref_ret = ref_func_(src_ptr, src_stride,
                             xoffset, yoffset,
@@ -297,19 +297,19 @@
   unsigned int ref_ret, opt_ret;
   unsigned int ref_sse, opt_sse;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint16_t, src_ptr[MAX_SIZE*MAX_SIZE]);
-  DECLARE_ALIGNED(16, uint16_t, ref_ptr[MAX_SIZE*MAX_SIZE]);
-  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[MAX_SIZE*MAX_SIZE]);
+  DECLARE_ALIGNED(16, uint16_t, src_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
+  DECLARE_ALIGNED(16, uint16_t, ref_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
   uint8_t* src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
   uint8_t* ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
   int err_count = 0;
   int first_failure = -1;
-  int src_stride = MAX_SIZE;
-  int ref_stride = MAX_SIZE;
-  int msk_stride = MAX_SIZE;
+  int src_stride = MAX_CU_SIZE;
+  int ref_stride = MAX_CU_SIZE;
+  int msk_stride = MAX_CU_SIZE;
 
   for (int i = 0; i < number_of_iterations; ++i) {
-    for (int j = 0; j < MAX_SIZE*MAX_SIZE; j++) {
+    for (int j = 0; j < MAX_CU_SIZE*MAX_CU_SIZE; j++) {
       src_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
       ref_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
       msk_ptr[j] = rnd(65);
@@ -341,23 +341,23 @@
   unsigned int ref_ret, opt_ret;
   unsigned int ref_sse, opt_sse;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint16_t, src_ptr[MAX_SIZE*MAX_SIZE]);
-  DECLARE_ALIGNED(16, uint16_t, ref_ptr[MAX_SIZE*MAX_SIZE]);
-  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[MAX_SIZE*MAX_SIZE]);
+  DECLARE_ALIGNED(16, uint16_t, src_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
+  DECLARE_ALIGNED(16, uint16_t, ref_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
   uint8_t* src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
   uint8_t* ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
   int err_count = 0;
   int first_failure = -1;
-  int src_stride = MAX_SIZE;
-  int ref_stride = MAX_SIZE;
-  int msk_stride = MAX_SIZE;
+  int src_stride = MAX_CU_SIZE;
+  int ref_stride = MAX_CU_SIZE;
+  int msk_stride = MAX_CU_SIZE;
 
   for (int i = 0; i < 8; ++i) {
     vpx_memset16(src_ptr, (i & 0x1) ? ((1 << bit_depth_) - 1) : 0,
-                 MAX_SIZE*MAX_SIZE);
+                 MAX_CU_SIZE*MAX_CU_SIZE);
     vpx_memset16(ref_ptr, (i & 0x2) ? ((1 << bit_depth_) - 1) : 0,
-                 MAX_SIZE*MAX_SIZE);
-    memset(msk_ptr, (i & 0x4) ?  64 : 0, MAX_SIZE*MAX_SIZE);
+                 MAX_CU_SIZE*MAX_CU_SIZE);
+    memset(msk_ptr, (i & 0x4) ?  64 : 0, MAX_CU_SIZE*MAX_CU_SIZE);
 
     ref_ret = ref_func_(src8_ptr, src_stride,
                         ref8_ptr, ref_stride,
@@ -407,24 +407,24 @@
   unsigned int ref_ret, opt_ret;
   unsigned int ref_sse, opt_sse;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint16_t, src_ptr[(MAX_SIZE+1)*(MAX_SIZE+1)]);
-  DECLARE_ALIGNED(16, uint16_t, ref_ptr[(MAX_SIZE+1)*(MAX_SIZE+1)]);
-  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[(MAX_SIZE+1)*(MAX_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint16_t, src_ptr[(MAX_CU_SIZE+1)*(MAX_CU_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint16_t, ref_ptr[(MAX_CU_SIZE+1)*(MAX_CU_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[(MAX_CU_SIZE+1)*(MAX_CU_SIZE+1)]);
   uint8_t* src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
   uint8_t* ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
   int err_count = 0;
   int first_failure = -1;
   int first_failure_x = -1;
   int first_failure_y = -1;
-  int src_stride = (MAX_SIZE+1);
-  int ref_stride = (MAX_SIZE+1);
-  int msk_stride = (MAX_SIZE+1);
+  int src_stride = (MAX_CU_SIZE+1);
+  int ref_stride = (MAX_CU_SIZE+1);
+  int msk_stride = (MAX_CU_SIZE+1);
   int xoffset, yoffset;
 
   for (int i = 0; i < number_of_iterations; ++i) {
     for (xoffset = 0; xoffset < BIL_SUBPEL_SHIFTS; xoffset++) {
       for (yoffset = 0; yoffset < BIL_SUBPEL_SHIFTS; yoffset++) {
-        for (int j = 0; j < (MAX_SIZE+1)*(MAX_SIZE+1); j++) {
+        for (int j = 0; j < (MAX_CU_SIZE+1)*(MAX_CU_SIZE+1); j++) {
           src_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
           ref_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
           msk_ptr[j] = rnd(65);
@@ -465,27 +465,27 @@
   unsigned int ref_ret, opt_ret;
   unsigned int ref_sse, opt_sse;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint16_t, src_ptr[(MAX_SIZE+1)*(MAX_SIZE+1)]);
-  DECLARE_ALIGNED(16, uint16_t, ref_ptr[(MAX_SIZE+1)*(MAX_SIZE+1)]);
-  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[(MAX_SIZE+1)*(MAX_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint16_t, src_ptr[(MAX_CU_SIZE+1)*(MAX_CU_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint16_t, ref_ptr[(MAX_CU_SIZE+1)*(MAX_CU_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[(MAX_CU_SIZE+1)*(MAX_CU_SIZE+1)]);
   uint8_t* src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
   uint8_t* ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
   int first_failure_x = -1;
   int first_failure_y = -1;
   int err_count = 0;
   int first_failure = -1;
-  int src_stride = (MAX_SIZE+1);
-  int ref_stride = (MAX_SIZE+1);
-  int msk_stride = (MAX_SIZE+1);
+  int src_stride = (MAX_CU_SIZE+1);
+  int ref_stride = (MAX_CU_SIZE+1);
+  int msk_stride = (MAX_CU_SIZE+1);
 
   for (int xoffset = 0 ; xoffset < BIL_SUBPEL_SHIFTS ; xoffset++) {
     for (int yoffset = 0 ; yoffset < BIL_SUBPEL_SHIFTS ; yoffset++) {
       for (int i = 0; i < 8; ++i) {
         vpx_memset16(src_ptr, (i & 0x1) ? ((1 << bit_depth_) - 1) : 0,
-                     (MAX_SIZE+1)*(MAX_SIZE+1));
+                     (MAX_CU_SIZE+1)*(MAX_CU_SIZE+1));
         vpx_memset16(ref_ptr, (i & 0x2) ? ((1 << bit_depth_) - 1) : 0,
-                     (MAX_SIZE+1)*(MAX_SIZE+1));
-        memset(msk_ptr, (i & 0x4) ?   64 : 0, (MAX_SIZE+1)*(MAX_SIZE+1));
+                     (MAX_CU_SIZE+1)*(MAX_CU_SIZE+1));
+        memset(msk_ptr, (i & 0x4) ?   64 : 0, (MAX_CU_SIZE+1)*(MAX_CU_SIZE+1));
 
         ref_ret = ref_func_(src8_ptr, src_stride,
                             xoffset, yoffset,
@@ -525,6 +525,14 @@
 INSTANTIATE_TEST_CASE_P(
   SSSE3_C_COMPARE, MaskedVarianceTest,
   ::testing::Values(
+#if CONFIG_EXT_PARTITION
+    make_tuple(&vpx_masked_variance128x128_ssse3,
+               &vpx_masked_variance128x128_c),
+    make_tuple(&vpx_masked_variance128x64_ssse3,
+               &vpx_masked_variance128x64_c),
+    make_tuple(&vpx_masked_variance64x128_ssse3,
+               &vpx_masked_variance64x128_c),
+#endif  // CONFIG_EXT_PARTITION
     make_tuple(&vpx_masked_variance64x64_ssse3,
                &vpx_masked_variance64x64_c),
     make_tuple(&vpx_masked_variance64x32_ssse3,
@@ -555,197 +563,253 @@
 INSTANTIATE_TEST_CASE_P(
   SSSE3_C_COMPARE, MaskedSubPixelVarianceTest,
   ::testing::Values(
+#if CONFIG_EXT_PARTITION
+    make_tuple(&vpx_masked_sub_pixel_variance128x128_ssse3,
+               &vpx_masked_sub_pixel_variance128x128_c),
+    make_tuple(&vpx_masked_sub_pixel_variance128x64_ssse3,
+               &vpx_masked_sub_pixel_variance128x64_c),
+    make_tuple(&vpx_masked_sub_pixel_variance64x128_ssse3,
+               &vpx_masked_sub_pixel_variance64x128_c),
+#endif  // CONFIG_EXT_PARTITION
     make_tuple(&vpx_masked_sub_pixel_variance64x64_ssse3,
-              &vpx_masked_sub_pixel_variance64x64_c),
+               &vpx_masked_sub_pixel_variance64x64_c),
     make_tuple(&vpx_masked_sub_pixel_variance64x32_ssse3,
-              &vpx_masked_sub_pixel_variance64x32_c),
+               &vpx_masked_sub_pixel_variance64x32_c),
     make_tuple(&vpx_masked_sub_pixel_variance32x64_ssse3,
-              &vpx_masked_sub_pixel_variance32x64_c),
+               &vpx_masked_sub_pixel_variance32x64_c),
     make_tuple(&vpx_masked_sub_pixel_variance32x32_ssse3,
-              &vpx_masked_sub_pixel_variance32x32_c),
+               &vpx_masked_sub_pixel_variance32x32_c),
     make_tuple(&vpx_masked_sub_pixel_variance32x16_ssse3,
-              &vpx_masked_sub_pixel_variance32x16_c),
+               &vpx_masked_sub_pixel_variance32x16_c),
     make_tuple(&vpx_masked_sub_pixel_variance16x32_ssse3,
-              &vpx_masked_sub_pixel_variance16x32_c),
+               &vpx_masked_sub_pixel_variance16x32_c),
     make_tuple(&vpx_masked_sub_pixel_variance16x16_ssse3,
-              &vpx_masked_sub_pixel_variance16x16_c),
+               &vpx_masked_sub_pixel_variance16x16_c),
     make_tuple(&vpx_masked_sub_pixel_variance16x8_ssse3,
-              &vpx_masked_sub_pixel_variance16x8_c),
+               &vpx_masked_sub_pixel_variance16x8_c),
     make_tuple(&vpx_masked_sub_pixel_variance8x16_ssse3,
-              &vpx_masked_sub_pixel_variance8x16_c),
+               &vpx_masked_sub_pixel_variance8x16_c),
     make_tuple(&vpx_masked_sub_pixel_variance8x8_ssse3,
-              &vpx_masked_sub_pixel_variance8x8_c),
+               &vpx_masked_sub_pixel_variance8x8_c),
     make_tuple(&vpx_masked_sub_pixel_variance8x4_ssse3,
-              &vpx_masked_sub_pixel_variance8x4_c),
+               &vpx_masked_sub_pixel_variance8x4_c),
     make_tuple(&vpx_masked_sub_pixel_variance4x8_ssse3,
-              &vpx_masked_sub_pixel_variance4x8_c),
+               &vpx_masked_sub_pixel_variance4x8_c),
     make_tuple(&vpx_masked_sub_pixel_variance4x4_ssse3,
-              &vpx_masked_sub_pixel_variance4x4_c)));
+               &vpx_masked_sub_pixel_variance4x4_c)));
 
 #if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
   SSSE3_C_COMPARE, HighbdMaskedVarianceTest,
   ::testing::Values(
-    make_tuple(&vp9_highbd_masked_variance64x64_ssse3,
-               &vp9_highbd_masked_variance64x64_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_variance64x32_ssse3,
-               &vp9_highbd_masked_variance64x32_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_variance32x64_ssse3,
-               &vp9_highbd_masked_variance32x64_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_variance32x32_ssse3,
-               &vp9_highbd_masked_variance32x32_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_variance32x16_ssse3,
-               &vp9_highbd_masked_variance32x16_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_variance16x32_ssse3,
-               &vp9_highbd_masked_variance16x32_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_variance16x16_ssse3,
-               &vp9_highbd_masked_variance16x16_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_variance16x8_ssse3,
-               &vp9_highbd_masked_variance16x8_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_variance8x16_ssse3,
-               &vp9_highbd_masked_variance8x16_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_variance8x8_ssse3,
-               &vp9_highbd_masked_variance8x8_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_variance8x4_ssse3,
-               &vp9_highbd_masked_variance8x4_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_variance4x8_ssse3,
-               &vp9_highbd_masked_variance4x8_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_variance4x4_ssse3,
-               &vp9_highbd_masked_variance4x4_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_10_masked_variance64x64_ssse3,
-               &vp9_highbd_10_masked_variance64x64_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_variance64x32_ssse3,
-               &vp9_highbd_10_masked_variance64x32_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_variance32x64_ssse3,
-               &vp9_highbd_10_masked_variance32x64_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_variance32x32_ssse3,
-               &vp9_highbd_10_masked_variance32x32_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_variance32x16_ssse3,
-               &vp9_highbd_10_masked_variance32x16_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_variance16x32_ssse3,
-               &vp9_highbd_10_masked_variance16x32_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_variance16x16_ssse3,
-               &vp9_highbd_10_masked_variance16x16_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_variance16x8_ssse3,
-               &vp9_highbd_10_masked_variance16x8_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_variance8x16_ssse3,
-               &vp9_highbd_10_masked_variance8x16_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_variance8x8_ssse3,
-               &vp9_highbd_10_masked_variance8x8_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_variance8x4_ssse3,
-               &vp9_highbd_10_masked_variance8x4_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_variance4x8_ssse3,
-               &vp9_highbd_10_masked_variance4x8_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_variance4x4_ssse3,
-               &vp9_highbd_10_masked_variance4x4_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_12_masked_variance64x64_ssse3,
-               &vp9_highbd_12_masked_variance64x64_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_variance64x32_ssse3,
-               &vp9_highbd_12_masked_variance64x32_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_variance32x64_ssse3,
-               &vp9_highbd_12_masked_variance32x64_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_variance32x32_ssse3,
-               &vp9_highbd_12_masked_variance32x32_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_variance32x16_ssse3,
-               &vp9_highbd_12_masked_variance32x16_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_variance16x32_ssse3,
-               &vp9_highbd_12_masked_variance16x32_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_variance16x16_ssse3,
-               &vp9_highbd_12_masked_variance16x16_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_variance16x8_ssse3,
-               &vp9_highbd_12_masked_variance16x8_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_variance8x16_ssse3,
-               &vp9_highbd_12_masked_variance8x16_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_variance8x8_ssse3,
-               &vp9_highbd_12_masked_variance8x8_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_variance8x4_ssse3,
-               &vp9_highbd_12_masked_variance8x4_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_variance4x8_ssse3,
-               &vp9_highbd_12_masked_variance4x8_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_variance4x4_ssse3,
-               &vp9_highbd_12_masked_variance4x4_c, VPX_BITS_12)));
+#if CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_masked_variance128x128_ssse3,
+               &vpx_highbd_masked_variance128x128_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance128x64_ssse3,
+               &vpx_highbd_masked_variance128x64_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance64x128_ssse3,
+               &vpx_highbd_masked_variance64x128_c, VPX_BITS_8),
+#endif  // CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_masked_variance64x64_ssse3,
+               &vpx_highbd_masked_variance64x64_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance64x32_ssse3,
+               &vpx_highbd_masked_variance64x32_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance32x64_ssse3,
+               &vpx_highbd_masked_variance32x64_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance32x32_ssse3,
+               &vpx_highbd_masked_variance32x32_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance32x16_ssse3,
+               &vpx_highbd_masked_variance32x16_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance16x32_ssse3,
+               &vpx_highbd_masked_variance16x32_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance16x16_ssse3,
+               &vpx_highbd_masked_variance16x16_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance16x8_ssse3,
+               &vpx_highbd_masked_variance16x8_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance8x16_ssse3,
+               &vpx_highbd_masked_variance8x16_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance8x8_ssse3,
+               &vpx_highbd_masked_variance8x8_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance8x4_ssse3,
+               &vpx_highbd_masked_variance8x4_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance4x8_ssse3,
+               &vpx_highbd_masked_variance4x8_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance4x4_ssse3,
+               &vpx_highbd_masked_variance4x4_c, VPX_BITS_8),
+#if CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_10_masked_variance128x128_ssse3,
+               &vpx_highbd_10_masked_variance128x128_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance128x64_ssse3,
+               &vpx_highbd_10_masked_variance128x64_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance64x128_ssse3,
+               &vpx_highbd_10_masked_variance64x128_c, VPX_BITS_10),
+#endif  // CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_10_masked_variance64x64_ssse3,
+               &vpx_highbd_10_masked_variance64x64_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance64x32_ssse3,
+               &vpx_highbd_10_masked_variance64x32_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance32x64_ssse3,
+               &vpx_highbd_10_masked_variance32x64_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance32x32_ssse3,
+               &vpx_highbd_10_masked_variance32x32_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance32x16_ssse3,
+               &vpx_highbd_10_masked_variance32x16_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance16x32_ssse3,
+               &vpx_highbd_10_masked_variance16x32_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance16x16_ssse3,
+               &vpx_highbd_10_masked_variance16x16_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance16x8_ssse3,
+               &vpx_highbd_10_masked_variance16x8_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance8x16_ssse3,
+               &vpx_highbd_10_masked_variance8x16_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance8x8_ssse3,
+               &vpx_highbd_10_masked_variance8x8_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance8x4_ssse3,
+               &vpx_highbd_10_masked_variance8x4_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance4x8_ssse3,
+               &vpx_highbd_10_masked_variance4x8_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance4x4_ssse3,
+               &vpx_highbd_10_masked_variance4x4_c, VPX_BITS_10),
+#if CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_12_masked_variance128x128_ssse3,
+               &vpx_highbd_12_masked_variance128x128_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance128x64_ssse3,
+               &vpx_highbd_12_masked_variance128x64_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance64x128_ssse3,
+               &vpx_highbd_12_masked_variance64x128_c, VPX_BITS_12),
+#endif  // CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_12_masked_variance64x64_ssse3,
+               &vpx_highbd_12_masked_variance64x64_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance64x32_ssse3,
+               &vpx_highbd_12_masked_variance64x32_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance32x64_ssse3,
+               &vpx_highbd_12_masked_variance32x64_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance32x32_ssse3,
+               &vpx_highbd_12_masked_variance32x32_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance32x16_ssse3,
+               &vpx_highbd_12_masked_variance32x16_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance16x32_ssse3,
+               &vpx_highbd_12_masked_variance16x32_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance16x16_ssse3,
+               &vpx_highbd_12_masked_variance16x16_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance16x8_ssse3,
+               &vpx_highbd_12_masked_variance16x8_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance8x16_ssse3,
+               &vpx_highbd_12_masked_variance8x16_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance8x8_ssse3,
+               &vpx_highbd_12_masked_variance8x8_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance8x4_ssse3,
+               &vpx_highbd_12_masked_variance8x4_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance4x8_ssse3,
+               &vpx_highbd_12_masked_variance4x8_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance4x4_ssse3,
+               &vpx_highbd_12_masked_variance4x4_c, VPX_BITS_12)));
 
 INSTANTIATE_TEST_CASE_P(
   SSSE3_C_COMPARE, HighbdMaskedSubPixelVarianceTest,
   ::testing::Values(
-    make_tuple(&vp9_highbd_masked_sub_pixel_variance64x64_ssse3,
-               &vp9_highbd_masked_sub_pixel_variance64x64_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_sub_pixel_variance64x32_ssse3,
-               &vp9_highbd_masked_sub_pixel_variance64x32_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_sub_pixel_variance32x64_ssse3,
-               &vp9_highbd_masked_sub_pixel_variance32x64_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_sub_pixel_variance32x32_ssse3,
-               &vp9_highbd_masked_sub_pixel_variance32x32_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_sub_pixel_variance32x16_ssse3,
-               &vp9_highbd_masked_sub_pixel_variance32x16_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_sub_pixel_variance16x32_ssse3,
-               &vp9_highbd_masked_sub_pixel_variance16x32_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_sub_pixel_variance16x16_ssse3,
-               &vp9_highbd_masked_sub_pixel_variance16x16_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_sub_pixel_variance16x8_ssse3,
-               &vp9_highbd_masked_sub_pixel_variance16x8_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_sub_pixel_variance8x16_ssse3,
-               &vp9_highbd_masked_sub_pixel_variance8x16_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_sub_pixel_variance8x8_ssse3,
-               &vp9_highbd_masked_sub_pixel_variance8x8_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_sub_pixel_variance8x4_ssse3,
-               &vp9_highbd_masked_sub_pixel_variance8x4_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_sub_pixel_variance4x8_ssse3,
-               &vp9_highbd_masked_sub_pixel_variance4x8_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_sub_pixel_variance4x4_ssse3,
-               &vp9_highbd_masked_sub_pixel_variance4x4_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance64x64_ssse3,
-               &vp9_highbd_10_masked_sub_pixel_variance64x64_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance64x32_ssse3,
-               &vp9_highbd_10_masked_sub_pixel_variance64x32_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance32x64_ssse3,
-               &vp9_highbd_10_masked_sub_pixel_variance32x64_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance32x32_ssse3,
-               &vp9_highbd_10_masked_sub_pixel_variance32x32_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance32x16_ssse3,
-               &vp9_highbd_10_masked_sub_pixel_variance32x16_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance16x32_ssse3,
-               &vp9_highbd_10_masked_sub_pixel_variance16x32_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance16x16_ssse3,
-               &vp9_highbd_10_masked_sub_pixel_variance16x16_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance16x8_ssse3,
-               &vp9_highbd_10_masked_sub_pixel_variance16x8_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance8x16_ssse3,
-               &vp9_highbd_10_masked_sub_pixel_variance8x16_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance8x8_ssse3,
-               &vp9_highbd_10_masked_sub_pixel_variance8x8_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance8x4_ssse3,
-               &vp9_highbd_10_masked_sub_pixel_variance8x4_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance4x8_ssse3,
-               &vp9_highbd_10_masked_sub_pixel_variance4x8_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance4x4_ssse3,
-               &vp9_highbd_10_masked_sub_pixel_variance4x4_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance64x64_ssse3,
-               &vp9_highbd_12_masked_sub_pixel_variance64x64_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance64x32_ssse3,
-               &vp9_highbd_12_masked_sub_pixel_variance64x32_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance32x64_ssse3,
-               &vp9_highbd_12_masked_sub_pixel_variance32x64_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance32x32_ssse3,
-               &vp9_highbd_12_masked_sub_pixel_variance32x32_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance32x16_ssse3,
-               &vp9_highbd_12_masked_sub_pixel_variance32x16_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance16x32_ssse3,
-               &vp9_highbd_12_masked_sub_pixel_variance16x32_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance16x16_ssse3,
-               &vp9_highbd_12_masked_sub_pixel_variance16x16_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance16x8_ssse3,
-               &vp9_highbd_12_masked_sub_pixel_variance16x8_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance8x16_ssse3,
-               &vp9_highbd_12_masked_sub_pixel_variance8x16_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance8x8_ssse3,
-               &vp9_highbd_12_masked_sub_pixel_variance8x8_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance8x4_ssse3,
-               &vp9_highbd_12_masked_sub_pixel_variance8x4_c, VPX_BITS_12) ,
-    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance4x8_ssse3,
-               &vp9_highbd_12_masked_sub_pixel_variance4x8_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance4x4_ssse3,
-               &vp9_highbd_12_masked_sub_pixel_variance4x4_c, VPX_BITS_12)));
+#if CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance128x128_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance128x128_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance128x64_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance128x64_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance64x128_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance64x128_c, VPX_BITS_8),
+#endif  // CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance64x64_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance64x64_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance64x32_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance64x32_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance32x64_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance32x64_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance32x32_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance32x32_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance32x16_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance32x16_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance16x32_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance16x32_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance16x16_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance16x16_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance16x8_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance16x8_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance8x16_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance8x16_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance8x8_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance8x8_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance8x4_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance8x4_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance4x8_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance4x8_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance4x4_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance4x4_c, VPX_BITS_8),
+#if CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance128x128_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance128x128_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance128x64_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance128x64_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance64x128_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance64x128_c, VPX_BITS_10),
+#endif  // CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance64x64_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance64x64_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance64x32_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance64x32_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance32x64_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance32x64_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance32x32_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance32x32_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance32x16_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance32x16_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance16x32_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance16x32_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance16x16_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance16x16_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance16x8_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance16x8_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance8x16_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance8x16_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance8x8_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance8x8_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance8x4_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance8x4_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance4x8_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance4x8_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance4x4_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance4x4_c, VPX_BITS_10),
+#if CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance128x128_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance128x128_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance128x64_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance128x64_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance64x128_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance64x128_c, VPX_BITS_12),
+#endif  // CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance64x64_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance64x64_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance64x32_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance64x32_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance32x64_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance32x64_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance32x32_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance32x32_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance32x16_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance32x16_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance16x32_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance16x32_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance16x16_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance16x16_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance16x8_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance16x8_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance8x16_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance8x16_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance8x8_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance8x8_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance8x4_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance8x4_c, VPX_BITS_12) ,
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance4x8_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance4x8_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance4x4_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance4x4_c, VPX_BITS_12)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #endif  // HAVE_SSSE3
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 3f0f74c..1985e18 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -59,13 +59,13 @@
     reference_data8_ = reinterpret_cast<uint8_t*>(
         vpx_memalign(kDataAlignment, kDataBufferSize));
     second_pred8_ = reinterpret_cast<uint8_t*>(
-        vpx_memalign(kDataAlignment, 64*64));
+        vpx_memalign(kDataAlignment, 128*128));
     source_data16_ = reinterpret_cast<uint16_t*>(
         vpx_memalign(kDataAlignment, kDataBlockSize*sizeof(uint16_t)));
     reference_data16_ = reinterpret_cast<uint16_t*>(
         vpx_memalign(kDataAlignment, kDataBufferSize*sizeof(uint16_t)));
     second_pred16_ = reinterpret_cast<uint16_t*>(
-        vpx_memalign(kDataAlignment, 64*64*sizeof(uint16_t)));
+        vpx_memalign(kDataAlignment, 128*128*sizeof(uint16_t)));
   }
 
   static void TearDownTestCase() {
@@ -88,9 +88,9 @@
   }
 
  protected:
-  // Handle blocks up to 4 blocks 64x64 with stride up to 128
+  // Handle up to 4 128x128 blocks, with stride up to 256
   static const int kDataAlignment = 16;
-  static const int kDataBlockSize = 64 * 128;
+  static const int kDataBlockSize = 128 * 256;
   static const int kDataBufferSize = 4 * kDataBlockSize;
 
   virtual void SetUp() {
@@ -485,6 +485,11 @@
 //------------------------------------------------------------------------------
 // C functions
 const SadMxNParam c_tests[] = {
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_sad128x128_c, -1),
+  make_tuple(128, 64, &vpx_sad128x64_c, -1),
+  make_tuple(64, 128, &vpx_sad64x128_c, -1),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_sad64x64_c, -1),
   make_tuple(64, 32, &vpx_sad64x32_c, -1),
   make_tuple(32, 64, &vpx_sad32x64_c, -1),
@@ -499,6 +504,11 @@
   make_tuple(4, 8, &vpx_sad4x8_c, -1),
   make_tuple(4, 4, &vpx_sad4x4_c, -1),
 #if CONFIG_VP9_HIGHBITDEPTH
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_highbd_sad128x128_c, 8),
+  make_tuple(128, 64, &vpx_highbd_sad128x64_c, 8),
+  make_tuple(64, 128, &vpx_highbd_sad64x128_c, 8),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_highbd_sad64x64_c, 8),
   make_tuple(64, 32, &vpx_highbd_sad64x32_c, 8),
   make_tuple(32, 64, &vpx_highbd_sad32x64_c, 8),
@@ -512,6 +522,11 @@
   make_tuple(8, 4, &vpx_highbd_sad8x4_c, 8),
   make_tuple(4, 8, &vpx_highbd_sad4x8_c, 8),
   make_tuple(4, 4, &vpx_highbd_sad4x4_c, 8),
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_highbd_sad128x128_c, 10),
+  make_tuple(128, 64, &vpx_highbd_sad128x64_c, 10),
+  make_tuple(64, 128, &vpx_highbd_sad64x128_c, 10),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_highbd_sad64x64_c, 10),
   make_tuple(64, 32, &vpx_highbd_sad64x32_c, 10),
   make_tuple(32, 64, &vpx_highbd_sad32x64_c, 10),
@@ -525,6 +540,11 @@
   make_tuple(8, 4, &vpx_highbd_sad8x4_c, 10),
   make_tuple(4, 8, &vpx_highbd_sad4x8_c, 10),
   make_tuple(4, 4, &vpx_highbd_sad4x4_c, 10),
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_highbd_sad128x128_c, 12),
+  make_tuple(128, 64, &vpx_highbd_sad128x64_c, 12),
+  make_tuple(64, 128, &vpx_highbd_sad64x128_c, 12),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_highbd_sad64x64_c, 12),
   make_tuple(64, 32, &vpx_highbd_sad64x32_c, 12),
   make_tuple(32, 64, &vpx_highbd_sad32x64_c, 12),
@@ -543,6 +563,11 @@
 INSTANTIATE_TEST_CASE_P(C, SADTest, ::testing::ValuesIn(c_tests));
 
 const SadMxNAvgParam avg_c_tests[] = {
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_sad128x128_avg_c, -1),
+  make_tuple(128, 64, &vpx_sad128x64_avg_c, -1),
+  make_tuple(64, 128, &vpx_sad64x128_avg_c, -1),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_sad64x64_avg_c, -1),
   make_tuple(64, 32, &vpx_sad64x32_avg_c, -1),
   make_tuple(32, 64, &vpx_sad32x64_avg_c, -1),
@@ -557,6 +582,11 @@
   make_tuple(4, 8, &vpx_sad4x8_avg_c, -1),
   make_tuple(4, 4, &vpx_sad4x4_avg_c, -1),
 #if CONFIG_VP9_HIGHBITDEPTH
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_highbd_sad128x128_avg_c, 8),
+  make_tuple(128, 64, &vpx_highbd_sad128x64_avg_c, 8),
+  make_tuple(64, 128, &vpx_highbd_sad64x128_avg_c, 8),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_highbd_sad64x64_avg_c, 8),
   make_tuple(64, 32, &vpx_highbd_sad64x32_avg_c, 8),
   make_tuple(32, 64, &vpx_highbd_sad32x64_avg_c, 8),
@@ -570,6 +600,11 @@
   make_tuple(8, 4, &vpx_highbd_sad8x4_avg_c, 8),
   make_tuple(4, 8, &vpx_highbd_sad4x8_avg_c, 8),
   make_tuple(4, 4, &vpx_highbd_sad4x4_avg_c, 8),
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_highbd_sad128x128_avg_c, 10),
+  make_tuple(128, 64, &vpx_highbd_sad128x64_avg_c, 10),
+  make_tuple(64, 128, &vpx_highbd_sad64x128_avg_c, 10),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_highbd_sad64x64_avg_c, 10),
   make_tuple(64, 32, &vpx_highbd_sad64x32_avg_c, 10),
   make_tuple(32, 64, &vpx_highbd_sad32x64_avg_c, 10),
@@ -583,6 +618,11 @@
   make_tuple(8, 4, &vpx_highbd_sad8x4_avg_c, 10),
   make_tuple(4, 8, &vpx_highbd_sad4x8_avg_c, 10),
   make_tuple(4, 4, &vpx_highbd_sad4x4_avg_c, 10),
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_highbd_sad128x128_avg_c, 12),
+  make_tuple(128, 64, &vpx_highbd_sad128x64_avg_c, 12),
+  make_tuple(64, 128, &vpx_highbd_sad64x128_avg_c, 12),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_highbd_sad64x64_avg_c, 12),
   make_tuple(64, 32, &vpx_highbd_sad64x32_avg_c, 12),
   make_tuple(32, 64, &vpx_highbd_sad32x64_avg_c, 12),
@@ -601,6 +641,11 @@
 INSTANTIATE_TEST_CASE_P(C, SADavgTest, ::testing::ValuesIn(avg_c_tests));
 
 const SadMxNx4Param x4d_c_tests[] = {
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_sad128x128x4d_c, -1),
+  make_tuple(128, 64, &vpx_sad128x64x4d_c, -1),
+  make_tuple(64, 128, &vpx_sad64x128x4d_c, -1),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_sad64x64x4d_c, -1),
   make_tuple(64, 32, &vpx_sad64x32x4d_c, -1),
   make_tuple(32, 64, &vpx_sad32x64x4d_c, -1),
@@ -615,6 +660,11 @@
   make_tuple(4, 8, &vpx_sad4x8x4d_c, -1),
   make_tuple(4, 4, &vpx_sad4x4x4d_c, -1),
 #if CONFIG_VP9_HIGHBITDEPTH
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_highbd_sad128x128x4d_c, 8),
+  make_tuple(128, 64, &vpx_highbd_sad128x64x4d_c, 8),
+  make_tuple(64, 128, &vpx_highbd_sad64x128x4d_c, 8),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_highbd_sad64x64x4d_c, 8),
   make_tuple(64, 32, &vpx_highbd_sad64x32x4d_c, 8),
   make_tuple(32, 64, &vpx_highbd_sad32x64x4d_c, 8),
@@ -628,6 +678,11 @@
   make_tuple(8, 4, &vpx_highbd_sad8x4x4d_c, 8),
   make_tuple(4, 8, &vpx_highbd_sad4x8x4d_c, 8),
   make_tuple(4, 4, &vpx_highbd_sad4x4x4d_c, 8),
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_highbd_sad128x128x4d_c, 10),
+  make_tuple(128, 64, &vpx_highbd_sad128x64x4d_c, 10),
+  make_tuple(64, 128, &vpx_highbd_sad64x128x4d_c, 10),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_highbd_sad64x64x4d_c, 10),
   make_tuple(64, 32, &vpx_highbd_sad64x32x4d_c, 10),
   make_tuple(32, 64, &vpx_highbd_sad32x64x4d_c, 10),
@@ -641,6 +696,11 @@
   make_tuple(8, 4, &vpx_highbd_sad8x4x4d_c, 10),
   make_tuple(4, 8, &vpx_highbd_sad4x8x4d_c, 10),
   make_tuple(4, 4, &vpx_highbd_sad4x4x4d_c, 10),
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_highbd_sad128x128x4d_c, 12),
+  make_tuple(128, 64, &vpx_highbd_sad128x64x4d_c, 12),
+  make_tuple(64, 128, &vpx_highbd_sad64x128x4d_c, 12),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_highbd_sad64x64x4d_c, 12),
   make_tuple(64, 32, &vpx_highbd_sad64x32x4d_c, 12),
   make_tuple(32, 64, &vpx_highbd_sad32x64x4d_c, 12),
diff --git a/test/test.mk b/test/test.mk
index 95dfa16..21b8919 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -9,6 +9,7 @@
 LIBVPX_TEST_SRCS-yes += test_vectors.h
 LIBVPX_TEST_SRCS-yes += util.h
 LIBVPX_TEST_SRCS-yes += video_source.h
+LIBVPX_TEST_SRCS-yes += transform_test_base.h
 
 ##
 ## BLACK BOX TESTS
@@ -165,6 +166,7 @@
 ifeq ($(CONFIG_VP10),yes)
 LIBVPX_TEST_SRCS-yes                    += vp10_inv_txfm_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_dct_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_fht4x4_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ANS)          += vp10_ans_test.cc
 
 LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += sum_squares_test.cc
diff --git a/test/transform_test_base.h b/test/transform_test_base.h
new file mode 100644
index 0000000..cf2facd
--- /dev/null
+++ b/test/transform_test_base.h
@@ -0,0 +1,291 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef TEST_TRANSFORM_TEST_BASE_H_
+#define TEST_TRANSFORM_TEST_BASE_H_
+
+#include "./vpx_config.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx/vpx_codec.h"
+
+
+namespace libvpx_test {
+
+//  Note:
+//   Same constant are defined in vp9/common/vp9_entropy.h and
+//   vp10/common/entropy.h.  Goal is to make this base class
+//   to use for future codec transform testing.  But including
+//   either of them would lead to compiling error when we do
+//   unit test for another codec. Suggest to move the definition
+//   to a vpx header file.
+const int kDctMaxValue = 16384;
+
+typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
+                        int tx_type);
+
+class TransformTestBase {
+ public:
+  virtual ~TransformTestBase() {}
+
+ protected:
+  virtual void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) = 0;
+
+  virtual void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) = 0;
+
+  void RunAccuracyCheck(int limit) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    uint32_t max_error = 0;
+    int64_t total_error = 0;
+    const int count_test_block = 10000;
+
+    int16_t *test_input_block = reinterpret_cast<int16_t *>
+        (vpx_memalign(16, sizeof(int16_t) * num_coeffs_));
+    tran_low_t *test_temp_block = reinterpret_cast<tran_low_t *>
+        (vpx_memalign(16, sizeof(tran_low_t) * num_coeffs_));
+    uint8_t *dst = reinterpret_cast<uint8_t *>
+        (vpx_memalign(16, sizeof(uint8_t) * num_coeffs_));
+    uint8_t *src = reinterpret_cast<uint8_t *>
+        (vpx_memalign(16, sizeof(uint8_t) * num_coeffs_));
+#if CONFIG_VP9_HIGHBITDEPTH
+    uint16_t *dst16 = reinterpret_cast<uint16_t *>
+        (vpx_memalign(16, sizeof(uint16_t) * num_coeffs_));
+    uint16_t *src16 = reinterpret_cast<uint16_t *>
+        (vpx_memalign(16, sizeof(uint16_t) * num_coeffs_));
+#endif
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < num_coeffs_; ++j) {
+        if (bit_depth_ == VPX_BITS_8) {
+          src[j] = rnd.Rand8();
+          dst[j] = rnd.Rand8();
+          test_input_block[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          src16[j] = rnd.Rand16() & mask_;
+          dst16[j] = rnd.Rand16() & mask_;
+          test_input_block[j] = src16[j] - dst16[j];
+#endif
+        }
+      }
+
+      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block,
+                                          test_temp_block, pitch_));
+      if (bit_depth_ == VPX_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block,
+                                            CONVERT_TO_BYTEPTR(dst16), pitch_));
+#endif
+      }
+
+      for (int j = 0; j < num_coeffs_; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
+        ASSERT_EQ(VPX_BITS_8, bit_depth_);
+        const uint32_t diff = dst[j] - src[j];
+#endif
+        const uint32_t error = diff * diff;
+        if (max_error < error)
+          max_error = error;
+        total_error += error;
+      }
+    }
+
+    EXPECT_GE(static_cast<uint32_t>(limit), max_error)
+        << "Error: 4x4 FHT/IHT has an individual round trip error > "
+        << limit;
+
+    EXPECT_GE(count_test_block * limit, total_error)
+        << "Error: 4x4 FHT/IHT has average round trip error > " << limit
+        << " per block";
+
+    vpx_free(test_input_block);
+    vpx_free(test_temp_block);
+    vpx_free(dst);
+    vpx_free(src);
+#if CONFIG_VP9_HIGHBITDEPTH
+    vpx_free(dst16);
+    vpx_free(src16);
+#endif
+  }
+
+  void RunCoeffCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 5000;
+
+    int16_t *input_block = reinterpret_cast<int16_t *>
+        (vpx_memalign(16, sizeof(int16_t) * num_coeffs_));
+    tran_low_t *output_ref_block = reinterpret_cast<tran_low_t *>
+        (vpx_memalign(16, sizeof(tran_low_t) * num_coeffs_));
+    tran_low_t *output_block = reinterpret_cast<tran_low_t *>
+        (vpx_memalign(16, sizeof(tran_low_t) * num_coeffs_));
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-mask_, mask_].
+      for (int j = 0; j < num_coeffs_; ++j)
+        input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+
+      fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_);
+      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));
+
+      // The minimum quant value is 4.
+      for (int j = 0; j < num_coeffs_; ++j) {
+        EXPECT_EQ(output_block[j], output_ref_block[j])
+            << "Error: not bit-exact result at index: " << j
+            << " at test block: " << i;
+      }
+    }
+    vpx_free(input_block);
+    vpx_free(output_ref_block);
+    vpx_free(output_block);
+  }
+
+  void RunMemCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 5000;
+
+    int16_t *input_extreme_block = reinterpret_cast<int16_t *>
+        (vpx_memalign(16, sizeof(int16_t) * num_coeffs_));
+    tran_low_t *output_ref_block = reinterpret_cast<tran_low_t *>
+        (vpx_memalign(16, sizeof(tran_low_t) * num_coeffs_));
+    tran_low_t *output_block = reinterpret_cast<tran_low_t *>
+        (vpx_memalign(16, sizeof(tran_low_t) * num_coeffs_));
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-mask_, mask_].
+      for (int j = 0; j < num_coeffs_; ++j) {
+        input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;
+      }
+      if (i == 0) {
+        for (int j = 0; j < num_coeffs_; ++j)
+          input_extreme_block[j] = mask_;
+      } else if (i == 1) {
+        for (int j = 0; j < num_coeffs_; ++j)
+          input_extreme_block[j] = -mask_;
+      }
+
+      fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
+      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_extreme_block,
+                                          output_block, pitch_));
+
+      int row_length = FindRowLength();
+      // The minimum quant value is 4.
+      for (int j = 0; j < num_coeffs_; ++j) {
+        EXPECT_EQ(output_block[j], output_ref_block[j]);
+        EXPECT_GE(row_length * kDctMaxValue << (bit_depth_ - 8),
+                  abs(output_block[j]))
+            << "Error: NxN FDCT has coefficient larger than N*DCT_MAX_VALUE";
+      }
+    }
+    vpx_free(input_extreme_block);
+    vpx_free(output_ref_block);
+    vpx_free(output_block);
+  }
+
+  void RunInvAccuracyCheck(int limit) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 1000;
+
+    int16_t *in = reinterpret_cast<int16_t *>
+        (vpx_memalign(16, sizeof(int16_t) * num_coeffs_));
+    tran_low_t *coeff = reinterpret_cast<tran_low_t *>
+        (vpx_memalign(16, sizeof(tran_low_t) * num_coeffs_));
+    uint8_t *dst = reinterpret_cast<uint8_t *>
+        (vpx_memalign(16, sizeof(uint8_t) * num_coeffs_));
+    uint8_t *src = reinterpret_cast<uint8_t *>
+        (vpx_memalign(16, sizeof(uint8_t) * num_coeffs_));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    uint16_t *dst16 = reinterpret_cast<uint16_t *>
+        (vpx_memalign(16, sizeof(uint16_t) * num_coeffs_));
+    uint16_t *src16 = reinterpret_cast<uint16_t *>
+        (vpx_memalign(16, sizeof(uint16_t) * num_coeffs_));
+#endif
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-mask_, mask_].
+      for (int j = 0; j < num_coeffs_; ++j) {
+        if (bit_depth_ == VPX_BITS_8) {
+          src[j] = rnd.Rand8();
+          dst[j] = rnd.Rand8();
+          in[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          src16[j] = rnd.Rand16() & mask_;
+          dst16[j] = rnd.Rand16() & mask_;
+          in[j] = src16[j] - dst16[j];
+#endif
+        }
+      }
+
+      fwd_txfm_ref(in, coeff, pitch_, tx_type_);
+
+      if (bit_depth_ == VPX_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
+                                            pitch_));
+#endif
+      }
+
+      for (int j = 0; j < num_coeffs_; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
+        const uint32_t diff = dst[j] - src[j];
+#endif
+        const uint32_t error = diff * diff;
+        EXPECT_GE(static_cast<uint32_t>(limit), error)
+            << "Error: 4x4 IDCT has error " << error
+            << " at index " << j;
+      }
+    }
+    vpx_free(in);
+    vpx_free(coeff);
+    vpx_free(dst);
+    vpx_free(src);
+#if CONFIG_VP9_HIGHBITDEPTH
+    vpx_free(src16);
+    vpx_free(dst16);
+#endif
+  }
+
+  int pitch_;
+  int tx_type_;
+  FhtFunc fwd_txfm_ref;
+  vpx_bit_depth_t bit_depth_;
+  int mask_;
+  int num_coeffs_;
+
+ private:
+  //  Assume transform size is 4x4, 8x8, 16x16,...
+  int FindRowLength() const {
+    int row = 4;
+    if (16 == num_coeffs_) {
+      row = 4;
+    } else if (64 == num_coeffs_) {
+      row = 8;
+    } else if (256 == num_coeffs_) {
+      row = 16;
+    } else if (1024 == num_coeffs_) {
+      row = 32;
+    }
+    return row;
+  }
+};
+
+}  // namespace libvpx_test
+
+#endif  // TEST_TRANSFORM_TEST_BASE_H_
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 6f50f78..97c5516 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -759,7 +759,13 @@
 
 INSTANTIATE_TEST_CASE_P(
     C, VpxVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_variance64x64_c, 0),
+    ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+                      make_tuple(7, 7, &vpx_variance128x128_c, 0),
+                      make_tuple(7, 6, &vpx_variance128x64_c, 0),
+                      make_tuple(6, 7, &vpx_variance64x128_c, 0),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
+                      make_tuple(6, 6, &vpx_variance64x64_c, 0),
                       make_tuple(6, 5, &vpx_variance64x32_c, 0),
                       make_tuple(5, 6, &vpx_variance32x64_c, 0),
                       make_tuple(5, 5, &vpx_variance32x32_c, 0),
@@ -775,7 +781,13 @@
 
 INSTANTIATE_TEST_CASE_P(
     C, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_c, 0),
+    ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+                      make_tuple(7, 7, &vpx_sub_pixel_variance128x128_c, 0),
+                      make_tuple(7, 6, &vpx_sub_pixel_variance128x64_c, 0),
+                      make_tuple(6, 7, &vpx_sub_pixel_variance64x128_c, 0),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
+                      make_tuple(6, 6, &vpx_sub_pixel_variance64x64_c, 0),
                       make_tuple(6, 5, &vpx_sub_pixel_variance64x32_c, 0),
                       make_tuple(5, 6, &vpx_sub_pixel_variance32x64_c, 0),
                       make_tuple(5, 5, &vpx_sub_pixel_variance32x32_c, 0),
@@ -791,7 +803,13 @@
 
 INSTANTIATE_TEST_CASE_P(
     C, VpxSubpelAvgVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_c, 0),
+    ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+                      make_tuple(7, 7, &vpx_sub_pixel_avg_variance128x128_c, 0),
+                      make_tuple(7, 6, &vpx_sub_pixel_avg_variance128x64_c, 0),
+                      make_tuple(6, 7, &vpx_sub_pixel_avg_variance64x128_c, 0),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
+                      make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_c, 0),
                       make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_c, 0),
                       make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_c, 0),
                       make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_c, 0),
@@ -841,7 +859,13 @@
 
 INSTANTIATE_TEST_CASE_P(
     C, VpxHBDVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_highbd_12_variance64x64_c, 12),
+    ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+                      make_tuple(7, 7, &vpx_highbd_12_variance128x128_c, 12),
+                      make_tuple(7, 6, &vpx_highbd_12_variance128x64_c, 12),
+                      make_tuple(6, 7, &vpx_highbd_12_variance64x128_c, 12),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
+                      make_tuple(6, 6, &vpx_highbd_12_variance64x64_c, 12),
                       make_tuple(6, 5, &vpx_highbd_12_variance64x32_c, 12),
                       make_tuple(5, 6, &vpx_highbd_12_variance32x64_c, 12),
                       make_tuple(5, 5, &vpx_highbd_12_variance32x32_c, 12),
@@ -854,6 +878,11 @@
                       make_tuple(3, 2, &vpx_highbd_12_variance8x4_c, 12),
                       make_tuple(2, 3, &vpx_highbd_12_variance4x8_c, 12),
                       make_tuple(2, 2, &vpx_highbd_12_variance4x4_c, 12),
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+                      make_tuple(7, 7, &vpx_highbd_10_variance128x128_c, 10),
+                      make_tuple(7, 6, &vpx_highbd_10_variance128x64_c, 10),
+                      make_tuple(6, 7, &vpx_highbd_10_variance64x128_c, 10),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
                       make_tuple(6, 6, &vpx_highbd_10_variance64x64_c, 10),
                       make_tuple(6, 5, &vpx_highbd_10_variance64x32_c, 10),
                       make_tuple(5, 6, &vpx_highbd_10_variance32x64_c, 10),
@@ -867,6 +896,11 @@
                       make_tuple(3, 2, &vpx_highbd_10_variance8x4_c, 10),
                       make_tuple(2, 3, &vpx_highbd_10_variance4x8_c, 10),
                       make_tuple(2, 2, &vpx_highbd_10_variance4x4_c, 10),
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+                      make_tuple(7, 7, &vpx_highbd_8_variance128x128_c, 8),
+                      make_tuple(7, 6, &vpx_highbd_8_variance128x64_c, 8),
+                      make_tuple(6, 7, &vpx_highbd_8_variance64x128_c, 8),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
                       make_tuple(6, 6, &vpx_highbd_8_variance64x64_c, 8),
                       make_tuple(6, 5, &vpx_highbd_8_variance64x32_c, 8),
                       make_tuple(5, 6, &vpx_highbd_8_variance32x64_c, 8),
@@ -884,6 +918,11 @@
 INSTANTIATE_TEST_CASE_P(
     C, VpxHBDSubpelVarianceTest,
     ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+        make_tuple(7, 7, &vpx_highbd_8_sub_pixel_variance128x128_c, 8),
+        make_tuple(7, 6, &vpx_highbd_8_sub_pixel_variance128x64_c, 8),
+        make_tuple(6, 7, &vpx_highbd_8_sub_pixel_variance64x128_c, 8),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
         make_tuple(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_c, 8),
         make_tuple(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_c, 8),
         make_tuple(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_c, 8),
@@ -897,6 +936,11 @@
         make_tuple(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_c, 8),
         make_tuple(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_c, 8),
         make_tuple(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_c, 8),
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+        make_tuple(7, 7, &vpx_highbd_10_sub_pixel_variance128x128_c, 10),
+        make_tuple(7, 6, &vpx_highbd_10_sub_pixel_variance128x64_c, 10),
+        make_tuple(6, 7, &vpx_highbd_10_sub_pixel_variance64x128_c, 10),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
         make_tuple(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_c, 10),
         make_tuple(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_c, 10),
         make_tuple(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_c, 10),
@@ -910,6 +954,11 @@
         make_tuple(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_c, 10),
         make_tuple(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_c, 10),
         make_tuple(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_c, 10),
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+        make_tuple(7, 7, &vpx_highbd_12_sub_pixel_variance128x128_c, 12),
+        make_tuple(7, 6, &vpx_highbd_12_sub_pixel_variance128x64_c, 12),
+        make_tuple(6, 7, &vpx_highbd_12_sub_pixel_variance64x128_c, 12),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
         make_tuple(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_c, 12),
         make_tuple(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_c, 12),
         make_tuple(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_c, 12),
@@ -927,6 +976,11 @@
 INSTANTIATE_TEST_CASE_P(
     C, VpxHBDSubpelAvgVarianceTest,
     ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+        make_tuple(7, 7, &vpx_highbd_8_sub_pixel_avg_variance128x128_c, 8),
+        make_tuple(7, 6, &vpx_highbd_8_sub_pixel_avg_variance128x64_c, 8),
+        make_tuple(6, 7, &vpx_highbd_8_sub_pixel_avg_variance64x128_c, 8),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
         make_tuple(6, 6, &vpx_highbd_8_sub_pixel_avg_variance64x64_c, 8),
         make_tuple(6, 5, &vpx_highbd_8_sub_pixel_avg_variance64x32_c, 8),
         make_tuple(5, 6, &vpx_highbd_8_sub_pixel_avg_variance32x64_c, 8),
@@ -940,6 +994,11 @@
         make_tuple(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_c, 8),
         make_tuple(2, 3, &vpx_highbd_8_sub_pixel_avg_variance4x8_c, 8),
         make_tuple(2, 2, &vpx_highbd_8_sub_pixel_avg_variance4x4_c, 8),
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+        make_tuple(7, 7, &vpx_highbd_10_sub_pixel_avg_variance128x128_c, 10),
+        make_tuple(7, 6, &vpx_highbd_10_sub_pixel_avg_variance128x64_c, 10),
+        make_tuple(6, 7, &vpx_highbd_10_sub_pixel_avg_variance64x128_c, 10),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
         make_tuple(6, 6, &vpx_highbd_10_sub_pixel_avg_variance64x64_c, 10),
         make_tuple(6, 5, &vpx_highbd_10_sub_pixel_avg_variance64x32_c, 10),
         make_tuple(5, 6, &vpx_highbd_10_sub_pixel_avg_variance32x64_c, 10),
@@ -953,6 +1012,11 @@
         make_tuple(3, 2, &vpx_highbd_10_sub_pixel_avg_variance8x4_c, 10),
         make_tuple(2, 3, &vpx_highbd_10_sub_pixel_avg_variance4x8_c, 10),
         make_tuple(2, 2, &vpx_highbd_10_sub_pixel_avg_variance4x4_c, 10),
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+        make_tuple(7, 7, &vpx_highbd_12_sub_pixel_avg_variance128x128_c, 12),
+        make_tuple(7, 6, &vpx_highbd_12_sub_pixel_avg_variance128x64_c, 12),
+        make_tuple(6, 7, &vpx_highbd_12_sub_pixel_avg_variance64x128_c, 12),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
         make_tuple(6, 6, &vpx_highbd_12_sub_pixel_avg_variance64x64_c, 12),
         make_tuple(6, 5, &vpx_highbd_12_sub_pixel_avg_variance64x32_c, 12),
         make_tuple(5, 6, &vpx_highbd_12_sub_pixel_avg_variance32x64_c, 12),
diff --git a/test/vp10_fht4x4_test.cc b/test/vp10_fht4x4_test.cc
new file mode 100644
index 0000000..d2598f9
--- /dev/null
+++ b/test/vp10_fht4x4_test.cc
@@ -0,0 +1,123 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp10_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/transform_test_base.h"
+#include "test/util.h"
+#include "vpx_ports/mem.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                        int tx_type);
+
+using libvpx_test::FhtFunc;
+typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t, int> Ht4x4Param;
+
+void fht4x4_ref(const int16_t *in, tran_low_t *out, int stride,
+                int tx_type) {
+  vp10_fht4x4_c(in, out, stride, tx_type);
+}
+
+class VP10Trans4x4HT
+    : public libvpx_test::TransformTestBase,
+      public ::testing::TestWithParam<Ht4x4Param> {
+ public:
+  virtual ~VP10Trans4x4HT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_  = GET_PARAM(2);
+    pitch_    = 4;
+    fwd_txfm_ref = fht4x4_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = GET_PARAM(4);
+  }
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
+    fwd_txfm_(in, out, stride, tx_type_);
+  }
+
+  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride, tx_type_);
+  }
+
+  FhtFunc fwd_txfm_;
+  IhtFunc inv_txfm_;
+};
+
+TEST_P(VP10Trans4x4HT, CoeffCheck) {
+  RunCoeffCheck();
+}
+
+using std::tr1::make_tuple;
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VP10Trans4x4HT,
+    ::testing::Values(
+#if !CONFIG_EXT_TX
+      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 0,
+                 VPX_BITS_8, 16),
+      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 1,
+                 VPX_BITS_8, 16),
+      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 2,
+                 VPX_BITS_8, 16),
+      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 3,
+                 VPX_BITS_8, 16)));
+#else
+      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 0,
+                 VPX_BITS_8, 16),
+      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 1,
+                 VPX_BITS_8, 16),
+      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 2,
+                 VPX_BITS_8, 16),
+      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 3,
+                 VPX_BITS_8, 16),
+      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 4,
+                 VPX_BITS_8, 16),
+      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 5,
+                 VPX_BITS_8, 16),
+      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 6,
+                 VPX_BITS_8, 16),
+      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 7,
+                 VPX_BITS_8, 16),
+      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 8,
+                 VPX_BITS_8, 16),
+      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 9,
+                 VPX_BITS_8, 16),
+      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 10,
+                 VPX_BITS_8, 16),
+      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 11,
+                 VPX_BITS_8, 16),
+      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 12,
+                 VPX_BITS_8, 16),
+      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 13,
+                 VPX_BITS_8, 16),
+      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 14,
+                 VPX_BITS_8, 16),
+      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 15,
+                 VPX_BITS_8, 16)));
+#endif  // !CONFIG_EXT_TX
+#endif  // HAVE_SSE2
+
+}  // namespace
diff --git a/vp10/common/blockd.h b/vp10/common/blockd.h
index f96aa2e..a1b5683 100644
--- a/vp10/common/blockd.h
+++ b/vp10/common/blockd.h
@@ -360,7 +360,7 @@
 #define USE_MSKTX_FOR_32X32      1
 
 static const int num_ext_tx_set_inter[EXT_TX_SETS_INTER] = {
-  1, 17, 10, 2
+  1, 19, 12, 2
 };
 static const int num_ext_tx_set_intra[EXT_TX_SETS_INTRA] = {
   1, 17, 10
@@ -421,10 +421,10 @@
 
 // Transform types used in each inter set
 static const int ext_tx_used_inter[EXT_TX_SETS_INTER][TX_TYPES] = {
-  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
-  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, },
-  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, },
-  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, },
+  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1},
+  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1},
 };
 
 static INLINE int get_ext_tx_types(TX_SIZE tx_size, BLOCK_SIZE bs,
@@ -580,17 +580,12 @@
 static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi,
                                      const struct macroblockd_plane *pd) {
 #if CONFIG_SUPERTX
-  if (!supertx_enabled(mbmi)) {
-    return get_uv_tx_size_impl(mbmi->tx_size, mbmi->sb_type, pd->subsampling_x,
-                               pd->subsampling_y);
-  } else {
+  if (supertx_enabled(mbmi))
     return uvsupertx_size_lookup[mbmi->tx_size][pd->subsampling_x]
                                                [pd->subsampling_y];
-  }
-#else
+#endif  // CONFIG_SUPERTX
   return get_uv_tx_size_impl(mbmi->tx_size, mbmi->sb_type, pd->subsampling_x,
                              pd->subsampling_y);
-#endif  // CONFIG_SUPERTX
 }
 
 static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
diff --git a/vp10/common/entropymode.c b/vp10/common/entropymode.c
index 8bb653c..e4c27a7 100644
--- a/vp10/common/entropymode.c
+++ b/vp10/common/entropymode.c
@@ -838,46 +838,52 @@
                                            [TREE_SIZE(TX_TYPES)] = {
   { // ToDo(yaowu): remove used entry 0.
     -IDTX, 2,
-    -DCT_DCT, 4,
-    -DST_DST, 6,
-    8, 18,
-    10, 12,
-    -DST_DCT, -DCT_DST,
+    -V_DCT, 4,
+    -H_DCT, 6,
+    -DCT_DCT, 8,
+    -DST_DST, 10,
+    12, 22,
     14, 16,
+    -DST_DCT, -DCT_DST,
+    18, 20,
     -ADST_DCT, -DCT_ADST,
     -FLIPADST_DCT, -DCT_FLIPADST,
-    20, 26,
-    22, 24,
+    24, 30,
+    26, 28,
     -DST_ADST, -ADST_DST,
     -DST_FLIPADST, -FLIPADST_DST,
-    28, 30,
+    32, 34,
     -ADST_ADST, -FLIPADST_FLIPADST,
     -ADST_FLIPADST, -FLIPADST_ADST,
   }, {
     -IDTX, 2,
-    -DCT_DCT, 4,
-    -DST_DST, 6,
-    8, 18,
-    10, 12,
-    -DST_DCT, -DCT_DST,
+    -V_DCT, 4,
+    -H_DCT, 6,
+    -DCT_DCT, 8,
+    -DST_DST, 10,
+    12, 22,
     14, 16,
+    -DST_DCT, -DCT_DST,
+    18, 20,
     -ADST_DCT, -DCT_ADST,
     -FLIPADST_DCT, -DCT_FLIPADST,
-    20, 26,
-    22, 24,
+    24, 30,
+    26, 28,
     -DST_ADST, -ADST_DST,
     -DST_FLIPADST, -FLIPADST_DST,
-    28, 30,
+    32, 34,
     -ADST_ADST, -FLIPADST_FLIPADST,
     -ADST_FLIPADST, -FLIPADST_ADST,
   }, {
     -IDTX, 2,
-    -DCT_DCT, 4,
-    6, 12,
-    8, 10,
+    -V_DCT, 4,
+    -H_DCT, 6,
+    -DCT_DCT, 8,
+    10, 16,
+    12, 14,
     -ADST_DCT, -DCT_ADST,
     -FLIPADST_DCT, -DCT_FLIPADST,
-    14, 16,
+    18, 20,
     -ADST_ADST, -FLIPADST_FLIPADST,
     -ADST_FLIPADST, -FLIPADST_ADST
   }, {
@@ -937,33 +943,33 @@
 static const vpx_prob
 default_inter_ext_tx_prob[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES - 1] = {
   { // ToDo(yaowu): remove unused entry 0.
-    { 12, 112, 16, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-    128 },
-    { 12, 112, 16, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-    128 },
-    { 12, 112, 16, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-    128 },
+    { 12, 15, 15, 112, 16, 128, 128, 128, 128, 128,
+      128, 128, 128, 128, 128, 128, 128, 128 },
+    { 12, 15, 15, 112, 16, 128, 128, 128, 128, 128,
+      128, 128, 128, 128, 128, 128, 128, 128 },
+    { 12, 15, 15, 112, 16, 128, 128, 128, 128, 128,
+      128, 128, 128, 128, 128, 128, 128, 128 },
 #if EXT_TX_SIZES == 4
-    { 12, 112, 16, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-    128 },
+    { 12, 15, 15, 112, 16, 128, 128, 128, 128, 128,
+      128, 128, 128, 128, 128, 128, 128, 128 },
 #endif
   }, {
-    { 12, 112, 16, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-      128 },
-    { 12, 112, 16, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-      128 },
-    { 12, 112, 16, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-      128 },
+    { 12, 15, 15, 112, 16, 128, 128, 128, 128, 128,
+      128, 128, 128, 128, 128, 128, 128, 128 },
+    { 12, 15, 15, 112, 16, 128, 128, 128, 128, 128,
+      128, 128, 128, 128, 128, 128, 128, 128 },
+    { 12, 15, 15, 112, 16, 128, 128, 128, 128, 128,
+      128, 128, 128, 128, 128, 128, 128, 128 },
 #if EXT_TX_SIZES == 4
-    { 12, 160, 16, 144, 160, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-      128 },
+    { 12, 15, 15, 160, 16, 144, 160, 128, 128, 128,
+      128, 128, 128, 128, 128, 128, 128, 128 },
 #endif
   }, {
-    { 12, 112, 128, 128, 128, 128, 128, 128, 128 },
-    { 12, 112, 128, 128, 128, 128, 128, 128, 128 },
-    { 12, 112, 128, 128, 128, 128, 128, 128, 128 },
+    { 12, 15, 15, 112, 128, 128, 128, 128, 128, 128, 128 },
+    { 12, 15, 15, 112, 128, 128, 128, 128, 128, 128, 128 },
+    { 12, 15, 15, 112, 128, 128, 128, 128, 128, 128, 128 },
 #if EXT_TX_SIZES == 4
-    { 12, 160, 128, 128, 128, 128, 128, 128, 128 },
+    { 12, 15, 15, 160, 128, 128, 128, 128, 128, 128, 128 },
 #endif
   }, {
     { 12, },
diff --git a/vp10/common/enums.h b/vp10/common/enums.h
index 2233649..4e3a5b1 100644
--- a/vp10/common/enums.h
+++ b/vp10/common/enums.h
@@ -108,6 +108,8 @@
   FLIPADST_DST = 14,
   DST_DST = 15,
   IDTX = 16,
+  V_DCT = 17,
+  H_DCT = 18,
 #endif  // CONFIG_EXT_TX
   TX_TYPES,
 } TX_TYPE;
diff --git a/vp10/common/filter.c b/vp10/common/filter.c
index 2023cd6..36a17a8 100644
--- a/vp10/common/filter.c
+++ b/vp10/common/filter.c
@@ -77,6 +77,7 @@
   {0,   1,  -3,   8, 126,  -5,   1, 0},
 };
 
+#if CONFIG_EXT_INTRA
 DECLARE_ALIGNED(256, static const InterpKernel,
                 sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = {
   // intfilt 0.8
@@ -97,6 +98,28 @@
   {-2,   4,  -7,  18, 124, -12,   5, -2},
   {-1,   2,  -4,   9, 127,  -6,   2, -1},
 };
+#endif  // CONFIG_EXT_INTRA
+
+DECLARE_ALIGNED(256, static const int16_t,
+                sub_pel_filters_10sharp[SUBPEL_SHIFTS][10]) = {
+  // intfilt 0.77
+  {0,   0,   0,   0, 128,   0,   0,   0,   0, 0},
+  {0,  -1,   3,  -6, 127,   8,  -4,   2,  -1, 0},
+  {1,  -2,   5, -12, 124,  18,  -7,   3,  -2, 0},
+  {1,  -3,   7, -17, 119,  28, -11,   5,  -2, 1},
+  {1,  -4,   8, -20, 114,  38, -14,   7,  -3, 1},
+  {1,  -4,   9, -22, 107,  49, -17,   8,  -4, 1},
+  {2,  -5,  10, -24,  99,  59, -20,   9,  -4, 2},
+  {2,  -5,  10, -24,  90,  70, -22,  10,  -5, 2},
+  {2,  -5,  10, -23,  80,  80, -23,  10,  -5, 2},
+  {2,  -5,  10, -22,  70,  90, -24,  10,  -5, 2},
+  {2,  -4,   9, -20,  59,  99, -24,  10,  -5, 2},
+  {1,  -4,   8, -17,  49, 107, -22,   9,  -4, 1},
+  {1,  -3,   7, -14,  38, 114, -20,   8,  -4, 1},
+  {1,  -2,   5, -11,  28, 119, -17,   7,  -3, 1},
+  {0,  -2,   3,  -7,  18, 124, -12,   5,  -2, 1},
+  {0,  -1,   2,  -4,   8, 127,  -6,   3,  -1, 0},
+};
 
 #if SWITCHABLE_FILTERS >= 4
 DECLARE_ALIGNED(256, static const InterpKernel,
@@ -145,23 +168,23 @@
 #if SWITCHABLE_FILTERS == 5
 DECLARE_ALIGNED(16, static const int16_t,
                 sub_pel_filters_12sharp[SUBPEL_SHIFTS][12]) = {
-  // intfilt 0.8
+  // intfilt 0.85
   {0,   0,   0,   0,   0, 128,   0,   0,   0,   0,   0, 0},
-  {0,   1,  -1,   3,  -7, 127,   8,  -4,   2,  -1,   0, 0},
-  {0,   1,  -3,   5, -12, 124,  18,  -8,   4,  -2,   1, 0},
-  {-1,   2,  -4,   8, -17, 120,  28, -11,   6,  -3,   1, -1},
-  {-1,   2,  -4,  10, -21, 114,  38, -15,   8,  -4,   2, -1},
-  {-1,   3,  -5,  11, -23, 107,  49, -18,   9,  -5,   2, -1},
-  {-1,   3,  -6,  12, -25,  99,  60, -21,  11,  -6,   3, -1},
-  {-1,   3,  -6,  12, -25,  90,  70, -23,  12,  -6,   3, -1},
-  {-1,   3,  -6,  12, -24,  80,  80, -24,  12,  -6,   3, -1},
-  {-1,   3,  -6,  12, -23,  70,  90, -25,  12,  -6,   3, -1},
-  {-1,   3,  -6,  11, -21,  60,  99, -25,  12,  -6,   3, -1},
-  {-1,   2,  -5,   9, -18,  49, 107, -23,  11,  -5,   3, -1},
-  {-1,   2,  -4,   8, -15,  38, 114, -21,  10,  -4,   2, -1},
-  {-1,   1,  -3,   6, -11,  28, 120, -17,   8,  -4,   2, -1},
-  {0,   1,  -2,   4,  -8,  18, 124, -12,   5,  -3,   1, 0},
-  {0,   0,  -1,   2,  -4,   8, 127,  -7,   3,  -1,   1, 0},
+  {0,   1,  -2,   3,  -7, 127,   8,  -4,   2,  -1,   1, 0},
+  {-1,   2,  -3,   6, -13, 124,  18,  -8,   4,  -2,   2, -1},
+  {-1,   3,  -4,   8, -18, 120,  28, -12,   7,  -4,   2, -1},
+  {-1,   3,  -6,  10, -21, 115,  38, -15,   8,  -5,   3, -1},
+  {-2,   4,  -6,  12, -24, 108,  49, -18,  10,  -6,   3, -2},
+  {-2,   4,  -7,  13, -25, 100,  60, -21,  11,  -7,   4, -2},
+  {-2,   4,  -7,  13, -26,  91,  71, -24,  13,  -7,   4, -2},
+  {-2,   4,  -7,  13, -25,  81,  81, -25,  13,  -7,   4, -2},
+  {-2,   4,  -7,  13, -24,  71,  91, -26,  13,  -7,   4, -2},
+  {-2,   4,  -7,  11, -21,  60, 100, -25,  13,  -7,   4, -2},
+  {-2,   3,  -6,  10, -18,  49, 108, -24,  12,  -6,   4, -2},
+  {-1,   3,  -5,   8, -15,  38, 115, -21,  10,  -6,   3, -1},
+  {-1,   2,  -4,   7, -12,  28, 120, -18,   8,  -4,   3, -1},
+  {-1,   2,  -2,   4,  -8,  18, 124, -13,   6,  -3,   2, -1},
+  {0,   1,  -1,   2,  -4,   8, 127,  -7,   3,  -2,   1, 0},
 };
 #endif
 
@@ -245,7 +268,7 @@
 vp10_interp_filter_params_list[SWITCHABLE_FILTERS + 1] = {
   {(const int16_t*)sub_pel_filters_8, SUBPEL_TAPS, SUBPEL_SHIFTS},
   {(const int16_t*)sub_pel_filters_8smooth, SUBPEL_TAPS, SUBPEL_SHIFTS},
-  {(const int16_t*)sub_pel_filters_8sharp, SUBPEL_TAPS, SUBPEL_SHIFTS},
+  {(const int16_t*)sub_pel_filters_10sharp, 10, SUBPEL_SHIFTS},
 #if SWITCHABLE_FILTERS >= 4
   {(const int16_t*)sub_pel_filters_8smooth2, SUBPEL_TAPS, SUBPEL_SHIFTS},
 #endif
@@ -266,7 +289,7 @@
 
 #if USE_TEMPORALFILTER_12TAP
 static const InterpFilterParams vp10_interp_temporalfilter_12tap = {
-    (const int16_t*)sub_pel_filters_temporalfilter_12, 12, SUBPEL_SHIFTS
+  (const int16_t*)sub_pel_filters_temporalfilter_12, 12, SUBPEL_SHIFTS
 };
 #endif  // USE_TEMPORALFILTER_12TAP
 
diff --git a/vp10/common/idct.c b/vp10/common/idct.c
index dbb50fb..a941f64 100644
--- a/vp10/common/idct.c
+++ b/vp10/common/idct.c
@@ -326,11 +326,79 @@
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // CONFIG_EXT_TX
 
-// Inverse identiy transform and add.
+// Inverse identity transform and add.
 static void inv_idtx_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                           int bs) {
+                           int bs, int tx_type) {
   int r, c;
   const int shift = bs < 32 ? 3 : 2;
+
+  tran_low_t temp_in[32], temp_out[32];
+  transform_2d ht = {idct4_c, idct4_c};
+  int out_scale = 1;
+  int coeff_stride = 0;
+
+  switch (bs) {
+    case 4:
+      ht.cols = idct4_c;
+      ht.rows = idct4_c;
+      out_scale = cospi_16_64 >> 3;
+      coeff_stride = 4;
+      break;
+    case 8:
+      ht.cols = idct8_c;
+      ht.rows = idct8_c;
+      out_scale = (1 << (DCT_CONST_BITS - 4));
+      coeff_stride = 8;
+      break;
+    case 16:
+      ht.cols = idct16_c;
+      ht.rows = idct16_c;
+      out_scale = cospi_16_64 >> 4;
+      coeff_stride = 16;
+      break;
+    case 32:
+      ht.cols = idct32_c;
+      ht.rows = idct32_c;
+      out_scale = (1 << (DCT_CONST_BITS - 4));
+      coeff_stride = 32;
+      break;
+    default:
+      assert(0);
+  }
+
+  // Columns
+  if (tx_type == V_DCT) {
+    for (c = 0; c < bs; ++c) {
+      for (r = 0; r < bs; ++r)
+        temp_in[r] = input[r * coeff_stride + c];
+      ht.cols(temp_in, temp_out);
+
+      for (r = 0; r < bs; ++r) {
+        tran_high_t temp = (tran_high_t)temp_out[r] * out_scale;
+        temp >>= DCT_CONST_BITS;
+        dest[r * stride + c] = clip_pixel_add(dest[r * stride + c],
+                                              (tran_low_t)temp);
+      }
+    }
+    return;
+  }
+
+  if (tx_type == H_DCT) {
+    for (r = 0; r < bs; ++r) {
+      for (c = 0; c < bs; ++c)
+        temp_in[c] = input[r * coeff_stride + c];
+      ht.rows(temp_in, temp_out);
+
+      for (c = 0; c < bs; ++c) {
+        tran_high_t temp = (tran_high_t)temp_out[c] * out_scale;
+        temp >>= DCT_CONST_BITS;
+        dest[r * stride + c] = clip_pixel_add(dest[r * stride + c],
+                                              (tran_low_t)temp);
+      }
+    }
+    return;
+  }
+
   for (r = 0; r < bs; ++r) {
     for (c = 0; c < bs; ++c)
       dest[c] = clip_pixel_add(dest[c], input[c] >> shift);
@@ -360,6 +428,8 @@
     case DST_DCT:
     case DST_ADST:
     case ADST_DST:
+    case V_DCT:
+    case H_DCT:
       break;
     case FLIPADST_DCT:
     case FLIPADST_ADST:
@@ -1031,8 +1101,10 @@
       // Use C version since DST only exists in C code
       vp10_iht4x4_16_add_c(input, dest, stride, tx_type);
       break;
+    case H_DCT:
+    case V_DCT:
     case IDTX:
-      inv_idtx_add_c(input, dest, stride, 4);
+      inv_idtx_add_c(input, dest, stride, 4, tx_type);
       break;
 #endif  // CONFIG_EXT_TX
     default:
@@ -1070,8 +1142,10 @@
       // Use C version since DST only exists in C code
       vp10_iht8x8_64_add_c(input, dest, stride, tx_type);
       break;
+    case H_DCT:
+    case V_DCT:
     case IDTX:
-      inv_idtx_add_c(input, dest, stride, 8);
+      inv_idtx_add_c(input, dest, stride, 8, tx_type);
       break;
 #endif  // CONFIG_EXT_TX
     default:
@@ -1109,8 +1183,10 @@
       // Use C version since DST only exists in C code
       vp10_iht16x16_256_add_c(input, dest, stride, tx_type);
       break;
+    case H_DCT:
+    case V_DCT:
     case IDTX:
-      inv_idtx_add_c(input, dest, stride, 16);
+      inv_idtx_add_c(input, dest, stride, 16, tx_type);
       break;
 #endif  // CONFIG_EXT_TX
     default:
@@ -1143,8 +1219,10 @@
     case DST_FLIPADST:
       vp10_iht32x32_1024_add_c(input, dest, stride, tx_type);
       break;
+    case H_DCT:
+    case V_DCT:
     case IDTX:
-      inv_idtx_add_c(input, dest, stride, 32);
+      inv_idtx_add_c(input, dest, stride, 32, tx_type);
       break;
 #endif  // CONFIG_EXT_TX
     default:
diff --git a/vp10/common/mvref_common.c b/vp10/common/mvref_common.c
index 5a2def0..c67beed 100644
--- a/vp10/common/mvref_common.c
+++ b/vp10/common/mvref_common.c
@@ -17,6 +17,7 @@
                                     const MV_REFERENCE_FRAME rf[2],
                                     uint8_t *refmv_count,
                                     CANDIDATE_MV *ref_mv_stack,
+                                    const int use_hp,
                                     int len, int block, int col) {
   const int weight = len;
   int index = 0, ref;
@@ -28,6 +29,8 @@
       if (candidate->ref_frame[ref] == rf[0]) {
         int_mv this_refmv =
             get_sub_block_mv(candidate_mi, ref, col, block);
+        lower_mv_precision(&this_refmv.as_mv, use_hp);
+
         for (index = 0; index < *refmv_count; ++index)
           if (ref_mv_stack[index].this_mv.as_int == this_refmv.as_int)
             break;
@@ -168,8 +171,9 @@
                              num_8x8_blocks_wide_lookup[candidate->sb_type]);
 
       newmv_count += add_ref_mv_candidate(candidate_mi, candidate, rf,
-                                          refmv_count, ref_mv_stack, len,
-                                          block, mi_pos.col);
+                                          refmv_count, ref_mv_stack,
+                                          cm->allow_high_precision_mv,
+                                          len, block, mi_pos.col);
       i += len;
     } else {
       ++i;
@@ -202,8 +206,9 @@
                        num_8x8_blocks_high_lookup[candidate->sb_type]);
 
       newmv_count += add_ref_mv_candidate(candidate_mi, candidate, rf,
-                                          refmv_count, ref_mv_stack, len,
-                                          block, mi_pos.col);
+                                          refmv_count, ref_mv_stack,
+                                          cm->allow_high_precision_mv,
+                                          len, block, mi_pos.col);
       i += len;
     } else {
       ++i;
@@ -234,8 +239,9 @@
     const int len = 1;
 
     newmv_count += add_ref_mv_candidate(candidate_mi, candidate, rf,
-                                        refmv_count, ref_mv_stack, len,
-                                        block, mi_pos.col);
+                                        refmv_count, ref_mv_stack,
+                                        cm->allow_high_precision_mv,
+                                        len, block, mi_pos.col);
   }  // Analyze a single 8x8 block motion information.
   return newmv_count;
 }
@@ -355,9 +361,12 @@
 
         for (ref = 0; ref < 2; ++ref) {
           if (prev_frame_mvs->ref_frame[ref] == ref_frame) {
+            int_mv this_refmv = prev_frame_mvs->mv[ref];
+            lower_mv_precision(&this_refmv.as_mv,
+                               cm->allow_high_precision_mv);
+
             for (idx = 0; idx < *refmv_count; ++idx)
-              if (prev_frame_mvs->mv[ref].as_int ==
-                  ref_mv_stack[idx].this_mv.as_int)
+              if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int)
                 break;
 
             if (idx < *refmv_count)
@@ -365,7 +374,7 @@
 
             if (idx == *refmv_count &&
                 *refmv_count < MAX_REF_MV_STACK_SIZE) {
-              ref_mv_stack[idx].this_mv.as_int = prev_frame_mvs->mv[ref].as_int;
+              ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
               ref_mv_stack[idx].weight = 2;
               ++(*refmv_count);
 
diff --git a/vp10/common/reconinter.c b/vp10/common/reconinter.c
index 97046bb..72e6ae0 100644
--- a/vp10/common/reconinter.c
+++ b/vp10/common/reconinter.c
@@ -792,7 +792,7 @@
 
         for (row = 0; row < bh; ++row) {
           for (col = 0; col < bw; ++col)
-            dst16[col] = (mask[0][row] * dst16[col] + mask[1][row] * tmp16[col]
+            dst16[col] = (mask[0][col] * dst16[col] + mask[1][col] * tmp16[col]
                           + 32) >> 6;
           dst16 += dst_stride;
           tmp16 += tmp_stride;
diff --git a/vp10/common/reconintra.c b/vp10/common/reconintra.c
index 76d50c6..e28f01c 100644
--- a/vp10/common/reconintra.c
+++ b/vp10/common/reconintra.c
@@ -1345,7 +1345,7 @@
   const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const int right_available =
-      mi_col + (bw >> !pd->subsampling_x) < xd->tile.mi_col_end;
+      mi_col + (1 << mi_width_log2_lookup[bsize]) < xd->tile.mi_col_end;
   const int have_right = vp10_has_right(bsize, mi_row, mi_col,
                                         right_available,
                                         tx_size, row_off, col_off,
diff --git a/vp10/common/scan.c b/vp10/common/scan.c
index 672ac1d..21d291f 100644
--- a/vp10/common/scan.c
+++ b/vp10/common/scan.c
@@ -1790,6 +1790,8 @@
     {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
     {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
     {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {row_scan_4x4,     vp10_row_iscan_4x4,     row_scan_4x4_neighbors},
+    {col_scan_4x4,     vp10_col_iscan_4x4,     col_scan_4x4_neighbors},
   }, {  // TX_8X8
     {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
     {row_scan_8x8,     vp10_row_iscan_8x8,     row_scan_8x8_neighbors},
@@ -1808,6 +1810,8 @@
     {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
     {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
     {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {row_scan_8x8,     vp10_row_iscan_8x8,     row_scan_8x8_neighbors},
+    {col_scan_8x8,     vp10_col_iscan_8x8,     col_scan_8x8_neighbors},
   }, {  // TX_16X16
     {default_scan_16x16, vp10_default_iscan_16x16,
      default_scan_16x16_neighbors},
@@ -1841,6 +1845,8 @@
      default_scan_16x16_neighbors},
     {default_scan_16x16, vp10_default_iscan_16x16,
      default_scan_16x16_neighbors},
+     {row_scan_16x16,     vp10_row_iscan_16x16,     row_scan_16x16_neighbors},
+     {col_scan_16x16,     vp10_col_iscan_16x16,     col_scan_16x16_neighbors},
   }, {  // TX_32X32
     {default_scan_32x32, vp10_default_iscan_32x32,
      default_scan_32x32_neighbors},
@@ -1876,6 +1882,10 @@
      qtr_scan_32x32_neighbors},
     {default_scan_32x32, vp10_default_iscan_32x32,
      default_scan_32x32_neighbors},
+     {h2_scan_32x32, vp10_h2_iscan_32x32,
+      h2_scan_32x32_neighbors},
+     {v2_scan_32x32, vp10_v2_iscan_32x32,
+      v2_scan_32x32_neighbors},
   }
 };
 
@@ -1898,6 +1908,8 @@
     {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
     {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
     {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {row_scan_4x4,     vp10_row_iscan_4x4,     row_scan_4x4_neighbors},
+    {col_scan_4x4,     vp10_col_iscan_4x4,     col_scan_4x4_neighbors},
   }, {  // TX_8X8
     {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
     {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
@@ -1916,6 +1928,8 @@
     {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
     {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
     {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {row_scan_8x8,     vp10_row_iscan_8x8,     row_scan_8x8_neighbors},
+    {col_scan_8x8,     vp10_col_iscan_8x8,     col_scan_8x8_neighbors},
   }, {  // TX_16X16
     {default_scan_16x16, vp10_default_iscan_16x16,
      default_scan_16x16_neighbors},
@@ -1951,6 +1965,8 @@
      default_scan_16x16_neighbors},
     {default_scan_16x16, vp10_default_iscan_16x16,
      default_scan_16x16_neighbors},
+     {row_scan_16x16,     vp10_row_iscan_16x16,     row_scan_16x16_neighbors},
+     {col_scan_16x16,     vp10_col_iscan_16x16,     col_scan_16x16_neighbors},
   }, {  // TX_32X32
     {default_scan_32x32, vp10_default_iscan_32x32,
      default_scan_32x32_neighbors},
@@ -1986,6 +2002,10 @@
      qtr_scan_32x32_neighbors},
     {default_scan_32x32, vp10_default_iscan_32x32,
      default_scan_32x32_neighbors},
+     {h2_scan_32x32, vp10_h2_iscan_32x32,
+      h2_scan_32x32_neighbors},
+     {v2_scan_32x32, vp10_v2_iscan_32x32,
+      v2_scan_32x32_neighbors},
   }
 };
 
diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl
index c9f0295..2344ce2 100644
--- a/vp10/common/vp10_rtcd_defs.pl
+++ b/vp10/common/vp10_rtcd_defs.pl
@@ -426,6 +426,9 @@
   specialize qw/vp10_fwht4x4 msa/, "$mmx_x86inc";
 }
 
+add_proto qw/void vp10_fwd_idtx/, "const int16_t *src_diff, tran_low_t *coeff, int stride, int bs, int tx_type";
+  specialize qw/vp10_fwd_idtx/;
+
 # Inverse transform
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   # Note as optimized versions of these functions are added we need to add a check to ensure
diff --git a/vp10/decoder/decodeframe.c b/vp10/decoder/decodeframe.c
index af6016a..64ac3cc 100644
--- a/vp10/decoder/decodeframe.c
+++ b/vp10/decoder/decodeframe.c
@@ -811,7 +811,9 @@
                   subpel_y, sf, w, h, ref, interp_filter, xs, ys);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }
+#endif  // (CONFIG_SUPERTX || CONFIG_OBMC)
 
+#if CONFIG_SUPERTX
 static void dec_build_inter_predictors_sb(VP10Decoder *const pbi,
                                           MACROBLOCKD *xd,
                                           int mi_row, int mi_col) {
@@ -881,9 +883,7 @@
                                      sb_type);
 #endif  // CONFIG_EXT_INTER
 }
-#endif  // (CONFIG_SUPERTX || CONFIG_OBMC)
 
-#if CONFIG_SUPERTX
 static void dec_build_inter_predictors_sb_sub8x8(VP10Decoder *const pbi,
                                                  MACROBLOCKD *xd,
                                                  int mi_row, int mi_col,
diff --git a/vp10/encoder/block.h b/vp10/encoder/block.h
index 7921300..ce650b1 100644
--- a/vp10/encoder/block.h
+++ b/vp10/encoder/block.h
@@ -143,6 +143,9 @@
   uint8_t zcoeff_blk[TX_SIZES][256];
 #if CONFIG_VAR_TX
   uint8_t blk_skip[MAX_MB_PLANE][256];
+#if CONFIG_REF_MV
+  uint8_t blk_skip_drl[MAX_MB_PLANE][256];
+#endif
 #endif
 
   int skip;
diff --git a/vp10/encoder/dct.c b/vp10/encoder/dct.c
index 333adbb..31a4c87 100644
--- a/vp10/encoder/dct.c
+++ b/vp10/encoder/dct.c
@@ -1315,6 +1315,8 @@
     case DST_DCT:
     case DST_ADST:
     case ADST_DST:
+    case H_DCT:
+    case V_DCT:
       break;
     case FLIPADST_DCT:
     case FLIPADST_ADST:
@@ -1758,6 +1760,95 @@
   }
 }
 
+// Forward identity transform.
+void vp10_fwd_idtx_c(const int16_t *src_diff,
+                     tran_low_t *coeff, int stride,
+                     int bs, int tx_type) {
+  int r, c;
+  const int shift = bs < 32 ? 3 : 2;
+
+  const int16_t *input = src_diff;
+  tran_low_t *output = coeff;
+
+  int i, j;
+  tran_low_t temp_in[32], temp_out[32];
+  transform_2d ht = {fdct4, fdct4};
+  int in_scale = 1;
+  int out_scale = 1;
+  int coeff_stride = 0;
+
+  switch (bs) {
+    case 4:
+      ht.cols = fdct4;
+      ht.rows = fdct4;
+      in_scale = 16;
+      out_scale = cospi_16_64 >> 1;
+      coeff_stride = 4;
+      break;
+    case 8:
+      ht.cols = fdct8;
+      ht.rows = fdct8;
+      in_scale = 4;
+      out_scale = (1 << DCT_CONST_BITS);
+      coeff_stride = 8;
+      break;
+    case 16:
+      ht.cols = fdct16;
+      ht.rows = fdct16;
+      in_scale = 4;
+      out_scale = cospi_16_64;
+      coeff_stride = 16;
+      break;
+    case 32:
+      ht.cols = fdct32;
+      ht.rows = fdct32;
+      in_scale = 4;
+      out_scale = (1 << (DCT_CONST_BITS - 2));
+      coeff_stride = 32;
+      break;
+    default:
+      assert(0);
+  }
+
+  // Columns
+  if (tx_type == V_DCT) {
+    for (i = 0; i < bs; ++i) {
+      for (j = 0; j < bs; ++j)
+        temp_in[j] = input[j * stride + i] * in_scale;
+      ht.cols(temp_in, temp_out);
+
+      for (j = 0; j < bs; ++j) {
+        tran_high_t temp = (tran_high_t)temp_out[j] * out_scale;
+        temp >>= DCT_CONST_BITS;
+        output[j * coeff_stride + i] = (tran_low_t)temp;
+      }
+    }
+    return;
+  }
+
+  // Rows
+  if (tx_type == H_DCT) {
+    for (j = 0; j < bs; ++j) {
+      for (i = 0; i < bs; ++i)
+        temp_in[i] = input[j * stride + i] * in_scale;
+      ht.rows(temp_in, temp_out);
+
+      for (i = 0; i < bs; ++i) {
+        tran_high_t temp = (tran_high_t)temp_out[i] * out_scale;
+        temp >>= DCT_CONST_BITS;
+        output[j * coeff_stride + i] = (tran_low_t)temp;
+      }
+    }
+    return;
+  }
+
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) coeff[c] = src_diff[c] << shift;
+    src_diff += stride;
+    coeff += bs;
+  }
+}
+
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp10_highbd_fht32x32_c(const int16_t *input, tran_low_t *output,
                             int stride, int tx_type) {
diff --git a/vp10/encoder/encodeframe.c b/vp10/encoder/encodeframe.c
index d3ea94b..8c7af63 100644
--- a/vp10/encoder/encodeframe.c
+++ b/vp10/encoder/encodeframe.c
@@ -3987,6 +3987,9 @@
   vp10_zero(x->skip_txfm);
 #if CONFIG_VAR_TX
   vp10_zero(x->blk_skip);
+#if CONFIG_REF_MV
+  vp10_zero(x->blk_skip_drl);
+#endif
 #endif
 
   {
diff --git a/vp10/encoder/hybrid_fwd_txfm.c b/vp10/encoder/hybrid_fwd_txfm.c
index 03d9c6d..029240f 100644
--- a/vp10/encoder/hybrid_fwd_txfm.c
+++ b/vp10/encoder/hybrid_fwd_txfm.c
@@ -33,21 +33,6 @@
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-#if CONFIG_EXT_TX
-// Forward identity transform.
-static void fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride,
-                       int bs) {
-  int r, c;
-  const int shift = bs < 32 ? 3 : 2;
-
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) coeff[c] = src_diff[c] << shift;
-    src_diff += stride;
-    coeff += bs;
-  }
-}
-#endif  // CONFIG_EXT_TX
-
 void vp10_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
                        int diff_stride, TX_TYPE tx_type, int lossless) {
   if (lossless) {
@@ -78,8 +63,10 @@
     case FLIPADST_DST:
       vp10_fht4x4(src_diff, coeff, diff_stride, tx_type);
       break;
+    case H_DCT:
+    case V_DCT:
     case IDTX:
-      fwd_idtx_c(src_diff, coeff, diff_stride, 4);
+      vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 4, tx_type);
       break;
 #endif  // CONFIG_EXT_TX
     default:
@@ -116,8 +103,10 @@
     case FLIPADST_DST:
       vp10_fht8x8(src_diff, coeff, diff_stride, tx_type);
       break;
+    case H_DCT:
+    case V_DCT:
     case IDTX:
-      fwd_idtx_c(src_diff, coeff, diff_stride, 8);
+      vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 8, tx_type);
       break;
 #endif  // CONFIG_EXT_TX
     default:
@@ -157,8 +146,10 @@
       // Use C version since DST exists only in C
       vp10_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
       break;
+    case H_DCT:
+    case V_DCT:
     case IDTX:
-      fwd_idtx_c(src_diff, coeff, diff_stride, 16);
+      vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 16, tx_type);
       break;
 #endif  // CONFIG_EXT_TX
     default:
@@ -195,8 +186,10 @@
     case FLIPADST_DST:
       vp10_fht32x32_c(src_diff, coeff, diff_stride, tx_type);
       break;
+    case H_DCT:
+    case V_DCT:
     case IDTX:
-      fwd_idtx_c(src_diff, coeff, diff_stride, 32);
+      vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 32, tx_type);
       break;
 #endif  // CONFIG_EXT_TX
     default:
@@ -240,7 +233,7 @@
       vp10_highbd_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
       break;
     case IDTX:
-      fwd_idtx_c(src_diff, coeff, diff_stride, 4);
+      vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 4, tx_type);
       break;
 #endif  // CONFIG_EXT_TX
     default:
@@ -282,7 +275,7 @@
       vp10_highbd_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
       break;
     case IDTX:
-      fwd_idtx_c(src_diff, coeff, diff_stride, 8);
+      vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 8, tx_type);
       break;
 #endif  // CONFIG_EXT_TX
     default:
@@ -324,7 +317,7 @@
       vp10_highbd_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
       break;
     case IDTX:
-      fwd_idtx_c(src_diff, coeff, diff_stride, 16);
+      vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 16, tx_type);
       break;
 #endif  // CONFIG_EXT_TX
     default:
@@ -362,7 +355,7 @@
       vp10_highbd_fht32x32_c(src_diff, coeff, diff_stride, tx_type);
       break;
     case IDTX:
-      fwd_idtx_c(src_diff, coeff, diff_stride, 32);
+      vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 32, tx_type);
       break;
 #endif  // CONFIG_EXT_TX
     default:
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index 96edc0f..b86f6c0 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -354,6 +354,126 @@
   }
 }
 
+#if CONFIG_EXT_TX
+typedef enum {
+  DCT_1D = 0,
+  ADST_1D = 1,
+  FLIPADST_1D = 2,
+  DST_1D = 3,
+  TX_TYPES_1D = 4,
+} TX_TYPE_1D;
+
+static int prune_two_for_sby(const VP10_COMP *cpi,
+                             BLOCK_SIZE bsize,
+                             MACROBLOCK *x,
+                             MACROBLOCKD *xd) {
+  (void) cpi;
+  (void) bsize;
+  (void) x;
+  (void) xd;
+  return 3;
+}
+
+static int prune_three_for_sby(const VP10_COMP *cpi,
+                               BLOCK_SIZE bsize,
+                               MACROBLOCK *x,
+                               MACROBLOCKD *xd) {
+  (void) cpi;
+  (void) bsize;
+  (void) x;
+  (void) xd;
+  return 7;
+}
+
+#endif  // CONFIG_EXT_TX
+
+static int prune_one_for_sby(const VP10_COMP *cpi,
+                             BLOCK_SIZE bsize,
+                             MACROBLOCK *x,
+                             MACROBLOCKD *xd) {
+  (void) cpi;
+  (void) bsize;
+  (void) x;
+  (void) xd;
+  return 1;
+}
+
+static int prune_tx_types(const VP10_COMP *cpi,
+                          BLOCK_SIZE bsize,
+                          MACROBLOCK *x,
+                          MACROBLOCKD *xd) {
+  switch (cpi->sf.tx_type_search) {
+    case NO_PRUNE:
+      return 0;
+      break;
+    case PRUNE_ONE :
+      return prune_one_for_sby(cpi, bsize, x, xd);
+      break;
+  #if CONFIG_EXT_TX
+    case PRUNE_TWO :
+      return prune_two_for_sby(cpi, bsize, x, xd);
+      break;
+    case PRUNE_THREE :
+      return prune_three_for_sby(cpi, bsize, x, xd);
+      break;
+  #endif
+  }
+  assert(0);
+  return 0;
+}
+
+static int do_tx_type_search(TX_TYPE tx_type,
+                             int prune) {
+// TODO(sarahparker) implement for non ext tx
+#if CONFIG_EXT_TX
+  static TX_TYPE_1D vtx_tab[TX_TYPES] = {
+    DCT_1D,
+    ADST_1D,
+    DCT_1D,
+    ADST_1D,
+    FLIPADST_1D,
+    DCT_1D,
+    FLIPADST_1D,
+    ADST_1D,
+    FLIPADST_1D,
+    DST_1D,
+    DCT_1D,
+    DST_1D,
+    ADST_1D,
+    DST_1D,
+    FLIPADST_1D,
+    DST_1D,
+  };
+  static TX_TYPE_1D htx_tab[TX_TYPES] = {
+    DCT_1D,
+    DCT_1D,
+    ADST_1D,
+    ADST_1D,
+    DCT_1D,
+    FLIPADST_1D,
+    FLIPADST_1D,
+    FLIPADST_1D,
+    ADST_1D,
+    DCT_1D,
+    DST_1D,
+    ADST_1D,
+    DST_1D,
+    FLIPADST_1D,
+    DST_1D,
+    DST_1D,
+  };
+  if (tx_type == IDTX)
+    return 1;
+  return !(((prune >> vtx_tab[tx_type]) & 1) |
+         ((prune >> (htx_tab[tx_type] + TX_TYPES_1D)) & 1));
+#else
+  // temporary to avoid compiler warnings
+  (void) tx_type;
+  (void) prune;
+  return 1;
+#endif
+}
+
 static void model_rd_for_sb(VP10_COMP *cpi, BLOCK_SIZE bsize,
                             MACROBLOCK *x, MACROBLOCKD *xd,
                             int *out_rate_sum, int64_t *out_dist_sum,
@@ -394,8 +514,10 @@
     // low enough so that we can skip the mode search.
     const int64_t low_dc_thr = VPXMIN(50, dc_thr >> 2);
     const int64_t low_ac_thr = VPXMIN(80, ac_thr >> 2);
-    int bw = 1 << (b_width_log2_lookup[bs] - b_width_log2_lookup[unit_size]);
-    int bh = 1 << (b_height_log2_lookup[bs] - b_width_log2_lookup[unit_size]);
+    int bw_shift = (b_width_log2_lookup[bs] - b_width_log2_lookup[unit_size]);
+    int bh_shift = (b_height_log2_lookup[bs] - b_width_log2_lookup[unit_size]);
+    int bw = 1 << bw_shift;
+    int bh = 1 << bh_shift;
     int idx, idy;
     int lw = b_width_log2_lookup[unit_size] + 2;
     int lh = b_height_log2_lookup[unit_size] + 2;
@@ -406,7 +528,7 @@
       for (idx = 0; idx < bw; ++idx) {
         uint8_t *src = p->src.buf + (idy * p->src.stride << lh) + (idx << lw);
         uint8_t *dst = pd->dst.buf + (idy * pd->dst.stride << lh) + (idx << lh);
-        int block_idx = (idy << 1) + idx;
+        int block_idx = (idy << bw_shift) + idx;
         int low_err_skip = 0;
 
         var = cpi->fn_ptr[unit_size].vf(src, p->src.stride,
@@ -988,11 +1110,14 @@
   vpx_prob skip_prob = vp10_get_skip_prob(cm, xd);
   int  s0 = vp10_cost_bit(skip_prob, 0);
   int  s1 = vp10_cost_bit(skip_prob, 1);
+  const int is_inter = is_inter_block(mbmi);
+  int prune = 0;
 #if CONFIG_EXT_TX
   int ext_tx_set;
 #endif  // CONFIG_EXT_TX
-  const int is_inter = is_inter_block(mbmi);
 
+  if (is_inter && cpi->sf.tx_type_search > 0)
+    prune = prune_tx_types(cpi, bs, x, xd);
   mbmi->tx_size = VPXMIN(max_tx_size, largest_tx_size);
 
 #if CONFIG_EXT_TX
@@ -1004,6 +1129,15 @@
       if (is_inter) {
         if (!ext_tx_used_inter[ext_tx_set][tx_type])
           continue;
+        if (cpi->sf.tx_type_search > 0) {
+          if (!do_tx_type_search(tx_type, prune))
+            continue;
+        } else if (ext_tx_set == 1 &&
+                   tx_type >= DST_ADST && tx_type < IDTX &&
+                   best_tx_type == DCT_DCT) {
+          tx_type = IDTX - 1;
+          continue;
+        }
       } else {
         if (!ALLOW_INTRA_EXT_TX && bs >= BLOCK_8X8) {
           if (tx_type != intra_mode_to_tx_type_context[mbmi->mode])
@@ -1011,15 +1145,15 @@
         }
         if (!ext_tx_used_intra[ext_tx_set][tx_type])
           continue;
+        if (ext_tx_set == 1 &&
+            tx_type >= DST_ADST && tx_type < IDTX &&
+            best_tx_type == DCT_DCT) {
+          tx_type = IDTX - 1;
+          continue;
+        }
       }
 
       mbmi->tx_type = tx_type;
-      if (ext_tx_set == 1 &&
-          mbmi->tx_type >= DST_ADST && mbmi->tx_type < IDTX &&
-          best_tx_type == DCT_DCT) {
-        tx_type = IDTX - 1;
-        continue;
-      }
 
       txfm_rd_in_plane(x,
                        cpi,
@@ -1067,12 +1201,15 @@
                        cpi->sf.use_fast_coef_costing);
       if (r == INT_MAX)
         continue;
-      if (is_inter)
+      if (is_inter) {
         r += cpi->inter_tx_type_costs[mbmi->tx_size][mbmi->tx_type];
-      else
+        if (cpi->sf.tx_type_search > 0 && !do_tx_type_search(tx_type, prune))
+            continue;
+      } else {
         r += cpi->intra_tx_type_costs[mbmi->tx_size]
                                      [intra_mode_to_tx_type_context[mbmi->mode]]
                                      [mbmi->tx_type];
+      }
       if (s)
         this_rd = RDCOST(x->rdmult, x->rddiv, s1, psse);
       else
@@ -1150,13 +1287,17 @@
   TX_SIZE best_tx = max_tx_size;
   int start_tx, end_tx;
   const int tx_select = cm->tx_mode == TX_MODE_SELECT;
-  TX_TYPE tx_type, best_tx_type = DCT_DCT;
   const int is_inter = is_inter_block(mbmi);
+  const vpx_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc->tx_probs);
+  TX_TYPE tx_type, best_tx_type = DCT_DCT;
+  int prune = 0;
 #if CONFIG_EXT_TX
   int ext_tx_set;
 #endif  // CONFIG_EXT_TX
 
-  const vpx_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc->tx_probs);
+  if (is_inter && cpi->sf.tx_type_search > 0)
+    prune = prune_tx_types(cpi, bs, x, xd);
+
   assert(skip_prob > 0);
   s0 = vp10_cost_bit(skip_prob, 0);
   s1 = vp10_cost_bit(skip_prob, 1);
@@ -1187,6 +1328,15 @@
       if (is_inter) {
         if (!ext_tx_used_inter[ext_tx_set][tx_type])
           continue;
+        if (cpi->sf.tx_type_search > 0) {
+          if (!do_tx_type_search(tx_type, prune))
+            continue;
+        } else if (ext_tx_set == 1 &&
+                   tx_type >= DST_ADST && tx_type < IDTX &&
+                   best_tx_type == DCT_DCT) {
+          tx_type = IDTX - 1;
+          continue;
+        }
       } else {
         if (!ALLOW_INTRA_EXT_TX && bs >= BLOCK_8X8) {
           if (tx_type != intra_mode_to_tx_type_context[mbmi->mode])
@@ -1194,14 +1344,14 @@
         }
         if (!ext_tx_used_intra[ext_tx_set][tx_type])
           continue;
+        if (ext_tx_set == 1 &&
+            tx_type >= DST_ADST && tx_type < IDTX &&
+            best_tx_type == DCT_DCT) {
+          tx_type = IDTX - 1;
+          break;
+        }
       }
       mbmi->tx_type = tx_type;
-      if (ext_tx_set == 1 &&
-          mbmi->tx_type >= DST_ADST && mbmi->tx_type < IDTX &&
-          best_tx_type == DCT_DCT) {
-        tx_type = IDTX - 1;
-        break;
-      }
       txfm_rd_in_plane(x,
                        cpi,
                        &r, &d, &s,
@@ -1233,12 +1383,15 @@
       if (n < TX_32X32 &&
           !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
           r != INT_MAX && !FIXED_TX_TYPE) {
-        if (is_inter)
+        if (is_inter) {
           r += cpi->inter_tx_type_costs[mbmi->tx_size][mbmi->tx_type];
-        else
+          if (cpi->sf.tx_type_search > 0 && !do_tx_type_search(tx_type, prune))
+              continue;
+        } else {
           r += cpi->intra_tx_type_costs[mbmi->tx_size]
               [intra_mode_to_tx_type_context[mbmi->mode]]
               [mbmi->tx_type];
+        }
       }
 #endif  // CONFIG_EXT_TX
 
@@ -2803,9 +2956,13 @@
   uint8_t best_blk_skip[256];
   const int n4 = 1 << (num_pels_log2_lookup[bsize] - 4);
   int idx, idy;
+  int prune = 0;
 #if CONFIG_EXT_TX
   int ext_tx_set = get_ext_tx_set(max_tx_size, bsize, is_inter);
-#endif
+#endif  // CONFIG_EXT_TX
+
+  if (is_inter && cpi->sf.tx_type_search > 0)
+    prune = prune_tx_types(cpi, bsize, x, xd);
 
   *distortion = INT64_MAX;
   *rate       = INT_MAX;
@@ -2821,6 +2978,15 @@
     if (is_inter) {
       if (!ext_tx_used_inter[ext_tx_set][tx_type])
         continue;
+      if (cpi->sf.tx_type_search > 0) {
+        if (!do_tx_type_search(tx_type, prune))
+          continue;
+      } else if (ext_tx_set == 1 &&
+                 tx_type >= DST_ADST && tx_type < IDTX &&
+                 best_tx_type == DCT_DCT) {
+        tx_type = IDTX - 1;
+        continue;
+      }
     } else {
       if (!ALLOW_INTRA_EXT_TX && bsize >= BLOCK_8X8) {
         if (tx_type != intra_mode_to_tx_type_context[mbmi->mode])
@@ -2828,17 +2994,16 @@
       }
       if (!ext_tx_used_intra[ext_tx_set][tx_type])
         continue;
+      if (ext_tx_set == 1 &&
+          tx_type >= DST_ADST && tx_type < IDTX &&
+          best_tx_type == DCT_DCT) {
+        tx_type = IDTX - 1;
+        break;
+      }
     }
 
     mbmi->tx_type = tx_type;
 
-    if (ext_tx_set == 1 &&
-        mbmi->tx_type >= DST_ADST && mbmi->tx_type < IDTX &&
-        best_tx_type == DCT_DCT) {
-      tx_type = IDTX - 1;
-      break;
-    }
-
     inter_block_yrd(cpi, x, &this_rate, &this_dist, &this_skip, &this_sse,
                     bsize, ref_best_rd);
 
@@ -2867,12 +3032,15 @@
       if (max_tx_size < TX_32X32 &&
           !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
           this_rate != INT_MAX) {
-        if (is_inter)
+        if (is_inter) {
           this_rate += cpi->inter_tx_type_costs[max_tx_size][mbmi->tx_type];
-        else
+          if (cpi->sf.tx_type_search > 0 && !do_tx_type_search(tx_type, prune))
+              continue;
+        } else {
           this_rate += cpi->intra_tx_type_costs[max_tx_size]
               [intra_mode_to_tx_type_context[mbmi->mode]]
               [mbmi->tx_type];
+        }
       }
 #endif  // CONFIG_EXT_TX
 
@@ -7208,6 +7376,11 @@
                 rate_y - rate_uv,
                 total_sse);
         }
+#if CONFIG_VAR_TX
+        for (i = 0; i < MAX_MB_PLANE; ++i)
+          memcpy(x->blk_skip_drl[i], x->blk_skip[i],
+                 sizeof(uint8_t) * ctx->num_4x4_blk);
+#endif
 
         for (ref_idx = 0; ref_idx < ref_set; ++ref_idx) {
           int64_t tmp_alt_rd = INT64_MAX;
@@ -7235,8 +7408,6 @@
 #else
             int_mv dummy_single_newmv[MAX_REF_FRAMES] = { { 0 } };
 #endif
-
-
             mbmi->ref_mv_idx = 1 + ref_idx;
 
             frame_mv[NEARMV][ref_frame] = cur_mv;
@@ -7299,12 +7470,22 @@
             this_rd = tmp_alt_rd;
             tmp_ref_rd = tmp_alt_rd;
             backup_mbmi = *mbmi;
+#if CONFIG_VAR_TX
+            for (i = 0; i < MAX_MB_PLANE; ++i)
+              memcpy(x->blk_skip_drl[i], x->blk_skip[i],
+                     sizeof(uint8_t) * ctx->num_4x4_blk);
+#endif
           } else {
             *mbmi = backup_mbmi;
           }
         }
 
         frame_mv[NEARMV][ref_frame] = backup_mv;
+#if CONFIG_VAR_TX
+        for (i = 0; i < MAX_MB_PLANE; ++i)
+          memcpy(x->blk_skip[i], x->blk_skip_drl[i],
+                 sizeof(uint8_t) * ctx->num_4x4_blk);
+#endif
       }
 #endif  // CONFIG_REF_MV
 
diff --git a/vp10/encoder/speed_features.c b/vp10/encoder/speed_features.c
index 1f70dcb..ec8acda 100644
--- a/vp10/encoder/speed_features.c
+++ b/vp10/encoder/speed_features.c
@@ -159,7 +159,7 @@
 
     sf->tx_size_search_breakout = 1;
     sf->partition_search_breakout_rate_thr = 80;
-
+    sf->tx_type_search = PRUNE_ONE;
     // Use transform domain distortion.
     // Note var-tx expt always uses pixel domain distortion.
     sf->use_transform_domain_distortion = 1;
@@ -177,6 +177,9 @@
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
     sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
     sf->allow_partition_search_skip = 1;
+#if CONFIG_EXT_TX
+    sf->tx_type_search = PRUNE_TWO;
+#endif
   }
 
   if (speed >= 3) {
@@ -195,6 +198,9 @@
     sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
     sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
     sf->adaptive_interp_filter_search = 1;
+#if CONFIG_EXT_TX
+    sf->tx_type_search = PRUNE_THREE;
+#endif
   }
 
   if (speed >= 4) {
@@ -473,6 +479,7 @@
   sf->alt_ref_search_fp = 0;
   sf->use_quant_fp = 0;
   sf->partition_search_type = SEARCH_PARTITION;
+  sf->tx_type_search = NO_PRUNE;
   sf->less_rectangular_check = 0;
   sf->use_square_partition_only = 0;
   sf->auto_min_max_partition_size = NOT_IN_USE;
diff --git a/vp10/encoder/speed_features.h b/vp10/encoder/speed_features.h
index 4f931d8..fbb6988 100644
--- a/vp10/encoder/speed_features.h
+++ b/vp10/encoder/speed_features.h
@@ -182,6 +182,18 @@
 } INTERP_FILTER_MASK;
 
 typedef enum {
+  NO_PRUNE = 0,
+  // eliminates one tx type in vertical and horizontal direction
+  PRUNE_ONE = 1,
+#if CONFIG_EXT_TX
+  // eliminates two tx types in each direction
+  PRUNE_TWO = 2,
+  // eliminates three tx types in each direction
+  PRUNE_THREE = 3,
+#endif
+} TX_TYPE_SEARCH;
+
+typedef enum {
   // Search partitions using RD criterion
   SEARCH_PARTITION,
 
@@ -298,6 +310,7 @@
 
   PARTITION_SEARCH_TYPE partition_search_type;
 
+  TX_TYPE_SEARCH tx_type_search;
   // Used if partition_search_type = FIXED_SIZE_PARTITION
   BLOCK_SIZE always_this_block_size;
 
diff --git a/vp10/encoder/x86/dct_sse2.c b/vp10/encoder/x86/dct_sse2.c
index 79d1e88..aaf1e6a 100644
--- a/vp10/encoder/x86/dct_sse2.c
+++ b/vp10/encoder/x86/dct_sse2.c
@@ -1635,7 +1635,7 @@
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
+  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
   const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
   const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
@@ -1839,10 +1839,10 @@
 
   v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
   v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
-  v[2] = _mm_madd_epi16(u[2], k__cospi_p24_p08);
-  v[3] = _mm_madd_epi16(u[3], k__cospi_p24_p08);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p08_m24);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p08_m24);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24);
   v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
   v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
 
@@ -1872,10 +1872,10 @@
   // stage 5
   s[0] = _mm_add_epi16(p[0], t[1]);
   s[1] = _mm_sub_epi16(p[0], t[1]);
-  s[2] = _mm_add_epi16(p[3], t[2]);
-  s[3] = _mm_sub_epi16(p[3], t[2]);
-  s[4] = _mm_sub_epi16(p[4], t[5]);
-  s[5] = _mm_add_epi16(p[4], t[5]);
+  s[2] = _mm_sub_epi16(p[3], t[2]);
+  s[3] = _mm_add_epi16(p[3], t[2]);
+  s[4] = _mm_add_epi16(p[4], t[5]);
+  s[5] = _mm_sub_epi16(p[4], t[5]);
   s[6] = _mm_sub_epi16(p[7], t[6]);
   s[7] = _mm_add_epi16(p[7], t[6]);
 
diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c
index 204cede..c500206 100644
--- a/vpx_dsp/sad.c
+++ b/vpx_dsp/sad.c
@@ -108,6 +108,22 @@
     sad_array[i] = vpx_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \
 }
 
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+// 128x128
+sadMxN(128, 128)
+sadMxNxK(128, 128, 3)
+sadMxNxK(128, 128, 8)
+sadMxNx4D(128, 128)
+
+// 128x64
+sadMxN(128, 64)
+sadMxNx4D(128, 64)
+
+// 64x128
+sadMxN(64, 128)
+sadMxNx4D(64, 128)
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
+
 // 64x64
 sadMxN(64, 64)
 sadMxNxK(64, 64, 3)
@@ -247,6 +263,22 @@
   } \
 }
 
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+// 128x128
+highbd_sadMxN(128, 128)
+highbd_sadMxNxK(128, 128, 3)
+highbd_sadMxNxK(128, 128, 8)
+highbd_sadMxNx4D(128, 128)
+
+// 128x64
+highbd_sadMxN(128, 64)
+highbd_sadMxNx4D(128, 64)
+
+// 64x128
+highbd_sadMxN(64, 128)
+highbd_sadMxNx4D(64, 128)
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
+
 // 64x64
 highbd_sadMxN(64, 64)
 highbd_sadMxNxK(64, 64, 3)
diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c
index 14d7f99..169769a 100644
--- a/vpx_dsp/variance.c
+++ b/vpx_dsp/variance.c
@@ -17,17 +17,6 @@
 #include "vpx_dsp/variance.h"
 #include "vpx_dsp/vpx_filter.h"
 
-const uint8_t vpx_bilinear_filters[BIL_SUBPEL_SHIFTS][2] = {
-  { 128,   0  },
-  { 112,  16  },
-  {  96,  32  },
-  {  80,  48  },
-  {  64,  64  },
-  {  48,  80  },
-  {  32,  96  },
-  {  16, 112  },
-};
-
 uint32_t vpx_get4x4sse_cs_c(const uint8_t *a, int  a_stride,
                             const uint8_t *b, int  b_stride) {
   int distortion = 0;
@@ -176,9 +165,9 @@
   uint8_t temp2[H * W]; \
 \
   var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
-                                    vpx_bilinear_filters[xoffset]); \
+                                    bilinear_filters_2t[xoffset]); \
   var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                     vpx_bilinear_filters[yoffset]); \
+                                     bilinear_filters_2t[yoffset]); \
 \
   return vpx_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \
 }
@@ -196,9 +185,9 @@
   DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
 \
   var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
-                                    vpx_bilinear_filters[xoffset]); \
+                                    bilinear_filters_2t[xoffset]); \
   var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                     vpx_bilinear_filters[yoffset]); \
+                                     bilinear_filters_2t[yoffset]); \
 \
   vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
 \
@@ -235,6 +224,11 @@
     SUBPIX_VAR(W, H) \
     SUBPIX_AVG_VAR(W, H)
 
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+VARIANCES(128, 128)
+VARIANCES(128, 64)
+VARIANCES(64, 128)
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
 VARIANCES(64, 64)
 VARIANCES(64, 32)
 VARIANCES(32, 64)
@@ -501,9 +495,9 @@
   uint16_t temp2[H * W]; \
 \
   highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, vpx_bilinear_filters[xoffset]); \
+                                           W, bilinear_filters_2t[xoffset]); \
   highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            vpx_bilinear_filters[yoffset]); \
+                                            bilinear_filters_2t[yoffset]); \
 \
   return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
                                           dst_stride, sse); \
@@ -518,9 +512,9 @@
   uint16_t temp2[H * W]; \
 \
   highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, vpx_bilinear_filters[xoffset]); \
+                                           W, bilinear_filters_2t[xoffset]); \
   highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            vpx_bilinear_filters[yoffset]); \
+                                            bilinear_filters_2t[yoffset]); \
 \
   return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
                                              W, dst, dst_stride, sse); \
@@ -535,9 +529,9 @@
   uint16_t temp2[H * W]; \
 \
   highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, vpx_bilinear_filters[xoffset]); \
+                                           W, bilinear_filters_2t[xoffset]); \
   highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            vpx_bilinear_filters[yoffset]); \
+                                            bilinear_filters_2t[yoffset]); \
 \
   return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
                                              W, dst, dst_stride, sse); \
@@ -555,9 +549,9 @@
   DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
 \
   highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, vpx_bilinear_filters[xoffset]); \
+                                           W, bilinear_filters_2t[xoffset]); \
   highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            vpx_bilinear_filters[yoffset]); \
+                                            bilinear_filters_2t[yoffset]); \
 \
   vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
                            CONVERT_TO_BYTEPTR(temp2), W); \
@@ -577,9 +571,9 @@
   DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
 \
   highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, vpx_bilinear_filters[xoffset]); \
+                                           W, bilinear_filters_2t[xoffset]); \
   highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            vpx_bilinear_filters[yoffset]); \
+                                            bilinear_filters_2t[yoffset]); \
 \
   vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
                            CONVERT_TO_BYTEPTR(temp2), W); \
@@ -599,9 +593,9 @@
   DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
 \
   highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, vpx_bilinear_filters[xoffset]); \
+                                           W, bilinear_filters_2t[xoffset]); \
   highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            vpx_bilinear_filters[yoffset]); \
+                                            bilinear_filters_2t[yoffset]); \
 \
   vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
                            CONVERT_TO_BYTEPTR(temp2), W); \
@@ -616,6 +610,11 @@
     HIGHBD_SUBPIX_VAR(W, H) \
     HIGHBD_SUBPIX_AVG_VAR(W, H)
 
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+HIGHBD_VARIANCES(128, 128)
+HIGHBD_VARIANCES(128, 64)
+HIGHBD_VARIANCES(64, 128)
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
 HIGHBD_VARIANCES(64, 64)
 HIGHBD_VARIANCES(64, 32)
 HIGHBD_VARIANCES(32, 64)
@@ -677,8 +676,9 @@
     b += b_stride;
     m += m_stride;
   }
-  *sum = (sum64 >= 0) ? ((sum64 + 31) >> 6) : -((-sum64 + 31) >> 6);
-  *sse = (sse64 + 2047) >> 12;
+  sum64 = (sum64 >= 0) ? sum64  : -sum64;
+  *sum = ROUND_POWER_OF_TWO(sum64, 6);
+  *sse = ROUND_POWER_OF_TWO(sse64, 12);
 }
 
 #define MASK_VAR(W, H) \
@@ -702,9 +702,9 @@
   uint8_t temp2[H * W]; \
 \
   var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \
-                                    vpx_bilinear_filters[xoffset]); \
+                                    bilinear_filters_2t[xoffset]); \
   var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                     vpx_bilinear_filters[yoffset]); \
+                                     bilinear_filters_2t[yoffset]); \
 \
   return vpx_masked_variance##W##x##H##_c(temp2, W, dst, dst_stride, \
                                           msk, msk_stride, sse); \
@@ -765,27 +765,28 @@
                               const uint8_t *b8, int  b_stride,
                               const uint8_t *m, int  m_stride,
                               int  w, int  h,
-                              uint64_t *sse64, int *sum) {
+                              uint64_t *sse, int64_t *sum) {
   int i, j;
   uint16_t *a = CONVERT_TO_SHORTPTR(a8);
   uint16_t *b = CONVERT_TO_SHORTPTR(b8);
 
-  int64_t sum64 = 0;
-  *sse64 = 0;
+  *sum = 0;
+  *sse = 0;
 
   for (i = 0; i < h; i++) {
     for (j = 0; j < w; j++) {
       const int diff = (a[j] - b[j]) * (m[j]);
-      sum64 += diff;
-      *sse64 += (int64_t)diff * diff;
+      *sum += (int64_t)diff;
+      *sse += (int64_t)diff * diff;
     }
 
     a += a_stride;
     b += b_stride;
     m += m_stride;
   }
-  *sum = (sum64 >= 0) ? ((sum64 + 31) >> 6) : -((-sum64 + 31) >> 6);
-  *sse64 = (*sse64 + 2047) >> 12;
+  *sum = (*sum >= 0) ? *sum  : -*sum;
+  *sum = ROUND_POWER_OF_TWO(*sum, 6);
+  *sse = ROUND_POWER_OF_TWO(*sse, 12);
 }
 
 void highbd_masked_variance(const uint8_t *a8, int  a_stride,
@@ -793,9 +794,11 @@
                             const uint8_t *m, int  m_stride,
                             int  w, int  h,
                             unsigned int *sse, int *sum) {
+  int64_t sum64;
   uint64_t sse64;
   highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride,
-                           w, h, &sse64, sum);
+                           w, h, &sse64, &sum64);
+  *sum = (int)sum64;
   *sse = (unsigned int)sse64;
 }
 
@@ -804,10 +807,11 @@
                                const uint8_t *m, int  m_stride,
                                int  w, int  h,
                                unsigned int *sse, int *sum) {
+  int64_t sum64;
   uint64_t sse64;
   highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride,
-                           w, h, &sse64, sum);
-  *sum = ROUND_POWER_OF_TWO(*sum, 2);
+                           w, h, &sse64, &sum64);
+  *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
 }
 
@@ -816,10 +820,11 @@
                                const uint8_t *m, int  m_stride,
                                int  w, int  h,
                                unsigned int *sse, int *sum) {
+  int64_t sum64;
   uint64_t sse64;
   highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride,
-                           w, h, &sse64, sum);
-  *sum = ROUND_POWER_OF_TWO(*sum, 4);
+                           w, h, &sse64, &sum64);
+  *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
 }
 
@@ -875,9 +880,9 @@
 \
   highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, \
                                            H + 1, W, \
-                                           vpx_bilinear_filters[xoffset]); \
+                                           bilinear_filters_2t[xoffset]); \
   highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            vpx_bilinear_filters[yoffset]); \
+                                            bilinear_filters_2t[yoffset]); \
 \
   return vpx_highbd_masked_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
                                                  W, dst, dst_stride, \
@@ -895,9 +900,9 @@
 \
   highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, \
                                            H + 1, W, \
-                                           vpx_bilinear_filters[xoffset]); \
+                                           bilinear_filters_2t[xoffset]); \
   highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            vpx_bilinear_filters[yoffset]); \
+                                            bilinear_filters_2t[yoffset]); \
 \
   return vpx_highbd_10_masked_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
                                                     W, dst, dst_stride, \
@@ -915,9 +920,9 @@
 \
   highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, \
                                            H + 1, W, \
-                                           vpx_bilinear_filters[xoffset]); \
+                                           bilinear_filters_2t[xoffset]); \
   highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            vpx_bilinear_filters[yoffset]); \
+                                            bilinear_filters_2t[yoffset]); \
 \
   return vpx_highbd_12_masked_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
                                                     W, dst, dst_stride, \
diff --git a/vpx_dsp/vpx_convolve.c b/vpx_dsp/vpx_convolve.c
index 2d1c927..2e85ed4 100644
--- a/vpx_dsp/vpx_convolve.c
+++ b/vpx_dsp/vpx_convolve.c
@@ -130,18 +130,21 @@
   // --Must round-up because block may be located at sub-pixel position.
   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  uint8_t temp[135 * 64];
+  uint8_t temp[MAX_EXT_SIZE * MAX_CU_SIZE];
   int intermediate_height =
           (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
 
-  assert(w <= 64);
-  assert(h <= 64);
+  assert(w <= MAX_CU_SIZE);
+  assert(h <= MAX_CU_SIZE);
+
   assert(y_step_q4 <= 32);
   assert(x_step_q4 <= 32);
 
-  convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
+  convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
+                 temp, MAX_CU_SIZE,
                  x_filters, x0_q4, x_step_q4, w, intermediate_height);
-  convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
+  convolve_vert(temp + MAX_CU_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_CU_SIZE,
+                dst, dst_stride,
                 y_filters, y0_q4, y_step_q4, w, h);
 }
 
@@ -237,13 +240,14 @@
                          const int16_t *filter_y, int y_step_q4,
                          int w, int h) {
   /* Fixed size intermediate buffer places limits on parameters. */
-  DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
-  assert(w <= 64);
-  assert(h <= 64);
+  DECLARE_ALIGNED(16, uint8_t, temp[MAX_CU_SIZE * MAX_CU_SIZE]);
+  assert(w <= MAX_CU_SIZE);
+  assert(h <= MAX_CU_SIZE);
 
-  vpx_convolve8_c(src, src_stride, temp, 64,
+  vpx_convolve8_c(src, src_stride, temp, MAX_CU_SIZE,
                   filter_x, x_step_q4, filter_y, y_step_q4, w, h);
-  vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
+  vpx_convolve_avg_c(temp, MAX_CU_SIZE, dst, dst_stride,
+                     NULL, 0, NULL, 0, w, h);
 }
 
 void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -459,22 +463,23 @@
   // --Must round-up because block may be located at sub-pixel position.
   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  uint16_t temp[64 * 135];
+  uint16_t temp[MAX_EXT_SIZE * MAX_CU_SIZE];
   int intermediate_height =
           (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
 
-  assert(w <= 64);
-  assert(h <= 64);
+  assert(w <= MAX_CU_SIZE);
+  assert(h <= MAX_CU_SIZE);
   assert(y_step_q4 <= 32);
   assert(x_step_q4 <= 32);
 
-  highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                        src_stride, CONVERT_TO_BYTEPTR(temp), 64,
+  highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
+                        CONVERT_TO_BYTEPTR(temp), MAX_CU_SIZE,
                         x_filters, x0_q4, x_step_q4, w,
                         intermediate_height, bd);
-  highbd_convolve_vert(CONVERT_TO_BYTEPTR(temp) + 64 * (SUBPEL_TAPS / 2 - 1),
-                       64, dst, dst_stride, y_filters, y0_q4, y_step_q4,
-                       w, h, bd);
+  highbd_convolve_vert(
+    CONVERT_TO_BYTEPTR(temp) + MAX_CU_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_CU_SIZE,
+    dst, dst_stride,
+    y_filters, y0_q4, y_step_q4, w, h, bd);
 }
 
 
@@ -556,13 +561,15 @@
                                 const int16_t *filter_y, int y_step_q4,
                                 int w, int h, int bd) {
   // Fixed size intermediate buffer places limits on parameters.
-  DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]);
-  assert(w <= 64);
-  assert(h <= 64);
+  DECLARE_ALIGNED(16, uint16_t, temp[MAX_CU_SIZE * MAX_CU_SIZE]);
+  assert(w <= MAX_CU_SIZE);
+  assert(h <= MAX_CU_SIZE);
 
-  vpx_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64,
+  vpx_highbd_convolve8_c(src, src_stride,
+                         CONVERT_TO_BYTEPTR(temp), MAX_CU_SIZE,
                          filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
-  vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride,
+  vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), MAX_CU_SIZE,
+                            dst, dst_stride,
                             NULL, 0, NULL, 0, w, h, bd);
 }
 
diff --git a/vpx_dsp/vpx_convolve.h b/vpx_dsp/vpx_convolve.h
index 9ed3f17..bd8679d 100644
--- a/vpx_dsp/vpx_convolve.h
+++ b/vpx_dsp/vpx_convolve.h
@@ -17,6 +17,24 @@
 extern "C" {
 #endif
 
+// Note: Fixed size intermediate buffers, place limits on parameters
+// of some functions. 2d filtering proceeds in 2 steps:
+//   (1) Interpolate horizontally into an intermediate buffer, temp.
+//   (2) Interpolate temp vertically to derive the sub-pixel result.
+// Deriving the maximum number of rows in the temp buffer (135):
+// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+// --Largest block size is 64x64 pixels.
+// --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+//   original frame (in 1/16th pixel units).
+// --Must round-up because block may be located at sub-pixel position.
+// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+// --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+# define MAX_EXT_SIZE 263
+#else
+# define MAX_EXT_SIZE 135
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
+
 typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int x_step_q4,
diff --git a/vpx_dsp/vpx_dsp_common.h b/vpx_dsp/vpx_dsp_common.h
index b4e6f4c..8d9bf55 100644
--- a/vpx_dsp/vpx_dsp_common.h
+++ b/vpx_dsp/vpx_dsp_common.h
@@ -20,6 +20,12 @@
 extern "C" {
 #endif
 
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+# define MAX_CU_SIZE 128
+#else
+# define MAX_CU_SIZE 64
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
+
 #define VPXMIN(x, y) (((x) < (y)) ? (x) : (y))
 #define VPXMAX(x, y) (((x) > (y)) ? (x) : (y))
 
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index fdfd20c..583d9fa 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -50,6 +50,19 @@
   $avx2_x86_64 = 'avx2';
 }
 
+if (vpx_config("CONFIG_EXT_PARTITION") eq "yes") {
+  @block_widths = (4, 8, 16, 32, 64, 128)
+} else {
+  @block_widths = (4, 8, 16, 32, 64)
+}
+
+@block_sizes = ();
+foreach $w (@block_widths) {
+  foreach $h (@block_widths) {
+    push @block_sizes, [$w, $h] if ($w <= 2*$h && $h <= 2*$w) ;
+  }
+}
+
 #
 # Intra prediction
 #
@@ -453,52 +466,44 @@
 #
 # Sub Pixel Filters
 #
-add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve_copy neon dspr2 msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve_avg neon dspr2 msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8 sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
-
-add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_horiz sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
-
-add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_vert sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
-
-add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg sse2 ssse3 neon dspr2 msa/;
-
+add_proto qw/void vpx_convolve_copy/,       "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve_avg/,        "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve8/,           "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve8_horiz/,     "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve8_vert/,      "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve8_avg/,       "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
 add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 neon dspr2 msa/;
+add_proto qw/void vpx_convolve8_avg_vert/,  "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_2d/,           "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_horiz/,        "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_vert/,         "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_avg_2d/,       "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_avg_horiz/,    "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_avg_vert/,     "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
 
-add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_vert sse2 ssse3 neon dspr2 msa/;
+specialize qw/vpx_convolve_copy                 /, "$sse2_x86inc";
+specialize qw/vpx_convolve_avg                  /, "$sse2_x86inc";
+specialize qw/vpx_convolve8           sse2 ssse3/, "$avx2_ssse3";
+specialize qw/vpx_convolve8_horiz     sse2 ssse3/, "$avx2_ssse3";
+specialize qw/vpx_convolve8_vert      sse2 ssse3/, "$avx2_ssse3";
+specialize qw/vpx_convolve8_avg       sse2 ssse3/;
+specialize qw/vpx_convolve8_avg_horiz sse2 ssse3/;
+specialize qw/vpx_convolve8_avg_vert  sse2 ssse3/;
+specialize qw/vpx_scaled_2d                ssse3/;
 
-add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_scaled_2d ssse3/;
-
-add_proto qw/void vpx_scaled_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_scaled_horiz/;
-
-add_proto qw/void vpx_scaled_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_scaled_vert/;
-
-add_proto qw/void vpx_scaled_avg_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_scaled_avg_2d/;
-
-add_proto qw/void vpx_scaled_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_scaled_avg_horiz/;
-
-add_proto qw/void vpx_scaled_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_scaled_avg_vert/;
+# TODO(any): These need to be extended to up to 128x128 block sizes
+if (!(vpx_config("CONFIG_VP10") eq "yes" && vpx_config("CONFIG_EXT_PARTITION") eq "yes")) {
+  specialize qw/vpx_convolve_copy       neon dspr2 msa/;
+  specialize qw/vpx_convolve_avg        neon dspr2 msa/;
+  specialize qw/vpx_convolve8           neon dspr2 msa/;
+  specialize qw/vpx_convolve8_horiz     neon dspr2 msa/;
+  specialize qw/vpx_convolve8_vert      neon dspr2 msa/;
+  specialize qw/vpx_convolve8_avg       neon dspr2 msa/;
+  specialize qw/vpx_convolve8_avg_horiz neon dspr2 msa/;
+  specialize qw/vpx_convolve8_avg_vert  neon dspr2 msa/;
+}
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-  #
-  # Sub Pixel Filters
-  #
   add_proto qw/void vpx_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
   specialize qw/vpx_highbd_convolve_copy/, "$sse2_x86inc";
 
@@ -960,69 +965,43 @@
 #
 add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
 specialize qw/vpx_subtract_block neon msa/, "$sse2_x86inc";
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
+  specialize qw/vpx_highbd_subtract_block/;
+}
 
 if (vpx_config("CONFIG_VP10_ENCODER") eq "yes") {
-#
-# Sum of Squares
-#
+  #
+  # Sum of Squares
+  #
   add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";
   specialize qw/vpx_sum_squares_2d_i16 sse2/;
 }
 
-#
-# Single block SAD
-#
-add_proto qw/unsigned int vpx_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad64x64 avx2 neon msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad64x32 avx2 msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x64 avx2 msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x32 avx2 neon msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x16 avx2 msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x32 msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x16 mmx media neon msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x8 mmx neon msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x16 mmx neon msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x8 mmx neon msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x4 msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad4x8 msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad4x4 mmx neon msa/, "$sse2_x86inc";
-
-#
-# Avg
-#
 if ((vpx_config("CONFIG_VP9_ENCODER") eq "yes") || (vpx_config("CONFIG_VP10_ENCODER") eq "yes")) {
+  #
+  # Avg
+  #
   add_proto qw/unsigned int vpx_avg_8x8/, "const uint8_t *, int p";
   specialize qw/vpx_avg_8x8 sse2 neon msa/;
-
   add_proto qw/unsigned int vpx_avg_4x4/, "const uint8_t *, int p";
   specialize qw/vpx_avg_4x4 sse2 neon msa/;
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *, int p";
+    specialize qw/vpx_highbd_avg_8x8/;
+    add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *, int p";
+    specialize qw/vpx_highbd_avg_4x4/;
+  }
 
+  #
+  # Minmax
+  #
   add_proto qw/void vpx_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
   specialize qw/vpx_minmax_8x8 sse2/;
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+    specialize qw/vpx_highbd_minmax_8x8/;
+  }
 
   add_proto qw/void vpx_hadamard_8x8/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
   specialize qw/vpx_hadamard_8x8 sse2/, "$ssse3_x86_64_x86inc";
@@ -1043,575 +1022,217 @@
   specialize qw/vpx_vector_var neon sse2/;
 }  # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
 
-add_proto qw/unsigned int vpx_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+#
+# Single block SAD / Single block Avg SAD
+#
+foreach (@block_sizes) {
+  ($w, $h) = @$_;
+  add_proto qw/unsigned int/, "vpx_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  add_proto qw/unsigned int/, "vpx_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+}
+
+specialize qw/vpx_sad64x64    avx2       neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad64x32    avx2            msa/, "$sse2_x86inc";
+specialize qw/vpx_sad32x64    avx2            msa/, "$sse2_x86inc";
+specialize qw/vpx_sad32x32    avx2       neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad32x16    avx2            msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x32                    msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x16 mmx     media neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x8  mmx           neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x16  mmx           neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x8   mmx           neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x4                      msa/, "$sse2_x86inc";
+specialize qw/vpx_sad4x8                      msa/, "$sse2_x86inc";
+specialize qw/vpx_sad4x4   mmx           neon msa/, "$sse2_x86inc";
+
 specialize qw/vpx_sad64x64_avg avx2 msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
 specialize qw/vpx_sad64x32_avg avx2 msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
 specialize qw/vpx_sad32x64_avg avx2 msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
 specialize qw/vpx_sad32x32_avg avx2 msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
 specialize qw/vpx_sad32x16_avg avx2 msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x32_avg      msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x16_avg      msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x8_avg       msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x16_avg       msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x8_avg        msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x4_avg        msa/, "$sse2_x86inc";
+specialize qw/vpx_sad4x8_avg        msa/, "$sse2_x86inc";
+specialize qw/vpx_sad4x4_avg        msa/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vpx_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad16x32_avg msa/, "$sse2_x86inc";
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "vpx_highbd_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+    add_proto qw/unsigned int/, "vpx_highbd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+    if ($w != 128 && $h != 128 && $w != 4) {
+      specialize "vpx_highbd_sad${w}x${h}", "$sse2_x86inc";
+      specialize "vpx_highbd_sad${w}x${h}_avg", "$sse2_x86inc";
+    }
+  }
+}
 
-add_proto qw/unsigned int vpx_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad16x16_avg msa/, "$sse2_x86inc";
+#
+# Masked SAD
+#
+if (vpx_config("CONFIG_EXT_INTER") eq "yes") {
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "vpx_masked_sad${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
+    specialize "vpx_masked_sad${w}x${h}", qw/ssse3/;
+  }
 
-add_proto qw/unsigned int vpx_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad16x8_avg msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad8x16_avg msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad8x8_avg msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad8x4_avg msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad4x8_avg msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad4x4_avg msa/, "$sse2_x86inc";
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    foreach (@block_sizes) {
+      ($w, $h) = @$_;
+      add_proto qw/unsigned int/, "vpx_highbd_masked_sad${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
+      specialize "vpx_highbd_masked_sad${w}x${h}", qw/ssse3/;
+    }
+  }
+}
 
 #
 # Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
 #
 # Blocks of 3
-add_proto qw/void vpx_sad64x64x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad64x64x3 msa/;
-
-add_proto qw/void vpx_sad32x32x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x32x3 msa/;
-
-add_proto qw/void vpx_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+foreach $s (@block_widths) {
+  add_proto qw/void/, "vpx_sad${s}x${s}x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+}
+specialize qw/vpx_sad64x64x3            msa/;
+specialize qw/vpx_sad32x32x3            msa/;
 specialize qw/vpx_sad16x16x3 sse3 ssse3 msa/;
+specialize qw/vpx_sad8x8x3   sse3       msa/;
+specialize qw/vpx_sad4x4x3   sse3       msa/;
 
-add_proto qw/void vpx_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void/, "vpx_sad16x8x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad16x8x3 sse3 ssse3 msa/;
-
-add_proto qw/void vpx_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void/, "vpx_sad8x16x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad8x16x3 sse3 msa/;
 
-add_proto qw/void vpx_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x8x3 sse3 msa/;
-
-add_proto qw/void vpx_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x4x3 sse3 msa/;
-
 # Blocks of 8
-add_proto qw/void vpx_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad64x64x8 msa/;
-
-add_proto qw/void vpx_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x32x8 msa/;
-
-add_proto qw/void vpx_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+foreach $s (@block_widths) {
+  add_proto qw/void/, "vpx_sad${s}x${s}x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+}
+specialize qw/vpx_sad64x64x8        msa/;
+specialize qw/vpx_sad32x32x8        msa/;
 specialize qw/vpx_sad16x16x8 sse4_1 msa/;
+specialize qw/vpx_sad8x8x8   sse4_1 msa/;
+specialize qw/vpx_sad4x4x8   sse4_1 msa/;
 
-add_proto qw/void vpx_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void/, "vpx_sad16x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad16x8x8 sse4_1 msa/;
-
-add_proto qw/void vpx_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void/, "vpx_sad8x16x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad8x16x8 sse4_1 msa/;
-
-add_proto qw/void vpx_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x8x8 sse4_1 msa/;
-
-add_proto qw/void vpx_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void/, "vpx_sad8x4x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad8x4x8 msa/;
-
-add_proto qw/void vpx_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void/, "vpx_sad4x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad4x8x8 msa/;
 
-add_proto qw/void vpx_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x4x8 sse4_1 msa/;
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  foreach $s (@block_widths) {
+    # Blocks of 3
+    add_proto qw/void/, "vpx_highbd_sad${s}x${s}x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+    # Blocks of 8
+    add_proto qw/void/, "vpx_highbd_sad${s}x${s}x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  }
+  # Blocks of 3
+  add_proto qw/void/, "vpx_highbd_sad16x8x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  add_proto qw/void/, "vpx_highbd_sad8x16x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  # Blocks of 8
+  add_proto qw/void/, "vpx_highbd_sad16x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  add_proto qw/void/, "vpx_highbd_sad8x16x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  add_proto qw/void/, "vpx_highbd_sad8x4x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  add_proto qw/void/, "vpx_highbd_sad4x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+}
 
 #
 # Multi-block SAD, comparing a reference to N independent blocks
 #
-add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+foreach (@block_sizes) {
+  ($w, $h) = @$_;
+  add_proto qw/void/, "vpx_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+}
 specialize qw/vpx_sad64x64x4d avx2 neon msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad64x32x4d msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x64x4d msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad64x32x4d           msa/, "$sse2_x86inc";
+specialize qw/vpx_sad32x64x4d           msa/, "$sse2_x86inc";
 specialize qw/vpx_sad32x32x4d avx2 neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad32x16x4d           msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x32x4d           msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x16x4d      neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x8x4d            msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x16x4d            msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x8x4d             msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x4x4d             msa/, "$sse2_x86inc";
+specialize qw/vpx_sad4x8x4d             msa/, "$sse2_x86inc";
+specialize qw/vpx_sad4x4x4d             msa/, "$sse2_x86inc";
 
-add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x16x4d msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x32x4d msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x16x4d neon msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x8x4d msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x16x4d msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x8x4d msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x4x4d msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x8x4d msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x4x4d msa/, "$sse2_x86inc";
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  #
+  # Multi-block SAD, comparing a reference to N independent blocks
+  #
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/void/, "vpx_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+    if ($w != 128 && $h != 128) {
+      specialize "vpx_highbd_sad${w}x${h}x4d", "$sse2_x86inc";
+    }
+  }
+}
 
 #
 # Structured Similarity (SSIM)
 #
 if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
-    add_proto qw/void vpx_ssim_parms_8x8/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
-    specialize qw/vpx_ssim_parms_8x8/, "$sse2_x86_64";
+  add_proto qw/void vpx_ssim_parms_8x8/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+  specialize qw/vpx_ssim_parms_8x8/, "$sse2_x86_64";
 
-    add_proto qw/void vpx_ssim_parms_16x16/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
-    specialize qw/vpx_ssim_parms_16x16/, "$sse2_x86_64";
-}
+  add_proto qw/void vpx_ssim_parms_16x16/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+  specialize qw/vpx_ssim_parms_16x16/, "$sse2_x86_64";
 
-if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-  #
-  # Block subtraction
-  #
-  add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
-  specialize qw/vpx_highbd_subtract_block/;
-
-  #
-  # Single block SAD
-  #
-  add_proto qw/unsigned int vpx_highbd_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad64x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad64x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad32x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad32x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad32x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad16x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad16x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad16x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad8x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad8x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad8x4/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad4x8/;
-
-  add_proto qw/unsigned int vpx_highbd_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad4x4/;
-
-  #
-  # Avg
-  #
-  add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *, int p";
-  specialize qw/vpx_highbd_avg_8x8/;
-  add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *, int p";
-  specialize qw/vpx_highbd_avg_4x4/;
-  add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
-  specialize qw/vpx_highbd_minmax_8x8/;
-
-  add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad64x64_avg/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad64x32_avg/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad32x64_avg/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad32x32_avg/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad32x16_avg/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad16x32_avg/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad16x16_avg/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad16x8_avg/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad8x16_avg/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad8x8_avg/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad8x4_avg/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad4x8_avg/;
-
-  add_proto qw/unsigned int vpx_highbd_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad4x4_avg/;
-
-  #
-  # Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
-  #
-  # Blocks of 3
-  add_proto qw/void vpx_highbd_sad64x64x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad64x64x3/;
-
-  add_proto qw/void vpx_highbd_sad32x32x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad32x32x3/;
-
-  add_proto qw/void vpx_highbd_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad16x16x3/;
-
-  add_proto qw/void vpx_highbd_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad16x8x3/;
-
-  add_proto qw/void vpx_highbd_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x16x3/;
-
-  add_proto qw/void vpx_highbd_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x8x3/;
-
-  add_proto qw/void vpx_highbd_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad4x4x3/;
-
-  # Blocks of 8
-  add_proto qw/void vpx_highbd_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad64x64x8/;
-
-  add_proto qw/void vpx_highbd_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad32x32x8/;
-
-  add_proto qw/void vpx_highbd_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad16x16x8/;
-
-  add_proto qw/void vpx_highbd_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad16x8x8/;
-
-  add_proto qw/void vpx_highbd_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x16x8/;
-
-  add_proto qw/void vpx_highbd_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x8x8/;
-
-  add_proto qw/void vpx_highbd_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x4x8/;
-
-  add_proto qw/void vpx_highbd_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad4x8x8/;
-
-  add_proto qw/void vpx_highbd_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad4x4x8/;
-
-  #
-  # Multi-block SAD, comparing a reference to N independent blocks
-  #
-  add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad64x64x4d/, "$sse2_x86inc";
-
-  add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad64x32x4d/, "$sse2_x86inc";
-
-  add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad32x64x4d/, "$sse2_x86inc";
-
-  add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad32x32x4d/, "$sse2_x86inc";
-
-  add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad32x16x4d/, "$sse2_x86inc";
-
-  add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad16x32x4d/, "$sse2_x86inc";
-
-  add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad16x16x4d/, "$sse2_x86inc";
-
-  add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad16x8x4d/, "$sse2_x86inc";
-
-  add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x16x4d/, "$sse2_x86inc";
-
-  add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x8x4d/, "$sse2_x86inc";
-
-  add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x4x4d/, "$sse2_x86inc";
-
-  add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad4x8x4d/, "$sse2_x86inc";
-
-  add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad4x4x4d/, "$sse2_x86inc";
-
-  #
-  # Structured Similarity (SSIM)
-  #
-  if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void vpx_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
-    specialize qw/vpx_highbd_ssim_parms_8x8/;
   }
-}  # CONFIG_VP9_HIGHBITDEPTH
+}
 }  # CONFIG_ENCODERS
 
 if (vpx_config("CONFIG_ENCODERS") eq "yes" || vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
 
 #
-# Variance
-#
-add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance64x64 sse2 avx2 neon msa/;
-
-add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance64x32 sse2 avx2 neon msa/;
-
-add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance32x64 sse2 neon msa/;
-
-add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance32x32 sse2 avx2 neon msa/;
-
-add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance32x16 sse2 avx2 msa/;
-
-add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x32 sse2 msa/;
-
-add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x16 mmx sse2 avx2 media neon msa/;
-
-add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x8 mmx sse2 neon msa/;
-
-add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x16 mmx sse2 neon msa/;
-
-add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x8 mmx sse2 media neon msa/;
-
-add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x4 sse2 msa/;
-
-add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance4x8 sse2 msa/;
-
-add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance4x4 mmx sse2 msa/;
-
-#
 # Specialty Variance
 #
 add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_get16x16var sse2 avx2 neon msa/;
-
 add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_get8x8var mmx sse2 neon msa/;
+
+specialize qw/vpx_get16x16var     avx2 sse2 neon msa/;
+specialize qw/vpx_get8x8var   mmx      sse2 neon msa/;
 
 add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse16x16 mmx sse2 avx2 media neon msa/;
-
 add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse16x8 sse2 msa/;
-
 add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse8x16 sse2 msa/;
-
 add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse8x8 sse2 msa/;
 
-add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
-  specialize qw/vpx_get_mb_ss mmx sse2 msa/;
+specialize qw/vpx_mse16x16 mmx avx2 sse2 media neon msa/;
+specialize qw/vpx_mse16x8           sse2            msa/;
+specialize qw/vpx_mse8x16           sse2            msa/;
+specialize qw/vpx_mse8x8            sse2            msa/;
 
-add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride";
-  specialize qw/vpx_get4x4sse_cs neon msa/;
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  foreach $bd (8, 10, 12) {
+    add_proto qw/void/, "vpx_highbd_${bd}_get16x16var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+    add_proto qw/void/, "vpx_highbd_${bd}_get8x8var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
 
-add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
+    add_proto qw/unsigned int/, "vpx_highbd_${bd}_mse16x16", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+    add_proto qw/unsigned int/, "vpx_highbd_${bd}_mse16x8", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+    add_proto qw/unsigned int/, "vpx_highbd_${bd}_mse8x16", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+    add_proto qw/unsigned int/, "vpx_highbd_${bd}_mse8x8", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
 
-if (vpx_config("CONFIG_EXT_INTER") eq "yes") {
-  add_proto qw/unsigned int vpx_masked_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_variance32x16 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_variance16x32 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_variance64x32 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_variance32x64 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_variance32x32 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_variance64x64 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_variance16x16 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_variance16x8 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_variance8x16 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_variance8x8 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_variance8x4 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_variance4x8 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_variance4x4 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_sub_pixel_variance64x64 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_sub_pixel_variance32x64 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_sub_pixel_variance64x32 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_sub_pixel_variance32x16 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_sub_pixel_variance16x32 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_sub_pixel_variance32x32 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_sub_pixel_variance16x16 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_sub_pixel_variance8x16 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_sub_pixel_variance16x8 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_sub_pixel_variance8x8 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_sub_pixel_variance8x4 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_sub_pixel_variance4x8 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_sub_pixel_variance4x4 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
-  specialize qw/vpx_masked_sad64x64 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
-  specialize qw/vpx_masked_sad32x64 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
-  specialize qw/vpx_masked_sad64x32 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
-  specialize qw/vpx_masked_sad32x16 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
-  specialize qw/vpx_masked_sad16x32 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
-  specialize qw/vpx_masked_sad32x32 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
-  specialize qw/vpx_masked_sad16x16 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
-  specialize qw/vpx_masked_sad16x8 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
-  specialize qw/vpx_masked_sad8x16 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
-  specialize qw/vpx_masked_sad8x8 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
-  specialize qw/vpx_masked_sad8x4 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
-  specialize qw/vpx_masked_sad4x8 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
-  specialize qw/vpx_masked_sad4x4 ssse3/;
-
-  if (vpx_config("CONFIG_EXT_PARTITION") eq "yes") {
-    add_proto qw/unsigned int vpx_masked_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_masked_variance128x128/;
-
-    add_proto qw/unsigned int vpx_masked_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_masdctked_variance128x64/;
-
-    add_proto qw/unsigned int vpx_masked_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_masked_variance64x128/;
-
-    add_proto qw/unsigned int vpx_masked_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_masked_sub_pixel_variance128x128/;
-
-    add_proto qw/unsigned int vpx_masked_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_masked_sub_pixel_variance128x64/;
-
-    add_proto qw/unsigned int vpx_masked_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_masked_sub_pixel_variance64x128/;
-
-    add_proto qw/unsigned int vpx_masked_sad128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
-    specialize qw/vpx_masked_sad128x128/;
-
-    add_proto qw/unsigned int vpx_masked_sad128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
-    specialize qw/vpx_masked_sad128x64/;
-
-    add_proto qw/unsigned int vpx_masked_sad64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
-    specialize qw/vpx_masked_sad64x128/;
+    specialize "vpx_highbd_${bd}_mse16x16", qw/sse2/;
+    specialize "vpx_highbd_${bd}_mse8x8", qw/sse2/;
   }
 }
 
+#
+# ...
+#
 if (vpx_config("CONFIG_AFFINE_MOTION") eq "yes") {
   add_proto qw/void vpx_upsampled_pred/, "uint8_t *comp_pred, int width, int height, const uint8_t *ref, int ref_stride";
     specialize qw/vpx_upsampled_pred sse2/;
@@ -1620,796 +1241,129 @@
 }
 
 #
-# Subpixel Variance
+# ...
 #
-add_proto qw/uint32_t vpx_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance64x64 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
+add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride";
 
-add_proto qw/uint32_t vpx_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance64x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_get_mb_ss mmx sse2 msa/;
+specialize qw/vpx_get4x4sse_cs neon msa/;
 
-add_proto qw/uint32_t vpx_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance32x64 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+#
+# Variance / Subpixel Variance / Subpixel Avg Variance
+#
+foreach (@block_sizes) {
+  ($w, $h) = @$_;
+  add_proto qw/unsigned int/, "vpx_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/uint32_t/, "vpx_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t/, "vpx_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+}
 
-add_proto qw/uint32_t vpx_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_variance64x64     sse2 avx2       neon msa/;
+specialize qw/vpx_variance64x32     sse2 avx2       neon msa/;
+specialize qw/vpx_variance32x64     sse2            neon msa/;
+specialize qw/vpx_variance32x32     sse2 avx2       neon msa/;
+specialize qw/vpx_variance32x16     sse2 avx2            msa/;
+specialize qw/vpx_variance16x32     sse2                 msa/;
+specialize qw/vpx_variance16x16 mmx sse2 avx2 media neon msa/;
+specialize qw/vpx_variance16x8  mmx sse2            neon msa/;
+specialize qw/vpx_variance8x16  mmx sse2            neon msa/;
+specialize qw/vpx_variance8x8   mmx sse2      media neon msa/;
+specialize qw/vpx_variance8x4       sse2                 msa/;
+specialize qw/vpx_variance4x8       sse2                 msa/;
+specialize qw/vpx_variance4x4   mmx sse2                 msa/;
 
-add_proto qw/uint32_t vpx_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance32x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance64x64     avx2       neon msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance64x32                     msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance32x64                     msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance32x32     avx2       neon msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance32x16                     msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance16x32                     msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance16x16 mmx      media neon msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance16x8  mmx                 msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance8x16  mmx                 msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance8x8   mmx      media neon msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance8x4                       msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance4x8                       msa/, "$sse_x86inc",                  "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance4x4   mmx                 msa/, "$sse_x86inc",                  "$ssse3_x86inc";
 
-add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance64x64 avx2 msa/,                "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance64x32      msa/,                "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance32x64      msa/,                "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance32x32 avx2 msa/,                "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance32x16      msa/,                "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance16x32      msa/,                "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance16x16      msa/,                "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance16x8       msa/,                "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance8x16       msa/,                "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance8x8        msa/,                "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance8x4        msa/,                "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance4x8        msa/, "$sse_x86inc",                 "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance4x4        msa/, "$sse_x86inc",                 "$ssse3_x86inc";
 
-add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance16x16 mmx media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  foreach $bd (8, 10, 12) {
+    foreach (@block_sizes) {
+      ($w, $h) = @$_;
+      add_proto qw/unsigned int/, "vpx_highbd_${bd}_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      add_proto qw/uint32_t/, "vpx_highbd_${bd}_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      add_proto qw/uint32_t/, "vpx_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      if ($w != 128 && $h != 128 && $w != 4 && $h != 4) {
+        specialize "vpx_highbd_${bd}_variance${w}x${h}", "sse2";
+      }
+      if ($w != 128 && $h != 128 && $w != 4) {
+        specialize "vpx_highbd_${bd}_sub_pixel_variance${w}x${h}", $sse2_x86inc;
+        specialize "vpx_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", $sse2_x86inc;
+      }
+    }
+  }
+}  # CONFIG_VP9_HIGHBITDEPTH
 
-add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance16x8 mmx msa/, "$sse2_x86inc", "$ssse3_x86inc";
+#
+# Masked Variance / Masked Subpixel Variance
+#
+if (vpx_config("CONFIG_EXT_INTER") eq "yes") {
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "vpx_masked_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+    add_proto qw/unsigned int/, "vpx_masked_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+    specialize "vpx_masked_variance${w}x${h}", qw/ssse3/;
+    specialize "vpx_masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
+  }
 
-add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance8x16 mmx msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance8x8 mmx media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance4x4 mmx msa/, "$sse_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance64x64 avx2 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance64x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance32x64 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance32x32 avx2 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance32x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance16x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance16x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance8x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance8x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance4x4 msa/, "$sse_x86inc", "$ssse3_x86inc";
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    foreach $bd ("_", "_10_", "_12_") {
+      foreach (@block_sizes) {
+        ($w, $h) = @$_;
+        add_proto qw/unsigned int/, "vpx_highbd${bd}masked_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+        add_proto qw/unsigned int/, "vpx_highbd${bd}masked_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
+        specialize "vpx_highbd${bd}masked_variance${w}x${h}", qw/ssse3/;
+        specialize "vpx_highbd${bd}masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
+      }
+    }
+  }
+}
 
 #
 # Specialty Subpixel
 #
-add_proto qw/uint32_t vpx_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";
-  specialize qw/vpx_variance_halfpixvar16x16_h mmx sse2 media/;
+add_proto qw/uint32_t vpx_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse";
+specialize qw/vpx_variance_halfpixvar16x16_h mmx sse2 media/;
 
-add_proto qw/uint32_t vpx_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";
-  specialize qw/vpx_variance_halfpixvar16x16_v mmx sse2 media/;
+add_proto qw/uint32_t vpx_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse";
+specialize qw/vpx_variance_halfpixvar16x16_v mmx sse2 media/;
 
-add_proto qw/uint32_t vpx_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";
-  specialize qw/vpx_variance_halfpixvar16x16_hv mmx sse2 media/;
+add_proto qw/uint32_t vpx_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse";
+specialize qw/vpx_variance_halfpixvar16x16_hv mmx sse2 media/;
 
+#
+# Comp Avg
+#
+add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance64x64 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance64x32 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance32x64 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance32x32 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance32x16 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance16x32 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance16x16 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance16x8 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance8x16 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance8x8 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
-  add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance64x64 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance64x32 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance32x64 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance32x32 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance32x16 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance16x32 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance16x16 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance16x8 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance8x16 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance8x8 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
-  add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance64x64 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance64x32 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance32x64 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance32x32 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance32x16 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance16x32 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance16x16 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance16x8 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance8x16 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance8x8 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
-  add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-
-  add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-
-  add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-
-  add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_mse16x16 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_mse8x8 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_mse16x16 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_mse8x8 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_mse16x16 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_mse8x8 sse2/;
-
-  if (vpx_config("CONFIG_EXT_INTER") eq "yes") {
-    add_proto qw/unsigned int vpx_highbd_masked_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_variance32x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_variance16x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_variance64x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_variance32x64 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_variance32x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_variance64x64 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_variance16x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_variance16x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_variance8x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_variance8x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_variance8x4 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_variance4x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_variance4x4 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_variance32x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_variance16x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_variance64x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_variance32x64 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_variance32x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_variance64x64 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_variance16x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_variance16x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_variance8x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_variance8x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_variance8x4 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_variance4x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_variance4x4 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_variance32x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_variance16x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_variance64x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_variance32x64 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_variance32x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_variance64x64 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_variance16x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_variance16x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_variance8x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_variance8x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_variance8x4 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_variance4x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_variance4x4 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_sub_pixel_variance64x64 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_sub_pixel_variance64x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_sub_pixel_variance32x64 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_sub_pixel_variance32x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_sub_pixel_variance32x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_sub_pixel_variance16x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_sub_pixel_variance16x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_sub_pixel_variance16x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_sub_pixel_variance8x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_sub_pixel_variance8x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_sub_pixel_variance8x4 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_sub_pixel_variance4x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_sub_pixel_variance4x4 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_sub_pixel_variance64x64 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_sub_pixel_variance64x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_sub_pixel_variance32x64 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_sub_pixel_variance32x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_sub_pixel_variance32x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_sub_pixel_variance16x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_sub_pixel_variance16x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_sub_pixel_variance16x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_sub_pixel_variance8x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_sub_pixel_variance8x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_sub_pixel_variance8x4 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_sub_pixel_variance4x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_sub_pixel_variance4x4 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_sub_pixel_variance64x64 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_sub_pixel_variance64x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_sub_pixel_variance32x64 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_sub_pixel_variance32x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_sub_pixel_variance32x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_sub_pixel_variance16x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_sub_pixel_variance16x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_sub_pixel_variance16x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_sub_pixel_variance8x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_sub_pixel_variance8x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_sub_pixel_variance8x4 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_sub_pixel_variance4x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_sub_pixel_variance4x4 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
-    specialize qw/vpx_highbd_masked_sad64x64 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
-    specialize qw/vpx_highbd_masked_sad32x64 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
-    specialize qw/vpx_highbd_masked_sad64x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
-    specialize qw/vpx_highbd_masked_sad32x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
-    specialize qw/vpx_highbd_masked_sad16x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
-    specialize qw/vpx_highbd_masked_sad32x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
-    specialize qw/vpx_highbd_masked_sad16x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
-    specialize qw/vpx_highbd_masked_sad16x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
-    specialize qw/vpx_highbd_masked_sad8x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
-    specialize qw/vpx_highbd_masked_sad8x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
-    specialize qw/vpx_highbd_masked_sad8x4 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
-    specialize qw/vpx_highbd_masked_sad4x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
-    specialize qw/vpx_highbd_masked_sad4x4 ssse3/;
-
-    if (vpx_config("CONFIG_EXT_PARTITION") eq "yes") {
-      add_proto qw/unsigned int vpx_highbd_masked_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-      specialize qw/vpx_highbd_masked_variance128x128/;
-
-      add_proto qw/unsigned int vpx_highbd_masked_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-      specialize qw/vpx_highbd_masked_variance128x64/;
-
-      add_proto qw/unsigned int vpx_highbd_masked_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-      specialize qw/vpx_highbd_masked_variance64x128/;
-
-      add_proto qw/unsigned int vpx_highbd_10_masked_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-      specialize qw/vpx_highbd_10_masked_variance128x128/;
-
-      add_proto qw/unsigned int vpx_highbd_10_masked_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-      specialize qw/vpx_highbd_10_masked_variance128x64/;
-
-      add_proto qw/unsigned int vpx_highbd_10_masked_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-      specialize qw/vpx_highbd_10_masked_variance64x128/;
-
-      add_proto qw/unsigned int vpx_highbd_12_masked_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-      specialize qw/vpx_highbd_10_masked_variance128x128/;
-
-      add_proto qw/unsigned int vpx_highbd_12_masked_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-      specialize qw/vpx_highbd_10_masked_variance128x64/;
-
-      add_proto qw/unsigned int vpx_highbd_12_masked_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-      specialize qw/vpx_highbd_10_masked_variance64x128/;
-
-      add_proto qw/unsigned int vpx_highbd_masked_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-      specialize qw/vpx_highbd_masked_sub_pixel_variance128x128/;
-
-      add_proto qw/unsigned int vpx_highbd_masked_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-      specialize qw/vpx_highbd_masked_sub_pixel_variance128x64/;
-
-      add_proto qw/unsigned int vpx_highbd_masked_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-      specialize qw/vpx_highbd_masked_sub_pixel_variance64x128/;
-
-      add_proto qw/unsigned int vpx_highbd_10_masked_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-      specialize qw/vpx_highbd_10_masked_sub_pixel_variance128x128/;
-
-      add_proto qw/unsigned int vpx_highbd_10_masked_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-      specialize qw/vpx_highbd_10_masked_sub_pixel_variance128x64/;
-
-      add_proto qw/unsigned int vpx_highbd_10_masked_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-      specialize qw/vpx_highbd_10_masked_sub_pixel_variance64x128/;
-
-      add_proto qw/unsigned int vpx_highbd_12_masked_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-      specialize qw/vpx_highbd_12_masked_sub_pixel_variance128x128/;
-
-      add_proto qw/unsigned int vpx_highbd_12_masked_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-      specialize qw/vpx_highbd_12_masked_sub_pixel_variance128x64/;
-
-      add_proto qw/unsigned int vpx_highbd_12_masked_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-      specialize qw/vpx_highbd_12_masked_sub_pixel_variance64x128/;
-
-      add_proto qw/unsigned int vpx_highbd_masked_sad128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
-      specialize qw/vpx_highbd_masked_sad128x128/;
-
-      add_proto qw/unsigned int vpx_highbd_masked_sad128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
-      specialize qw/vpx_highbd_masked_sad128x64/;
-
-      add_proto qw/unsigned int vpx_highbd_masked_sad64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
-      specialize qw/vpx_highbd_masked_sad64x128/;
-    }
-  }
-
   add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
-
-  #
-  # Subpixel Variance
-  #
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance64x64/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance64x32/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance32x64/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance32x32/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance32x16/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance16x32/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance16x16/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance16x8/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance8x16/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance8x8/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance8x4/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance64x64/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance64x32/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance32x64/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance32x32/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance32x16/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance16x32/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance16x16/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance16x8/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance8x16/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance8x8/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance8x4/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance64x64/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance64x32/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance32x64/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance32x32/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance32x16/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance16x32/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance16x16/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance16x8/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance8x16/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance8x8/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance8x4/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+}
 
-}  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
 
 1;
diff --git a/vpx_dsp/vpx_filter.h b/vpx_dsp/vpx_filter.h
index e049f74..cfe8161 100644
--- a/vpx_dsp/vpx_filter.h
+++ b/vpx_dsp/vpx_filter.h
@@ -29,7 +29,18 @@
 
 #define BIL_SUBPEL_BITS    3
 #define BIL_SUBPEL_SHIFTS  (1 << BIL_SUBPEL_BITS)
-extern const uint8_t vpx_bilinear_filters[BIL_SUBPEL_SHIFTS][2];
+
+// 2 tap bilinear filters
+static const uint8_t bilinear_filters_2t[BIL_SUBPEL_SHIFTS][2] = {
+  { 128,   0  },
+  { 112,  16  },
+  {  96,  32  },
+  {  80,  48  },
+  {  64,  64  },
+  {  48,  80  },
+  {  32,  96  },
+  {  16, 112  },
+};
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vpx_dsp/x86/convolve.h b/vpx_dsp/x86/convolve.h
index b6fbfcf..95aa790 100644
--- a/vpx_dsp/x86/convolve.h
+++ b/vpx_dsp/x86/convolve.h
@@ -15,6 +15,7 @@
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
+#include "vpx_dsp/vpx_convolve.h"
 
 typedef void filter8_1dfunction (
   const uint8_t *src_ptr,
@@ -112,25 +113,27 @@
                               int w, int h) { \
   assert(filter_x[3] != 128); \
   assert(filter_y[3] != 128); \
-  assert(w <= 64); \
-  assert(h <= 64); \
+  assert(w <= MAX_CU_SIZE); \
+  assert(h <= MAX_CU_SIZE); \
   assert(x_step_q4 == 16); \
   assert(y_step_q4 == 16); \
   if (filter_x[0] || filter_x[1] || filter_x[2]|| \
       filter_y[0] || filter_y[1] || filter_y[2]) { \
-    DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \
-    vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
+    DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_CU_SIZE * (MAX_CU_SIZE+7)]); \
+    vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
+                              fdata2, MAX_CU_SIZE, \
                               filter_x, x_step_q4, filter_y, y_step_q4, \
                               w, h + 7); \
-    vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
+    vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * MAX_CU_SIZE, MAX_CU_SIZE, \
+                                    dst, dst_stride, \
                                     filter_x, x_step_q4, filter_y, \
                                     y_step_q4, w, h); \
   } else { \
-    DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \
-    vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
+    DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_CU_SIZE * (MAX_CU_SIZE+1)]); \
+    vpx_convolve8_horiz_##opt(src, src_stride, fdata2, MAX_CU_SIZE, \
                               filter_x, x_step_q4, filter_y, y_step_q4, \
                               w, h + 1); \
-    vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
+    vpx_convolve8_##avg##vert_##opt(fdata2, MAX_CU_SIZE, dst, dst_stride, \
                                     filter_x, x_step_q4, filter_y, \
                                     y_step_q4, w, h); \
   } \
@@ -250,31 +253,40 @@
                                      const int16_t *filter_x, int x_step_q4, \
                                      const int16_t *filter_y, int y_step_q4, \
                                      int w, int h, int bd) { \
-  assert(w <= 64); \
-  assert(h <= 64); \
+  assert(w <= MAX_CU_SIZE); \
+  assert(h <= MAX_CU_SIZE); \
   if (x_step_q4 == 16 && y_step_q4 == 16) { \
     if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
         filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
-      DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
-      vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
-                                       CONVERT_TO_BYTEPTR(fdata2), 64, \
+      DECLARE_ALIGNED(16, uint16_t, fdata2[MAX_CU_SIZE * (MAX_CU_SIZE+7)]); \
+      vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, \
+                                       src_stride, \
+                                       CONVERT_TO_BYTEPTR(fdata2), \
+                                       MAX_CU_SIZE, \
                                        filter_x, x_step_q4, \
                                        filter_y, y_step_q4, \
                                        w, h + 7, bd); \
-      vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \
-                                             64, dst, dst_stride, \
-                                             filter_x, x_step_q4, \
-                                             filter_y, y_step_q4, \
-                                             w, h, bd); \
+      vpx_highbd_convolve8_##avg##vert_##opt( \
+        CONVERT_TO_BYTEPTR(fdata2) + 3 * MAX_CU_SIZE, \
+        MAX_CU_SIZE, \
+        dst, \
+        dst_stride, \
+        filter_x, x_step_q4, \
+        filter_y, y_step_q4, \
+        w, h, bd); \
     } else { \
-      DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \
-      vpx_highbd_convolve8_horiz_##opt(src, src_stride, \
-                                       CONVERT_TO_BYTEPTR(fdata2), 64, \
+      DECLARE_ALIGNED(16, uint16_t, fdata2[MAX_CU_SIZE * (MAX_CU_SIZE+1)]); \
+      vpx_highbd_convolve8_horiz_##opt(src, \
+                                       src_stride, \
+                                       CONVERT_TO_BYTEPTR(fdata2), \
+                                       MAX_CU_SIZE, \
                                        filter_x, x_step_q4, \
                                        filter_y, y_step_q4, \
                                        w, h + 1, bd); \
-      vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \
-                                             dst, dst_stride, \
+      vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), \
+                                             MAX_CU_SIZE, \
+                                             dst, \
+                                             dst_stride, \
                                              filter_x, x_step_q4, \
                                              filter_y, y_step_q4, \
                                              w, h, bd); \
diff --git a/vpx_dsp/x86/masked_sad_intrin_ssse3.c b/vpx_dsp/x86/masked_sad_intrin_ssse3.c
index 384f89b..8b9ff10 100644
--- a/vpx_dsp/x86/masked_sad_intrin_ssse3.c
+++ b/vpx_dsp/x86/masked_sad_intrin_ssse3.c
@@ -64,6 +64,11 @@
                           m, n); \
 }
 
+#if CONFIG_EXT_PARTITION
+MASKSADMXN_SSSE3(128, 128)
+MASKSADMXN_SSSE3(128, 64)
+MASKSADMXN_SSSE3(64, 128)
+#endif  // CONFIG_EXT_PARTITION
 MASKSADMXN_SSSE3(64, 64)
 MASKSADMXN_SSSE3(64, 32)
 MASKSADMXN_SSSE3(32, 64)
@@ -100,7 +105,7 @@
 MASKSAD4XN_SSSE3(4)
 
 // For width a multiple of 16
-// Assumes values in m are <=64 and w = 16, 32, or 64
+// Assumes values in m are <=64
 static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride,
                                             const uint8_t *b_ptr, int b_stride,
                                             const uint8_t *m_ptr, int m_stride,
@@ -255,6 +260,11 @@
                                  msk_stride, m, n); \
 }
 
+#if CONFIG_EXT_PARTITION
+HIGHBD_MASKSADMXN_SSSE3(128, 128)
+HIGHBD_MASKSADMXN_SSSE3(128, 64)
+HIGHBD_MASKSADMXN_SSSE3(64, 128)
+#endif  // CONFIG_EXT_PARTITION
 HIGHBD_MASKSADMXN_SSSE3(64, 64)
 HIGHBD_MASKSADMXN_SSSE3(64, 32)
 HIGHBD_MASKSADMXN_SSSE3(32, 64)
diff --git a/vpx_dsp/x86/masked_variance_intrin_ssse3.c b/vpx_dsp/x86/masked_variance_intrin_ssse3.c
index 96af421..ca4f6fc 100644
--- a/vpx_dsp/x86/masked_variance_intrin_ssse3.c
+++ b/vpx_dsp/x86/masked_variance_intrin_ssse3.c
@@ -18,17 +18,63 @@
 #include "vpx_ports/mem.h"
 #include "vpx_dsp/vpx_filter.h"
 
-// Assumes mask values are <= 64
 
-// Log 2 of powers of 2 as an expression
-#define LOG2_P2(n)  ((n) ==   1 ? 0 :       \
-                     (n) ==   2 ? 1 :       \
-                     (n) ==   4 ? 2 :       \
-                     (n) ==   8 ? 3 :       \
-                     (n) ==  16 ? 4 :       \
-                     (n) ==  32 ? 5 :       \
-                     (n) ==  64 ? 6 :       \
-                     (n) == 128 ? 7 :  -1)
+// Half pixel shift
+#define HALF_PIXEL_OFFSET (BIL_SUBPEL_SHIFTS/2)
+
+/*****************************************************************************
+ * Horizontal additions
+ *****************************************************************************/
+
+static INLINE int32_t hsum_epi32_si32(__m128i v_d) {
+  v_d = _mm_hadd_epi32(v_d, v_d);
+  v_d = _mm_hadd_epi32(v_d, v_d);
+  return _mm_cvtsi128_si32(v_d);
+}
+
+static INLINE int64_t hsum_epi64_si64(__m128i v_q) {
+  v_q = _mm_add_epi64(v_q, _mm_srli_si128(v_q, 8));
+#if ARCH_X86_64
+  return _mm_cvtsi128_si64(v_q);
+#else
+  {
+    int64_t tmp;
+    _mm_storel_epi64((__m128i*)&tmp, v_q);
+    return tmp;
+  }
+#endif
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE int64_t hsum_epi32_si64(__m128i v_d) {
+  const __m128i v_sign_d =  _mm_cmplt_epi32(v_d, _mm_setzero_si128());
+  const __m128i v_0_q = _mm_unpacklo_epi32(v_d, v_sign_d);
+  const __m128i v_1_q = _mm_unpackhi_epi32(v_d, v_sign_d);
+  return hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q));
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE int calc_masked_variance(__m128i v_sum_d, __m128i v_sse_q,
+                                       unsigned int* sse,
+                                       const int w, const int h) {
+  int64_t sum64;
+  uint64_t sse64;
+
+  // Horizontal sum
+  sum64 = hsum_epi32_si32(v_sum_d);
+  sse64 = hsum_epi64_si64(v_sse_q);
+
+  sum64 = (sum64 >= 0) ? sum64 : -sum64;
+
+  // Round
+  sum64 = ROUND_POWER_OF_TWO(sum64, 6);
+  sse64 = ROUND_POWER_OF_TWO(sse64, 12);
+
+  // Store the SSE
+  *sse = (unsigned int)sse64;
+  // Compute the variance
+  return  *sse - ((sum64 * sum64) / (w * h));
+}
 
 /*****************************************************************************
  * n*16 Wide versions
@@ -98,30 +144,7 @@
     m += m_stride;
   }
 
-  // Horizontal sum
-  v_sum_d = _mm_hadd_epi32(v_sum_d, v_sum_d);
-  v_sum_d = _mm_hadd_epi32(v_sum_d, v_sum_d);
-  v_sse_q = _mm_add_epi64(v_sse_q, _mm_srli_si128(v_sse_q, 8));
-
-  // Round
-  v_sum_d = _mm_sub_epi32(v_sum_d, _mm_cmplt_epi32(v_sum_d, v_zero));
-  v_sum_d = _mm_add_epi32(v_sum_d, _mm_set_epi32(0, 0, 0, 31));
-  v_sum_d = _mm_srai_epi32(v_sum_d, 6);
-
-  v_sse_q = _mm_add_epi64(v_sse_q, _mm_set_epi32(0, 0, 0, 2047));
-  v_sse_q = _mm_srli_epi64(v_sse_q, 12);
-
-  // Store the SSE
-  *sse = _mm_cvtsi128_si32(v_sse_q);
-
-  // Compute the variance
-  v_sum_d = _mm_abs_epi32(v_sum_d);
-  v_sum_d = _mm_mul_epu32(v_sum_d, v_sum_d);
-  v_sum_d = _mm_srl_epi64(v_sum_d,
-                          _mm_set_epi32(0, 0, 0, LOG2_P2(w) + LOG2_P2(h)));
-  v_sse_q = _mm_sub_epi64(v_sse_q, v_sum_d);
-
-  return _mm_cvtsi128_si32(v_sse_q);
+  return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h);
 }
 
 #define MASKED_VARWXH(W, H)                                               \
@@ -144,6 +167,11 @@
 MASKED_VARWXH(32, 64)
 MASKED_VARWXH(64, 32)
 MASKED_VARWXH(64, 64)
+#if CONFIG_EXT_PARTITION
+MASKED_VARWXH(64, 128)
+MASKED_VARWXH(128, 64)
+MASKED_VARWXH(128, 128)
+#endif  // CONFIG_EXT_PARTITION
 
 /*****************************************************************************
  * 8 Wide versions
@@ -198,29 +226,7 @@
     m += m_stride;
   }
 
-  // Horizontal sum
-  v_sum_d = _mm_hadd_epi32(v_sum_d, v_sum_d);
-  v_sum_d = _mm_hadd_epi32(v_sum_d, v_sum_d);
-  v_sse_q = _mm_add_epi64(v_sse_q, _mm_srli_si128(v_sse_q, 8));
-
-  // Round
-  v_sum_d = _mm_sub_epi32(v_sum_d, _mm_cmplt_epi32(v_sum_d, v_zero));
-  v_sum_d = _mm_add_epi32(v_sum_d, _mm_set_epi32(0, 0, 0, 31));
-  v_sum_d = _mm_srai_epi32(v_sum_d, 6);
-
-  v_sse_q = _mm_add_epi64(v_sse_q, _mm_set_epi32(0, 0, 0, 2047));
-  v_sse_q = _mm_srli_epi64(v_sse_q, 12);
-
-  // Store the SSE
-  *sse = _mm_cvtsi128_si32(v_sse_q);
-
-  // Compute the variance
-  v_sum_d = _mm_abs_epi32(v_sum_d);
-  v_sum_d = _mm_mul_epu32(v_sum_d, v_sum_d);
-  v_sum_d = _mm_srl_epi64(v_sum_d, _mm_set_epi32(0, 0, 0, LOG2_P2(h) + 3));
-  v_sse_q = _mm_sub_epi64(v_sse_q, v_sum_d);
-
-  return _mm_cvtsi128_si32(v_sse_q);
+  return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h);
 }
 
 #define MASKED_VAR8XH(H)                                                  \
@@ -302,29 +308,7 @@
     m += m_stride * 2;
   }
 
-  // Horizontal sum
-  v_sum_d = _mm_hadd_epi32(v_sum_d, v_sum_d);
-  v_sum_d = _mm_hadd_epi32(v_sum_d, v_sum_d);
-  v_sse_q = _mm_add_epi64(v_sse_q, _mm_srli_si128(v_sse_q, 8));
-
-  // Round
-  v_sum_d = _mm_sub_epi32(v_sum_d, _mm_cmplt_epi32(v_sum_d, v_zero));
-  v_sum_d = _mm_add_epi32(v_sum_d, _mm_set_epi32(0, 0, 0, 31));
-  v_sum_d = _mm_srai_epi32(v_sum_d, 6);
-
-  v_sse_q = _mm_add_epi64(v_sse_q, _mm_set_epi32(0, 0, 0, 2047));
-  v_sse_q = _mm_srli_epi64(v_sse_q, 12);
-
-  // Store the SSE
-  *sse = _mm_cvtsi128_si32(v_sse_q);
-
-  // Compute the variance
-  v_sum_d = _mm_abs_epi32(v_sum_d);
-  v_sum_d = _mm_mul_epu32(v_sum_d, v_sum_d);
-  v_sum_d = _mm_srl_epi64(v_sum_d, _mm_set_epi32(0, 0, 0, LOG2_P2(h) + 2));
-  v_sse_q = _mm_sub_epi64(v_sse_q, v_sum_d);
-
-  return _mm_cvtsi128_si32(v_sse_q);
+  return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h);
 }
 
 #define MASKED_VAR4XH(H)                                                  \
@@ -350,13 +334,13 @@
     const uint16_t *b, int  b_stride,
     const uint8_t *m, int  m_stride,
     int w, int  h,
-    __m128i* v_sum_d, __m128i* v_sse_q) {
+    int64_t *sum, uint64_t *sse) {
   int ii, jj;
 
   const __m128i v_zero = _mm_setzero_si128();
 
-  *v_sum_d = _mm_setzero_si128();
-  *v_sse_q = _mm_setzero_si128();
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
 
   assert((w % 8) == 0);
 
@@ -373,7 +357,7 @@
       // Difference: [-4095, 4095]
       const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
 
-      // Error - [-4095, 4095] * [0, 64] => fits in 19 bits (incld sign bit)
+      // Error - [-4095, 4095] * [0, 64] => sum of 2 of these fits in 19 bits
       const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w);
 
       // Squared error - max (18 bits * 18 bits) = 36 bits (no sign bit)
@@ -397,8 +381,8 @@
       v_se_q = _mm_add_epi64(v_se0_q, v_se1_q);
 
       // Accumulate
-      *v_sum_d = _mm_add_epi32(*v_sum_d, v_e_d);
-      *v_sse_q = _mm_add_epi64(*v_sse_q, v_se_q);
+      v_sum_d = _mm_add_epi32(v_sum_d, v_e_d);
+      v_sse_q = _mm_add_epi64(v_sse_q, v_se_q);
     }
 
     // Move on to next row
@@ -408,17 +392,13 @@
   }
 
   // Horizontal sum
-  *v_sum_d = _mm_hadd_epi32(*v_sum_d, *v_sum_d);
-  *v_sum_d = _mm_hadd_epi32(*v_sum_d, *v_sum_d);
-  *v_sse_q = _mm_add_epi64(*v_sse_q, _mm_srli_si128(*v_sse_q, 8));
+  *sum = hsum_epi32_si64(v_sum_d);
+  *sse = hsum_epi64_si64(v_sse_q);
 
   // Round
-  *v_sum_d = _mm_sub_epi32(*v_sum_d, _mm_cmplt_epi32(*v_sum_d, v_zero));
-  *v_sum_d = _mm_add_epi32(*v_sum_d, _mm_set_epi32(0, 0, 0, 31));
-  *v_sum_d = _mm_srai_epi32(*v_sum_d, 6);
-
-  *v_sse_q = _mm_add_epi64(*v_sse_q, _mm_set_epi32(0, 0, 0, 2047));
-  *v_sse_q = _mm_srli_epi64(*v_sse_q, 12);
+  *sum = (*sum >= 0) ? *sum  : -*sum;
+  *sum = ROUND_POWER_OF_TWO(*sum, 6);
+  *sse = ROUND_POWER_OF_TWO(*sse, 12);
 }
 
 // Main calculation for 4 wide blocks
@@ -427,13 +407,13 @@
     const uint16_t *b, int  b_stride,
     const uint8_t *m, int  m_stride,
     int  h,
-    __m128i* v_sum_d, __m128i* v_sse_q) {
+    int64_t *sum, uint64_t *sse) {
   int ii;
 
   const __m128i v_zero = _mm_setzero_si128();
 
-  *v_sum_d = _mm_setzero_si128();
-  *v_sse_q = _mm_setzero_si128();
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
 
   assert((h % 2) == 0);
 
@@ -481,8 +461,8 @@
     v_se_q = _mm_add_epi64(v_se0_q, v_se1_q);
 
     // Accumulate
-    *v_sum_d = _mm_add_epi32(*v_sum_d, v_e_d);
-    *v_sse_q = _mm_add_epi64(*v_sse_q, v_se_q);
+    v_sum_d = _mm_add_epi32(v_sum_d, v_e_d);
+    v_sse_q = _mm_add_epi64(v_sse_q, v_se_q);
 
     // Move on to next row
     a += a_stride * 2;
@@ -491,17 +471,13 @@
   }
 
   // Horizontal sum
-  *v_sum_d = _mm_hadd_epi32(*v_sum_d, *v_sum_d);
-  *v_sum_d = _mm_hadd_epi32(*v_sum_d, *v_sum_d);
-  *v_sse_q = _mm_add_epi64(*v_sse_q, _mm_srli_si128(*v_sse_q, 8));
+  *sum = hsum_epi32_si32(v_sum_d);
+  *sse = hsum_epi64_si64(v_sse_q);
 
   // Round
-  *v_sum_d = _mm_sub_epi32(*v_sum_d, _mm_cmplt_epi32(*v_sum_d, v_zero));
-  *v_sum_d = _mm_add_epi32(*v_sum_d, _mm_set_epi32(0, 0, 0, 31));
-  *v_sum_d = _mm_srai_epi32(*v_sum_d, 6);
-
-  *v_sse_q = _mm_add_epi64(*v_sse_q, _mm_set_epi32(0, 0, 0, 2047));
-  *v_sse_q = _mm_srli_epi64(*v_sse_q, 12);
+  *sum = (*sum >= 0) ? *sum  : -*sum;
+  *sum = ROUND_POWER_OF_TWO(*sum, 6);
+  *sse = ROUND_POWER_OF_TWO(*sse, 12);
 }
 
 static INLINE unsigned int highbd_masked_variancewxh_ssse3(
@@ -510,26 +486,20 @@
     const uint8_t *m, int  m_stride,
     int w, int  h,
     unsigned int *sse) {
-  __m128i v_sum_d, v_sse_q;
+  uint64_t sse64;
+  int64_t sum64;
 
   if (w == 4)
     highbd_masked_variance64_4wide_ssse3(a, a_stride, b,  b_stride, m, m_stride,
-            h, &v_sum_d, &v_sse_q);
+            h, &sum64, &sse64);
   else
     highbd_masked_variance64_ssse3(a, a_stride, b,  b_stride, m, m_stride, w, h,
-            &v_sum_d, &v_sse_q);
+            &sum64, &sse64);
 
   // Store the SSE
-  *sse = _mm_cvtsi128_si32(v_sse_q);
-
-  // Compute the variance
-  v_sum_d = _mm_abs_epi32(v_sum_d);
-  v_sum_d = _mm_mul_epu32(v_sum_d, v_sum_d);
-  v_sum_d = _mm_srl_epi64(v_sum_d,
-                          _mm_set_epi32(0, 0, 0, LOG2_P2(w) + LOG2_P2(h)));
-  v_sse_q = _mm_sub_epi64(v_sse_q, v_sum_d);
-
-  return _mm_cvtsi128_si32(v_sse_q);
+  *sse = (unsigned int)sse64;
+  // Compute and return variance
+  return *sse - ((sum64 * sum64) / (w * h));
 }
 
 static INLINE unsigned int highbd_10_masked_variancewxh_ssse3(
@@ -538,32 +508,24 @@
     const uint8_t *m, int  m_stride,
     int w, int  h,
     unsigned int *sse) {
-  __m128i v_sum_d, v_sse_q;
+  uint64_t sse64;
+  int64_t sum64;
 
   if (w == 4)
     highbd_masked_variance64_4wide_ssse3(a, a_stride, b,  b_stride, m, m_stride,
-            h, &v_sum_d, &v_sse_q);
+            h, &sum64, &sse64);
   else
     highbd_masked_variance64_ssse3(a, a_stride, b,  b_stride, m, m_stride, w, h,
-            &v_sum_d, &v_sse_q);
+            &sum64, &sse64);
 
-  // Round sum and sse
-  v_sum_d = _mm_srai_epi32(_mm_add_epi32(v_sum_d,
-          _mm_set_epi32(0, 0, 0, 1 << 1)), 2);
-  v_sse_q = _mm_srli_epi64(_mm_add_epi64(v_sse_q,
-          _mm_set_epi32(0, 0, 0, 1 << 3)), 4);
+  // Normalise
+  sum64 = ROUND_POWER_OF_TWO(sum64, 2);
+  sse64 = ROUND_POWER_OF_TWO(sse64, 4);
 
   // Store the SSE
-  *sse = _mm_cvtsi128_si32(v_sse_q);
-
-  // Compute the variance
-  v_sum_d = _mm_abs_epi32(v_sum_d);
-  v_sum_d = _mm_mul_epu32(v_sum_d, v_sum_d);
-  v_sum_d = _mm_srl_epi64(v_sum_d,
-                          _mm_set_epi32(0, 0, 0, LOG2_P2(w) + LOG2_P2(h)));
-  v_sse_q = _mm_sub_epi64(v_sse_q, v_sum_d);
-
-  return _mm_cvtsi128_si32(v_sse_q);
+  *sse = (unsigned int)sse64;
+  // Compute and return variance
+  return *sse - ((sum64 * sum64) / (w * h));
 }
 
 static INLINE unsigned int highbd_12_masked_variancewxh_ssse3(
@@ -572,32 +534,23 @@
     const uint8_t *m, int  m_stride,
     int w, int  h,
     unsigned int *sse) {
-  __m128i v_sum_d, v_sse_q;
+  uint64_t sse64;
+  int64_t sum64;
 
   if (w == 4)
     highbd_masked_variance64_4wide_ssse3(a, a_stride, b,  b_stride, m, m_stride,
-            h, &v_sum_d, &v_sse_q);
+            h, &sum64, &sse64);
   else
     highbd_masked_variance64_ssse3(a, a_stride, b,  b_stride, m, m_stride, w, h,
-            &v_sum_d, &v_sse_q);
+            &sum64, &sse64);
 
-  // Round sum and sse
-  v_sum_d = _mm_srai_epi32(_mm_add_epi32(v_sum_d,
-          _mm_set_epi32(0, 0, 0, 1 << 3)), 4);
-  v_sse_q = _mm_srli_epi64(_mm_add_epi64(v_sse_q,
-          _mm_set_epi32(0, 0, 0, 1 << 7)), 8);
+  sum64 = ROUND_POWER_OF_TWO(sum64, 4);
+  sse64 = ROUND_POWER_OF_TWO(sse64, 8);
 
   // Store the SSE
-  *sse = _mm_cvtsi128_si32(v_sse_q);
-
-  // Compute the variance
-  v_sum_d = _mm_abs_epi32(v_sum_d);
-  v_sum_d = _mm_mul_epu32(v_sum_d, v_sum_d);
-  v_sum_d = _mm_srl_epi64(v_sum_d,
-                          _mm_set_epi32(0, 0, 0, LOG2_P2(w) + LOG2_P2(h)));
-  v_sse_q = _mm_sub_epi64(v_sse_q, v_sum_d);
-
-  return _mm_cvtsi128_si32(v_sse_q);
+  *sse = (unsigned int)sse64;
+  // Compute and return variance
+  return *sse - ((sum64 * sum64) / (w * h));
 }
 
 #define HIGHBD_MASKED_VARWXH(W, H)                                             \
@@ -653,6 +606,11 @@
 HIGHBD_MASKED_VARWXH(32, 64)
 HIGHBD_MASKED_VARWXH(64, 32)
 HIGHBD_MASKED_VARWXH(64, 64)
+#if CONFIG_EXT_PARTITION
+HIGHBD_MASKED_VARWXH(64, 128)
+HIGHBD_MASKED_VARWXH(128, 64)
+HIGHBD_MASKED_VARWXH(128, 128)
+#endif  // CONFIG_EXT_PARTITION
 
 #endif
 
@@ -663,8 +621,8 @@
 typedef __m128i (*filter_fn_t)(__m128i v_a_b, __m128i v_b_b,
                                     __m128i v_filter_b);
 
-static INLINE __m128i apply_filter8(const __m128i v_a_b, const __m128i v_b_b,
-                                    const __m128i v_filter_b) {
+static INLINE __m128i apply_filter_avg(const __m128i v_a_b, const __m128i v_b_b,
+                                       const __m128i v_filter_b) {
   (void) v_filter_b;
   return _mm_avg_epu8(v_a_b, v_b_b);
 }
@@ -735,31 +693,6 @@
   *v_sse_q = _mm_add_epi64(*v_sse_q, v_se_hi_q);
 }
 
-static INLINE int calc_masked_variance(__m128i v_sum_d, __m128i v_sse_q,
-                                       unsigned int* sse,
-                                       const int w, const int h) {
-  int sum;
-
-  // Horizontal sum
-  v_sum_d = _mm_hadd_epi32(v_sum_d, v_sum_d);
-  v_sum_d = _mm_hadd_epi32(v_sum_d, v_sum_d);
-  v_sse_q = _mm_add_epi64(v_sse_q, _mm_srli_si128(v_sse_q, 8));
-
-  // Round
-  sum = _mm_cvtsi128_si32(v_sum_d);
-  sum = (sum >= 0) ? ((sum + 31) >> 6) : -((-sum + 31) >> 6);
-
-  v_sse_q = _mm_add_epi64(v_sse_q, _mm_set_epi32(0, 0, 0, 2047));
-  v_sse_q = _mm_srli_epi64(v_sse_q, 12);
-
-  // Store the SSE
-  *sse = _mm_cvtsi128_si32(v_sse_q);
-
-  // Compute the variance
-  return  *sse - (((int64_t)sum * sum) >> (LOG2_P2(h) + LOG2_P2(w)));
-}
-
-
 // Functions for width (W) >= 16
 unsigned int vpx_masked_subpel_varWxH_xzero(
         const uint8_t *src, int src_stride, int yoffset,
@@ -770,9 +703,9 @@
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
   const __m128i v_filter_b = _mm_set1_epi16((
-        vpx_bilinear_filters[yoffset][1] << 8) +
-        vpx_bilinear_filters[yoffset][0]);
-  assert(yoffset < 8);
+        bilinear_filters_2t[yoffset][1] << 8) +
+        bilinear_filters_2t[yoffset][0]);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
   for (j = 0; j < w; j += 16) {
     // Load the first row ready
     v_src0_b = _mm_loadu_si128((const __m128i*)(src + j));
@@ -814,9 +747,9 @@
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
   const __m128i v_filter_b = _mm_set1_epi16((
-        vpx_bilinear_filters[xoffset][1] << 8) +
-        vpx_bilinear_filters[xoffset][0]);
-  assert(xoffset < 8);
+        bilinear_filters_2t[xoffset][1] << 8) +
+        bilinear_filters_2t[xoffset][0]);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
   for (i = 0; i < h; i++) {
     for (j = 0; j < w; j += 16) {
       // Load this row and one below & apply the filter to them
@@ -846,13 +779,13 @@
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
   const __m128i v_filterx_b = _mm_set1_epi16((
-        vpx_bilinear_filters[xoffset][1] << 8) +
-        vpx_bilinear_filters[xoffset][0]);
+        bilinear_filters_2t[xoffset][1] << 8) +
+        bilinear_filters_2t[xoffset][0]);
   const __m128i v_filtery_b = _mm_set1_epi16((
-        vpx_bilinear_filters[yoffset][1] << 8) +
-        vpx_bilinear_filters[yoffset][0]);
-  assert(yoffset < 8);
-  assert(xoffset < 8);
+        bilinear_filters_2t[yoffset][1] << 8) +
+        bilinear_filters_2t[yoffset][0]);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
   for (j = 0; j < w; j += 16) {
     // Load the first row ready
     v_src0_b = _mm_loadu_si128((const __m128i*)(src + j));
@@ -908,9 +841,9 @@
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
   __m128i v_filter_b = _mm_set1_epi16((
-        vpx_bilinear_filters[yoffset][1] << 8) +
-        vpx_bilinear_filters[yoffset][0]);
-  assert(yoffset < 8);
+        bilinear_filters_2t[yoffset][1] << 8) +
+        bilinear_filters_2t[yoffset][0]);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
   // Load the first row of src data ready
   v_src0_b = _mm_loadl_epi64((const __m128i*)src);
   for (i = 0; i < h; i += 4) {
@@ -938,7 +871,7 @@
     v_msk2_b = _mm_unpacklo_epi32(v_msk3_b, v_msk2_b);
     v_msk0_b = _mm_unpacklo_epi64(v_msk2_b, v_msk0_b);
     // Apply the y filter
-    if (yoffset == 8) {
+    if (yoffset == HALF_PIXEL_OFFSET) {
       v_src1_b = _mm_unpacklo_epi64(v_src3_b, v_src1_b);
       v_src2_b = _mm_or_si128(_mm_slli_si128(v_src1_b, 4),
             _mm_and_si128(v_src0_b, _mm_setr_epi32(-1, 0, 0, 0)));
@@ -974,13 +907,13 @@
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
   __m128i v_filter_b = _mm_set1_epi16((
-        vpx_bilinear_filters[yoffset][1] << 8) +
-        vpx_bilinear_filters[yoffset][0]);
-  assert(yoffset < 8);
+        bilinear_filters_2t[yoffset][1] << 8) +
+        bilinear_filters_2t[yoffset][0]);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
   // Load the first row of src data ready
   v_src0_b = _mm_loadl_epi64((const __m128i*)src);
   for (i = 0; i < h; i += 2) {
-    if (yoffset == 8) {
+    if (yoffset == HALF_PIXEL_OFFSET) {
       // Load the rest of the source data for these rows
       v_src1_b = _mm_or_si128(
             _mm_slli_si128(v_src0_b, 8),
@@ -1030,9 +963,9 @@
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
   __m128i v_filter_b = _mm_set1_epi16((
-        vpx_bilinear_filters[xoffset][1] << 8) +
-        vpx_bilinear_filters[xoffset][0]);
-  assert(xoffset < 8);
+        bilinear_filters_2t[xoffset][1] << 8) +
+        bilinear_filters_2t[xoffset][0]);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
   for (i = 0; i < h; i += 4) {
     // Load the src data
     v_src0_b = _mm_loadl_epi64((const __m128i*)src);
@@ -1064,7 +997,7 @@
     v_msk2_b = _mm_unpacklo_epi32(v_msk3_b, v_msk2_b);
     v_msk0_b = _mm_unpacklo_epi64(v_msk2_b, v_msk0_b);
     // Apply the x filter
-    if (xoffset == 8) {
+    if (xoffset == HALF_PIXEL_OFFSET) {
       v_src0_b = _mm_unpacklo_epi64(v_src2_b, v_src0_b);
       v_src0_shift_b = _mm_unpacklo_epi64(v_src2_shift_b, v_src0_shift_b);
       v_res_b = _mm_avg_epu8(v_src0_b, v_src0_shift_b);
@@ -1093,9 +1026,9 @@
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
   __m128i v_filter_b = _mm_set1_epi16((
-        vpx_bilinear_filters[xoffset][1] << 8) +
-        vpx_bilinear_filters[xoffset][0]);
-  assert(xoffset < 8);
+        bilinear_filters_2t[xoffset][1] << 8) +
+        bilinear_filters_2t[xoffset][0]);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
   for (i = 0; i < h; i += 2) {
     // Load the src data
     v_src0_b = _mm_loadu_si128((const __m128i*)(src));
@@ -1103,7 +1036,7 @@
     v_src1_b = _mm_loadu_si128((const __m128i*)(src + src_stride));
     v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
     // Apply the x filter
-    if (xoffset == 8) {
+    if (xoffset == HALF_PIXEL_OFFSET) {
       v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b);
       v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b);
       v_res_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b);
@@ -1145,13 +1078,13 @@
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
   __m128i v_filterx_b = _mm_set1_epi16((
-        vpx_bilinear_filters[xoffset][1] << 8) +
-        vpx_bilinear_filters[xoffset][0]);
+        bilinear_filters_2t[xoffset][1] << 8) +
+        bilinear_filters_2t[xoffset][0]);
   __m128i v_filtery_b = _mm_set1_epi16((
-        vpx_bilinear_filters[yoffset][1] << 8) +
-        vpx_bilinear_filters[yoffset][0]);
-  assert(xoffset < 8);
-  assert(yoffset < 8);
+        bilinear_filters_2t[yoffset][1] << 8) +
+        bilinear_filters_2t[yoffset][0]);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
   for (i = 0; i < h; i += 4) {
     // Load the src data
     v_src0_b = _mm_loadl_epi64((const __m128i*)src);
@@ -1167,7 +1100,7 @@
     v_src3_shift_b = _mm_srli_si128(v_src3_b, 1);
     v_src2_shift_b = _mm_unpacklo_epi32(v_src3_shift_b, v_src2_shift_b);
     // Apply the x filter
-    if (xoffset == 8) {
+    if (xoffset == HALF_PIXEL_OFFSET) {
       v_src0_b = _mm_unpacklo_epi64(v_src2_b, v_src0_b);
       v_src0_shift_b = _mm_unpacklo_epi64(v_src2_shift_b, v_src0_shift_b);
       v_xres_b[i == 0 ? 0 : 1] = _mm_avg_epu8(v_src0_b, v_src0_shift_b);
@@ -1183,7 +1116,7 @@
   v_src0_b = _mm_loadl_epi64((const __m128i*)src);
   v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
   // Apply the x filter
-  if (xoffset == 8) {
+  if (xoffset == HALF_PIXEL_OFFSET) {
     v_extra_row_b = _mm_and_si128(
             _mm_avg_epu8(v_src0_b, v_src0_shift_b),
             _mm_setr_epi32(-1, 0, 0, 0));
@@ -1203,7 +1136,7 @@
                               v_extra_row_b);
     }
     // Apply the y filter
-    if (yoffset == 8) {
+    if (yoffset == HALF_PIXEL_OFFSET) {
       v_res_b = _mm_avg_epu8(v_xres_b[i == 0 ? 0 : 1], v_temp_b);
     } else {
       v_res_b = apply_filter(v_xres_b[i == 0 ? 0 : 1], v_temp_b, v_filtery_b);
@@ -1245,21 +1178,20 @@
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
   __m128i v_filterx_b = _mm_set1_epi16((
-        vpx_bilinear_filters[xoffset][1] << 8) +
-        vpx_bilinear_filters[xoffset][0]);
+        bilinear_filters_2t[xoffset][1] << 8) +
+        bilinear_filters_2t[xoffset][0]);
   __m128i v_filtery_b = _mm_set1_epi16((
-        vpx_bilinear_filters[yoffset][1] << 8) +
-        vpx_bilinear_filters[yoffset][0]);
-  assert(xoffset < 8);
-  assert(yoffset < 8);
-
+        bilinear_filters_2t[yoffset][1] << 8) +
+        bilinear_filters_2t[yoffset][0]);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
   // Load the first block of src data
   v_src0_b = _mm_loadu_si128((const __m128i*)(src));
   v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
   v_src1_b = _mm_loadu_si128((const __m128i*)(src + src_stride));
   v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
   // Apply the x filter
-  if (xoffset == 8) {
+  if (xoffset == HALF_PIXEL_OFFSET) {
     v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b);
     v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b);
     v_xres0_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b);
@@ -1275,7 +1207,7 @@
     v_src1_b = _mm_loadu_si128((const __m128i*)(src + src_stride * 3));
     v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
     // Apply the x filter
-    if (xoffset == 8) {
+    if (xoffset == HALF_PIXEL_OFFSET) {
       v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b);
       v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b);
       v_xres1_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b);
@@ -1287,7 +1219,7 @@
     // Apply the y filter to the previous block
     v_temp_b = _mm_or_si128(_mm_srli_si128(v_xres0_b, 8),
                             _mm_slli_si128(v_xres1_b, 8));
-    if (yoffset == 8) {
+    if (yoffset == HALF_PIXEL_OFFSET) {
       v_res_b = _mm_avg_epu8(v_xres0_b, v_temp_b);
     } else {
       v_res_b = apply_filter(v_xres0_b, v_temp_b, v_filtery_b);
@@ -1309,7 +1241,7 @@
     v_src1_b = _mm_loadu_si128((const __m128i*)(src + src_stride * 5));
     v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
     // Apply the x filter
-    if (xoffset == 8) {
+    if (xoffset == HALF_PIXEL_OFFSET) {
       v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b);
       v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b);
       v_xres0_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b);
@@ -1321,7 +1253,7 @@
     // Apply the y filter to the previous block
     v_temp_b = _mm_or_si128(_mm_srli_si128(v_xres1_b, 8),
                             _mm_slli_si128(v_xres0_b, 8));
-    if (yoffset == 8) {
+    if (yoffset == HALF_PIXEL_OFFSET) {
       v_res_b = _mm_avg_epu8(v_xres1_b, v_temp_b);
     } else {
       v_res_b = apply_filter(v_xres1_b, v_temp_b, v_filtery_b);
@@ -1359,41 +1291,45 @@
       return vpx_masked_variance##W##x##H##_ssse3(src, src_stride,             \
                                                   dst, dst_stride,             \
                                                   msk, msk_stride, sse);       \
-    else if (yoffset == 8)                                                     \
-      return vpx_masked_subpel_varWxH_xzero(src, src_stride, 8,                \
+    else if (yoffset == HALF_PIXEL_OFFSET)                                     \
+      return vpx_masked_subpel_varWxH_xzero(src, src_stride,                   \
+                                            HALF_PIXEL_OFFSET,                 \
                                             dst, dst_stride, msk, msk_stride,  \
-                                            sse, W, H, apply_filter8);         \
+                                            sse, W, H, apply_filter_avg);      \
     else                                                                       \
-      return vpx_masked_subpel_varWxH_xzero(src, src_stride, yoffset,          \
+      return vpx_masked_subpel_varWxH_xzero(src, src_stride,                   \
+                                            yoffset,                           \
                                             dst, dst_stride, msk, msk_stride,  \
                                             sse, W, H, apply_filter);          \
   } else if (yoffset == 0) {                                                   \
-    if (xoffset == 8)                                                          \
-      return vpx_masked_subpel_varWxH_yzero(src, src_stride, 8,                \
+    if (xoffset == HALF_PIXEL_OFFSET)                                          \
+      return vpx_masked_subpel_varWxH_yzero(src, src_stride,                   \
+                                            HALF_PIXEL_OFFSET,                 \
                                             dst, dst_stride, msk, msk_stride,  \
-                                            sse, W, H, apply_filter8);         \
+                                            sse, W, H, apply_filter_avg);      \
     else                                                                       \
-      return vpx_masked_subpel_varWxH_yzero(src, src_stride, xoffset,          \
+      return vpx_masked_subpel_varWxH_yzero(src, src_stride,                   \
+                                            xoffset,                           \
                                             dst, dst_stride, msk, msk_stride,  \
                                             sse, W, H, apply_filter);          \
-  } else if (xoffset == 8) {                                                   \
-    if (yoffset == 8)                                                          \
+  } else if (xoffset == HALF_PIXEL_OFFSET) {                                   \
+    if (yoffset == HALF_PIXEL_OFFSET)                                          \
       return vpx_masked_subpel_varWxH_xnonzero_ynonzero(src, src_stride,       \
-              8, 8, dst, dst_stride, msk, msk_stride, sse, W, H,               \
-              apply_filter8, apply_filter8);                                   \
+              HALF_PIXEL_OFFSET, HALF_PIXEL_OFFSET, dst, dst_stride, msk,      \
+              msk_stride, sse, W, H, apply_filter_avg, apply_filter_avg);      \
     else                                                                       \
       return vpx_masked_subpel_varWxH_xnonzero_ynonzero(src, src_stride,       \
-              8, yoffset, dst, dst_stride, msk, msk_stride, sse, W, H,         \
-              apply_filter8, apply_filter);                                    \
+              HALF_PIXEL_OFFSET, yoffset, dst, dst_stride, msk,                \
+              msk_stride, sse, W, H, apply_filter_avg, apply_filter);          \
   } else {                                                                     \
-    if (yoffset == 8)                                                          \
+    if (yoffset == HALF_PIXEL_OFFSET)                                          \
       return vpx_masked_subpel_varWxH_xnonzero_ynonzero(src, src_stride,       \
-              xoffset, 8, dst, dst_stride, msk, msk_stride, sse, W, H,         \
-              apply_filter, apply_filter8);                                    \
+              xoffset, HALF_PIXEL_OFFSET, dst, dst_stride, msk,                \
+              msk_stride, sse, W, H, apply_filter, apply_filter_avg);          \
     else                                                                       \
       return vpx_masked_subpel_varWxH_xnonzero_ynonzero(src, src_stride,       \
-              xoffset, yoffset, dst, dst_stride, msk, msk_stride, sse, W, H,   \
-              apply_filter, apply_filter);                                     \
+              xoffset, yoffset, dst, dst_stride, msk,                          \
+              msk_stride, sse, W, H, apply_filter, apply_filter);              \
   }                                                                            \
 }
 
@@ -1437,6 +1373,11 @@
 MASK_SUBPIX_VAR_LARGE(32, 64)
 MASK_SUBPIX_VAR_LARGE(64, 32)
 MASK_SUBPIX_VAR_LARGE(64, 64)
+#if CONFIG_EXT_PARTITION
+MASK_SUBPIX_VAR_LARGE(64, 128)
+MASK_SUBPIX_VAR_LARGE(128, 64)
+MASK_SUBPIX_VAR_LARGE(128, 128)
+#endif  // CONFIG_EXT_PARTITION
 
 #if CONFIG_VP9_HIGHBITDEPTH
 typedef int (*highbd_calc_masked_var_t)(__m128i v_sum_d, __m128i v_sse_q,
@@ -1449,9 +1390,9 @@
 typedef __m128i (*highbd_filter_fn_t)(__m128i v_a_w, __m128i v_b_w,
                                     __m128i v_filter_w);
 
-static INLINE __m128i highbd_apply_filter8(const __m128i v_a_w,
-                                           const __m128i v_b_w,
-                                           const __m128i v_filter_w) {
+static INLINE __m128i highbd_apply_filter_avg(const __m128i v_a_w,
+                                              const __m128i v_b_w,
+                                              const __m128i v_filter_w) {
   (void) v_filter_w;
   return _mm_avg_epu16(v_a_w, v_b_w);
 }
@@ -1523,55 +1464,53 @@
                                                  __m128i v_sse_q,
                                                  unsigned int* sse,
                                                  const int w, const int h) {
-  int sum;
+  int64_t sum64;
+  uint64_t sse64;
 
   // Horizontal sum
-  v_sum_d = _mm_hadd_epi32(v_sum_d, v_sum_d);
-  v_sum_d = _mm_hadd_epi32(v_sum_d, v_sum_d);
-  v_sse_q = _mm_add_epi64(v_sse_q, _mm_srli_si128(v_sse_q, 8));
+  sum64 = hsum_epi32_si32(v_sum_d);
+  sse64 = hsum_epi64_si64(v_sse_q);
+
+  sum64 = (sum64 >= 0) ? sum64 : -sum64;
 
   // Round
-  sum = _mm_cvtsi128_si32(v_sum_d);
-  sum = (sum >= 0) ? ((sum + 31) >> 6) : -((-sum + 31) >> 6);
-  sum = ROUND_POWER_OF_TWO(sum, 2);
+  sum64 = ROUND_POWER_OF_TWO(sum64, 6);
+  sse64 = ROUND_POWER_OF_TWO(sse64, 12);
 
-  v_sse_q = _mm_add_epi64(v_sse_q, _mm_set_epi32(0, 0, 0, 2047));
-  v_sse_q = _mm_srli_epi64(v_sse_q, 12);
+  // Normalise
+  sum64 = ROUND_POWER_OF_TWO(sum64, 2);
+  sse64 = ROUND_POWER_OF_TWO(sse64, 4);
 
   // Store the SSE
-  v_sse_q = _mm_add_epi64(v_sse_q, _mm_set_epi32(0, 0, 0, 0x8));
-  v_sse_q = _mm_srli_epi64(v_sse_q, 4);
-  *sse = _mm_cvtsi128_si32(v_sse_q);
-
+  *sse = (unsigned int)sse64;
   // Compute the variance
-  return  *sse - (((int64_t)sum * sum) >> (LOG2_P2(h) + LOG2_P2(w)));
+  return  *sse - ((sum64 * sum64) / (w * h));
 }
 static INLINE int highbd_12_calc_masked_variance(__m128i v_sum_d,
                                                  __m128i v_sse_q,
                                                  unsigned int* sse,
                                                  const int w, const int h) {
-  int sum;
+  int64_t sum64;
+  uint64_t sse64;
 
   // Horizontal sum
-  v_sum_d = _mm_hadd_epi32(v_sum_d, v_sum_d);
-  v_sum_d = _mm_hadd_epi32(v_sum_d, v_sum_d);
-  v_sse_q = _mm_add_epi64(v_sse_q, _mm_srli_si128(v_sse_q, 8));
+  sum64 = hsum_epi32_si64(v_sum_d);
+  sse64 = hsum_epi64_si64(v_sse_q);
+
+  sum64 = (sum64 >= 0) ? sum64 : -sum64;
 
   // Round
-  sum = _mm_cvtsi128_si32(v_sum_d);
-  sum = (sum >= 0) ? ((sum + 31) >> 6) : -((-sum + 31) >> 6);
-  sum = ROUND_POWER_OF_TWO(sum, 4);
+  sum64 = ROUND_POWER_OF_TWO(sum64, 6);
+  sse64 = ROUND_POWER_OF_TWO(sse64, 12);
 
-  v_sse_q = _mm_add_epi64(v_sse_q, _mm_set_epi32(0, 0, 0, 2047));
-  v_sse_q = _mm_srli_epi64(v_sse_q, 12);
+  // Normalise
+  sum64 = ROUND_POWER_OF_TWO(sum64, 4);
+  sse64 = ROUND_POWER_OF_TWO(sse64, 8);
 
   // Store the SSE
-  v_sse_q = _mm_add_epi64(v_sse_q, _mm_set_epi32(0, 0, 0, 0x80));
-  v_sse_q = _mm_srli_epi64(v_sse_q, 8);
-  *sse = _mm_cvtsi128_si32(v_sse_q);
-
+  *sse = (unsigned int)sse64;
   // Compute the variance
-  return  *sse - (((int64_t)sum * sum) >> (LOG2_P2(h) + LOG2_P2(w)));
+  return  *sse - ((sum64 * sum64) / (w * h));
 }
 
 
@@ -1586,9 +1525,9 @@
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
   const __m128i v_filter_w = _mm_set1_epi32((
-        vpx_bilinear_filters[yoffset][1] << 16) +
-        vpx_bilinear_filters[yoffset][0]);
-  assert(yoffset < 8);
+        bilinear_filters_2t[yoffset][1] << 16) +
+        bilinear_filters_2t[yoffset][0]);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
   for (j = 0; j < w; j += 8) {
     // Load the first row ready
     v_src0_w = _mm_loadu_si128((const __m128i*)(src + j));
@@ -1631,9 +1570,9 @@
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
   const __m128i v_filter_w = _mm_set1_epi32((
-        vpx_bilinear_filters[xoffset][1] << 16) +
-        vpx_bilinear_filters[xoffset][0]);
-  assert(xoffset < 8);
+        bilinear_filters_2t[xoffset][1] << 16) +
+        bilinear_filters_2t[xoffset][0]);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
   for (i = 0; i < h; i++) {
     for (j = 0; j < w; j += 8) {
       // Load this row & apply the filter to them
@@ -1664,13 +1603,13 @@
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
   const __m128i v_filterx_w = _mm_set1_epi32((
-        vpx_bilinear_filters[xoffset][1] << 16) +
-        vpx_bilinear_filters[xoffset][0]);
+        bilinear_filters_2t[xoffset][1] << 16) +
+        bilinear_filters_2t[xoffset][0]);
   const __m128i v_filtery_w = _mm_set1_epi32((
-        vpx_bilinear_filters[yoffset][1] << 16) +
-        vpx_bilinear_filters[yoffset][0]);
-  assert(xoffset < 8);
-  assert(yoffset < 8);
+        bilinear_filters_2t[yoffset][1] << 16) +
+        bilinear_filters_2t[yoffset][0]);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
   for (j = 0; j < w; j += 8) {
     // Load the first row ready
     v_src0_w = _mm_loadu_si128((const __m128i*)(src + j));
@@ -1724,13 +1663,13 @@
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
   __m128i v_filter_w = _mm_set1_epi32((
-        vpx_bilinear_filters[yoffset][1] << 16) +
-        vpx_bilinear_filters[yoffset][0]);
-  assert(yoffset < 8);
+        bilinear_filters_2t[yoffset][1] << 16) +
+        bilinear_filters_2t[yoffset][0]);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
   // Load the first row of src data ready
   v_src0_w = _mm_loadl_epi64((const __m128i*)src);
   for (i = 0; i < h; i += 2) {
-    if (yoffset == 8) {
+    if (yoffset == HALF_PIXEL_OFFSET) {
       // Load the rest of the source data for these rows
       v_src1_w = _mm_or_si128(
             _mm_slli_si128(v_src0_w, 8),
@@ -1776,9 +1715,9 @@
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
   __m128i v_filter_w = _mm_set1_epi32((
-        vpx_bilinear_filters[xoffset][1] << 16) +
-        vpx_bilinear_filters[xoffset][0]);
-  assert(xoffset < 8);
+        bilinear_filters_2t[xoffset][1] << 16) +
+        bilinear_filters_2t[xoffset][0]);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
   for (i = 0; i < h; i += 2) {
     // Load the src data
     v_src0_w = _mm_loadu_si128((const __m128i*)(src));
@@ -1786,7 +1725,7 @@
     v_src1_w = _mm_loadu_si128((const __m128i*)(src + src_stride));
     v_src1_shift_w = _mm_srli_si128(v_src1_w, 2);
     // Apply the x filter
-    if (xoffset == 8) {
+    if (xoffset == HALF_PIXEL_OFFSET) {
       v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w);
       v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w);
       v_res_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w);
@@ -1826,21 +1765,20 @@
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
   __m128i v_filterx_w = _mm_set1_epi32((
-        vpx_bilinear_filters[xoffset][1] << 16) +
-        vpx_bilinear_filters[xoffset][0]);
+        bilinear_filters_2t[xoffset][1] << 16) +
+        bilinear_filters_2t[xoffset][0]);
   __m128i v_filtery_w = _mm_set1_epi32((
-        vpx_bilinear_filters[yoffset][1] << 16) +
-        vpx_bilinear_filters[yoffset][0]);
-  assert(xoffset < 8);
-  assert(yoffset < 8);
-
+        bilinear_filters_2t[yoffset][1] << 16) +
+        bilinear_filters_2t[yoffset][0]);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
   // Load the first block of src data
   v_src0_w = _mm_loadu_si128((const __m128i*)(src));
   v_src0_shift_w = _mm_srli_si128(v_src0_w, 2);
   v_src1_w = _mm_loadu_si128((const __m128i*)(src + src_stride));
   v_src1_shift_w = _mm_srli_si128(v_src1_w, 2);
   // Apply the x filter
-  if (xoffset == 8) {
+  if (xoffset == HALF_PIXEL_OFFSET) {
     v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w);
     v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w);
     v_xres0_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w);
@@ -1858,7 +1796,7 @@
     v_src1_w = _mm_loadu_si128((const __m128i*)(src + src_stride * 3));
     v_src1_shift_w = _mm_srli_si128(v_src1_w, 2);
     // Apply the x filter
-    if (xoffset == 8) {
+    if (xoffset == HALF_PIXEL_OFFSET) {
       v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w);
       v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w);
       v_xres1_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w);
@@ -1872,7 +1810,7 @@
     // Apply the y filter to the previous block
     v_temp_w = _mm_or_si128(_mm_srli_si128(v_xres0_w, 8),
                             _mm_slli_si128(v_xres1_w, 8));
-    if (yoffset == 8) {
+    if (yoffset == HALF_PIXEL_OFFSET) {
       v_res_w = _mm_avg_epu16(v_xres0_w, v_temp_w);
     } else {
       v_res_w = highbd_apply_filter(v_xres0_w, v_temp_w, v_filtery_w);
@@ -1894,7 +1832,7 @@
     v_src1_w = _mm_loadu_si128((const __m128i*)(src + src_stride * 5));
     v_src1_shift_w = _mm_srli_si128(v_src1_w, 2);
     // Apply the x filter
-    if (xoffset == 8) {
+    if (xoffset == HALF_PIXEL_OFFSET) {
       v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w);
       v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w);
       v_xres0_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w);
@@ -1908,7 +1846,7 @@
     // Apply the y filter to the previous block
     v_temp_w = _mm_or_si128(_mm_srli_si128(v_xres1_w, 8),
                             _mm_slli_si128(v_xres0_w, 8));
-    if (yoffset == 8) {
+    if (yoffset == HALF_PIXEL_OFFSET) {
       v_res_w = _mm_avg_epu16(v_xres1_w, v_temp_w);
     } else {
       v_res_w = highbd_apply_filter(v_xres1_w, v_temp_w, v_filtery_w);
@@ -1948,55 +1886,61 @@
     if (yoffset == 0)                                                          \
       return full_variance_function(src8, src_stride, dst8, dst_stride,        \
                                     msk, msk_stride, sse);                     \
-    else if (yoffset == 8)                                                     \
-      return vpx_highbd_masked_subpel_varWxH_xzero(src, src_stride, 8,         \
+    else if (yoffset == HALF_PIXEL_OFFSET)                                     \
+      return vpx_highbd_masked_subpel_varWxH_xzero(src, src_stride,            \
+                                                   HALF_PIXEL_OFFSET,          \
                                                    dst, dst_stride,            \
                                                    msk, msk_stride,            \
                                                    sse, W, H,                  \
-                                                   highbd_apply_filter8,       \
+                                                   highbd_apply_filter_avg,    \
                                                    calc_var);                  \
     else                                                                       \
-      return vpx_highbd_masked_subpel_varWxH_xzero(src, src_stride, yoffset,   \
+      return vpx_highbd_masked_subpel_varWxH_xzero(src, src_stride,            \
+                                                   yoffset,                    \
                                                    dst, dst_stride,            \
                                                    msk, msk_stride,            \
                                                    sse, W, H,                  \
                                                    highbd_apply_filter,        \
                                                    calc_var);                  \
   } else if (yoffset == 0) {                                                   \
-    if (xoffset == 8)                                                          \
-      return vpx_highbd_masked_subpel_varWxH_yzero(src, src_stride, 8,         \
+    if (xoffset == HALF_PIXEL_OFFSET)                                          \
+      return vpx_highbd_masked_subpel_varWxH_yzero(src, src_stride,            \
+                                                   HALF_PIXEL_OFFSET,          \
                                                    dst, dst_stride,            \
                                                    msk, msk_stride,            \
                                                    sse, W, H,                  \
-                                                   highbd_apply_filter8,       \
+                                                   highbd_apply_filter_avg,    \
                                                    calc_var);                  \
     else                                                                       \
-      return vpx_highbd_masked_subpel_varWxH_yzero(src, src_stride, xoffset,   \
+      return vpx_highbd_masked_subpel_varWxH_yzero(src, src_stride,            \
+                                                   xoffset,                    \
                                                    dst, dst_stride,            \
                                                    msk, msk_stride,            \
                                                    sse, W, H,                  \
                                                    highbd_apply_filter,        \
                                                    calc_var);                  \
-  } else if (xoffset == 8) {                                                   \
-    if (yoffset == 8)                                                          \
+  } else if (xoffset == HALF_PIXEL_OFFSET) {                                   \
+    if (yoffset == HALF_PIXEL_OFFSET)                                          \
       return vpx_highbd_masked_subpel_varWxH_xnonzero_ynonzero(                \
-              src, src_stride, 8, 8, dst, dst_stride, msk, msk_stride,         \
-              sse, W, H, highbd_apply_filter8, highbd_apply_filter8, calc_var);\
+              src, src_stride, HALF_PIXEL_OFFSET, HALF_PIXEL_OFFSET,           \
+              dst, dst_stride, msk, msk_stride, sse, W, H,                     \
+              highbd_apply_filter_avg, highbd_apply_filter_avg, calc_var);     \
     else                                                                       \
       return vpx_highbd_masked_subpel_varWxH_xnonzero_ynonzero(                \
-              src, src_stride, 8, yoffset, dst, dst_stride,                    \
-              msk, msk_stride, sse, W, H, highbd_apply_filter8,                \
+              src, src_stride, HALF_PIXEL_OFFSET, yoffset, dst, dst_stride,    \
+              msk, msk_stride, sse, W, H, highbd_apply_filter_avg,             \
               highbd_apply_filter, calc_var);                                  \
   } else {                                                                     \
-    if (yoffset == 8)                                                          \
+    if (yoffset == HALF_PIXEL_OFFSET)                                          \
       return vpx_highbd_masked_subpel_varWxH_xnonzero_ynonzero(                \
-              src, src_stride, xoffset, 8, dst, dst_stride, msk, msk_stride,   \
-              sse, W, H, highbd_apply_filter, highbd_apply_filter8, calc_var); \
+              src, src_stride, xoffset, HALF_PIXEL_OFFSET,                     \
+              dst, dst_stride, msk, msk_stride, sse, W, H,                     \
+              highbd_apply_filter, highbd_apply_filter_avg, calc_var);         \
     else                                                                       \
       return vpx_highbd_masked_subpel_varWxH_xnonzero_ynonzero(                \
-              src, src_stride, xoffset, yoffset, dst, dst_stride,              \
-               msk, msk_stride, sse, W, H, highbd_apply_filter,                \
-               highbd_apply_filter, calc_var);                                 \
+              src, src_stride, xoffset, yoffset,                               \
+              dst, dst_stride, msk, msk_stride, sse, W, H,                     \
+              highbd_apply_filter, highbd_apply_filter, calc_var);             \
   }                                                                            \
 }
 
@@ -2093,4 +2037,12 @@
 HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(64, 32)
 HIGHBD_MASK_SUBPIX_VAR_LARGE(64, 64)
 HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(64, 64)
+#if CONFIG_EXT_PARTITION
+HIGHBD_MASK_SUBPIX_VAR_LARGE(64, 128)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(64, 128)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(128, 64)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(128, 64)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(128, 128)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(128, 128)
+#endif  // CONFIG_EXT_PARTITION
 #endif
diff --git a/vpx_dsp/x86/vpx_convolve_copy_sse2.asm b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
index abc0270..6d43fc1 100644
--- a/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
+++ b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
@@ -46,6 +46,119 @@
   je .w16
   cmp r4d, 32
   je .w32
+
+%if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  cmp r4d, 64
+  je .w64
+%ifidn %2, highbd
+  cmp r4d, 128
+  je .w128
+
+.w256:
+  mov                    r4d, dword hm
+.loop256:
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+16]
+  movu                    m2, [srcq+32]
+  movu                    m3, [srcq+48]
+%ifidn %1, avg
+  pavg                    m0, [dstq]
+  pavg                    m1, [dstq+16]
+  pavg                    m2, [dstq+32]
+  pavg                    m3, [dstq+48]
+%endif
+  mova             [dstq   ], m0
+  mova             [dstq+16], m1
+  mova             [dstq+32], m2
+  mova             [dstq+48], m3
+  movu                    m0, [srcq+64]
+  movu                    m1, [srcq+80]
+  movu                    m2, [srcq+96]
+  movu                    m3, [srcq+112]
+%ifidn %1, avg
+  pavg                    m0, [dstq+64]
+  pavg                    m1, [dstq+80]
+  pavg                    m2, [dstq+96]
+  pavg                    m3, [dstq+112]
+%endif
+  mova             [dstq+64], m0
+  mova             [dstq+80], m1
+  mova             [dstq+96], m2
+  mova            [dstq+112], m3
+  movu                    m0, [srcq+128]
+  movu                    m1, [srcq+128+16]
+  movu                    m2, [srcq+128+32]
+  movu                    m3, [srcq+128+48]
+%ifidn %1, avg
+  pavg                    m0, [dstq+128]
+  pavg                    m1, [dstq+128+16]
+  pavg                    m2, [dstq+128+32]
+  pavg                    m3, [dstq+128+48]
+%endif
+  mova         [dstq+128   ], m0
+  mova         [dstq+128+16], m1
+  mova         [dstq+128+32], m2
+  mova         [dstq+128+48], m3
+  movu                    m0, [srcq+128+64]
+  movu                    m1, [srcq+128+80]
+  movu                    m2, [srcq+128+96]
+  movu                    m3, [srcq+128+112]
+  add                   srcq, src_strideq
+%ifidn %1, avg
+  pavg                    m0, [dstq+128+64]
+  pavg                    m1, [dstq+128+80]
+  pavg                    m2, [dstq+128+96]
+  pavg                    m3, [dstq+128+112]
+%endif
+  mova         [dstq+128+64], m0
+  mova         [dstq+128+80], m1
+  mova         [dstq+128+96], m2
+  mova        [dstq+128+112], m3
+  add                   dstq, dst_strideq
+  sub                    r4d, 1
+  jnz .loop256
+  RET
+%endif
+
+.w128:
+  mov                    r4d, dword hm
+.loop128:
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+16]
+  movu                    m2, [srcq+32]
+  movu                    m3, [srcq+48]
+%ifidn %1, avg
+  pavg                    m0, [dstq]
+  pavg                    m1, [dstq+16]
+  pavg                    m2, [dstq+32]
+  pavg                    m3, [dstq+48]
+%endif
+  mova             [dstq   ], m0
+  mova             [dstq+16], m1
+  mova             [dstq+32], m2
+  mova             [dstq+48], m3
+  movu                    m0, [srcq+64]
+  movu                    m1, [srcq+80]
+  movu                    m2, [srcq+96]
+  movu                    m3, [srcq+112]
+  add                   srcq, src_strideq
+%ifidn %1, avg
+  pavg                    m0, [dstq+64]
+  pavg                    m1, [dstq+80]
+  pavg                    m2, [dstq+96]
+  pavg                    m3, [dstq+112]
+%endif
+  mova             [dstq+64], m0
+  mova             [dstq+80], m1
+  mova             [dstq+96], m2
+  mova            [dstq+112], m3
+  add                   dstq, dst_strideq
+  sub                    r4d, 1
+  jnz .loop128
+  RET
+
+%else  ; CONFIG_VP10 && CONFIG_EXT_PARTITION
+
 %ifidn %2, highbd
   cmp r4d, 64
   je .w64
@@ -82,10 +195,11 @@
   mova             [dstq+96], m2
   mova            [dstq+112], m3
   add                   dstq, dst_strideq
-  dec                    r4d
+  sub                    r4d, 1
   jnz .loop128
   RET
 %endif
+%endif  ; CONFIG_VP10 && CONFIG_EXT_PARTITION
 
 .w64
   mov                    r4d, dword hm
@@ -106,7 +220,7 @@
   mova             [dstq+32], m2
   mova             [dstq+48], m3
   add                   dstq, dst_strideq
-  dec                    r4d
+  sub                    r4d, 1
   jnz .loop64
   RET
 
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
index 6fd5208..6c59918 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -844,34 +844,49 @@
   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
   // --Require an additional 8 rows for the horiz_w8 transpose tail.
-  DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
+  DECLARE_ALIGNED(16, uint8_t, temp[(MAX_EXT_SIZE + 8) * MAX_CU_SIZE]);
   const int intermediate_height =
       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
 
-  assert(w <= 64);
-  assert(h <= 64);
+  assert(w <= MAX_CU_SIZE);
+  assert(h <= MAX_CU_SIZE);
   assert(y_step_q4 <= 32);
   assert(x_step_q4 <= 32);
 
   if (w >= 8) {
     scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
+                            src_stride,
+                            temp,
+                            MAX_CU_SIZE,
+                            x_filters, x0_q4, x_step_q4,
                             w, intermediate_height);
   } else {
     scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
+                            src_stride,
+                            temp,
+                            MAX_CU_SIZE,
+                            x_filters, x0_q4, x_step_q4,
                             w, intermediate_height);
   }
 
   if (w >= 16) {
-    scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                            dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+    scaledconvolve_vert_w16(temp + MAX_CU_SIZE * (SUBPEL_TAPS / 2 - 1),
+                            MAX_CU_SIZE,
+                            dst,
+                            dst_stride,
+                            y_filters, y0_q4, y_step_q4, w, h);
   } else if (w == 8) {
-    scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                           dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+    scaledconvolve_vert_w8(temp + MAX_CU_SIZE * (SUBPEL_TAPS / 2 - 1),
+                           MAX_CU_SIZE,
+                           dst,
+                           dst_stride,
+                           y_filters, y0_q4, y_step_q4, w, h);
   } else {
-    scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                           dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+    scaledconvolve_vert_w4(temp + MAX_CU_SIZE * (SUBPEL_TAPS / 2 - 1),
+                           MAX_CU_SIZE,
+                           dst,
+                           dst_stride,
+                           y_filters, y0_q4, y_step_q4, w, h);
   }
 }