Add 128 pixel variance and SAD functions

Change-Id: I8fde245b32c9e586683a28aa6925da0b83850b39
diff --git a/test/masked_sad_test.cc b/test/masked_sad_test.cc
index c09104c..d7c6fce 100644
--- a/test/masked_sad_test.cc
+++ b/test/masked_sad_test.cc
@@ -22,6 +22,8 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 
+#define MAX_CU_SIZE 128
+
 using libvpx_test::ACMRandom;
 
 namespace {
@@ -50,16 +52,16 @@
 TEST_P(MaskedSADTest, OperationCheck) {
   unsigned int ref_ret, ret;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint8_t,  src_ptr[4096]);
-  DECLARE_ALIGNED(16, uint8_t,  ref_ptr[4096]);
-  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[4096]);
+  DECLARE_ALIGNED(16, uint8_t, src_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t, ref_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
   int err_count = 0;
   int first_failure = -1;
-  int src_stride = 64;
-  int ref_stride = 64;
-  int msk_stride = 64;
+  int src_stride = MAX_CU_SIZE;
+  int ref_stride = MAX_CU_SIZE;
+  int msk_stride = MAX_CU_SIZE;
   for (int i = 0; i < number_of_iterations; ++i) {
-    for (int j = 0; j < 4096; j++) {
+    for (int j = 0; j < MAX_CU_SIZE*MAX_CU_SIZE; j++) {
       src_ptr[j] = rnd.Rand8();
       ref_ptr[j] = rnd.Rand8();
       msk_ptr[j] = ((rnd.Rand8()&0x7f) > 64) ? rnd.Rand8()&0x3f : 64;
@@ -108,18 +110,18 @@
 TEST_P(HighbdMaskedSADTest, OperationCheck) {
   unsigned int ref_ret, ret;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint16_t,  src_ptr[4096]);
-  DECLARE_ALIGNED(16, uint16_t,  ref_ptr[4096]);
-  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[4096]);
+  DECLARE_ALIGNED(16, uint16_t, src_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
+  DECLARE_ALIGNED(16, uint16_t, ref_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
   uint8_t* src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
   uint8_t* ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
   int err_count = 0;
   int first_failure = -1;
-  int src_stride = 64;
-  int ref_stride = 64;
-  int msk_stride = 64;
+  int src_stride = MAX_CU_SIZE;
+  int ref_stride = MAX_CU_SIZE;
+  int msk_stride = MAX_CU_SIZE;
   for (int i = 0; i < number_of_iterations; ++i) {
-    for (int j = 0; j < 4096; j++) {
+    for (int j = 0; j < MAX_CU_SIZE*MAX_CU_SIZE; j++) {
       src_ptr[j] = rnd.Rand16()&0xfff;
       ref_ptr[j] = rnd.Rand16()&0xfff;
       msk_ptr[j] = ((rnd.Rand8()&0x7f) > 64) ? rnd.Rand8()&0x3f : 64;
@@ -148,6 +150,14 @@
 INSTANTIATE_TEST_CASE_P(
   SSSE3_C_COMPARE, MaskedSADTest,
   ::testing::Values(
+#if CONFIG_EXT_PARTITION
+    make_tuple(&vpx_masked_sad128x128_ssse3,
+               &vpx_masked_sad128x128_c),
+    make_tuple(&vpx_masked_sad128x64_ssse3,
+               &vpx_masked_sad128x64_c),
+    make_tuple(&vpx_masked_sad64x128_ssse3,
+               &vpx_masked_sad64x128_c),
+#endif  // CONFIG_EXT_PARTITION
     make_tuple(&vpx_masked_sad64x64_ssse3,
                &vpx_masked_sad64x64_c),
     make_tuple(&vpx_masked_sad64x32_ssse3,
@@ -178,32 +188,40 @@
 INSTANTIATE_TEST_CASE_P(
   SSSE3_C_COMPARE, HighbdMaskedSADTest,
   ::testing::Values(
-    make_tuple(&vp9_highbd_masked_sad64x64_ssse3,
-               &vp9_highbd_masked_sad64x64_c),
-    make_tuple(&vp9_highbd_masked_sad64x32_ssse3,
-               &vp9_highbd_masked_sad64x32_c),
-    make_tuple(&vp9_highbd_masked_sad32x64_ssse3,
-               &vp9_highbd_masked_sad32x64_c),
-    make_tuple(&vp9_highbd_masked_sad32x32_ssse3,
-               &vp9_highbd_masked_sad32x32_c),
-    make_tuple(&vp9_highbd_masked_sad32x16_ssse3,
-               &vp9_highbd_masked_sad32x16_c),
-    make_tuple(&vp9_highbd_masked_sad16x32_ssse3,
-               &vp9_highbd_masked_sad16x32_c),
-    make_tuple(&vp9_highbd_masked_sad16x16_ssse3,
-               &vp9_highbd_masked_sad16x16_c),
-    make_tuple(&vp9_highbd_masked_sad16x8_ssse3,
-               &vp9_highbd_masked_sad16x8_c),
-    make_tuple(&vp9_highbd_masked_sad8x16_ssse3,
-               &vp9_highbd_masked_sad8x16_c),
-    make_tuple(&vp9_highbd_masked_sad8x8_ssse3,
-               &vp9_highbd_masked_sad8x8_c),
-    make_tuple(&vp9_highbd_masked_sad8x4_ssse3,
-               &vp9_highbd_masked_sad8x4_c),
-    make_tuple(&vp9_highbd_masked_sad4x8_ssse3,
-               &vp9_highbd_masked_sad4x8_c),
-    make_tuple(&vp9_highbd_masked_sad4x4_ssse3,
-               &vp9_highbd_masked_sad4x4_c)));
+#if CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_masked_sad128x128_ssse3,
+               &vpx_highbd_masked_sad128x128_c),
+    make_tuple(&vpx_highbd_masked_sad128x64_ssse3,
+               &vpx_highbd_masked_sad128x64_c),
+    make_tuple(&vpx_highbd_masked_sad64x128_ssse3,
+               &vpx_highbd_masked_sad64x128_c),
+#endif  // CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_masked_sad64x64_ssse3,
+               &vpx_highbd_masked_sad64x64_c),
+    make_tuple(&vpx_highbd_masked_sad64x32_ssse3,
+               &vpx_highbd_masked_sad64x32_c),
+    make_tuple(&vpx_highbd_masked_sad32x64_ssse3,
+               &vpx_highbd_masked_sad32x64_c),
+    make_tuple(&vpx_highbd_masked_sad32x32_ssse3,
+               &vpx_highbd_masked_sad32x32_c),
+    make_tuple(&vpx_highbd_masked_sad32x16_ssse3,
+               &vpx_highbd_masked_sad32x16_c),
+    make_tuple(&vpx_highbd_masked_sad16x32_ssse3,
+               &vpx_highbd_masked_sad16x32_c),
+    make_tuple(&vpx_highbd_masked_sad16x16_ssse3,
+               &vpx_highbd_masked_sad16x16_c),
+    make_tuple(&vpx_highbd_masked_sad16x8_ssse3,
+               &vpx_highbd_masked_sad16x8_c),
+    make_tuple(&vpx_highbd_masked_sad8x16_ssse3,
+               &vpx_highbd_masked_sad8x16_c),
+    make_tuple(&vpx_highbd_masked_sad8x8_ssse3,
+               &vpx_highbd_masked_sad8x8_c),
+    make_tuple(&vpx_highbd_masked_sad8x4_ssse3,
+               &vpx_highbd_masked_sad8x4_c),
+    make_tuple(&vpx_highbd_masked_sad4x8_ssse3,
+               &vpx_highbd_masked_sad4x8_c),
+    make_tuple(&vpx_highbd_masked_sad4x4_ssse3,
+               &vpx_highbd_masked_sad4x4_c)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_SSSE3
 }  // namespace
diff --git a/test/masked_variance_test.cc b/test/masked_variance_test.cc
index fc37759..c312899 100644
--- a/test/masked_variance_test.cc
+++ b/test/masked_variance_test.cc
@@ -20,10 +20,12 @@
 
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_filter.h"
+#include "vpx_mem/vpx_mem.h"
 
-#define MAX_SIZE 64
+#define MAX_CU_SIZE 128
 
 using libvpx_test::ACMRandom;
 
@@ -58,17 +60,17 @@
   unsigned int ref_ret, opt_ret;
   unsigned int ref_sse, opt_sse;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint8_t,  src_ptr[MAX_SIZE*MAX_SIZE]);
-  DECLARE_ALIGNED(16, uint8_t,  ref_ptr[MAX_SIZE*MAX_SIZE]);
-  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[MAX_SIZE*MAX_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t,  src_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t,  ref_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
   int err_count = 0;
   int first_failure = -1;
-  int src_stride = MAX_SIZE;
-  int ref_stride = MAX_SIZE;
-  int msk_stride = MAX_SIZE;
+  int src_stride = MAX_CU_SIZE;
+  int ref_stride = MAX_CU_SIZE;
+  int msk_stride = MAX_CU_SIZE;
 
   for (int i = 0; i < number_of_iterations; ++i) {
-    for (int j = 0; j < MAX_SIZE*MAX_SIZE; j++) {
+    for (int j = 0; j < MAX_CU_SIZE*MAX_CU_SIZE; j++) {
       src_ptr[j] = rnd.Rand8();
       ref_ptr[j] = rnd.Rand8();
       msk_ptr[j] = rnd(65);
@@ -100,19 +102,19 @@
   unsigned int ref_ret, opt_ret;
   unsigned int ref_sse, opt_sse;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint8_t,  src_ptr[MAX_SIZE*MAX_SIZE]);
-  DECLARE_ALIGNED(16, uint8_t,  ref_ptr[MAX_SIZE*MAX_SIZE]);
-  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[MAX_SIZE*MAX_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t,  src_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t,  ref_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
   int err_count = 0;
   int first_failure = -1;
-  int src_stride = MAX_SIZE;
-  int ref_stride = MAX_SIZE;
-  int msk_stride = MAX_SIZE;
+  int src_stride = MAX_CU_SIZE;
+  int ref_stride = MAX_CU_SIZE;
+  int msk_stride = MAX_CU_SIZE;
 
   for (int i = 0; i < 8; ++i) {
-    memset(src_ptr, (i & 0x1) ? 255 : 0, MAX_SIZE*MAX_SIZE);
-    memset(ref_ptr, (i & 0x2) ? 255 : 0, MAX_SIZE*MAX_SIZE);
-    memset(msk_ptr, (i & 0x4) ?  64 : 0, MAX_SIZE*MAX_SIZE);
+    memset(src_ptr, (i & 0x1) ? 255 : 0, MAX_CU_SIZE*MAX_CU_SIZE);
+    memset(ref_ptr, (i & 0x2) ? 255 : 0, MAX_CU_SIZE*MAX_CU_SIZE);
+    memset(msk_ptr, (i & 0x4) ?  64 : 0, MAX_CU_SIZE*MAX_CU_SIZE);
 
     ref_ret = ref_func_(src_ptr, src_stride,
                         ref_ptr, ref_stride,
@@ -166,21 +168,21 @@
   unsigned int ref_ret, opt_ret;
   unsigned int ref_sse, opt_sse;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint8_t,  src_ptr[(MAX_SIZE+1)*(MAX_SIZE+1)]);
-  DECLARE_ALIGNED(16, uint8_t,  ref_ptr[(MAX_SIZE+1)*(MAX_SIZE+1)]);
-  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[(MAX_SIZE+1)*(MAX_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint8_t,  src_ptr[(MAX_CU_SIZE+1)*(MAX_CU_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint8_t,  ref_ptr[(MAX_CU_SIZE+1)*(MAX_CU_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[(MAX_CU_SIZE+1)*(MAX_CU_SIZE+1)]);
   int err_count = 0;
   int first_failure = -1;
-  int src_stride = (MAX_SIZE+1);
-  int ref_stride = (MAX_SIZE+1);
-  int msk_stride = (MAX_SIZE+1);
+  int src_stride = (MAX_CU_SIZE+1);
+  int ref_stride = (MAX_CU_SIZE+1);
+  int msk_stride = (MAX_CU_SIZE+1);
   int xoffset;
   int yoffset;
 
   for (int i = 0; i < number_of_iterations; ++i) {
     int xoffsets[] = {0, 4, rnd(BIL_SUBPEL_SHIFTS)};
     int yoffsets[] = {0, 4, rnd(BIL_SUBPEL_SHIFTS)};
-    for (int j = 0; j < (MAX_SIZE+1)*(MAX_SIZE+1); j++) {
+    for (int j = 0; j < (MAX_CU_SIZE+1)*(MAX_CU_SIZE+1); j++) {
       src_ptr[j] = rnd.Rand8();
       ref_ptr[j] = rnd.Rand8();
       msk_ptr[j] = rnd(65);
@@ -221,23 +223,23 @@
   unsigned int ref_ret, opt_ret;
   unsigned int ref_sse, opt_sse;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint8_t,  src_ptr[(MAX_SIZE+1)*(MAX_SIZE+1)]);
-  DECLARE_ALIGNED(16, uint8_t,  ref_ptr[(MAX_SIZE+1)*(MAX_SIZE+1)]);
-  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[(MAX_SIZE+1)*(MAX_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint8_t,  src_ptr[(MAX_CU_SIZE+1)*(MAX_CU_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint8_t,  ref_ptr[(MAX_CU_SIZE+1)*(MAX_CU_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[(MAX_CU_SIZE+1)*(MAX_CU_SIZE+1)]);
   int first_failure_x = -1;
   int first_failure_y = -1;
   int err_count = 0;
   int first_failure = -1;
-  int src_stride = (MAX_SIZE+1);
-  int ref_stride = (MAX_SIZE+1);
-  int msk_stride = (MAX_SIZE+1);
+  int src_stride = (MAX_CU_SIZE+1);
+  int ref_stride = (MAX_CU_SIZE+1);
+  int msk_stride = (MAX_CU_SIZE+1);
 
   for (int xoffset = 0 ; xoffset < BIL_SUBPEL_SHIFTS ; xoffset++) {
     for (int yoffset = 0 ; yoffset < BIL_SUBPEL_SHIFTS ; yoffset++) {
       for (int i = 0; i < 8; ++i) {
-        memset(src_ptr, (i & 0x1) ? 255 : 0, (MAX_SIZE+1)*(MAX_SIZE+1));
-        memset(ref_ptr, (i & 0x2) ? 255 : 0, (MAX_SIZE+1)*(MAX_SIZE+1));
-        memset(msk_ptr, (i & 0x4) ?  64 : 0, (MAX_SIZE+1)*(MAX_SIZE+1));
+        memset(src_ptr, (i & 0x1) ? 255 : 0, (MAX_CU_SIZE+1)*(MAX_CU_SIZE+1));
+        memset(ref_ptr, (i & 0x2) ? 255 : 0, (MAX_CU_SIZE+1)*(MAX_CU_SIZE+1));
+        memset(msk_ptr, (i & 0x4) ?  64 : 0, (MAX_CU_SIZE+1)*(MAX_CU_SIZE+1));
 
         ref_ret = ref_func_(src_ptr, src_stride,
                             xoffset, yoffset,
@@ -297,19 +299,19 @@
   unsigned int ref_ret, opt_ret;
   unsigned int ref_sse, opt_sse;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint16_t, src_ptr[MAX_SIZE*MAX_SIZE]);
-  DECLARE_ALIGNED(16, uint16_t, ref_ptr[MAX_SIZE*MAX_SIZE]);
-  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[MAX_SIZE*MAX_SIZE]);
+  DECLARE_ALIGNED(16, uint16_t, src_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
+  DECLARE_ALIGNED(16, uint16_t, ref_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
   uint8_t* src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
   uint8_t* ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
   int err_count = 0;
   int first_failure = -1;
-  int src_stride = MAX_SIZE;
-  int ref_stride = MAX_SIZE;
-  int msk_stride = MAX_SIZE;
+  int src_stride = MAX_CU_SIZE;
+  int ref_stride = MAX_CU_SIZE;
+  int msk_stride = MAX_CU_SIZE;
 
   for (int i = 0; i < number_of_iterations; ++i) {
-    for (int j = 0; j < MAX_SIZE*MAX_SIZE; j++) {
+    for (int j = 0; j < MAX_CU_SIZE*MAX_CU_SIZE; j++) {
       src_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
       ref_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
       msk_ptr[j] = rnd(65);
@@ -341,23 +343,23 @@
   unsigned int ref_ret, opt_ret;
   unsigned int ref_sse, opt_sse;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint16_t, src_ptr[MAX_SIZE*MAX_SIZE]);
-  DECLARE_ALIGNED(16, uint16_t, ref_ptr[MAX_SIZE*MAX_SIZE]);
-  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[MAX_SIZE*MAX_SIZE]);
+  DECLARE_ALIGNED(16, uint16_t, src_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
+  DECLARE_ALIGNED(16, uint16_t, ref_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
   uint8_t* src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
   uint8_t* ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
   int err_count = 0;
   int first_failure = -1;
-  int src_stride = MAX_SIZE;
-  int ref_stride = MAX_SIZE;
-  int msk_stride = MAX_SIZE;
+  int src_stride = MAX_CU_SIZE;
+  int ref_stride = MAX_CU_SIZE;
+  int msk_stride = MAX_CU_SIZE;
 
   for (int i = 0; i < 8; ++i) {
     vpx_memset16(src_ptr, (i & 0x1) ? ((1 << bit_depth_) - 1) : 0,
-                 MAX_SIZE*MAX_SIZE);
+                 MAX_CU_SIZE*MAX_CU_SIZE);
     vpx_memset16(ref_ptr, (i & 0x2) ? ((1 << bit_depth_) - 1) : 0,
-                 MAX_SIZE*MAX_SIZE);
-    memset(msk_ptr, (i & 0x4) ?  64 : 0, MAX_SIZE*MAX_SIZE);
+                 MAX_CU_SIZE*MAX_CU_SIZE);
+    memset(msk_ptr, (i & 0x4) ?  64 : 0, MAX_CU_SIZE*MAX_CU_SIZE);
 
     ref_ret = ref_func_(src8_ptr, src_stride,
                         ref8_ptr, ref_stride,
@@ -407,24 +409,24 @@
   unsigned int ref_ret, opt_ret;
   unsigned int ref_sse, opt_sse;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint16_t, src_ptr[(MAX_SIZE+1)*(MAX_SIZE+1)]);
-  DECLARE_ALIGNED(16, uint16_t, ref_ptr[(MAX_SIZE+1)*(MAX_SIZE+1)]);
-  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[(MAX_SIZE+1)*(MAX_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint16_t, src_ptr[(MAX_CU_SIZE+1)*(MAX_CU_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint16_t, ref_ptr[(MAX_CU_SIZE+1)*(MAX_CU_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[(MAX_CU_SIZE+1)*(MAX_CU_SIZE+1)]);
   uint8_t* src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
   uint8_t* ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
   int err_count = 0;
   int first_failure = -1;
   int first_failure_x = -1;
   int first_failure_y = -1;
-  int src_stride = (MAX_SIZE+1);
-  int ref_stride = (MAX_SIZE+1);
-  int msk_stride = (MAX_SIZE+1);
+  int src_stride = (MAX_CU_SIZE+1);
+  int ref_stride = (MAX_CU_SIZE+1);
+  int msk_stride = (MAX_CU_SIZE+1);
   int xoffset, yoffset;
 
   for (int i = 0; i < number_of_iterations; ++i) {
     for (xoffset = 0; xoffset < BIL_SUBPEL_SHIFTS; xoffset++) {
       for (yoffset = 0; yoffset < BIL_SUBPEL_SHIFTS; yoffset++) {
-        for (int j = 0; j < (MAX_SIZE+1)*(MAX_SIZE+1); j++) {
+        for (int j = 0; j < (MAX_CU_SIZE+1)*(MAX_CU_SIZE+1); j++) {
           src_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
           ref_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
           msk_ptr[j] = rnd(65);
@@ -465,27 +467,27 @@
   unsigned int ref_ret, opt_ret;
   unsigned int ref_sse, opt_sse;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint16_t, src_ptr[(MAX_SIZE+1)*(MAX_SIZE+1)]);
-  DECLARE_ALIGNED(16, uint16_t, ref_ptr[(MAX_SIZE+1)*(MAX_SIZE+1)]);
-  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[(MAX_SIZE+1)*(MAX_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint16_t, src_ptr[(MAX_CU_SIZE+1)*(MAX_CU_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint16_t, ref_ptr[(MAX_CU_SIZE+1)*(MAX_CU_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[(MAX_CU_SIZE+1)*(MAX_CU_SIZE+1)]);
   uint8_t* src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
   uint8_t* ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
   int first_failure_x = -1;
   int first_failure_y = -1;
   int err_count = 0;
   int first_failure = -1;
-  int src_stride = (MAX_SIZE+1);
-  int ref_stride = (MAX_SIZE+1);
-  int msk_stride = (MAX_SIZE+1);
+  int src_stride = (MAX_CU_SIZE+1);
+  int ref_stride = (MAX_CU_SIZE+1);
+  int msk_stride = (MAX_CU_SIZE+1);
 
   for (int xoffset = 0 ; xoffset < BIL_SUBPEL_SHIFTS ; xoffset++) {
     for (int yoffset = 0 ; yoffset < BIL_SUBPEL_SHIFTS ; yoffset++) {
       for (int i = 0; i < 8; ++i) {
         vpx_memset16(src_ptr, (i & 0x1) ? ((1 << bit_depth_) - 1) : 0,
-                     (MAX_SIZE+1)*(MAX_SIZE+1));
+                     (MAX_CU_SIZE+1)*(MAX_CU_SIZE+1));
         vpx_memset16(ref_ptr, (i & 0x2) ? ((1 << bit_depth_) - 1) : 0,
-                     (MAX_SIZE+1)*(MAX_SIZE+1));
-        memset(msk_ptr, (i & 0x4) ?   64 : 0, (MAX_SIZE+1)*(MAX_SIZE+1));
+                     (MAX_CU_SIZE+1)*(MAX_CU_SIZE+1));
+        memset(msk_ptr, (i & 0x4) ?   64 : 0, (MAX_CU_SIZE+1)*(MAX_CU_SIZE+1));
 
         ref_ret = ref_func_(src8_ptr, src_stride,
                             xoffset, yoffset,
@@ -525,6 +527,14 @@
 INSTANTIATE_TEST_CASE_P(
   SSSE3_C_COMPARE, MaskedVarianceTest,
   ::testing::Values(
+#if CONFIG_EXT_PARTITION
+    make_tuple(&vpx_masked_variance128x128_ssse3,
+               &vpx_masked_variance128x128_c),
+    make_tuple(&vpx_masked_variance128x64_ssse3,
+               &vpx_masked_variance128x64_c),
+    make_tuple(&vpx_masked_variance64x128_ssse3,
+               &vpx_masked_variance64x128_c),
+#endif  // CONFIG_EXT_PARTITION
     make_tuple(&vpx_masked_variance64x64_ssse3,
                &vpx_masked_variance64x64_c),
     make_tuple(&vpx_masked_variance64x32_ssse3,
@@ -555,197 +565,253 @@
 INSTANTIATE_TEST_CASE_P(
   SSSE3_C_COMPARE, MaskedSubPixelVarianceTest,
   ::testing::Values(
+#if CONFIG_EXT_PARTITION
+    make_tuple(&vpx_masked_sub_pixel_variance128x128_ssse3,
+               &vpx_masked_sub_pixel_variance128x128_c),
+    make_tuple(&vpx_masked_sub_pixel_variance128x64_ssse3,
+               &vpx_masked_sub_pixel_variance128x64_c),
+    make_tuple(&vpx_masked_sub_pixel_variance64x128_ssse3,
+               &vpx_masked_sub_pixel_variance64x128_c),
+#endif  // CONFIG_EXT_PARTITION
     make_tuple(&vpx_masked_sub_pixel_variance64x64_ssse3,
-              &vpx_masked_sub_pixel_variance64x64_c),
+               &vpx_masked_sub_pixel_variance64x64_c),
     make_tuple(&vpx_masked_sub_pixel_variance64x32_ssse3,
-              &vpx_masked_sub_pixel_variance64x32_c),
+               &vpx_masked_sub_pixel_variance64x32_c),
     make_tuple(&vpx_masked_sub_pixel_variance32x64_ssse3,
-              &vpx_masked_sub_pixel_variance32x64_c),
+               &vpx_masked_sub_pixel_variance32x64_c),
     make_tuple(&vpx_masked_sub_pixel_variance32x32_ssse3,
-              &vpx_masked_sub_pixel_variance32x32_c),
+               &vpx_masked_sub_pixel_variance32x32_c),
     make_tuple(&vpx_masked_sub_pixel_variance32x16_ssse3,
-              &vpx_masked_sub_pixel_variance32x16_c),
+               &vpx_masked_sub_pixel_variance32x16_c),
     make_tuple(&vpx_masked_sub_pixel_variance16x32_ssse3,
-              &vpx_masked_sub_pixel_variance16x32_c),
+               &vpx_masked_sub_pixel_variance16x32_c),
     make_tuple(&vpx_masked_sub_pixel_variance16x16_ssse3,
-              &vpx_masked_sub_pixel_variance16x16_c),
+               &vpx_masked_sub_pixel_variance16x16_c),
     make_tuple(&vpx_masked_sub_pixel_variance16x8_ssse3,
-              &vpx_masked_sub_pixel_variance16x8_c),
+               &vpx_masked_sub_pixel_variance16x8_c),
     make_tuple(&vpx_masked_sub_pixel_variance8x16_ssse3,
-              &vpx_masked_sub_pixel_variance8x16_c),
+               &vpx_masked_sub_pixel_variance8x16_c),
     make_tuple(&vpx_masked_sub_pixel_variance8x8_ssse3,
-              &vpx_masked_sub_pixel_variance8x8_c),
+               &vpx_masked_sub_pixel_variance8x8_c),
     make_tuple(&vpx_masked_sub_pixel_variance8x4_ssse3,
-              &vpx_masked_sub_pixel_variance8x4_c),
+               &vpx_masked_sub_pixel_variance8x4_c),
     make_tuple(&vpx_masked_sub_pixel_variance4x8_ssse3,
-              &vpx_masked_sub_pixel_variance4x8_c),
+               &vpx_masked_sub_pixel_variance4x8_c),
     make_tuple(&vpx_masked_sub_pixel_variance4x4_ssse3,
-              &vpx_masked_sub_pixel_variance4x4_c)));
+               &vpx_masked_sub_pixel_variance4x4_c)));
 
 #if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
   SSSE3_C_COMPARE, HighbdMaskedVarianceTest,
   ::testing::Values(
-    make_tuple(&vp9_highbd_masked_variance64x64_ssse3,
-               &vp9_highbd_masked_variance64x64_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_variance64x32_ssse3,
-               &vp9_highbd_masked_variance64x32_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_variance32x64_ssse3,
-               &vp9_highbd_masked_variance32x64_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_variance32x32_ssse3,
-               &vp9_highbd_masked_variance32x32_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_variance32x16_ssse3,
-               &vp9_highbd_masked_variance32x16_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_variance16x32_ssse3,
-               &vp9_highbd_masked_variance16x32_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_variance16x16_ssse3,
-               &vp9_highbd_masked_variance16x16_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_variance16x8_ssse3,
-               &vp9_highbd_masked_variance16x8_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_variance8x16_ssse3,
-               &vp9_highbd_masked_variance8x16_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_variance8x8_ssse3,
-               &vp9_highbd_masked_variance8x8_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_variance8x4_ssse3,
-               &vp9_highbd_masked_variance8x4_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_variance4x8_ssse3,
-               &vp9_highbd_masked_variance4x8_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_variance4x4_ssse3,
-               &vp9_highbd_masked_variance4x4_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_10_masked_variance64x64_ssse3,
-               &vp9_highbd_10_masked_variance64x64_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_variance64x32_ssse3,
-               &vp9_highbd_10_masked_variance64x32_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_variance32x64_ssse3,
-               &vp9_highbd_10_masked_variance32x64_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_variance32x32_ssse3,
-               &vp9_highbd_10_masked_variance32x32_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_variance32x16_ssse3,
-               &vp9_highbd_10_masked_variance32x16_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_variance16x32_ssse3,
-               &vp9_highbd_10_masked_variance16x32_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_variance16x16_ssse3,
-               &vp9_highbd_10_masked_variance16x16_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_variance16x8_ssse3,
-               &vp9_highbd_10_masked_variance16x8_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_variance8x16_ssse3,
-               &vp9_highbd_10_masked_variance8x16_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_variance8x8_ssse3,
-               &vp9_highbd_10_masked_variance8x8_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_variance8x4_ssse3,
-               &vp9_highbd_10_masked_variance8x4_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_variance4x8_ssse3,
-               &vp9_highbd_10_masked_variance4x8_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_variance4x4_ssse3,
-               &vp9_highbd_10_masked_variance4x4_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_12_masked_variance64x64_ssse3,
-               &vp9_highbd_12_masked_variance64x64_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_variance64x32_ssse3,
-               &vp9_highbd_12_masked_variance64x32_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_variance32x64_ssse3,
-               &vp9_highbd_12_masked_variance32x64_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_variance32x32_ssse3,
-               &vp9_highbd_12_masked_variance32x32_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_variance32x16_ssse3,
-               &vp9_highbd_12_masked_variance32x16_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_variance16x32_ssse3,
-               &vp9_highbd_12_masked_variance16x32_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_variance16x16_ssse3,
-               &vp9_highbd_12_masked_variance16x16_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_variance16x8_ssse3,
-               &vp9_highbd_12_masked_variance16x8_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_variance8x16_ssse3,
-               &vp9_highbd_12_masked_variance8x16_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_variance8x8_ssse3,
-               &vp9_highbd_12_masked_variance8x8_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_variance8x4_ssse3,
-               &vp9_highbd_12_masked_variance8x4_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_variance4x8_ssse3,
-               &vp9_highbd_12_masked_variance4x8_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_variance4x4_ssse3,
-               &vp9_highbd_12_masked_variance4x4_c, VPX_BITS_12)));
+#if CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_masked_variance128x128_ssse3,
+               &vpx_highbd_masked_variance128x128_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance128x64_ssse3,
+               &vpx_highbd_masked_variance128x64_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance64x128_ssse3,
+               &vpx_highbd_masked_variance64x128_c, VPX_BITS_8),
+#endif  // CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_masked_variance64x64_ssse3,
+               &vpx_highbd_masked_variance64x64_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance64x32_ssse3,
+               &vpx_highbd_masked_variance64x32_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance32x64_ssse3,
+               &vpx_highbd_masked_variance32x64_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance32x32_ssse3,
+               &vpx_highbd_masked_variance32x32_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance32x16_ssse3,
+               &vpx_highbd_masked_variance32x16_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance16x32_ssse3,
+               &vpx_highbd_masked_variance16x32_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance16x16_ssse3,
+               &vpx_highbd_masked_variance16x16_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance16x8_ssse3,
+               &vpx_highbd_masked_variance16x8_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance8x16_ssse3,
+               &vpx_highbd_masked_variance8x16_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance8x8_ssse3,
+               &vpx_highbd_masked_variance8x8_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance8x4_ssse3,
+               &vpx_highbd_masked_variance8x4_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance4x8_ssse3,
+               &vpx_highbd_masked_variance4x8_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance4x4_ssse3,
+               &vpx_highbd_masked_variance4x4_c, VPX_BITS_8),
+#if CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_10_masked_variance128x128_ssse3,
+               &vpx_highbd_10_masked_variance128x128_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance128x64_ssse3,
+               &vpx_highbd_10_masked_variance128x64_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance64x128_ssse3,
+               &vpx_highbd_10_masked_variance64x128_c, VPX_BITS_10),
+#endif  // CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_10_masked_variance64x64_ssse3,
+               &vpx_highbd_10_masked_variance64x64_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance64x32_ssse3,
+               &vpx_highbd_10_masked_variance64x32_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance32x64_ssse3,
+               &vpx_highbd_10_masked_variance32x64_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance32x32_ssse3,
+               &vpx_highbd_10_masked_variance32x32_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance32x16_ssse3,
+               &vpx_highbd_10_masked_variance32x16_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance16x32_ssse3,
+               &vpx_highbd_10_masked_variance16x32_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance16x16_ssse3,
+               &vpx_highbd_10_masked_variance16x16_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance16x8_ssse3,
+               &vpx_highbd_10_masked_variance16x8_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance8x16_ssse3,
+               &vpx_highbd_10_masked_variance8x16_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance8x8_ssse3,
+               &vpx_highbd_10_masked_variance8x8_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance8x4_ssse3,
+               &vpx_highbd_10_masked_variance8x4_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance4x8_ssse3,
+               &vpx_highbd_10_masked_variance4x8_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance4x4_ssse3,
+               &vpx_highbd_10_masked_variance4x4_c, VPX_BITS_10),
+#if CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_12_masked_variance128x128_ssse3,
+               &vpx_highbd_12_masked_variance128x128_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance128x64_ssse3,
+               &vpx_highbd_12_masked_variance128x64_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance64x128_ssse3,
+               &vpx_highbd_12_masked_variance64x128_c, VPX_BITS_12),
+#endif  // CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_12_masked_variance64x64_ssse3,
+               &vpx_highbd_12_masked_variance64x64_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance64x32_ssse3,
+               &vpx_highbd_12_masked_variance64x32_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance32x64_ssse3,
+               &vpx_highbd_12_masked_variance32x64_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance32x32_ssse3,
+               &vpx_highbd_12_masked_variance32x32_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance32x16_ssse3,
+               &vpx_highbd_12_masked_variance32x16_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance16x32_ssse3,
+               &vpx_highbd_12_masked_variance16x32_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance16x16_ssse3,
+               &vpx_highbd_12_masked_variance16x16_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance16x8_ssse3,
+               &vpx_highbd_12_masked_variance16x8_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance8x16_ssse3,
+               &vpx_highbd_12_masked_variance8x16_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance8x8_ssse3,
+               &vpx_highbd_12_masked_variance8x8_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance8x4_ssse3,
+               &vpx_highbd_12_masked_variance8x4_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance4x8_ssse3,
+               &vpx_highbd_12_masked_variance4x8_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance4x4_ssse3,
+               &vpx_highbd_12_masked_variance4x4_c, VPX_BITS_12)));
 
 INSTANTIATE_TEST_CASE_P(
   SSSE3_C_COMPARE, HighbdMaskedSubPixelVarianceTest,
   ::testing::Values(
-    make_tuple(&vp9_highbd_masked_sub_pixel_variance64x64_ssse3,
-               &vp9_highbd_masked_sub_pixel_variance64x64_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_sub_pixel_variance64x32_ssse3,
-               &vp9_highbd_masked_sub_pixel_variance64x32_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_sub_pixel_variance32x64_ssse3,
-               &vp9_highbd_masked_sub_pixel_variance32x64_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_sub_pixel_variance32x32_ssse3,
-               &vp9_highbd_masked_sub_pixel_variance32x32_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_sub_pixel_variance32x16_ssse3,
-               &vp9_highbd_masked_sub_pixel_variance32x16_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_sub_pixel_variance16x32_ssse3,
-               &vp9_highbd_masked_sub_pixel_variance16x32_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_sub_pixel_variance16x16_ssse3,
-               &vp9_highbd_masked_sub_pixel_variance16x16_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_sub_pixel_variance16x8_ssse3,
-               &vp9_highbd_masked_sub_pixel_variance16x8_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_sub_pixel_variance8x16_ssse3,
-               &vp9_highbd_masked_sub_pixel_variance8x16_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_sub_pixel_variance8x8_ssse3,
-               &vp9_highbd_masked_sub_pixel_variance8x8_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_sub_pixel_variance8x4_ssse3,
-               &vp9_highbd_masked_sub_pixel_variance8x4_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_sub_pixel_variance4x8_ssse3,
-               &vp9_highbd_masked_sub_pixel_variance4x8_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_masked_sub_pixel_variance4x4_ssse3,
-               &vp9_highbd_masked_sub_pixel_variance4x4_c, VPX_BITS_8),
-    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance64x64_ssse3,
-               &vp9_highbd_10_masked_sub_pixel_variance64x64_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance64x32_ssse3,
-               &vp9_highbd_10_masked_sub_pixel_variance64x32_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance32x64_ssse3,
-               &vp9_highbd_10_masked_sub_pixel_variance32x64_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance32x32_ssse3,
-               &vp9_highbd_10_masked_sub_pixel_variance32x32_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance32x16_ssse3,
-               &vp9_highbd_10_masked_sub_pixel_variance32x16_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance16x32_ssse3,
-               &vp9_highbd_10_masked_sub_pixel_variance16x32_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance16x16_ssse3,
-               &vp9_highbd_10_masked_sub_pixel_variance16x16_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance16x8_ssse3,
-               &vp9_highbd_10_masked_sub_pixel_variance16x8_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance8x16_ssse3,
-               &vp9_highbd_10_masked_sub_pixel_variance8x16_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance8x8_ssse3,
-               &vp9_highbd_10_masked_sub_pixel_variance8x8_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance8x4_ssse3,
-               &vp9_highbd_10_masked_sub_pixel_variance8x4_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance4x8_ssse3,
-               &vp9_highbd_10_masked_sub_pixel_variance4x8_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_10_masked_sub_pixel_variance4x4_ssse3,
-               &vp9_highbd_10_masked_sub_pixel_variance4x4_c, VPX_BITS_10),
-    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance64x64_ssse3,
-               &vp9_highbd_12_masked_sub_pixel_variance64x64_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance64x32_ssse3,
-               &vp9_highbd_12_masked_sub_pixel_variance64x32_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance32x64_ssse3,
-               &vp9_highbd_12_masked_sub_pixel_variance32x64_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance32x32_ssse3,
-               &vp9_highbd_12_masked_sub_pixel_variance32x32_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance32x16_ssse3,
-               &vp9_highbd_12_masked_sub_pixel_variance32x16_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance16x32_ssse3,
-               &vp9_highbd_12_masked_sub_pixel_variance16x32_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance16x16_ssse3,
-               &vp9_highbd_12_masked_sub_pixel_variance16x16_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance16x8_ssse3,
-               &vp9_highbd_12_masked_sub_pixel_variance16x8_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance8x16_ssse3,
-               &vp9_highbd_12_masked_sub_pixel_variance8x16_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance8x8_ssse3,
-               &vp9_highbd_12_masked_sub_pixel_variance8x8_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance8x4_ssse3,
-               &vp9_highbd_12_masked_sub_pixel_variance8x4_c, VPX_BITS_12) ,
-    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance4x8_ssse3,
-               &vp9_highbd_12_masked_sub_pixel_variance4x8_c, VPX_BITS_12),
-    make_tuple(&vp9_highbd_12_masked_sub_pixel_variance4x4_ssse3,
-               &vp9_highbd_12_masked_sub_pixel_variance4x4_c, VPX_BITS_12)));
+#if CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance128x128_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance128x128_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance128x64_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance128x64_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance64x128_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance64x128_c, VPX_BITS_8),
+#endif  // CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance64x64_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance64x64_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance64x32_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance64x32_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance32x64_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance32x64_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance32x32_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance32x32_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance32x16_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance32x16_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance16x32_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance16x32_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance16x16_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance16x16_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance16x8_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance16x8_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance8x16_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance8x16_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance8x8_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance8x8_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance8x4_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance8x4_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance4x8_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance4x8_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance4x4_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance4x4_c, VPX_BITS_8),
+#if CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance128x128_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance128x128_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance128x64_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance128x64_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance64x128_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance64x128_c, VPX_BITS_10),
+#endif  // CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance64x64_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance64x64_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance64x32_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance64x32_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance32x64_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance32x64_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance32x32_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance32x32_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance32x16_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance32x16_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance16x32_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance16x32_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance16x16_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance16x16_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance16x8_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance16x8_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance8x16_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance8x16_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance8x8_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance8x8_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance8x4_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance8x4_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance4x8_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance4x8_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance4x4_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance4x4_c, VPX_BITS_10),
+#if CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance128x128_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance128x128_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance128x64_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance128x64_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance64x128_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance64x128_c, VPX_BITS_12),
+#endif  // CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance64x64_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance64x64_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance64x32_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance64x32_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance32x64_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance32x64_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance32x32_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance32x32_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance32x16_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance32x16_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance16x32_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance16x32_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance16x16_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance16x16_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance16x8_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance16x8_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance8x16_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance8x16_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance8x8_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance8x8_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance8x4_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance8x4_c, VPX_BITS_12) ,
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance4x8_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance4x8_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance4x4_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance4x4_c, VPX_BITS_12)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #endif  // HAVE_SSSE3
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 3f0f74c..1985e18 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -59,13 +59,13 @@
     reference_data8_ = reinterpret_cast<uint8_t*>(
         vpx_memalign(kDataAlignment, kDataBufferSize));
     second_pred8_ = reinterpret_cast<uint8_t*>(
-        vpx_memalign(kDataAlignment, 64*64));
+        vpx_memalign(kDataAlignment, 128*128));
     source_data16_ = reinterpret_cast<uint16_t*>(
         vpx_memalign(kDataAlignment, kDataBlockSize*sizeof(uint16_t)));
     reference_data16_ = reinterpret_cast<uint16_t*>(
         vpx_memalign(kDataAlignment, kDataBufferSize*sizeof(uint16_t)));
     second_pred16_ = reinterpret_cast<uint16_t*>(
-        vpx_memalign(kDataAlignment, 64*64*sizeof(uint16_t)));
+        vpx_memalign(kDataAlignment, 128*128*sizeof(uint16_t)));
   }
 
   static void TearDownTestCase() {
@@ -88,9 +88,9 @@
   }
 
  protected:
-  // Handle blocks up to 4 blocks 64x64 with stride up to 128
+  // Handle up to 4 128x128 blocks, with stride up to 256
   static const int kDataAlignment = 16;
-  static const int kDataBlockSize = 64 * 128;
+  static const int kDataBlockSize = 128 * 256;
   static const int kDataBufferSize = 4 * kDataBlockSize;
 
   virtual void SetUp() {
@@ -485,6 +485,11 @@
 //------------------------------------------------------------------------------
 // C functions
 const SadMxNParam c_tests[] = {
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_sad128x128_c, -1),
+  make_tuple(128, 64, &vpx_sad128x64_c, -1),
+  make_tuple(64, 128, &vpx_sad64x128_c, -1),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_sad64x64_c, -1),
   make_tuple(64, 32, &vpx_sad64x32_c, -1),
   make_tuple(32, 64, &vpx_sad32x64_c, -1),
@@ -499,6 +504,11 @@
   make_tuple(4, 8, &vpx_sad4x8_c, -1),
   make_tuple(4, 4, &vpx_sad4x4_c, -1),
 #if CONFIG_VP9_HIGHBITDEPTH
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_highbd_sad128x128_c, 8),
+  make_tuple(128, 64, &vpx_highbd_sad128x64_c, 8),
+  make_tuple(64, 128, &vpx_highbd_sad64x128_c, 8),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_highbd_sad64x64_c, 8),
   make_tuple(64, 32, &vpx_highbd_sad64x32_c, 8),
   make_tuple(32, 64, &vpx_highbd_sad32x64_c, 8),
@@ -512,6 +522,11 @@
   make_tuple(8, 4, &vpx_highbd_sad8x4_c, 8),
   make_tuple(4, 8, &vpx_highbd_sad4x8_c, 8),
   make_tuple(4, 4, &vpx_highbd_sad4x4_c, 8),
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_highbd_sad128x128_c, 10),
+  make_tuple(128, 64, &vpx_highbd_sad128x64_c, 10),
+  make_tuple(64, 128, &vpx_highbd_sad64x128_c, 10),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_highbd_sad64x64_c, 10),
   make_tuple(64, 32, &vpx_highbd_sad64x32_c, 10),
   make_tuple(32, 64, &vpx_highbd_sad32x64_c, 10),
@@ -525,6 +540,11 @@
   make_tuple(8, 4, &vpx_highbd_sad8x4_c, 10),
   make_tuple(4, 8, &vpx_highbd_sad4x8_c, 10),
   make_tuple(4, 4, &vpx_highbd_sad4x4_c, 10),
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_highbd_sad128x128_c, 12),
+  make_tuple(128, 64, &vpx_highbd_sad128x64_c, 12),
+  make_tuple(64, 128, &vpx_highbd_sad64x128_c, 12),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_highbd_sad64x64_c, 12),
   make_tuple(64, 32, &vpx_highbd_sad64x32_c, 12),
   make_tuple(32, 64, &vpx_highbd_sad32x64_c, 12),
@@ -543,6 +563,11 @@
 INSTANTIATE_TEST_CASE_P(C, SADTest, ::testing::ValuesIn(c_tests));
 
 const SadMxNAvgParam avg_c_tests[] = {
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_sad128x128_avg_c, -1),
+  make_tuple(128, 64, &vpx_sad128x64_avg_c, -1),
+  make_tuple(64, 128, &vpx_sad64x128_avg_c, -1),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_sad64x64_avg_c, -1),
   make_tuple(64, 32, &vpx_sad64x32_avg_c, -1),
   make_tuple(32, 64, &vpx_sad32x64_avg_c, -1),
@@ -557,6 +582,11 @@
   make_tuple(4, 8, &vpx_sad4x8_avg_c, -1),
   make_tuple(4, 4, &vpx_sad4x4_avg_c, -1),
 #if CONFIG_VP9_HIGHBITDEPTH
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_highbd_sad128x128_avg_c, 8),
+  make_tuple(128, 64, &vpx_highbd_sad128x64_avg_c, 8),
+  make_tuple(64, 128, &vpx_highbd_sad64x128_avg_c, 8),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_highbd_sad64x64_avg_c, 8),
   make_tuple(64, 32, &vpx_highbd_sad64x32_avg_c, 8),
   make_tuple(32, 64, &vpx_highbd_sad32x64_avg_c, 8),
@@ -570,6 +600,11 @@
   make_tuple(8, 4, &vpx_highbd_sad8x4_avg_c, 8),
   make_tuple(4, 8, &vpx_highbd_sad4x8_avg_c, 8),
   make_tuple(4, 4, &vpx_highbd_sad4x4_avg_c, 8),
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_highbd_sad128x128_avg_c, 10),
+  make_tuple(128, 64, &vpx_highbd_sad128x64_avg_c, 10),
+  make_tuple(64, 128, &vpx_highbd_sad64x128_avg_c, 10),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_highbd_sad64x64_avg_c, 10),
   make_tuple(64, 32, &vpx_highbd_sad64x32_avg_c, 10),
   make_tuple(32, 64, &vpx_highbd_sad32x64_avg_c, 10),
@@ -583,6 +618,11 @@
   make_tuple(8, 4, &vpx_highbd_sad8x4_avg_c, 10),
   make_tuple(4, 8, &vpx_highbd_sad4x8_avg_c, 10),
   make_tuple(4, 4, &vpx_highbd_sad4x4_avg_c, 10),
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_highbd_sad128x128_avg_c, 12),
+  make_tuple(128, 64, &vpx_highbd_sad128x64_avg_c, 12),
+  make_tuple(64, 128, &vpx_highbd_sad64x128_avg_c, 12),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_highbd_sad64x64_avg_c, 12),
   make_tuple(64, 32, &vpx_highbd_sad64x32_avg_c, 12),
   make_tuple(32, 64, &vpx_highbd_sad32x64_avg_c, 12),
@@ -601,6 +641,11 @@
 INSTANTIATE_TEST_CASE_P(C, SADavgTest, ::testing::ValuesIn(avg_c_tests));
 
 const SadMxNx4Param x4d_c_tests[] = {
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_sad128x128x4d_c, -1),
+  make_tuple(128, 64, &vpx_sad128x64x4d_c, -1),
+  make_tuple(64, 128, &vpx_sad64x128x4d_c, -1),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_sad64x64x4d_c, -1),
   make_tuple(64, 32, &vpx_sad64x32x4d_c, -1),
   make_tuple(32, 64, &vpx_sad32x64x4d_c, -1),
@@ -615,6 +660,11 @@
   make_tuple(4, 8, &vpx_sad4x8x4d_c, -1),
   make_tuple(4, 4, &vpx_sad4x4x4d_c, -1),
 #if CONFIG_VP9_HIGHBITDEPTH
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_highbd_sad128x128x4d_c, 8),
+  make_tuple(128, 64, &vpx_highbd_sad128x64x4d_c, 8),
+  make_tuple(64, 128, &vpx_highbd_sad64x128x4d_c, 8),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_highbd_sad64x64x4d_c, 8),
   make_tuple(64, 32, &vpx_highbd_sad64x32x4d_c, 8),
   make_tuple(32, 64, &vpx_highbd_sad32x64x4d_c, 8),
@@ -628,6 +678,11 @@
   make_tuple(8, 4, &vpx_highbd_sad8x4x4d_c, 8),
   make_tuple(4, 8, &vpx_highbd_sad4x8x4d_c, 8),
   make_tuple(4, 4, &vpx_highbd_sad4x4x4d_c, 8),
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_highbd_sad128x128x4d_c, 10),
+  make_tuple(128, 64, &vpx_highbd_sad128x64x4d_c, 10),
+  make_tuple(64, 128, &vpx_highbd_sad64x128x4d_c, 10),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_highbd_sad64x64x4d_c, 10),
   make_tuple(64, 32, &vpx_highbd_sad64x32x4d_c, 10),
   make_tuple(32, 64, &vpx_highbd_sad32x64x4d_c, 10),
@@ -641,6 +696,11 @@
   make_tuple(8, 4, &vpx_highbd_sad8x4x4d_c, 10),
   make_tuple(4, 8, &vpx_highbd_sad4x8x4d_c, 10),
   make_tuple(4, 4, &vpx_highbd_sad4x4x4d_c, 10),
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_highbd_sad128x128x4d_c, 12),
+  make_tuple(128, 64, &vpx_highbd_sad128x64x4d_c, 12),
+  make_tuple(64, 128, &vpx_highbd_sad64x128x4d_c, 12),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_highbd_sad64x64x4d_c, 12),
   make_tuple(64, 32, &vpx_highbd_sad64x32x4d_c, 12),
   make_tuple(32, 64, &vpx_highbd_sad32x64x4d_c, 12),
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 6f50f78..97c5516 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -759,7 +759,13 @@
 
 INSTANTIATE_TEST_CASE_P(
     C, VpxVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_variance64x64_c, 0),
+    ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+                      make_tuple(7, 7, &vpx_variance128x128_c, 0),
+                      make_tuple(7, 6, &vpx_variance128x64_c, 0),
+                      make_tuple(6, 7, &vpx_variance64x128_c, 0),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
+                      make_tuple(6, 6, &vpx_variance64x64_c, 0),
                       make_tuple(6, 5, &vpx_variance64x32_c, 0),
                       make_tuple(5, 6, &vpx_variance32x64_c, 0),
                       make_tuple(5, 5, &vpx_variance32x32_c, 0),
@@ -775,7 +781,13 @@
 
 INSTANTIATE_TEST_CASE_P(
     C, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_c, 0),
+    ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+                      make_tuple(7, 7, &vpx_sub_pixel_variance128x128_c, 0),
+                      make_tuple(7, 6, &vpx_sub_pixel_variance128x64_c, 0),
+                      make_tuple(6, 7, &vpx_sub_pixel_variance64x128_c, 0),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
+                      make_tuple(6, 6, &vpx_sub_pixel_variance64x64_c, 0),
                       make_tuple(6, 5, &vpx_sub_pixel_variance64x32_c, 0),
                       make_tuple(5, 6, &vpx_sub_pixel_variance32x64_c, 0),
                       make_tuple(5, 5, &vpx_sub_pixel_variance32x32_c, 0),
@@ -791,7 +803,13 @@
 
 INSTANTIATE_TEST_CASE_P(
     C, VpxSubpelAvgVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_c, 0),
+    ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+                      make_tuple(7, 7, &vpx_sub_pixel_avg_variance128x128_c, 0),
+                      make_tuple(7, 6, &vpx_sub_pixel_avg_variance128x64_c, 0),
+                      make_tuple(6, 7, &vpx_sub_pixel_avg_variance64x128_c, 0),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
+                      make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_c, 0),
                       make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_c, 0),
                       make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_c, 0),
                       make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_c, 0),
@@ -841,7 +859,13 @@
 
 INSTANTIATE_TEST_CASE_P(
     C, VpxHBDVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_highbd_12_variance64x64_c, 12),
+    ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+                      make_tuple(7, 7, &vpx_highbd_12_variance128x128_c, 12),
+                      make_tuple(7, 6, &vpx_highbd_12_variance128x64_c, 12),
+                      make_tuple(6, 7, &vpx_highbd_12_variance64x128_c, 12),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
+                      make_tuple(6, 6, &vpx_highbd_12_variance64x64_c, 12),
                       make_tuple(6, 5, &vpx_highbd_12_variance64x32_c, 12),
                       make_tuple(5, 6, &vpx_highbd_12_variance32x64_c, 12),
                       make_tuple(5, 5, &vpx_highbd_12_variance32x32_c, 12),
@@ -854,6 +878,11 @@
                       make_tuple(3, 2, &vpx_highbd_12_variance8x4_c, 12),
                       make_tuple(2, 3, &vpx_highbd_12_variance4x8_c, 12),
                       make_tuple(2, 2, &vpx_highbd_12_variance4x4_c, 12),
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+                      make_tuple(7, 7, &vpx_highbd_10_variance128x128_c, 10),
+                      make_tuple(7, 6, &vpx_highbd_10_variance128x64_c, 10),
+                      make_tuple(6, 7, &vpx_highbd_10_variance64x128_c, 10),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
                       make_tuple(6, 6, &vpx_highbd_10_variance64x64_c, 10),
                       make_tuple(6, 5, &vpx_highbd_10_variance64x32_c, 10),
                       make_tuple(5, 6, &vpx_highbd_10_variance32x64_c, 10),
@@ -867,6 +896,11 @@
                       make_tuple(3, 2, &vpx_highbd_10_variance8x4_c, 10),
                       make_tuple(2, 3, &vpx_highbd_10_variance4x8_c, 10),
                       make_tuple(2, 2, &vpx_highbd_10_variance4x4_c, 10),
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+                      make_tuple(7, 7, &vpx_highbd_8_variance128x128_c, 8),
+                      make_tuple(7, 6, &vpx_highbd_8_variance128x64_c, 8),
+                      make_tuple(6, 7, &vpx_highbd_8_variance64x128_c, 8),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
                       make_tuple(6, 6, &vpx_highbd_8_variance64x64_c, 8),
                       make_tuple(6, 5, &vpx_highbd_8_variance64x32_c, 8),
                       make_tuple(5, 6, &vpx_highbd_8_variance32x64_c, 8),
@@ -884,6 +918,11 @@
 INSTANTIATE_TEST_CASE_P(
     C, VpxHBDSubpelVarianceTest,
     ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+        make_tuple(7, 7, &vpx_highbd_8_sub_pixel_variance128x128_c, 8),
+        make_tuple(7, 6, &vpx_highbd_8_sub_pixel_variance128x64_c, 8),
+        make_tuple(6, 7, &vpx_highbd_8_sub_pixel_variance64x128_c, 8),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
         make_tuple(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_c, 8),
         make_tuple(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_c, 8),
         make_tuple(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_c, 8),
@@ -897,6 +936,11 @@
         make_tuple(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_c, 8),
         make_tuple(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_c, 8),
         make_tuple(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_c, 8),
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+        make_tuple(7, 7, &vpx_highbd_10_sub_pixel_variance128x128_c, 10),
+        make_tuple(7, 6, &vpx_highbd_10_sub_pixel_variance128x64_c, 10),
+        make_tuple(6, 7, &vpx_highbd_10_sub_pixel_variance64x128_c, 10),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
         make_tuple(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_c, 10),
         make_tuple(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_c, 10),
         make_tuple(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_c, 10),
@@ -910,6 +954,11 @@
         make_tuple(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_c, 10),
         make_tuple(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_c, 10),
         make_tuple(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_c, 10),
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+        make_tuple(7, 7, &vpx_highbd_12_sub_pixel_variance128x128_c, 12),
+        make_tuple(7, 6, &vpx_highbd_12_sub_pixel_variance128x64_c, 12),
+        make_tuple(6, 7, &vpx_highbd_12_sub_pixel_variance64x128_c, 12),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
         make_tuple(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_c, 12),
         make_tuple(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_c, 12),
         make_tuple(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_c, 12),
@@ -927,6 +976,11 @@
 INSTANTIATE_TEST_CASE_P(
     C, VpxHBDSubpelAvgVarianceTest,
     ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+        make_tuple(7, 7, &vpx_highbd_8_sub_pixel_avg_variance128x128_c, 8),
+        make_tuple(7, 6, &vpx_highbd_8_sub_pixel_avg_variance128x64_c, 8),
+        make_tuple(6, 7, &vpx_highbd_8_sub_pixel_avg_variance64x128_c, 8),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
         make_tuple(6, 6, &vpx_highbd_8_sub_pixel_avg_variance64x64_c, 8),
         make_tuple(6, 5, &vpx_highbd_8_sub_pixel_avg_variance64x32_c, 8),
         make_tuple(5, 6, &vpx_highbd_8_sub_pixel_avg_variance32x64_c, 8),
@@ -940,6 +994,11 @@
         make_tuple(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_c, 8),
         make_tuple(2, 3, &vpx_highbd_8_sub_pixel_avg_variance4x8_c, 8),
         make_tuple(2, 2, &vpx_highbd_8_sub_pixel_avg_variance4x4_c, 8),
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+        make_tuple(7, 7, &vpx_highbd_10_sub_pixel_avg_variance128x128_c, 10),
+        make_tuple(7, 6, &vpx_highbd_10_sub_pixel_avg_variance128x64_c, 10),
+        make_tuple(6, 7, &vpx_highbd_10_sub_pixel_avg_variance64x128_c, 10),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
         make_tuple(6, 6, &vpx_highbd_10_sub_pixel_avg_variance64x64_c, 10),
         make_tuple(6, 5, &vpx_highbd_10_sub_pixel_avg_variance64x32_c, 10),
         make_tuple(5, 6, &vpx_highbd_10_sub_pixel_avg_variance32x64_c, 10),
@@ -953,6 +1012,11 @@
         make_tuple(3, 2, &vpx_highbd_10_sub_pixel_avg_variance8x4_c, 10),
         make_tuple(2, 3, &vpx_highbd_10_sub_pixel_avg_variance4x8_c, 10),
         make_tuple(2, 2, &vpx_highbd_10_sub_pixel_avg_variance4x4_c, 10),
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+        make_tuple(7, 7, &vpx_highbd_12_sub_pixel_avg_variance128x128_c, 12),
+        make_tuple(7, 6, &vpx_highbd_12_sub_pixel_avg_variance128x64_c, 12),
+        make_tuple(6, 7, &vpx_highbd_12_sub_pixel_avg_variance64x128_c, 12),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
         make_tuple(6, 6, &vpx_highbd_12_sub_pixel_avg_variance64x64_c, 12),
         make_tuple(6, 5, &vpx_highbd_12_sub_pixel_avg_variance64x32_c, 12),
         make_tuple(5, 6, &vpx_highbd_12_sub_pixel_avg_variance32x64_c, 12),
diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c
index 204cede..c500206 100644
--- a/vpx_dsp/sad.c
+++ b/vpx_dsp/sad.c
@@ -108,6 +108,22 @@
     sad_array[i] = vpx_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \
 }
 
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+// 128x128
+sadMxN(128, 128)
+sadMxNxK(128, 128, 3)
+sadMxNxK(128, 128, 8)
+sadMxNx4D(128, 128)
+
+// 128x64
+sadMxN(128, 64)
+sadMxNx4D(128, 64)
+
+// 64x128
+sadMxN(64, 128)
+sadMxNx4D(64, 128)
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
+
 // 64x64
 sadMxN(64, 64)
 sadMxNxK(64, 64, 3)
@@ -247,6 +263,22 @@
   } \
 }
 
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+// 128x128
+highbd_sadMxN(128, 128)
+highbd_sadMxNxK(128, 128, 3)
+highbd_sadMxNxK(128, 128, 8)
+highbd_sadMxNx4D(128, 128)
+
+// 128x64
+highbd_sadMxN(128, 64)
+highbd_sadMxNx4D(128, 64)
+
+// 64x128
+highbd_sadMxN(64, 128)
+highbd_sadMxNx4D(64, 128)
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
+
 // 64x64
 highbd_sadMxN(64, 64)
 highbd_sadMxNxK(64, 64, 3)
diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c
index 14d7f99..169769a 100644
--- a/vpx_dsp/variance.c
+++ b/vpx_dsp/variance.c
@@ -17,17 +17,6 @@
 #include "vpx_dsp/variance.h"
 #include "vpx_dsp/vpx_filter.h"
 
-const uint8_t vpx_bilinear_filters[BIL_SUBPEL_SHIFTS][2] = {
-  { 128,   0  },
-  { 112,  16  },
-  {  96,  32  },
-  {  80,  48  },
-  {  64,  64  },
-  {  48,  80  },
-  {  32,  96  },
-  {  16, 112  },
-};
-
 uint32_t vpx_get4x4sse_cs_c(const uint8_t *a, int  a_stride,
                             const uint8_t *b, int  b_stride) {
   int distortion = 0;
@@ -176,9 +165,9 @@
   uint8_t temp2[H * W]; \
 \
   var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
-                                    vpx_bilinear_filters[xoffset]); \
+                                    bilinear_filters_2t[xoffset]); \
   var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                     vpx_bilinear_filters[yoffset]); \
+                                     bilinear_filters_2t[yoffset]); \
 \
   return vpx_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \
 }
@@ -196,9 +185,9 @@
   DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
 \
   var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
-                                    vpx_bilinear_filters[xoffset]); \
+                                    bilinear_filters_2t[xoffset]); \
   var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                     vpx_bilinear_filters[yoffset]); \
+                                     bilinear_filters_2t[yoffset]); \
 \
   vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
 \
@@ -235,6 +224,11 @@
     SUBPIX_VAR(W, H) \
     SUBPIX_AVG_VAR(W, H)
 
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+VARIANCES(128, 128)
+VARIANCES(128, 64)
+VARIANCES(64, 128)
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
 VARIANCES(64, 64)
 VARIANCES(64, 32)
 VARIANCES(32, 64)
@@ -501,9 +495,9 @@
   uint16_t temp2[H * W]; \
 \
   highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, vpx_bilinear_filters[xoffset]); \
+                                           W, bilinear_filters_2t[xoffset]); \
   highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            vpx_bilinear_filters[yoffset]); \
+                                            bilinear_filters_2t[yoffset]); \
 \
   return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
                                           dst_stride, sse); \
@@ -518,9 +512,9 @@
   uint16_t temp2[H * W]; \
 \
   highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, vpx_bilinear_filters[xoffset]); \
+                                           W, bilinear_filters_2t[xoffset]); \
   highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            vpx_bilinear_filters[yoffset]); \
+                                            bilinear_filters_2t[yoffset]); \
 \
   return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
                                              W, dst, dst_stride, sse); \
@@ -535,9 +529,9 @@
   uint16_t temp2[H * W]; \
 \
   highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, vpx_bilinear_filters[xoffset]); \
+                                           W, bilinear_filters_2t[xoffset]); \
   highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            vpx_bilinear_filters[yoffset]); \
+                                            bilinear_filters_2t[yoffset]); \
 \
   return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
                                              W, dst, dst_stride, sse); \
@@ -555,9 +549,9 @@
   DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
 \
   highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, vpx_bilinear_filters[xoffset]); \
+                                           W, bilinear_filters_2t[xoffset]); \
   highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            vpx_bilinear_filters[yoffset]); \
+                                            bilinear_filters_2t[yoffset]); \
 \
   vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
                            CONVERT_TO_BYTEPTR(temp2), W); \
@@ -577,9 +571,9 @@
   DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
 \
   highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, vpx_bilinear_filters[xoffset]); \
+                                           W, bilinear_filters_2t[xoffset]); \
   highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            vpx_bilinear_filters[yoffset]); \
+                                            bilinear_filters_2t[yoffset]); \
 \
   vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
                            CONVERT_TO_BYTEPTR(temp2), W); \
@@ -599,9 +593,9 @@
   DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
 \
   highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, vpx_bilinear_filters[xoffset]); \
+                                           W, bilinear_filters_2t[xoffset]); \
   highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            vpx_bilinear_filters[yoffset]); \
+                                            bilinear_filters_2t[yoffset]); \
 \
   vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
                            CONVERT_TO_BYTEPTR(temp2), W); \
@@ -616,6 +610,11 @@
     HIGHBD_SUBPIX_VAR(W, H) \
     HIGHBD_SUBPIX_AVG_VAR(W, H)
 
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+HIGHBD_VARIANCES(128, 128)
+HIGHBD_VARIANCES(128, 64)
+HIGHBD_VARIANCES(64, 128)
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
 HIGHBD_VARIANCES(64, 64)
 HIGHBD_VARIANCES(64, 32)
 HIGHBD_VARIANCES(32, 64)
@@ -677,8 +676,9 @@
     b += b_stride;
     m += m_stride;
   }
-  *sum = (sum64 >= 0) ? ((sum64 + 31) >> 6) : -((-sum64 + 31) >> 6);
-  *sse = (sse64 + 2047) >> 12;
+  sum64 = (sum64 >= 0) ? sum64  : -sum64;
+  *sum = ROUND_POWER_OF_TWO(sum64, 6);
+  *sse = ROUND_POWER_OF_TWO(sse64, 12);
 }
 
 #define MASK_VAR(W, H) \
@@ -702,9 +702,9 @@
   uint8_t temp2[H * W]; \
 \
   var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \
-                                    vpx_bilinear_filters[xoffset]); \
+                                    bilinear_filters_2t[xoffset]); \
   var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                     vpx_bilinear_filters[yoffset]); \
+                                     bilinear_filters_2t[yoffset]); \
 \
   return vpx_masked_variance##W##x##H##_c(temp2, W, dst, dst_stride, \
                                           msk, msk_stride, sse); \
@@ -765,27 +765,28 @@
                               const uint8_t *b8, int  b_stride,
                               const uint8_t *m, int  m_stride,
                               int  w, int  h,
-                              uint64_t *sse64, int *sum) {
+                              uint64_t *sse, int64_t *sum) {
   int i, j;
   uint16_t *a = CONVERT_TO_SHORTPTR(a8);
   uint16_t *b = CONVERT_TO_SHORTPTR(b8);
 
-  int64_t sum64 = 0;
-  *sse64 = 0;
+  *sum = 0;
+  *sse = 0;
 
   for (i = 0; i < h; i++) {
     for (j = 0; j < w; j++) {
       const int diff = (a[j] - b[j]) * (m[j]);
-      sum64 += diff;
-      *sse64 += (int64_t)diff * diff;
+      *sum += (int64_t)diff;
+      *sse += (int64_t)diff * diff;
     }
 
     a += a_stride;
     b += b_stride;
     m += m_stride;
   }
-  *sum = (sum64 >= 0) ? ((sum64 + 31) >> 6) : -((-sum64 + 31) >> 6);
-  *sse64 = (*sse64 + 2047) >> 12;
+  *sum = (*sum >= 0) ? *sum  : -*sum;
+  *sum = ROUND_POWER_OF_TWO(*sum, 6);
+  *sse = ROUND_POWER_OF_TWO(*sse, 12);
 }
 
 void highbd_masked_variance(const uint8_t *a8, int  a_stride,
@@ -793,9 +794,11 @@
                             const uint8_t *m, int  m_stride,
                             int  w, int  h,
                             unsigned int *sse, int *sum) {
+  int64_t sum64;
   uint64_t sse64;
   highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride,
-                           w, h, &sse64, sum);
+                           w, h, &sse64, &sum64);
+  *sum = (int)sum64;
   *sse = (unsigned int)sse64;
 }
 
@@ -804,10 +807,11 @@
                                const uint8_t *m, int  m_stride,
                                int  w, int  h,
                                unsigned int *sse, int *sum) {
+  int64_t sum64;
   uint64_t sse64;
   highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride,
-                           w, h, &sse64, sum);
-  *sum = ROUND_POWER_OF_TWO(*sum, 2);
+                           w, h, &sse64, &sum64);
+  *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
 }
 
@@ -816,10 +820,11 @@
                                const uint8_t *m, int  m_stride,
                                int  w, int  h,
                                unsigned int *sse, int *sum) {
+  int64_t sum64;
   uint64_t sse64;
   highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride,
-                           w, h, &sse64, sum);
-  *sum = ROUND_POWER_OF_TWO(*sum, 4);
+                           w, h, &sse64, &sum64);
+  *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
 }
 
@@ -875,9 +880,9 @@
 \
   highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, \
                                            H + 1, W, \
-                                           vpx_bilinear_filters[xoffset]); \
+                                           bilinear_filters_2t[xoffset]); \
   highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            vpx_bilinear_filters[yoffset]); \
+                                            bilinear_filters_2t[yoffset]); \
 \
   return vpx_highbd_masked_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
                                                  W, dst, dst_stride, \
@@ -895,9 +900,9 @@
 \
   highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, \
                                            H + 1, W, \
-                                           vpx_bilinear_filters[xoffset]); \
+                                           bilinear_filters_2t[xoffset]); \
   highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            vpx_bilinear_filters[yoffset]); \
+                                            bilinear_filters_2t[yoffset]); \
 \
   return vpx_highbd_10_masked_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
                                                     W, dst, dst_stride, \
@@ -915,9 +920,9 @@
 \
   highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, \
                                            H + 1, W, \
-                                           vpx_bilinear_filters[xoffset]); \
+                                           bilinear_filters_2t[xoffset]); \
   highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            vpx_bilinear_filters[yoffset]); \
+                                            bilinear_filters_2t[yoffset]); \
 \
   return vpx_highbd_12_masked_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
                                                     W, dst, dst_stride, \
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index fdfd20c..2ce0b99 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -50,6 +50,19 @@
   $avx2_x86_64 = 'avx2';
 }
 
+if (vpx_config("CONFIG_EXT_PARTITION") eq "yes") {
+  @block_widths = (4, 8, 16, 32, 64, 128)
+} else {
+  @block_widths = (4, 8, 16, 32, 64)
+}
+
+@block_sizes = ();
+foreach $w (@block_widths) {
+  foreach $h (@block_widths) {
+    push @block_sizes, [$w, $h] if ($w <= 2*$h && $h <= 2*$w) ;
+  }
+}
+
 #
 # Intra prediction
 #
@@ -960,69 +973,43 @@
 #
 add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
 specialize qw/vpx_subtract_block neon msa/, "$sse2_x86inc";
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
+  specialize qw/vpx_highbd_subtract_block/;
+}
 
 if (vpx_config("CONFIG_VP10_ENCODER") eq "yes") {
-#
-# Sum of Squares
-#
+  #
+  # Sum of Squares
+  #
   add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";
   specialize qw/vpx_sum_squares_2d_i16 sse2/;
 }
 
-#
-# Single block SAD
-#
-add_proto qw/unsigned int vpx_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad64x64 avx2 neon msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad64x32 avx2 msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x64 avx2 msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x32 avx2 neon msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x16 avx2 msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x32 msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x16 mmx media neon msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x8 mmx neon msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x16 mmx neon msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x8 mmx neon msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x4 msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad4x8 msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad4x4 mmx neon msa/, "$sse2_x86inc";
-
-#
-# Avg
-#
 if ((vpx_config("CONFIG_VP9_ENCODER") eq "yes") || (vpx_config("CONFIG_VP10_ENCODER") eq "yes")) {
+  #
+  # Avg
+  #
   add_proto qw/unsigned int vpx_avg_8x8/, "const uint8_t *, int p";
   specialize qw/vpx_avg_8x8 sse2 neon msa/;
-
   add_proto qw/unsigned int vpx_avg_4x4/, "const uint8_t *, int p";
   specialize qw/vpx_avg_4x4 sse2 neon msa/;
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *, int p";
+    specialize qw/vpx_highbd_avg_8x8/;
+    add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *, int p";
+    specialize qw/vpx_highbd_avg_4x4/;
+  }
 
+  #
+  # Minmax
+  #
   add_proto qw/void vpx_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
   specialize qw/vpx_minmax_8x8 sse2/;
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+    specialize qw/vpx_highbd_minmax_8x8/;
+  }
 
   add_proto qw/void vpx_hadamard_8x8/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
   specialize qw/vpx_hadamard_8x8 sse2/, "$ssse3_x86_64_x86inc";
@@ -1043,575 +1030,217 @@
   specialize qw/vpx_vector_var neon sse2/;
 }  # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
 
-add_proto qw/unsigned int vpx_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+#
+# Single block SAD / Single block Avg SAD
+#
+foreach (@block_sizes) {
+  ($w, $h) = @$_;
+  add_proto qw/unsigned int/, "vpx_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  add_proto qw/unsigned int/, "vpx_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+}
+
+specialize qw/vpx_sad64x64    avx2       neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad64x32    avx2            msa/, "$sse2_x86inc";
+specialize qw/vpx_sad32x64    avx2            msa/, "$sse2_x86inc";
+specialize qw/vpx_sad32x32    avx2       neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad32x16    avx2            msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x32                    msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x16 mmx     media neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x8  mmx           neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x16  mmx           neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x8   mmx           neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x4                      msa/, "$sse2_x86inc";
+specialize qw/vpx_sad4x8                      msa/, "$sse2_x86inc";
+specialize qw/vpx_sad4x4   mmx           neon msa/, "$sse2_x86inc";
+
 specialize qw/vpx_sad64x64_avg avx2 msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
 specialize qw/vpx_sad64x32_avg avx2 msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
 specialize qw/vpx_sad32x64_avg avx2 msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
 specialize qw/vpx_sad32x32_avg avx2 msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
 specialize qw/vpx_sad32x16_avg avx2 msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x32_avg      msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x16_avg      msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x8_avg       msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x16_avg       msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x8_avg        msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x4_avg        msa/, "$sse2_x86inc";
+specialize qw/vpx_sad4x8_avg        msa/, "$sse2_x86inc";
+specialize qw/vpx_sad4x4_avg        msa/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vpx_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad16x32_avg msa/, "$sse2_x86inc";
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "vpx_highbd_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+    add_proto qw/unsigned int/, "vpx_highbd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+    if ($w != 128 && $h != 128 && $w != 4) {
+      specialize "vpx_highbd_sad${w}x${h}", "$sse2_x86inc";
+      specialize "vpx_highbd_sad${w}x${h}_avg", "$sse2_x86inc";
+    }
+  }
+}
 
-add_proto qw/unsigned int vpx_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad16x16_avg msa/, "$sse2_x86inc";
+#
+# Masked SAD
+#
+if (vpx_config("CONFIG_EXT_INTER") eq "yes") {
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "vpx_masked_sad${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
+    specialize "vpx_masked_sad${w}x${h}", qw/ssse3/;
+  }
 
-add_proto qw/unsigned int vpx_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad16x8_avg msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad8x16_avg msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad8x8_avg msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad8x4_avg msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad4x8_avg msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad4x4_avg msa/, "$sse2_x86inc";
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    foreach (@block_sizes) {
+      ($w, $h) = @$_;
+      add_proto qw/unsigned int/, "vpx_highbd_masked_sad${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
+      specialize "vpx_highbd_masked_sad${w}x${h}", qw/ssse3/;
+    }
+  }
+}
 
 #
 # Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
 #
 # Blocks of 3
-add_proto qw/void vpx_sad64x64x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad64x64x3 msa/;
-
-add_proto qw/void vpx_sad32x32x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x32x3 msa/;
-
-add_proto qw/void vpx_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+foreach $s (@block_widths) {
+  add_proto qw/void/, "vpx_sad${s}x${s}x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+}
+specialize qw/vpx_sad64x64x3            msa/;
+specialize qw/vpx_sad32x32x3            msa/;
 specialize qw/vpx_sad16x16x3 sse3 ssse3 msa/;
+specialize qw/vpx_sad8x8x3   sse3       msa/;
+specialize qw/vpx_sad4x4x3   sse3       msa/;
 
-add_proto qw/void vpx_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void/, "vpx_sad16x8x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad16x8x3 sse3 ssse3 msa/;
-
-add_proto qw/void vpx_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void/, "vpx_sad8x16x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad8x16x3 sse3 msa/;
 
-add_proto qw/void vpx_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x8x3 sse3 msa/;
-
-add_proto qw/void vpx_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x4x3 sse3 msa/;
-
 # Blocks of 8
-add_proto qw/void vpx_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad64x64x8 msa/;
-
-add_proto qw/void vpx_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x32x8 msa/;
-
-add_proto qw/void vpx_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+foreach $s (@block_widths) {
+  add_proto qw/void/, "vpx_sad${s}x${s}x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+}
+specialize qw/vpx_sad64x64x8        msa/;
+specialize qw/vpx_sad32x32x8        msa/;
 specialize qw/vpx_sad16x16x8 sse4_1 msa/;
+specialize qw/vpx_sad8x8x8   sse4_1 msa/;
+specialize qw/vpx_sad4x4x8   sse4_1 msa/;
 
-add_proto qw/void vpx_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void/, "vpx_sad16x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad16x8x8 sse4_1 msa/;
-
-add_proto qw/void vpx_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void/, "vpx_sad8x16x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad8x16x8 sse4_1 msa/;
-
-add_proto qw/void vpx_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x8x8 sse4_1 msa/;
-
-add_proto qw/void vpx_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void/, "vpx_sad8x4x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad8x4x8 msa/;
-
-add_proto qw/void vpx_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void/, "vpx_sad4x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad4x8x8 msa/;
 
-add_proto qw/void vpx_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x4x8 sse4_1 msa/;
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  foreach $s (@block_widths) {
+    # Blocks of 3
+    add_proto qw/void/, "vpx_highbd_sad${s}x${s}x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+    # Blocks of 8
+    add_proto qw/void/, "vpx_highbd_sad${s}x${s}x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  }
+  # Blocks of 3
+  add_proto qw/void/, "vpx_highbd_sad16x8x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  add_proto qw/void/, "vpx_highbd_sad8x16x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  # Blocks of 8
+  add_proto qw/void/, "vpx_highbd_sad16x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  add_proto qw/void/, "vpx_highbd_sad8x16x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  add_proto qw/void/, "vpx_highbd_sad8x4x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  add_proto qw/void/, "vpx_highbd_sad4x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+}
 
 #
 # Multi-block SAD, comparing a reference to N independent blocks
 #
-add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+foreach (@block_sizes) {
+  ($w, $h) = @$_;
+  add_proto qw/void/, "vpx_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+}
 specialize qw/vpx_sad64x64x4d avx2 neon msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad64x32x4d msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x64x4d msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad64x32x4d           msa/, "$sse2_x86inc";
+specialize qw/vpx_sad32x64x4d           msa/, "$sse2_x86inc";
 specialize qw/vpx_sad32x32x4d avx2 neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad32x16x4d           msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x32x4d           msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x16x4d      neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x8x4d            msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x16x4d            msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x8x4d             msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x4x4d             msa/, "$sse2_x86inc";
+specialize qw/vpx_sad4x8x4d             msa/, "$sse2_x86inc";
+specialize qw/vpx_sad4x4x4d             msa/, "$sse2_x86inc";
 
-add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x16x4d msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x32x4d msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x16x4d neon msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x8x4d msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x16x4d msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x8x4d msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x4x4d msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x8x4d msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x4x4d msa/, "$sse2_x86inc";
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  #
+  # Multi-block SAD, comparing a reference to N independent blocks
+  #
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/void/, "vpx_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+    if ($w != 128 && $h != 128) {
+      specialize "vpx_highbd_sad${w}x${h}x4d", "$sse2_x86inc";
+    }
+  }
+}
 
 #
 # Structured Similarity (SSIM)
 #
 if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
-    add_proto qw/void vpx_ssim_parms_8x8/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
-    specialize qw/vpx_ssim_parms_8x8/, "$sse2_x86_64";
+  add_proto qw/void vpx_ssim_parms_8x8/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+  specialize qw/vpx_ssim_parms_8x8/, "$sse2_x86_64";
 
-    add_proto qw/void vpx_ssim_parms_16x16/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
-    specialize qw/vpx_ssim_parms_16x16/, "$sse2_x86_64";
-}
+  add_proto qw/void vpx_ssim_parms_16x16/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+  specialize qw/vpx_ssim_parms_16x16/, "$sse2_x86_64";
 
-if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-  #
-  # Block subtraction
-  #
-  add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
-  specialize qw/vpx_highbd_subtract_block/;
-
-  #
-  # Single block SAD
-  #
-  add_proto qw/unsigned int vpx_highbd_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad64x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad64x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad32x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad32x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad32x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad16x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad16x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad16x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad8x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad8x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad8x4/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad4x8/;
-
-  add_proto qw/unsigned int vpx_highbd_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad4x4/;
-
-  #
-  # Avg
-  #
-  add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *, int p";
-  specialize qw/vpx_highbd_avg_8x8/;
-  add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *, int p";
-  specialize qw/vpx_highbd_avg_4x4/;
-  add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
-  specialize qw/vpx_highbd_minmax_8x8/;
-
-  add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad64x64_avg/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad64x32_avg/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad32x64_avg/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad32x32_avg/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad32x16_avg/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad16x32_avg/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad16x16_avg/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad16x8_avg/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad8x16_avg/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad8x8_avg/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad8x4_avg/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad4x8_avg/;
-
-  add_proto qw/unsigned int vpx_highbd_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad4x4_avg/;
-
-  #
-  # Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
-  #
-  # Blocks of 3
-  add_proto qw/void vpx_highbd_sad64x64x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad64x64x3/;
-
-  add_proto qw/void vpx_highbd_sad32x32x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad32x32x3/;
-
-  add_proto qw/void vpx_highbd_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad16x16x3/;
-
-  add_proto qw/void vpx_highbd_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad16x8x3/;
-
-  add_proto qw/void vpx_highbd_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x16x3/;
-
-  add_proto qw/void vpx_highbd_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x8x3/;
-
-  add_proto qw/void vpx_highbd_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad4x4x3/;
-
-  # Blocks of 8
-  add_proto qw/void vpx_highbd_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad64x64x8/;
-
-  add_proto qw/void vpx_highbd_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad32x32x8/;
-
-  add_proto qw/void vpx_highbd_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad16x16x8/;
-
-  add_proto qw/void vpx_highbd_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad16x8x8/;
-
-  add_proto qw/void vpx_highbd_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x16x8/;
-
-  add_proto qw/void vpx_highbd_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x8x8/;
-
-  add_proto qw/void vpx_highbd_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x4x8/;
-
-  add_proto qw/void vpx_highbd_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad4x8x8/;
-
-  add_proto qw/void vpx_highbd_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad4x4x8/;
-
-  #
-  # Multi-block SAD, comparing a reference to N independent blocks
-  #
-  add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad64x64x4d/, "$sse2_x86inc";
-
-  add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad64x32x4d/, "$sse2_x86inc";
-
-  add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad32x64x4d/, "$sse2_x86inc";
-
-  add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad32x32x4d/, "$sse2_x86inc";
-
-  add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad32x16x4d/, "$sse2_x86inc";
-
-  add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad16x32x4d/, "$sse2_x86inc";
-
-  add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad16x16x4d/, "$sse2_x86inc";
-
-  add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad16x8x4d/, "$sse2_x86inc";
-
-  add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x16x4d/, "$sse2_x86inc";
-
-  add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x8x4d/, "$sse2_x86inc";
-
-  add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x4x4d/, "$sse2_x86inc";
-
-  add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad4x8x4d/, "$sse2_x86inc";
-
-  add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad4x4x4d/, "$sse2_x86inc";
-
-  #
-  # Structured Similarity (SSIM)
-  #
-  if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void vpx_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
-    specialize qw/vpx_highbd_ssim_parms_8x8/;
   }
-}  # CONFIG_VP9_HIGHBITDEPTH
+}
 }  # CONFIG_ENCODERS
 
 if (vpx_config("CONFIG_ENCODERS") eq "yes" || vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
 
 #
-# Variance
-#
-add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance64x64 sse2 avx2 neon msa/;
-
-add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance64x32 sse2 avx2 neon msa/;
-
-add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance32x64 sse2 neon msa/;
-
-add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance32x32 sse2 avx2 neon msa/;
-
-add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance32x16 sse2 avx2 msa/;
-
-add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x32 sse2 msa/;
-
-add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x16 mmx sse2 avx2 media neon msa/;
-
-add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x8 mmx sse2 neon msa/;
-
-add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x16 mmx sse2 neon msa/;
-
-add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x8 mmx sse2 media neon msa/;
-
-add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x4 sse2 msa/;
-
-add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance4x8 sse2 msa/;
-
-add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance4x4 mmx sse2 msa/;
-
-#
 # Specialty Variance
 #
 add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_get16x16var sse2 avx2 neon msa/;
-
 add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_get8x8var mmx sse2 neon msa/;
+
+specialize qw/vpx_get16x16var     avx2 sse2 neon msa/;
+specialize qw/vpx_get8x8var   mmx      sse2 neon msa/;
 
 add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse16x16 mmx sse2 avx2 media neon msa/;
-
 add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse16x8 sse2 msa/;
-
 add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse8x16 sse2 msa/;
-
 add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse8x8 sse2 msa/;
 
-add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
-  specialize qw/vpx_get_mb_ss mmx sse2 msa/;
+specialize qw/vpx_mse16x16 mmx avx2 sse2 media neon msa/;
+specialize qw/vpx_mse16x8           sse2            msa/;
+specialize qw/vpx_mse8x16           sse2            msa/;
+specialize qw/vpx_mse8x8            sse2            msa/;
 
-add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride";
-  specialize qw/vpx_get4x4sse_cs neon msa/;
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  foreach $bd (8, 10, 12) {
+    add_proto qw/void/, "vpx_highbd_${bd}_get16x16var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+    add_proto qw/void/, "vpx_highbd_${bd}_get8x8var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
 
-add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
+    add_proto qw/unsigned int/, "vpx_highbd_${bd}_mse16x16", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+    add_proto qw/unsigned int/, "vpx_highbd_${bd}_mse16x8", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+    add_proto qw/unsigned int/, "vpx_highbd_${bd}_mse8x16", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+    add_proto qw/unsigned int/, "vpx_highbd_${bd}_mse8x8", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
 
-if (vpx_config("CONFIG_EXT_INTER") eq "yes") {
-  add_proto qw/unsigned int vpx_masked_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_variance32x16 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_variance16x32 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_variance64x32 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_variance32x64 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_variance32x32 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_variance64x64 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_variance16x16 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_variance16x8 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_variance8x16 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_variance8x8 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_variance8x4 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_variance4x8 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_variance4x4 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_sub_pixel_variance64x64 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_sub_pixel_variance32x64 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_sub_pixel_variance64x32 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_sub_pixel_variance32x16 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_sub_pixel_variance16x32 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_sub_pixel_variance32x32 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_sub_pixel_variance16x16 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_sub_pixel_variance8x16 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_sub_pixel_variance16x8 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_sub_pixel_variance8x8 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_sub_pixel_variance8x4 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_sub_pixel_variance4x8 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-  specialize qw/vpx_masked_sub_pixel_variance4x4 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
-  specialize qw/vpx_masked_sad64x64 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
-  specialize qw/vpx_masked_sad32x64 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
-  specialize qw/vpx_masked_sad64x32 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
-  specialize qw/vpx_masked_sad32x16 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
-  specialize qw/vpx_masked_sad16x32 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
-  specialize qw/vpx_masked_sad32x32 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
-  specialize qw/vpx_masked_sad16x16 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
-  specialize qw/vpx_masked_sad16x8 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
-  specialize qw/vpx_masked_sad8x16 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
-  specialize qw/vpx_masked_sad8x8 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
-  specialize qw/vpx_masked_sad8x4 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
-  specialize qw/vpx_masked_sad4x8 ssse3/;
-
-  add_proto qw/unsigned int vpx_masked_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
-  specialize qw/vpx_masked_sad4x4 ssse3/;
-
-  if (vpx_config("CONFIG_EXT_PARTITION") eq "yes") {
-    add_proto qw/unsigned int vpx_masked_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_masked_variance128x128/;
-
-    add_proto qw/unsigned int vpx_masked_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_masdctked_variance128x64/;
-
-    add_proto qw/unsigned int vpx_masked_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_masked_variance64x128/;
-
-    add_proto qw/unsigned int vpx_masked_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_masked_sub_pixel_variance128x128/;
-
-    add_proto qw/unsigned int vpx_masked_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_masked_sub_pixel_variance128x64/;
-
-    add_proto qw/unsigned int vpx_masked_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_masked_sub_pixel_variance64x128/;
-
-    add_proto qw/unsigned int vpx_masked_sad128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
-    specialize qw/vpx_masked_sad128x128/;
-
-    add_proto qw/unsigned int vpx_masked_sad128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
-    specialize qw/vpx_masked_sad128x64/;
-
-    add_proto qw/unsigned int vpx_masked_sad64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
-    specialize qw/vpx_masked_sad64x128/;
+    specialize "vpx_highbd_${bd}_mse16x16", qw/sse2/;
+    specialize "vpx_highbd_${bd}_mse8x8", qw/sse2/;
   }
 }
 
+#
+# ...
+#
 if (vpx_config("CONFIG_AFFINE_MOTION") eq "yes") {
   add_proto qw/void vpx_upsampled_pred/, "uint8_t *comp_pred, int width, int height, const uint8_t *ref, int ref_stride";
     specialize qw/vpx_upsampled_pred sse2/;
@@ -1620,796 +1249,129 @@
 }
 
 #
-# Subpixel Variance
+# ...
 #
-add_proto qw/uint32_t vpx_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance64x64 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
+add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride";
 
-add_proto qw/uint32_t vpx_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance64x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_get_mb_ss mmx sse2 msa/;
+specialize qw/vpx_get4x4sse_cs neon msa/;
 
-add_proto qw/uint32_t vpx_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance32x64 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+#
+# Variance / Subpixel Variance / Subpixel Avg Variance
+#
+foreach (@block_sizes) {
+  ($w, $h) = @$_;
+  add_proto qw/unsigned int/, "vpx_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/uint32_t/, "vpx_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t/, "vpx_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+}
 
-add_proto qw/uint32_t vpx_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_variance64x64     sse2 avx2       neon msa/;
+specialize qw/vpx_variance64x32     sse2 avx2       neon msa/;
+specialize qw/vpx_variance32x64     sse2            neon msa/;
+specialize qw/vpx_variance32x32     sse2 avx2       neon msa/;
+specialize qw/vpx_variance32x16     sse2 avx2            msa/;
+specialize qw/vpx_variance16x32     sse2                 msa/;
+specialize qw/vpx_variance16x16 mmx sse2 avx2 media neon msa/;
+specialize qw/vpx_variance16x8  mmx sse2            neon msa/;
+specialize qw/vpx_variance8x16  mmx sse2            neon msa/;
+specialize qw/vpx_variance8x8   mmx sse2      media neon msa/;
+specialize qw/vpx_variance8x4       sse2                 msa/;
+specialize qw/vpx_variance4x8       sse2                 msa/;
+specialize qw/vpx_variance4x4   mmx sse2                 msa/;
 
-add_proto qw/uint32_t vpx_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance32x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance64x64     avx2       neon msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance64x32                     msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance32x64                     msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance32x32     avx2       neon msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance32x16                     msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance16x32                     msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance16x16 mmx      media neon msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance16x8  mmx                 msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance8x16  mmx                 msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance8x8   mmx      media neon msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance8x4                       msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance4x8                       msa/, "$sse_x86inc",                  "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance4x4   mmx                 msa/, "$sse_x86inc",                  "$ssse3_x86inc";
 
-add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance64x64 avx2 msa/,                "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance64x32      msa/,                "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance32x64      msa/,                "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance32x32 avx2 msa/,                "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance32x16      msa/,                "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance16x32      msa/,                "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance16x16      msa/,                "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance16x8       msa/,                "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance8x16       msa/,                "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance8x8        msa/,                "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance8x4        msa/,                "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance4x8        msa/, "$sse_x86inc",                 "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance4x4        msa/, "$sse_x86inc",                 "$ssse3_x86inc";
 
-add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance16x16 mmx media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  foreach $bd (8, 10, 12) {
+    foreach (@block_sizes) {
+      ($w, $h) = @$_;
+      add_proto qw/unsigned int/, "vpx_highbd_${bd}_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      add_proto qw/uint32_t/, "vpx_highbd_${bd}_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      add_proto qw/uint32_t/, "vpx_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      if ($w != 128 && $h != 128 && $w != 4 && $h != 4) {
+        specialize "vpx_highbd_${bd}_variance${w}x${h}", "sse2";
+      }
+      if ($w != 128 && $h != 128 && $w != 4) {
+        specialize "vpx_highbd_${bd}_sub_pixel_variance${w}x${h}", $sse2_x86inc;
+        specialize "vpx_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", $sse2_x86inc;
+      }
+    }
+  }
+}  # CONFIG_VP9_HIGHBITDEPTH
 
-add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance16x8 mmx msa/, "$sse2_x86inc", "$ssse3_x86inc";
+#
+# Masked Variance / Masked Subpixel Variance
+#
+if (vpx_config("CONFIG_EXT_INTER") eq "yes") {
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "vpx_masked_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+    add_proto qw/unsigned int/, "vpx_masked_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+    specialize "vpx_masked_variance${w}x${h}", qw/ssse3/;
+    specialize "vpx_masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
+  }
 
-add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance8x16 mmx msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance8x8 mmx media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance4x4 mmx msa/, "$sse_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance64x64 avx2 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance64x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance32x64 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance32x32 avx2 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance32x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance16x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance16x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance8x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance8x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance4x4 msa/, "$sse_x86inc", "$ssse3_x86inc";
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    foreach $bd ("_", "_10_", "_12_") {
+      foreach (@block_sizes) {
+        ($w, $h) = @$_;
+        add_proto qw/unsigned int/, "vpx_highbd${bd}masked_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+        add_proto qw/unsigned int/, "vpx_highbd${bd}masked_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
+        specialize "vpx_highbd${bd}masked_variance${w}x${h}", qw/ssse3/;
+        specialize "vpx_highbd${bd}masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
+      }
+    }
+  }
+}
 
 #
 # Specialty Subpixel
 #
-add_proto qw/uint32_t vpx_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";
-  specialize qw/vpx_variance_halfpixvar16x16_h mmx sse2 media/;
+add_proto qw/uint32_t vpx_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse";
+specialize qw/vpx_variance_halfpixvar16x16_h mmx sse2 media/;
 
-add_proto qw/uint32_t vpx_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";
-  specialize qw/vpx_variance_halfpixvar16x16_v mmx sse2 media/;
+add_proto qw/uint32_t vpx_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse";
+specialize qw/vpx_variance_halfpixvar16x16_v mmx sse2 media/;
 
-add_proto qw/uint32_t vpx_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";
-  specialize qw/vpx_variance_halfpixvar16x16_hv mmx sse2 media/;
+add_proto qw/uint32_t vpx_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse";
+specialize qw/vpx_variance_halfpixvar16x16_hv mmx sse2 media/;
 
+#
+# Comp Avg
+#
+add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance64x64 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance64x32 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance32x64 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance32x32 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance32x16 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance16x32 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance16x16 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance16x8 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance8x16 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance8x8 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
-  add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance64x64 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance64x32 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance32x64 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance32x32 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance32x16 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance16x32 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance16x16 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance16x8 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance8x16 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance8x8 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
-  add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance64x64 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance64x32 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance32x64 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance32x32 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance32x16 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance16x32 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance16x16 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance16x8 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance8x16 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance8x8 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
-  add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-
-  add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-
-  add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-
-  add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_mse16x16 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_mse8x8 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_mse16x16 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_mse8x8 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_mse16x16 sse2/;
-
-  add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_mse8x8 sse2/;
-
-  if (vpx_config("CONFIG_EXT_INTER") eq "yes") {
-    add_proto qw/unsigned int vpx_highbd_masked_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_variance32x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_variance16x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_variance64x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_variance32x64 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_variance32x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_variance64x64 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_variance16x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_variance16x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_variance8x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_variance8x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_variance8x4 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_variance4x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_variance4x4 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_variance32x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_variance16x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_variance64x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_variance32x64 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_variance32x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_variance64x64 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_variance16x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_variance16x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_variance8x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_variance8x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_variance8x4 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_variance4x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_variance4x4 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_variance32x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_variance16x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_variance64x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_variance32x64 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_variance32x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_variance64x64 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_variance16x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_variance16x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_variance8x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_variance8x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_variance8x4 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_variance4x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_variance4x4 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_sub_pixel_variance64x64 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_sub_pixel_variance64x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_sub_pixel_variance32x64 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_sub_pixel_variance32x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_sub_pixel_variance32x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_sub_pixel_variance16x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_sub_pixel_variance16x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_sub_pixel_variance16x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_sub_pixel_variance8x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_sub_pixel_variance8x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_sub_pixel_variance8x4 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_sub_pixel_variance4x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_masked_sub_pixel_variance4x4 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_sub_pixel_variance64x64 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_sub_pixel_variance64x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_sub_pixel_variance32x64 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_sub_pixel_variance32x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_sub_pixel_variance32x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_sub_pixel_variance16x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_sub_pixel_variance16x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_sub_pixel_variance16x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_sub_pixel_variance8x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_sub_pixel_variance8x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_sub_pixel_variance8x4 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_sub_pixel_variance4x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_10_masked_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_10_masked_sub_pixel_variance4x4 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_sub_pixel_variance64x64 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_sub_pixel_variance64x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_sub_pixel_variance32x64 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_sub_pixel_variance32x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_sub_pixel_variance32x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_sub_pixel_variance16x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_sub_pixel_variance16x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_sub_pixel_variance16x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_sub_pixel_variance8x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_sub_pixel_variance8x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_sub_pixel_variance8x4 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_sub_pixel_variance4x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_12_masked_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-    specialize qw/vpx_highbd_12_masked_sub_pixel_variance4x4 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
-    specialize qw/vpx_highbd_masked_sad64x64 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
-    specialize qw/vpx_highbd_masked_sad32x64 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
-    specialize qw/vpx_highbd_masked_sad64x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
-    specialize qw/vpx_highbd_masked_sad32x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
-    specialize qw/vpx_highbd_masked_sad16x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
-    specialize qw/vpx_highbd_masked_sad32x32 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
-    specialize qw/vpx_highbd_masked_sad16x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
-    specialize qw/vpx_highbd_masked_sad16x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
-    specialize qw/vpx_highbd_masked_sad8x16 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
-    specialize qw/vpx_highbd_masked_sad8x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
-    specialize qw/vpx_highbd_masked_sad8x4 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
-    specialize qw/vpx_highbd_masked_sad4x8 ssse3/;
-
-    add_proto qw/unsigned int vpx_highbd_masked_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
-    specialize qw/vpx_highbd_masked_sad4x4 ssse3/;
-
-    if (vpx_config("CONFIG_EXT_PARTITION") eq "yes") {
-      add_proto qw/unsigned int vpx_highbd_masked_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-      specialize qw/vpx_highbd_masked_variance128x128/;
-
-      add_proto qw/unsigned int vpx_highbd_masked_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-      specialize qw/vpx_highbd_masked_variance128x64/;
-
-      add_proto qw/unsigned int vpx_highbd_masked_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-      specialize qw/vpx_highbd_masked_variance64x128/;
-
-      add_proto qw/unsigned int vpx_highbd_10_masked_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-      specialize qw/vpx_highbd_10_masked_variance128x128/;
-
-      add_proto qw/unsigned int vpx_highbd_10_masked_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-      specialize qw/vpx_highbd_10_masked_variance128x64/;
-
-      add_proto qw/unsigned int vpx_highbd_10_masked_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-      specialize qw/vpx_highbd_10_masked_variance64x128/;
-
-      add_proto qw/unsigned int vpx_highbd_12_masked_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-      specialize qw/vpx_highbd_10_masked_variance128x128/;
-
-      add_proto qw/unsigned int vpx_highbd_12_masked_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-      specialize qw/vpx_highbd_10_masked_variance128x64/;
-
-      add_proto qw/unsigned int vpx_highbd_12_masked_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-      specialize qw/vpx_highbd_10_masked_variance64x128/;
-
-      add_proto qw/unsigned int vpx_highbd_masked_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-      specialize qw/vpx_highbd_masked_sub_pixel_variance128x128/;
-
-      add_proto qw/unsigned int vpx_highbd_masked_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-      specialize qw/vpx_highbd_masked_sub_pixel_variance128x64/;
-
-      add_proto qw/unsigned int vpx_highbd_masked_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-      specialize qw/vpx_highbd_masked_sub_pixel_variance64x128/;
-
-      add_proto qw/unsigned int vpx_highbd_10_masked_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-      specialize qw/vpx_highbd_10_masked_sub_pixel_variance128x128/;
-
-      add_proto qw/unsigned int vpx_highbd_10_masked_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-      specialize qw/vpx_highbd_10_masked_sub_pixel_variance128x64/;
-
-      add_proto qw/unsigned int vpx_highbd_10_masked_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-      specialize qw/vpx_highbd_10_masked_sub_pixel_variance64x128/;
-
-      add_proto qw/unsigned int vpx_highbd_12_masked_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-      specialize qw/vpx_highbd_12_masked_sub_pixel_variance128x128/;
-
-      add_proto qw/unsigned int vpx_highbd_12_masked_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-      specialize qw/vpx_highbd_12_masked_sub_pixel_variance128x64/;
-
-      add_proto qw/unsigned int vpx_highbd_12_masked_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-      specialize qw/vpx_highbd_12_masked_sub_pixel_variance64x128/;
-
-      add_proto qw/unsigned int vpx_highbd_masked_sad128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *mask, int mask_stride";
-      specialize qw/vpx_highbd_masked_sad128x128/;
-
-      add_proto qw/unsigned int vpx_highbd_masked_sad128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
-      specialize qw/vpx_highbd_masked_sad128x64/;
-
-      add_proto qw/unsigned int vpx_highbd_masked_sad64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
-      specialize qw/vpx_highbd_masked_sad64x128/;
-    }
-  }
-
   add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
-
-  #
-  # Subpixel Variance
-  #
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance64x64/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance64x32/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance32x64/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance32x32/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance32x16/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance16x32/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance16x16/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance16x8/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance8x16/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance8x8/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance8x4/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance64x64/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance64x32/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance32x64/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance32x32/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance32x16/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance16x32/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance16x16/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance16x8/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance8x16/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance8x8/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance8x4/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance64x64/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance64x32/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance32x64/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance32x32/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance32x16/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance16x32/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance16x16/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance16x8/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance8x16/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance8x8/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance8x4/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
-
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+}
 
-}  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
 
 1;
diff --git a/vpx_dsp/vpx_filter.h b/vpx_dsp/vpx_filter.h
index e049f74..cfe8161 100644
--- a/vpx_dsp/vpx_filter.h
+++ b/vpx_dsp/vpx_filter.h
@@ -29,7 +29,18 @@
 
 #define BIL_SUBPEL_BITS    3
 #define BIL_SUBPEL_SHIFTS  (1 << BIL_SUBPEL_BITS)
-extern const uint8_t vpx_bilinear_filters[BIL_SUBPEL_SHIFTS][2];
+
+// 2 tap bilinear filters
+static const uint8_t bilinear_filters_2t[BIL_SUBPEL_SHIFTS][2] = {
+  { 128,   0  },
+  { 112,  16  },
+  {  96,  32  },
+  {  80,  48  },
+  {  64,  64  },
+  {  48,  80  },
+  {  32,  96  },
+  {  16, 112  },
+};
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vpx_dsp/x86/masked_sad_intrin_ssse3.c b/vpx_dsp/x86/masked_sad_intrin_ssse3.c
index 384f89b..8b9ff10 100644
--- a/vpx_dsp/x86/masked_sad_intrin_ssse3.c
+++ b/vpx_dsp/x86/masked_sad_intrin_ssse3.c
@@ -64,6 +64,11 @@
                           m, n); \
 }
 
+#if CONFIG_EXT_PARTITION
+MASKSADMXN_SSSE3(128, 128)
+MASKSADMXN_SSSE3(128, 64)
+MASKSADMXN_SSSE3(64, 128)
+#endif  // CONFIG_EXT_PARTITION
 MASKSADMXN_SSSE3(64, 64)
 MASKSADMXN_SSSE3(64, 32)
 MASKSADMXN_SSSE3(32, 64)
@@ -100,7 +105,7 @@
 MASKSAD4XN_SSSE3(4)
 
 // For width a multiple of 16
-// Assumes values in m are <=64 and w = 16, 32, or 64
+// Assumes values in m are <=64
 static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride,
                                             const uint8_t *b_ptr, int b_stride,
                                             const uint8_t *m_ptr, int m_stride,
@@ -255,6 +260,11 @@
                                  msk_stride, m, n); \
 }
 
+#if CONFIG_EXT_PARTITION
+HIGHBD_MASKSADMXN_SSSE3(128, 128)
+HIGHBD_MASKSADMXN_SSSE3(128, 64)
+HIGHBD_MASKSADMXN_SSSE3(64, 128)
+#endif  // CONFIG_EXT_PARTITION
 HIGHBD_MASKSADMXN_SSSE3(64, 64)
 HIGHBD_MASKSADMXN_SSSE3(64, 32)
 HIGHBD_MASKSADMXN_SSSE3(32, 64)
diff --git a/vpx_dsp/x86/masked_variance_intrin_ssse3.c b/vpx_dsp/x86/masked_variance_intrin_ssse3.c
index 96af421..ca4f6fc 100644
--- a/vpx_dsp/x86/masked_variance_intrin_ssse3.c
+++ b/vpx_dsp/x86/masked_variance_intrin_ssse3.c
@@ -18,17 +18,63 @@
 #include "vpx_ports/mem.h"
 #include "vpx_dsp/vpx_filter.h"
 
-// Assumes mask values are <= 64
 
-// Log 2 of powers of 2 as an expression
-#define LOG2_P2(n)  ((n) ==   1 ? 0 :       \
-                     (n) ==   2 ? 1 :       \
-                     (n) ==   4 ? 2 :       \
-                     (n) ==   8 ? 3 :       \
-                     (n) ==  16 ? 4 :       \
-                     (n) ==  32 ? 5 :       \
-                     (n) ==  64 ? 6 :       \
-                     (n) == 128 ? 7 :  -1)
+// Half pixel shift
+#define HALF_PIXEL_OFFSET (BIL_SUBPEL_SHIFTS/2)
+
+/*****************************************************************************
+ * Horizontal additions
+ *****************************************************************************/
+
+static INLINE int32_t hsum_epi32_si32(__m128i v_d) {
+  v_d = _mm_hadd_epi32(v_d, v_d);
+  v_d = _mm_hadd_epi32(v_d, v_d);
+  return _mm_cvtsi128_si32(v_d);
+}
+
+static INLINE int64_t hsum_epi64_si64(__m128i v_q) {
+  v_q = _mm_add_epi64(v_q, _mm_srli_si128(v_q, 8));
+#if ARCH_X86_64
+  return _mm_cvtsi128_si64(v_q);
+#else
+  {
+    int64_t tmp;
+    _mm_storel_epi64((__m128i*)&tmp, v_q);
+    return tmp;
+  }
+#endif
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE int64_t hsum_epi32_si64(__m128i v_d) {
+  const __m128i v_sign_d =  _mm_cmplt_epi32(v_d, _mm_setzero_si128());
+  const __m128i v_0_q = _mm_unpacklo_epi32(v_d, v_sign_d);
+  const __m128i v_1_q = _mm_unpackhi_epi32(v_d, v_sign_d);
+  return hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q));
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE int calc_masked_variance(__m128i v_sum_d, __m128i v_sse_q,
+                                       unsigned int* sse,
+                                       const int w, const int h) {
+  int64_t sum64;
+  uint64_t sse64;
+
+  // Horizontal sum
+  sum64 = hsum_epi32_si32(v_sum_d);
+  sse64 = hsum_epi64_si64(v_sse_q);
+
+  sum64 = (sum64 >= 0) ? sum64 : -sum64;
+
+  // Round
+  sum64 = ROUND_POWER_OF_TWO(sum64, 6);
+  sse64 = ROUND_POWER_OF_TWO(sse64, 12);
+
+  // Store the SSE
+  *sse = (unsigned int)sse64;
+  // Compute the variance
+  return  *sse - ((sum64 * sum64) / (w * h));
+}
 
 /*****************************************************************************
  * n*16 Wide versions
@@ -98,30 +144,7 @@
     m += m_stride;
   }
 
-  // Horizontal sum
-  v_sum_d = _mm_hadd_epi32(v_sum_d, v_sum_d);
-  v_sum_d = _mm_hadd_epi32(v_sum_d, v_sum_d);
-  v_sse_q = _mm_add_epi64(v_sse_q, _mm_srli_si128(v_sse_q, 8));
-
-  // Round
-  v_sum_d = _mm_sub_epi32(v_sum_d, _mm_cmplt_epi32(v_sum_d, v_zero));
-  v_sum_d = _mm_add_epi32(v_sum_d, _mm_set_epi32(0, 0, 0, 31));
-  v_sum_d = _mm_srai_epi32(v_sum_d, 6);
-
-  v_sse_q = _mm_add_epi64(v_sse_q, _mm_set_epi32(0, 0, 0, 2047));
-  v_sse_q = _mm_srli_epi64(v_sse_q, 12);
-
-  // Store the SSE
-  *sse = _mm_cvtsi128_si32(v_sse_q);
-
-  // Compute the variance
-  v_sum_d = _mm_abs_epi32(v_sum_d);
-  v_sum_d = _mm_mul_epu32(v_sum_d, v_sum_d);
-  v_sum_d = _mm_srl_epi64(v_sum_d,
-                          _mm_set_epi32(0, 0, 0, LOG2_P2(w) + LOG2_P2(h)));
-  v_sse_q = _mm_sub_epi64(v_sse_q, v_sum_d);
-
-  return _mm_cvtsi128_si32(v_sse_q);
+  return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h);
 }
 
 #define MASKED_VARWXH(W, H)                                               \
@@ -144,6 +167,11 @@
 MASKED_VARWXH(32, 64)
 MASKED_VARWXH(64, 32)
 MASKED_VARWXH(64, 64)
+#if CONFIG_EXT_PARTITION
+MASKED_VARWXH(64, 128)
+MASKED_VARWXH(128, 64)
+MASKED_VARWXH(128, 128)
+#endif  // CONFIG_EXT_PARTITION
 
 /*****************************************************************************
  * 8 Wide versions
@@ -198,29 +226,7 @@
     m += m_stride;
   }
 
-  // Horizontal sum
-  v_sum_d = _mm_hadd_epi32(v_sum_d, v_sum_d);
-  v_sum_d = _mm_hadd_epi32(v_sum_d, v_sum_d);
-  v_sse_q = _mm_add_epi64(v_sse_q, _mm_srli_si128(v_sse_q, 8));
-
-  // Round
-  v_sum_d = _mm_sub_epi32(v_sum_d, _mm_cmplt_epi32(v_sum_d, v_zero));
-  v_sum_d = _mm_add_epi32(v_sum_d, _mm_set_epi32(0, 0, 0, 31));
-  v_sum_d = _mm_srai_epi32(v_sum_d, 6);
-
-  v_sse_q = _mm_add_epi64(v_sse_q, _mm_set_epi32(0, 0, 0, 2047));
-  v_sse_q = _mm_srli_epi64(v_sse_q, 12);
-
-  // Store the SSE
-  *sse = _mm_cvtsi128_si32(v_sse_q);
-
-  // Compute the variance
-  v_sum_d = _mm_abs_epi32(v_sum_d);
-  v_sum_d = _mm_mul_epu32(v_sum_d, v_sum_d);
-  v_sum_d = _mm_srl_epi64(v_sum_d, _mm_set_epi32(0, 0, 0, LOG2_P2(h) + 3));
-  v_sse_q = _mm_sub_epi64(v_sse_q, v_sum_d);
-
-  return _mm_cvtsi128_si32(v_sse_q);
+  return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h);
 }
 
 #define MASKED_VAR8XH(H)                                                  \
@@ -302,29 +308,7 @@
     m += m_stride * 2;
   }
 
-  // Horizontal sum
-  v_sum_d = _mm_hadd_epi32(v_sum_d, v_sum_d);
-  v_sum_d = _mm_hadd_epi32(v_sum_d, v_sum_d);
-  v_sse_q = _mm_add_epi64(v_sse_q, _mm_srli_si128(v_sse_q, 8));
-
-  // Round
-  v_sum_d = _mm_sub_epi32(v_sum_d, _mm_cmplt_epi32(v_sum_d, v_zero));
-  v_sum_d = _mm_add_epi32(v_sum_d, _mm_set_epi32(0, 0, 0, 31));
-  v_sum_d = _mm_srai_epi32(v_sum_d, 6);
-
-  v_sse_q = _mm_add_epi64(v_sse_q, _mm_set_epi32(0, 0, 0, 2047));
-  v_sse_q = _mm_srli_epi64(v_sse_q, 12);
-
-  // Store the SSE
-  *sse = _mm_cvtsi128_si32(v_sse_q);
-
-  // Compute the variance
-  v_sum_d = _mm_abs_epi32(v_sum_d);
-  v_sum_d = _mm_mul_epu32(v_sum_d, v_sum_d);
-  v_sum_d = _mm_srl_epi64(v_sum_d, _mm_set_epi32(0, 0, 0, LOG2_P2(h) + 2));
-  v_sse_q = _mm_sub_epi64(v_sse_q, v_sum_d);
-
-  return _mm_cvtsi128_si32(v_sse_q);
+  return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h);
 }
 
 #define MASKED_VAR4XH(H)                                                  \
@@ -350,13 +334,13 @@
     const uint16_t *b, int  b_stride,
     const uint8_t *m, int  m_stride,
     int w, int  h,
-    __m128i* v_sum_d, __m128i* v_sse_q) {
+    int64_t *sum, uint64_t *sse) {
   int ii, jj;
 
   const __m128i v_zero = _mm_setzero_si128();
 
-  *v_sum_d = _mm_setzero_si128();
-  *v_sse_q = _mm_setzero_si128();
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
 
   assert((w % 8) == 0);
 
@@ -373,7 +357,7 @@
       // Difference: [-4095, 4095]
       const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
 
-      // Error - [-4095, 4095] * [0, 64] => fits in 19 bits (incld sign bit)
+      // Error - [-4095, 4095] * [0, 64] => sum of 2 of these fits in 19 bits
       const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w);
 
       // Squared error - max (18 bits * 18 bits) = 36 bits (no sign bit)
@@ -397,8 +381,8 @@
       v_se_q = _mm_add_epi64(v_se0_q, v_se1_q);
 
       // Accumulate
-      *v_sum_d = _mm_add_epi32(*v_sum_d, v_e_d);
-      *v_sse_q = _mm_add_epi64(*v_sse_q, v_se_q);
+      v_sum_d = _mm_add_epi32(v_sum_d, v_e_d);
+      v_sse_q = _mm_add_epi64(v_sse_q, v_se_q);
     }
 
     // Move on to next row
@@ -408,17 +392,13 @@
   }
 
   // Horizontal sum
-  *v_sum_d = _mm_hadd_epi32(*v_sum_d, *v_sum_d);
-  *v_sum_d = _mm_hadd_epi32(*v_sum_d, *v_sum_d);
-  *v_sse_q = _mm_add_epi64(*v_sse_q, _mm_srli_si128(*v_sse_q, 8));
+  *sum = hsum_epi32_si64(v_sum_d);
+  *sse = hsum_epi64_si64(v_sse_q);
 
   // Round
-  *v_sum_d = _mm_sub_epi32(*v_sum_d, _mm_cmplt_epi32(*v_sum_d, v_zero));
-  *v_sum_d = _mm_add_epi32(*v_sum_d, _mm_set_epi32(0, 0, 0, 31));
-  *v_sum_d = _mm_srai_epi32(*v_sum_d, 6);
-
-  *v_sse_q = _mm_add_epi64(*v_sse_q, _mm_set_epi32(0, 0, 0, 2047));
-  *v_sse_q = _mm_srli_epi64(*v_sse_q, 12);
+  *sum = (*sum >= 0) ? *sum  : -*sum;
+  *sum = ROUND_POWER_OF_TWO(*sum, 6);
+  *sse = ROUND_POWER_OF_TWO(*sse, 12);
 }
 
 // Main calculation for 4 wide blocks
@@ -427,13 +407,13 @@
     const uint16_t *b, int  b_stride,
     const uint8_t *m, int  m_stride,
     int  h,
-    __m128i* v_sum_d, __m128i* v_sse_q) {
+    int64_t *sum, uint64_t *sse) {
   int ii;
 
   const __m128i v_zero = _mm_setzero_si128();
 
-  *v_sum_d = _mm_setzero_si128();
-  *v_sse_q = _mm_setzero_si128();
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
 
   assert((h % 2) == 0);
 
@@ -481,8 +461,8 @@
     v_se_q = _mm_add_epi64(v_se0_q, v_se1_q);
 
     // Accumulate
-    *v_sum_d = _mm_add_epi32(*v_sum_d, v_e_d);
-    *v_sse_q = _mm_add_epi64(*v_sse_q, v_se_q);
+    v_sum_d = _mm_add_epi32(v_sum_d, v_e_d);
+    v_sse_q = _mm_add_epi64(v_sse_q, v_se_q);
 
     // Move on to next row
     a += a_stride * 2;
@@ -491,17 +471,13 @@
   }
 
   // Horizontal sum
-  *v_sum_d = _mm_hadd_epi32(*v_sum_d, *v_sum_d);
-  *v_sum_d = _mm_hadd_epi32(*v_sum_d, *v_sum_d);
-  *v_sse_q = _mm_add_epi64(*v_sse_q, _mm_srli_si128(*v_sse_q, 8));
+  *sum = hsum_epi32_si32(v_sum_d);
+  *sse = hsum_epi64_si64(v_sse_q);
 
   // Round
-  *v_sum_d = _mm_sub_epi32(*v_sum_d, _mm_cmplt_epi32(*v_sum_d, v_zero));
-  *v_sum_d = _mm_add_epi32(*v_sum_d, _mm_set_epi32(0, 0, 0, 31));
-  *v_sum_d = _mm_srai_epi32(*v_sum_d, 6);
-
-  *v_sse_q = _mm_add_epi64(*v_sse_q, _mm_set_epi32(0, 0, 0, 2047));
-  *v_sse_q = _mm_srli_epi64(*v_sse_q, 12);
+  *sum = (*sum >= 0) ? *sum  : -*sum;
+  *sum = ROUND_POWER_OF_TWO(*sum, 6);
+  *sse = ROUND_POWER_OF_TWO(*sse, 12);
 }
 
 static INLINE unsigned int highbd_masked_variancewxh_ssse3(
@@ -510,26 +486,20 @@
     const uint8_t *m, int  m_stride,
     int w, int  h,
     unsigned int *sse) {
-  __m128i v_sum_d, v_sse_q;
+  uint64_t sse64;
+  int64_t sum64;
 
   if (w == 4)
     highbd_masked_variance64_4wide_ssse3(a, a_stride, b,  b_stride, m, m_stride,
-            h, &v_sum_d, &v_sse_q);
+            h, &sum64, &sse64);
   else
     highbd_masked_variance64_ssse3(a, a_stride, b,  b_stride, m, m_stride, w, h,
-            &v_sum_d, &v_sse_q);
+            &sum64, &sse64);
 
   // Store the SSE
-  *sse = _mm_cvtsi128_si32(v_sse_q);
-
-  // Compute the variance
-  v_sum_d = _mm_abs_epi32(v_sum_d);
-  v_sum_d = _mm_mul_epu32(v_sum_d, v_sum_d);
-  v_sum_d = _mm_srl_epi64(v_sum_d,
-                          _mm_set_epi32(0, 0, 0, LOG2_P2(w) + LOG2_P2(h)));
-  v_sse_q = _mm_sub_epi64(v_sse_q, v_sum_d);
-
-  return _mm_cvtsi128_si32(v_sse_q);
+  *sse = (unsigned int)sse64;
+  // Compute and return variance
+  return *sse - ((sum64 * sum64) / (w * h));
 }
 
 static INLINE unsigned int highbd_10_masked_variancewxh_ssse3(
@@ -538,32 +508,24 @@
     const uint8_t *m, int  m_stride,
     int w, int  h,
     unsigned int *sse) {
-  __m128i v_sum_d, v_sse_q;
+  uint64_t sse64;
+  int64_t sum64;
 
   if (w == 4)
     highbd_masked_variance64_4wide_ssse3(a, a_stride, b,  b_stride, m, m_stride,
-            h, &v_sum_d, &v_sse_q);
+            h, &sum64, &sse64);
   else
     highbd_masked_variance64_ssse3(a, a_stride, b,  b_stride, m, m_stride, w, h,
-            &v_sum_d, &v_sse_q);
+            &sum64, &sse64);
 
-  // Round sum and sse
-  v_sum_d = _mm_srai_epi32(_mm_add_epi32(v_sum_d,
-          _mm_set_epi32(0, 0, 0, 1 << 1)), 2);
-  v_sse_q = _mm_srli_epi64(_mm_add_epi64(v_sse_q,
-          _mm_set_epi32(0, 0, 0, 1 << 3)), 4);
+  // Normalise
+  sum64 = ROUND_POWER_OF_TWO(sum64, 2);
+  sse64 = ROUND_POWER_OF_TWO(sse64, 4);
 
   // Store the SSE
-  *sse = _mm_cvtsi128_si32(v_sse_q);
-
-  // Compute the variance
-  v_sum_d = _mm_abs_epi32(v_sum_d);
-  v_sum_d = _mm_mul_epu32(v_sum_d, v_sum_d);
-  v_sum_d = _mm_srl_epi64(v_sum_d,
-                          _mm_set_epi32(0, 0, 0, LOG2_P2(w) + LOG2_P2(h)));
-  v_sse_q = _mm_sub_epi64(v_sse_q, v_sum_d);
-
-  return _mm_cvtsi128_si32(v_sse_q);
+  *sse = (unsigned int)sse64;
+  // Compute and return variance
+  return *sse - ((sum64 * sum64) / (w * h));
 }
 
 static INLINE unsigned int highbd_12_masked_variancewxh_ssse3(
@@ -572,32 +534,23 @@
     const uint8_t *m, int  m_stride,
     int w, int  h,
     unsigned int *sse) {
-  __m128i v_sum_d, v_sse_q;
+  uint64_t sse64;
+  int64_t sum64;
 
   if (w == 4)
     highbd_masked_variance64_4wide_ssse3(a, a_stride, b,  b_stride, m, m_stride,
-            h, &v_sum_d, &v_sse_q);
+            h, &sum64, &sse64);
   else
     highbd_masked_variance64_ssse3(a, a_stride, b,  b_stride, m, m_stride, w, h,
-            &v_sum_d, &v_sse_q);
+            &sum64, &sse64);
 
-  // Round sum and sse
-  v_sum_d = _mm_srai_epi32(_mm_add_epi32(v_sum_d,
-          _mm_set_epi32(0, 0, 0, 1 << 3)), 4);
-  v_sse_q = _mm_srli_epi64(_mm_add_epi64(v_sse_q,
-          _mm_set_epi32(0, 0, 0, 1 << 7)), 8);
+  sum64 = ROUND_POWER_OF_TWO(sum64, 4);
+  sse64 = ROUND_POWER_OF_TWO(sse64, 8);
 
   // Store the SSE
-  *sse = _mm_cvtsi128_si32(v_sse_q);
-
-  // Compute the variance
-  v_sum_d = _mm_abs_epi32(v_sum_d);
-  v_sum_d = _mm_mul_epu32(v_sum_d, v_sum_d);
-  v_sum_d = _mm_srl_epi64(v_sum_d,
-                          _mm_set_epi32(0, 0, 0, LOG2_P2(w) + LOG2_P2(h)));
-  v_sse_q = _mm_sub_epi64(v_sse_q, v_sum_d);
-
-  return _mm_cvtsi128_si32(v_sse_q);
+  *sse = (unsigned int)sse64;
+  // Compute and return variance
+  return *sse - ((sum64 * sum64) / (w * h));
 }
 
 #define HIGHBD_MASKED_VARWXH(W, H)                                             \
@@ -653,6 +606,11 @@
 HIGHBD_MASKED_VARWXH(32, 64)
 HIGHBD_MASKED_VARWXH(64, 32)
 HIGHBD_MASKED_VARWXH(64, 64)
+#if CONFIG_EXT_PARTITION
+HIGHBD_MASKED_VARWXH(64, 128)
+HIGHBD_MASKED_VARWXH(128, 64)
+HIGHBD_MASKED_VARWXH(128, 128)
+#endif  // CONFIG_EXT_PARTITION
 
 #endif
 
@@ -663,8 +621,8 @@
 typedef __m128i (*filter_fn_t)(__m128i v_a_b, __m128i v_b_b,
                                     __m128i v_filter_b);
 
-static INLINE __m128i apply_filter8(const __m128i v_a_b, const __m128i v_b_b,
-                                    const __m128i v_filter_b) {
+static INLINE __m128i apply_filter_avg(const __m128i v_a_b, const __m128i v_b_b,
+                                       const __m128i v_filter_b) {
   (void) v_filter_b;
   return _mm_avg_epu8(v_a_b, v_b_b);
 }
@@ -735,31 +693,6 @@
   *v_sse_q = _mm_add_epi64(*v_sse_q, v_se_hi_q);
 }
 
-static INLINE int calc_masked_variance(__m128i v_sum_d, __m128i v_sse_q,
-                                       unsigned int* sse,
-                                       const int w, const int h) {
-  int sum;
-
-  // Horizontal sum
-  v_sum_d = _mm_hadd_epi32(v_sum_d, v_sum_d);
-  v_sum_d = _mm_hadd_epi32(v_sum_d, v_sum_d);
-  v_sse_q = _mm_add_epi64(v_sse_q, _mm_srli_si128(v_sse_q, 8));
-
-  // Round
-  sum = _mm_cvtsi128_si32(v_sum_d);
-  sum = (sum >= 0) ? ((sum + 31) >> 6) : -((-sum + 31) >> 6);
-
-  v_sse_q = _mm_add_epi64(v_sse_q, _mm_set_epi32(0, 0, 0, 2047));
-  v_sse_q = _mm_srli_epi64(v_sse_q, 12);
-
-  // Store the SSE
-  *sse = _mm_cvtsi128_si32(v_sse_q);
-
-  // Compute the variance
-  return  *sse - (((int64_t)sum * sum) >> (LOG2_P2(h) + LOG2_P2(w)));
-}
-
-
 // Functions for width (W) >= 16
 unsigned int vpx_masked_subpel_varWxH_xzero(
         const uint8_t *src, int src_stride, int yoffset,
@@ -770,9 +703,9 @@
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
   const __m128i v_filter_b = _mm_set1_epi16((
-        vpx_bilinear_filters[yoffset][1] << 8) +
-        vpx_bilinear_filters[yoffset][0]);
-  assert(yoffset < 8);
+        bilinear_filters_2t[yoffset][1] << 8) +
+        bilinear_filters_2t[yoffset][0]);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
   for (j = 0; j < w; j += 16) {
     // Load the first row ready
     v_src0_b = _mm_loadu_si128((const __m128i*)(src + j));
@@ -814,9 +747,9 @@
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
   const __m128i v_filter_b = _mm_set1_epi16((
-        vpx_bilinear_filters[xoffset][1] << 8) +
-        vpx_bilinear_filters[xoffset][0]);
-  assert(xoffset < 8);
+        bilinear_filters_2t[xoffset][1] << 8) +
+        bilinear_filters_2t[xoffset][0]);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
   for (i = 0; i < h; i++) {
     for (j = 0; j < w; j += 16) {
       // Load this row and one below & apply the filter to them
@@ -846,13 +779,13 @@
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
   const __m128i v_filterx_b = _mm_set1_epi16((
-        vpx_bilinear_filters[xoffset][1] << 8) +
-        vpx_bilinear_filters[xoffset][0]);
+        bilinear_filters_2t[xoffset][1] << 8) +
+        bilinear_filters_2t[xoffset][0]);
   const __m128i v_filtery_b = _mm_set1_epi16((
-        vpx_bilinear_filters[yoffset][1] << 8) +
-        vpx_bilinear_filters[yoffset][0]);
-  assert(yoffset < 8);
-  assert(xoffset < 8);
+        bilinear_filters_2t[yoffset][1] << 8) +
+        bilinear_filters_2t[yoffset][0]);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
   for (j = 0; j < w; j += 16) {
     // Load the first row ready
     v_src0_b = _mm_loadu_si128((const __m128i*)(src + j));
@@ -908,9 +841,9 @@
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
   __m128i v_filter_b = _mm_set1_epi16((
-        vpx_bilinear_filters[yoffset][1] << 8) +
-        vpx_bilinear_filters[yoffset][0]);
-  assert(yoffset < 8);
+        bilinear_filters_2t[yoffset][1] << 8) +
+        bilinear_filters_2t[yoffset][0]);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
   // Load the first row of src data ready
   v_src0_b = _mm_loadl_epi64((const __m128i*)src);
   for (i = 0; i < h; i += 4) {
@@ -938,7 +871,7 @@
     v_msk2_b = _mm_unpacklo_epi32(v_msk3_b, v_msk2_b);
     v_msk0_b = _mm_unpacklo_epi64(v_msk2_b, v_msk0_b);
     // Apply the y filter
-    if (yoffset == 8) {
+    if (yoffset == HALF_PIXEL_OFFSET) {
       v_src1_b = _mm_unpacklo_epi64(v_src3_b, v_src1_b);
       v_src2_b = _mm_or_si128(_mm_slli_si128(v_src1_b, 4),
             _mm_and_si128(v_src0_b, _mm_setr_epi32(-1, 0, 0, 0)));
@@ -974,13 +907,13 @@
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
   __m128i v_filter_b = _mm_set1_epi16((
-        vpx_bilinear_filters[yoffset][1] << 8) +
-        vpx_bilinear_filters[yoffset][0]);
-  assert(yoffset < 8);
+        bilinear_filters_2t[yoffset][1] << 8) +
+        bilinear_filters_2t[yoffset][0]);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
   // Load the first row of src data ready
   v_src0_b = _mm_loadl_epi64((const __m128i*)src);
   for (i = 0; i < h; i += 2) {
-    if (yoffset == 8) {
+    if (yoffset == HALF_PIXEL_OFFSET) {
       // Load the rest of the source data for these rows
       v_src1_b = _mm_or_si128(
             _mm_slli_si128(v_src0_b, 8),
@@ -1030,9 +963,9 @@
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
   __m128i v_filter_b = _mm_set1_epi16((
-        vpx_bilinear_filters[xoffset][1] << 8) +
-        vpx_bilinear_filters[xoffset][0]);
-  assert(xoffset < 8);
+        bilinear_filters_2t[xoffset][1] << 8) +
+        bilinear_filters_2t[xoffset][0]);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
   for (i = 0; i < h; i += 4) {
     // Load the src data
     v_src0_b = _mm_loadl_epi64((const __m128i*)src);
@@ -1064,7 +997,7 @@
     v_msk2_b = _mm_unpacklo_epi32(v_msk3_b, v_msk2_b);
     v_msk0_b = _mm_unpacklo_epi64(v_msk2_b, v_msk0_b);
     // Apply the x filter
-    if (xoffset == 8) {
+    if (xoffset == HALF_PIXEL_OFFSET) {
       v_src0_b = _mm_unpacklo_epi64(v_src2_b, v_src0_b);
       v_src0_shift_b = _mm_unpacklo_epi64(v_src2_shift_b, v_src0_shift_b);
       v_res_b = _mm_avg_epu8(v_src0_b, v_src0_shift_b);
@@ -1093,9 +1026,9 @@
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
   __m128i v_filter_b = _mm_set1_epi16((
-        vpx_bilinear_filters[xoffset][1] << 8) +
-        vpx_bilinear_filters[xoffset][0]);
-  assert(xoffset < 8);
+        bilinear_filters_2t[xoffset][1] << 8) +
+        bilinear_filters_2t[xoffset][0]);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
   for (i = 0; i < h; i += 2) {
     // Load the src data
     v_src0_b = _mm_loadu_si128((const __m128i*)(src));
@@ -1103,7 +1036,7 @@
     v_src1_b = _mm_loadu_si128((const __m128i*)(src + src_stride));
     v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
     // Apply the x filter
-    if (xoffset == 8) {
+    if (xoffset == HALF_PIXEL_OFFSET) {
       v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b);
       v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b);
       v_res_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b);
@@ -1145,13 +1078,13 @@
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
   __m128i v_filterx_b = _mm_set1_epi16((
-        vpx_bilinear_filters[xoffset][1] << 8) +
-        vpx_bilinear_filters[xoffset][0]);
+        bilinear_filters_2t[xoffset][1] << 8) +
+        bilinear_filters_2t[xoffset][0]);
   __m128i v_filtery_b = _mm_set1_epi16((
-        vpx_bilinear_filters[yoffset][1] << 8) +
-        vpx_bilinear_filters[yoffset][0]);
-  assert(xoffset < 8);
-  assert(yoffset < 8);
+        bilinear_filters_2t[yoffset][1] << 8) +
+        bilinear_filters_2t[yoffset][0]);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
   for (i = 0; i < h; i += 4) {
     // Load the src data
     v_src0_b = _mm_loadl_epi64((const __m128i*)src);
@@ -1167,7 +1100,7 @@
     v_src3_shift_b = _mm_srli_si128(v_src3_b, 1);
     v_src2_shift_b = _mm_unpacklo_epi32(v_src3_shift_b, v_src2_shift_b);
     // Apply the x filter
-    if (xoffset == 8) {
+    if (xoffset == HALF_PIXEL_OFFSET) {
       v_src0_b = _mm_unpacklo_epi64(v_src2_b, v_src0_b);
       v_src0_shift_b = _mm_unpacklo_epi64(v_src2_shift_b, v_src0_shift_b);
       v_xres_b[i == 0 ? 0 : 1] = _mm_avg_epu8(v_src0_b, v_src0_shift_b);
@@ -1183,7 +1116,7 @@
   v_src0_b = _mm_loadl_epi64((const __m128i*)src);
   v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
   // Apply the x filter
-  if (xoffset == 8) {
+  if (xoffset == HALF_PIXEL_OFFSET) {
     v_extra_row_b = _mm_and_si128(
             _mm_avg_epu8(v_src0_b, v_src0_shift_b),
             _mm_setr_epi32(-1, 0, 0, 0));
@@ -1203,7 +1136,7 @@
                               v_extra_row_b);
     }
     // Apply the y filter
-    if (yoffset == 8) {
+    if (yoffset == HALF_PIXEL_OFFSET) {
       v_res_b = _mm_avg_epu8(v_xres_b[i == 0 ? 0 : 1], v_temp_b);
     } else {
       v_res_b = apply_filter(v_xres_b[i == 0 ? 0 : 1], v_temp_b, v_filtery_b);
@@ -1245,21 +1178,20 @@
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
   __m128i v_filterx_b = _mm_set1_epi16((
-        vpx_bilinear_filters[xoffset][1] << 8) +
-        vpx_bilinear_filters[xoffset][0]);
+        bilinear_filters_2t[xoffset][1] << 8) +
+        bilinear_filters_2t[xoffset][0]);
   __m128i v_filtery_b = _mm_set1_epi16((
-        vpx_bilinear_filters[yoffset][1] << 8) +
-        vpx_bilinear_filters[yoffset][0]);
-  assert(xoffset < 8);
-  assert(yoffset < 8);
-
+        bilinear_filters_2t[yoffset][1] << 8) +
+        bilinear_filters_2t[yoffset][0]);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
   // Load the first block of src data
   v_src0_b = _mm_loadu_si128((const __m128i*)(src));
   v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
   v_src1_b = _mm_loadu_si128((const __m128i*)(src + src_stride));
   v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
   // Apply the x filter
-  if (xoffset == 8) {
+  if (xoffset == HALF_PIXEL_OFFSET) {
     v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b);
     v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b);
     v_xres0_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b);
@@ -1275,7 +1207,7 @@
     v_src1_b = _mm_loadu_si128((const __m128i*)(src + src_stride * 3));
     v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
     // Apply the x filter
-    if (xoffset == 8) {
+    if (xoffset == HALF_PIXEL_OFFSET) {
       v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b);
       v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b);
       v_xres1_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b);
@@ -1287,7 +1219,7 @@
     // Apply the y filter to the previous block
     v_temp_b = _mm_or_si128(_mm_srli_si128(v_xres0_b, 8),
                             _mm_slli_si128(v_xres1_b, 8));
-    if (yoffset == 8) {
+    if (yoffset == HALF_PIXEL_OFFSET) {
       v_res_b = _mm_avg_epu8(v_xres0_b, v_temp_b);
     } else {
       v_res_b = apply_filter(v_xres0_b, v_temp_b, v_filtery_b);
@@ -1309,7 +1241,7 @@
     v_src1_b = _mm_loadu_si128((const __m128i*)(src + src_stride * 5));
     v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
     // Apply the x filter
-    if (xoffset == 8) {
+    if (xoffset == HALF_PIXEL_OFFSET) {
       v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b);
       v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b);
       v_xres0_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b);
@@ -1321,7 +1253,7 @@
     // Apply the y filter to the previous block
     v_temp_b = _mm_or_si128(_mm_srli_si128(v_xres1_b, 8),
                             _mm_slli_si128(v_xres0_b, 8));
-    if (yoffset == 8) {
+    if (yoffset == HALF_PIXEL_OFFSET) {
       v_res_b = _mm_avg_epu8(v_xres1_b, v_temp_b);
     } else {
       v_res_b = apply_filter(v_xres1_b, v_temp_b, v_filtery_b);
@@ -1359,41 +1291,45 @@
       return vpx_masked_variance##W##x##H##_ssse3(src, src_stride,             \
                                                   dst, dst_stride,             \
                                                   msk, msk_stride, sse);       \
-    else if (yoffset == 8)                                                     \
-      return vpx_masked_subpel_varWxH_xzero(src, src_stride, 8,                \
+    else if (yoffset == HALF_PIXEL_OFFSET)                                     \
+      return vpx_masked_subpel_varWxH_xzero(src, src_stride,                   \
+                                            HALF_PIXEL_OFFSET,                 \
                                             dst, dst_stride, msk, msk_stride,  \
-                                            sse, W, H, apply_filter8);         \
+                                            sse, W, H, apply_filter_avg);      \
     else                                                                       \
-      return vpx_masked_subpel_varWxH_xzero(src, src_stride, yoffset,          \
+      return vpx_masked_subpel_varWxH_xzero(src, src_stride,                   \
+                                            yoffset,                           \
                                             dst, dst_stride, msk, msk_stride,  \
                                             sse, W, H, apply_filter);          \
   } else if (yoffset == 0) {                                                   \
-    if (xoffset == 8)                                                          \
-      return vpx_masked_subpel_varWxH_yzero(src, src_stride, 8,                \
+    if (xoffset == HALF_PIXEL_OFFSET)                                          \
+      return vpx_masked_subpel_varWxH_yzero(src, src_stride,                   \
+                                            HALF_PIXEL_OFFSET,                 \
                                             dst, dst_stride, msk, msk_stride,  \
-                                            sse, W, H, apply_filter8);         \
+                                            sse, W, H, apply_filter_avg);      \
     else                                                                       \
-      return vpx_masked_subpel_varWxH_yzero(src, src_stride, xoffset,          \
+      return vpx_masked_subpel_varWxH_yzero(src, src_stride,                   \
+                                            xoffset,                           \
                                             dst, dst_stride, msk, msk_stride,  \
                                             sse, W, H, apply_filter);          \
-  } else if (xoffset == 8) {                                                   \
-    if (yoffset == 8)                                                          \
+  } else if (xoffset == HALF_PIXEL_OFFSET) {                                   \
+    if (yoffset == HALF_PIXEL_OFFSET)                                          \
       return vpx_masked_subpel_varWxH_xnonzero_ynonzero(src, src_stride,       \
-              8, 8, dst, dst_stride, msk, msk_stride, sse, W, H,               \
-              apply_filter8, apply_filter8);                                   \
+              HALF_PIXEL_OFFSET, HALF_PIXEL_OFFSET, dst, dst_stride, msk,      \
+              msk_stride, sse, W, H, apply_filter_avg, apply_filter_avg);      \
     else                                                                       \
       return vpx_masked_subpel_varWxH_xnonzero_ynonzero(src, src_stride,       \
-              8, yoffset, dst, dst_stride, msk, msk_stride, sse, W, H,         \
-              apply_filter8, apply_filter);                                    \
+              HALF_PIXEL_OFFSET, yoffset, dst, dst_stride, msk,                \
+              msk_stride, sse, W, H, apply_filter_avg, apply_filter);          \
   } else {                                                                     \
-    if (yoffset == 8)                                                          \
+    if (yoffset == HALF_PIXEL_OFFSET)                                          \
       return vpx_masked_subpel_varWxH_xnonzero_ynonzero(src, src_stride,       \
-              xoffset, 8, dst, dst_stride, msk, msk_stride, sse, W, H,         \
-              apply_filter, apply_filter8);                                    \
+              xoffset, HALF_PIXEL_OFFSET, dst, dst_stride, msk,                \
+              msk_stride, sse, W, H, apply_filter, apply_filter_avg);          \
     else                                                                       \
       return vpx_masked_subpel_varWxH_xnonzero_ynonzero(src, src_stride,       \
-              xoffset, yoffset, dst, dst_stride, msk, msk_stride, sse, W, H,   \
-              apply_filter, apply_filter);                                     \
+              xoffset, yoffset, dst, dst_stride, msk,                          \
+              msk_stride, sse, W, H, apply_filter, apply_filter);              \
   }                                                                            \
 }
 
@@ -1437,6 +1373,11 @@
 MASK_SUBPIX_VAR_LARGE(32, 64)
 MASK_SUBPIX_VAR_LARGE(64, 32)
 MASK_SUBPIX_VAR_LARGE(64, 64)
+#if CONFIG_EXT_PARTITION
+MASK_SUBPIX_VAR_LARGE(64, 128)
+MASK_SUBPIX_VAR_LARGE(128, 64)
+MASK_SUBPIX_VAR_LARGE(128, 128)
+#endif  // CONFIG_EXT_PARTITION
 
 #if CONFIG_VP9_HIGHBITDEPTH
 typedef int (*highbd_calc_masked_var_t)(__m128i v_sum_d, __m128i v_sse_q,
@@ -1449,9 +1390,9 @@
 typedef __m128i (*highbd_filter_fn_t)(__m128i v_a_w, __m128i v_b_w,
                                     __m128i v_filter_w);
 
-static INLINE __m128i highbd_apply_filter8(const __m128i v_a_w,
-                                           const __m128i v_b_w,
-                                           const __m128i v_filter_w) {
+static INLINE __m128i highbd_apply_filter_avg(const __m128i v_a_w,
+                                              const __m128i v_b_w,
+                                              const __m128i v_filter_w) {
   (void) v_filter_w;
   return _mm_avg_epu16(v_a_w, v_b_w);
 }
@@ -1523,55 +1464,53 @@
                                                  __m128i v_sse_q,
                                                  unsigned int* sse,
                                                  const int w, const int h) {
-  int sum;
+  int64_t sum64;
+  uint64_t sse64;
 
   // Horizontal sum
-  v_sum_d = _mm_hadd_epi32(v_sum_d, v_sum_d);
-  v_sum_d = _mm_hadd_epi32(v_sum_d, v_sum_d);
-  v_sse_q = _mm_add_epi64(v_sse_q, _mm_srli_si128(v_sse_q, 8));
+  sum64 = hsum_epi32_si32(v_sum_d);
+  sse64 = hsum_epi64_si64(v_sse_q);
+
+  sum64 = (sum64 >= 0) ? sum64 : -sum64;
 
   // Round
-  sum = _mm_cvtsi128_si32(v_sum_d);
-  sum = (sum >= 0) ? ((sum + 31) >> 6) : -((-sum + 31) >> 6);
-  sum = ROUND_POWER_OF_TWO(sum, 2);
+  sum64 = ROUND_POWER_OF_TWO(sum64, 6);
+  sse64 = ROUND_POWER_OF_TWO(sse64, 12);
 
-  v_sse_q = _mm_add_epi64(v_sse_q, _mm_set_epi32(0, 0, 0, 2047));
-  v_sse_q = _mm_srli_epi64(v_sse_q, 12);
+  // Normalise
+  sum64 = ROUND_POWER_OF_TWO(sum64, 2);
+  sse64 = ROUND_POWER_OF_TWO(sse64, 4);
 
   // Store the SSE
-  v_sse_q = _mm_add_epi64(v_sse_q, _mm_set_epi32(0, 0, 0, 0x8));
-  v_sse_q = _mm_srli_epi64(v_sse_q, 4);
-  *sse = _mm_cvtsi128_si32(v_sse_q);
-
+  *sse = (unsigned int)sse64;
   // Compute the variance
-  return  *sse - (((int64_t)sum * sum) >> (LOG2_P2(h) + LOG2_P2(w)));
+  return  *sse - ((sum64 * sum64) / (w * h));
 }
 static INLINE int highbd_12_calc_masked_variance(__m128i v_sum_d,
                                                  __m128i v_sse_q,
                                                  unsigned int* sse,
                                                  const int w, const int h) {
-  int sum;
+  int64_t sum64;
+  uint64_t sse64;
 
   // Horizontal sum
-  v_sum_d = _mm_hadd_epi32(v_sum_d, v_sum_d);
-  v_sum_d = _mm_hadd_epi32(v_sum_d, v_sum_d);
-  v_sse_q = _mm_add_epi64(v_sse_q, _mm_srli_si128(v_sse_q, 8));
+  sum64 = hsum_epi32_si64(v_sum_d);
+  sse64 = hsum_epi64_si64(v_sse_q);
+
+  sum64 = (sum64 >= 0) ? sum64 : -sum64;
 
   // Round
-  sum = _mm_cvtsi128_si32(v_sum_d);
-  sum = (sum >= 0) ? ((sum + 31) >> 6) : -((-sum + 31) >> 6);
-  sum = ROUND_POWER_OF_TWO(sum, 4);
+  sum64 = ROUND_POWER_OF_TWO(sum64, 6);
+  sse64 = ROUND_POWER_OF_TWO(sse64, 12);
 
-  v_sse_q = _mm_add_epi64(v_sse_q, _mm_set_epi32(0, 0, 0, 2047));
-  v_sse_q = _mm_srli_epi64(v_sse_q, 12);
+  // Normalise
+  sum64 = ROUND_POWER_OF_TWO(sum64, 4);
+  sse64 = ROUND_POWER_OF_TWO(sse64, 8);
 
   // Store the SSE
-  v_sse_q = _mm_add_epi64(v_sse_q, _mm_set_epi32(0, 0, 0, 0x80));
-  v_sse_q = _mm_srli_epi64(v_sse_q, 8);
-  *sse = _mm_cvtsi128_si32(v_sse_q);
-
+  *sse = (unsigned int)sse64;
   // Compute the variance
-  return  *sse - (((int64_t)sum * sum) >> (LOG2_P2(h) + LOG2_P2(w)));
+  return  *sse - ((sum64 * sum64) / (w * h));
 }
 
 
@@ -1586,9 +1525,9 @@
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
   const __m128i v_filter_w = _mm_set1_epi32((
-        vpx_bilinear_filters[yoffset][1] << 16) +
-        vpx_bilinear_filters[yoffset][0]);
-  assert(yoffset < 8);
+        bilinear_filters_2t[yoffset][1] << 16) +
+        bilinear_filters_2t[yoffset][0]);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
   for (j = 0; j < w; j += 8) {
     // Load the first row ready
     v_src0_w = _mm_loadu_si128((const __m128i*)(src + j));
@@ -1631,9 +1570,9 @@
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
   const __m128i v_filter_w = _mm_set1_epi32((
-        vpx_bilinear_filters[xoffset][1] << 16) +
-        vpx_bilinear_filters[xoffset][0]);
-  assert(xoffset < 8);
+        bilinear_filters_2t[xoffset][1] << 16) +
+        bilinear_filters_2t[xoffset][0]);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
   for (i = 0; i < h; i++) {
     for (j = 0; j < w; j += 8) {
       // Load this row & apply the filter to them
@@ -1664,13 +1603,13 @@
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
   const __m128i v_filterx_w = _mm_set1_epi32((
-        vpx_bilinear_filters[xoffset][1] << 16) +
-        vpx_bilinear_filters[xoffset][0]);
+        bilinear_filters_2t[xoffset][1] << 16) +
+        bilinear_filters_2t[xoffset][0]);
   const __m128i v_filtery_w = _mm_set1_epi32((
-        vpx_bilinear_filters[yoffset][1] << 16) +
-        vpx_bilinear_filters[yoffset][0]);
-  assert(xoffset < 8);
-  assert(yoffset < 8);
+        bilinear_filters_2t[yoffset][1] << 16) +
+        bilinear_filters_2t[yoffset][0]);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
   for (j = 0; j < w; j += 8) {
     // Load the first row ready
     v_src0_w = _mm_loadu_si128((const __m128i*)(src + j));
@@ -1724,13 +1663,13 @@
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
   __m128i v_filter_w = _mm_set1_epi32((
-        vpx_bilinear_filters[yoffset][1] << 16) +
-        vpx_bilinear_filters[yoffset][0]);
-  assert(yoffset < 8);
+        bilinear_filters_2t[yoffset][1] << 16) +
+        bilinear_filters_2t[yoffset][0]);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
   // Load the first row of src data ready
   v_src0_w = _mm_loadl_epi64((const __m128i*)src);
   for (i = 0; i < h; i += 2) {
-    if (yoffset == 8) {
+    if (yoffset == HALF_PIXEL_OFFSET) {
       // Load the rest of the source data for these rows
       v_src1_w = _mm_or_si128(
             _mm_slli_si128(v_src0_w, 8),
@@ -1776,9 +1715,9 @@
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
   __m128i v_filter_w = _mm_set1_epi32((
-        vpx_bilinear_filters[xoffset][1] << 16) +
-        vpx_bilinear_filters[xoffset][0]);
-  assert(xoffset < 8);
+        bilinear_filters_2t[xoffset][1] << 16) +
+        bilinear_filters_2t[xoffset][0]);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
   for (i = 0; i < h; i += 2) {
     // Load the src data
     v_src0_w = _mm_loadu_si128((const __m128i*)(src));
@@ -1786,7 +1725,7 @@
     v_src1_w = _mm_loadu_si128((const __m128i*)(src + src_stride));
     v_src1_shift_w = _mm_srli_si128(v_src1_w, 2);
     // Apply the x filter
-    if (xoffset == 8) {
+    if (xoffset == HALF_PIXEL_OFFSET) {
       v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w);
       v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w);
       v_res_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w);
@@ -1826,21 +1765,20 @@
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
   __m128i v_filterx_w = _mm_set1_epi32((
-        vpx_bilinear_filters[xoffset][1] << 16) +
-        vpx_bilinear_filters[xoffset][0]);
+        bilinear_filters_2t[xoffset][1] << 16) +
+        bilinear_filters_2t[xoffset][0]);
   __m128i v_filtery_w = _mm_set1_epi32((
-        vpx_bilinear_filters[yoffset][1] << 16) +
-        vpx_bilinear_filters[yoffset][0]);
-  assert(xoffset < 8);
-  assert(yoffset < 8);
-
+        bilinear_filters_2t[yoffset][1] << 16) +
+        bilinear_filters_2t[yoffset][0]);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
   // Load the first block of src data
   v_src0_w = _mm_loadu_si128((const __m128i*)(src));
   v_src0_shift_w = _mm_srli_si128(v_src0_w, 2);
   v_src1_w = _mm_loadu_si128((const __m128i*)(src + src_stride));
   v_src1_shift_w = _mm_srli_si128(v_src1_w, 2);
   // Apply the x filter
-  if (xoffset == 8) {
+  if (xoffset == HALF_PIXEL_OFFSET) {
     v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w);
     v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w);
     v_xres0_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w);
@@ -1858,7 +1796,7 @@
     v_src1_w = _mm_loadu_si128((const __m128i*)(src + src_stride * 3));
     v_src1_shift_w = _mm_srli_si128(v_src1_w, 2);
     // Apply the x filter
-    if (xoffset == 8) {
+    if (xoffset == HALF_PIXEL_OFFSET) {
       v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w);
       v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w);
       v_xres1_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w);
@@ -1872,7 +1810,7 @@
     // Apply the y filter to the previous block
     v_temp_w = _mm_or_si128(_mm_srli_si128(v_xres0_w, 8),
                             _mm_slli_si128(v_xres1_w, 8));
-    if (yoffset == 8) {
+    if (yoffset == HALF_PIXEL_OFFSET) {
       v_res_w = _mm_avg_epu16(v_xres0_w, v_temp_w);
     } else {
       v_res_w = highbd_apply_filter(v_xres0_w, v_temp_w, v_filtery_w);
@@ -1894,7 +1832,7 @@
     v_src1_w = _mm_loadu_si128((const __m128i*)(src + src_stride * 5));
     v_src1_shift_w = _mm_srli_si128(v_src1_w, 2);
     // Apply the x filter
-    if (xoffset == 8) {
+    if (xoffset == HALF_PIXEL_OFFSET) {
       v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w);
       v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w);
       v_xres0_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w);
@@ -1908,7 +1846,7 @@
     // Apply the y filter to the previous block
     v_temp_w = _mm_or_si128(_mm_srli_si128(v_xres1_w, 8),
                             _mm_slli_si128(v_xres0_w, 8));
-    if (yoffset == 8) {
+    if (yoffset == HALF_PIXEL_OFFSET) {
       v_res_w = _mm_avg_epu16(v_xres1_w, v_temp_w);
     } else {
       v_res_w = highbd_apply_filter(v_xres1_w, v_temp_w, v_filtery_w);
@@ -1948,55 +1886,61 @@
     if (yoffset == 0)                                                          \
       return full_variance_function(src8, src_stride, dst8, dst_stride,        \
                                     msk, msk_stride, sse);                     \
-    else if (yoffset == 8)                                                     \
-      return vpx_highbd_masked_subpel_varWxH_xzero(src, src_stride, 8,         \
+    else if (yoffset == HALF_PIXEL_OFFSET)                                     \
+      return vpx_highbd_masked_subpel_varWxH_xzero(src, src_stride,            \
+                                                   HALF_PIXEL_OFFSET,          \
                                                    dst, dst_stride,            \
                                                    msk, msk_stride,            \
                                                    sse, W, H,                  \
-                                                   highbd_apply_filter8,       \
+                                                   highbd_apply_filter_avg,    \
                                                    calc_var);                  \
     else                                                                       \
-      return vpx_highbd_masked_subpel_varWxH_xzero(src, src_stride, yoffset,   \
+      return vpx_highbd_masked_subpel_varWxH_xzero(src, src_stride,            \
+                                                   yoffset,                    \
                                                    dst, dst_stride,            \
                                                    msk, msk_stride,            \
                                                    sse, W, H,                  \
                                                    highbd_apply_filter,        \
                                                    calc_var);                  \
   } else if (yoffset == 0) {                                                   \
-    if (xoffset == 8)                                                          \
-      return vpx_highbd_masked_subpel_varWxH_yzero(src, src_stride, 8,         \
+    if (xoffset == HALF_PIXEL_OFFSET)                                          \
+      return vpx_highbd_masked_subpel_varWxH_yzero(src, src_stride,            \
+                                                   HALF_PIXEL_OFFSET,          \
                                                    dst, dst_stride,            \
                                                    msk, msk_stride,            \
                                                    sse, W, H,                  \
-                                                   highbd_apply_filter8,       \
+                                                   highbd_apply_filter_avg,    \
                                                    calc_var);                  \
     else                                                                       \
-      return vpx_highbd_masked_subpel_varWxH_yzero(src, src_stride, xoffset,   \
+      return vpx_highbd_masked_subpel_varWxH_yzero(src, src_stride,            \
+                                                   xoffset,                    \
                                                    dst, dst_stride,            \
                                                    msk, msk_stride,            \
                                                    sse, W, H,                  \
                                                    highbd_apply_filter,        \
                                                    calc_var);                  \
-  } else if (xoffset == 8) {                                                   \
-    if (yoffset == 8)                                                          \
+  } else if (xoffset == HALF_PIXEL_OFFSET) {                                   \
+    if (yoffset == HALF_PIXEL_OFFSET)                                          \
       return vpx_highbd_masked_subpel_varWxH_xnonzero_ynonzero(                \
-              src, src_stride, 8, 8, dst, dst_stride, msk, msk_stride,         \
-              sse, W, H, highbd_apply_filter8, highbd_apply_filter8, calc_var);\
+              src, src_stride, HALF_PIXEL_OFFSET, HALF_PIXEL_OFFSET,           \
+              dst, dst_stride, msk, msk_stride, sse, W, H,                     \
+              highbd_apply_filter_avg, highbd_apply_filter_avg, calc_var);     \
     else                                                                       \
       return vpx_highbd_masked_subpel_varWxH_xnonzero_ynonzero(                \
-              src, src_stride, 8, yoffset, dst, dst_stride,                    \
-              msk, msk_stride, sse, W, H, highbd_apply_filter8,                \
+              src, src_stride, HALF_PIXEL_OFFSET, yoffset, dst, dst_stride,    \
+              msk, msk_stride, sse, W, H, highbd_apply_filter_avg,             \
               highbd_apply_filter, calc_var);                                  \
   } else {                                                                     \
-    if (yoffset == 8)                                                          \
+    if (yoffset == HALF_PIXEL_OFFSET)                                          \
       return vpx_highbd_masked_subpel_varWxH_xnonzero_ynonzero(                \
-              src, src_stride, xoffset, 8, dst, dst_stride, msk, msk_stride,   \
-              sse, W, H, highbd_apply_filter, highbd_apply_filter8, calc_var); \
+              src, src_stride, xoffset, HALF_PIXEL_OFFSET,                     \
+              dst, dst_stride, msk, msk_stride, sse, W, H,                     \
+              highbd_apply_filter, highbd_apply_filter_avg, calc_var);         \
     else                                                                       \
       return vpx_highbd_masked_subpel_varWxH_xnonzero_ynonzero(                \
-              src, src_stride, xoffset, yoffset, dst, dst_stride,              \
-               msk, msk_stride, sse, W, H, highbd_apply_filter,                \
-               highbd_apply_filter, calc_var);                                 \
+              src, src_stride, xoffset, yoffset,                               \
+              dst, dst_stride, msk, msk_stride, sse, W, H,                     \
+              highbd_apply_filter, highbd_apply_filter, calc_var);             \
   }                                                                            \
 }
 
@@ -2093,4 +2037,12 @@
 HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(64, 32)
 HIGHBD_MASK_SUBPIX_VAR_LARGE(64, 64)
 HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(64, 64)
+#if CONFIG_EXT_PARTITION
+HIGHBD_MASK_SUBPIX_VAR_LARGE(64, 128)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(64, 128)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(128, 64)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(128, 64)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(128, 128)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(128, 128)
+#endif  // CONFIG_EXT_PARTITION
 #endif