ext-inter: Delete dead code

Patches https://aomedia-review.googlesource.com/c/11987/
and https://aomedia-review.googlesource.com/c/11988/
replaced the old masked motion search pipeline with
a new one which uses different SAD/SSE functions.
This resulted in a lot of dead code.

This patch removes the now-dead code. Note that this
includes vectorized SAD/SSE functions, which will need
to be rewritten at some point for the new pipeline. It
also includes the masked_compound_variance_* functions
since these turned out not to be used by the new pipeline.

To help with the later addition of vectorized functions, the
masked_sad/variance_test.cc files are kept but are modified
to work with the new functions. The tests are then disabled
until we actually have the vectorized functions.

Change-Id: I61b686abd14bba5280bed94e1be62eb74ea23d89
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index c248f3e..08dea4e 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -317,11 +317,6 @@
         "${AOM_ROOT}/aom_dsp/x86/variance_sse2.c"
         "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.c")
 
-    set(AOM_DSP_ENCODER_INTRIN_SSSE3
-        ${AOM_DSP_ENCODER_INTRIN_SSSE3}
-        "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c"
-        "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c")
-
     set(AOM_DSP_ENCODER_ASM_SSSE3_X86_64
         ${AOM_DSP_ENCODER_ASM_SSSE3_X86_64}
         "${AOM_ROOT}/aom_dsp/x86/avg_ssse3_x86_64.asm"
diff --git a/aom_dsp/aom_dsp.mk b/aom_dsp/aom_dsp.mk
index da173f5..1129ba3 100644
--- a/aom_dsp/aom_dsp.mk
+++ b/aom_dsp/aom_dsp.mk
@@ -342,10 +342,6 @@
 endif
 
 ifeq ($(CONFIG_AV1_ENCODER),yes)
-ifeq ($(CONFIG_EXT_INTER),yes)
-DSP_SRCS-$(HAVE_SSSE3)  += x86/masked_sad_intrin_ssse3.c
-DSP_SRCS-$(HAVE_SSSE3)  += x86/masked_variance_intrin_ssse3.c
-endif  #CONFIG_EXT_INTER
 ifeq ($(CONFIG_MOTION_VAR),yes)
 DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_sad_sse4.c
 DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_variance_sse4.c
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 28f997a..129ad72 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -738,17 +738,12 @@
 if (aom_config("CONFIG_EXT_INTER") eq "yes") {
   foreach (@block_sizes) {
     ($w, $h) = @$_;
-    add_proto qw/unsigned int/, "aom_masked_sad${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
-    specialize "aom_masked_sad${w}x${h}", qw/ssse3/;
     add_proto qw/unsigned int/, "aom_masked_compound_sad${w}x${h}", "const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask";
   }
 
   if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
     foreach (@block_sizes) {
       ($w, $h) = @$_;
-      add_proto qw/unsigned int/, "aom_highbd_masked_sad${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
-      specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3/;
-
       add_proto qw/unsigned int/, "aom_highbd_masked_compound_sad${w}x${h}", "const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask";
     }
   }
@@ -1048,12 +1043,6 @@
 #
   foreach (@block_sizes) {
     ($w, $h) = @$_;
-    add_proto qw/unsigned int/, "aom_masked_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    add_proto qw/unsigned int/, "aom_masked_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-    specialize "aom_masked_variance${w}x${h}", qw/ssse3/;
-    specialize "aom_masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
-
-    add_proto qw/unsigned int/, "aom_masked_compound_variance${w}x${h}", "const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *m, int m_stride, int invert_mask, unsigned int *sse";
     add_proto qw/unsigned int/, "aom_masked_compound_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse";
   }
 
@@ -1061,12 +1050,6 @@
     foreach $bd ("_", "_10_", "_12_") {
       foreach (@block_sizes) {
         ($w, $h) = @$_;
-        add_proto qw/unsigned int/, "aom_highbd${bd}masked_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
-        add_proto qw/unsigned int/, "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
-        specialize "aom_highbd${bd}masked_variance${w}x${h}", qw/ssse3/;
-        specialize "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
-
-        add_proto qw/unsigned int/, "aom_highbd${bd}masked_compound_variance${w}x${h}", "const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *m, int m_stride, int invert_mask, unsigned int *sse";
         add_proto qw/unsigned int/, "aom_highbd${bd}masked_compound_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse";
       }
     }
diff --git a/aom_dsp/sad.c b/aom_dsp/sad.c
index e7f31a1..e4be68c 100644
--- a/aom_dsp/sad.c
+++ b/aom_dsp/sad.c
@@ -312,30 +312,11 @@
 
 #if CONFIG_AV1 && CONFIG_EXT_INTER
             static INLINE
-    unsigned int masked_sad(const uint8_t *a, int a_stride, const uint8_t *b,
-                            int b_stride, const uint8_t *m, int m_stride,
-                            int width, int height) {
-  int y, x;
-  unsigned int sad = 0;
-
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++) sad += m[x] * abs(a[x] - b[x]);
-
-    a += a_stride;
-    b += b_stride;
-    m += m_stride;
-  }
-  sad = (sad + 31) >> 6;
-
-  return sad;
-}
-
-static INLINE unsigned int masked_compound_sad(const uint8_t *src,
-                                               int src_stride, const uint8_t *a,
-                                               int a_stride, const uint8_t *b,
-                                               int b_stride, const uint8_t *m,
-                                               int m_stride, int width,
-                                               int height) {
+    unsigned int masked_compound_sad(const uint8_t *src, int src_stride,
+                                     const uint8_t *a, int a_stride,
+                                     const uint8_t *b, int b_stride,
+                                     const uint8_t *m, int m_stride, int width,
+                                     int height) {
   int y, x;
   unsigned int sad = 0;
 
@@ -356,12 +337,6 @@
 }
 
 #define MASKSADMxN(m, n)                                                      \
-  unsigned int aom_masked_sad##m##x##n##_c(                                   \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      const uint8_t *msk, int msk_stride) {                                   \
-    return masked_sad(src, src_stride, ref, ref_stride, msk, msk_stride, m,   \
-                      n);                                                     \
-  }                                                                           \
   unsigned int aom_masked_compound_sad##m##x##n##_c(                          \
       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
       const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
@@ -397,31 +372,11 @@
 
 #if CONFIG_HIGHBITDEPTH
                     static INLINE
-    unsigned int highbd_masked_sad(const uint8_t *a8, int a_stride,
-                                   const uint8_t *b8, int b_stride,
-                                   const uint8_t *m, int m_stride, int width,
-                                   int height) {
-  int y, x;
-  unsigned int sad = 0;
-  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++) sad += m[x] * abs(a[x] - b[x]);
-
-    a += a_stride;
-    b += b_stride;
-    m += m_stride;
-  }
-  sad = (sad + 31) >> 6;
-
-  return sad;
-}
-
-static INLINE unsigned int highbd_masked_compound_sad(
-    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
-    const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride, int width,
-    int height) {
+    unsigned int highbd_masked_compound_sad(const uint8_t *src8, int src_stride,
+                                            const uint8_t *a8, int a_stride,
+                                            const uint8_t *b8, int b_stride,
+                                            const uint8_t *m, int m_stride,
+                                            int width, int height) {
   int y, x;
   unsigned int sad = 0;
   const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
@@ -445,12 +400,6 @@
 }
 
 #define HIGHBD_MASKSADMXN(m, n)                                               \
-  unsigned int aom_highbd_masked_sad##m##x##n##_c(                            \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      const uint8_t *msk, int msk_stride) {                                   \
-    return highbd_masked_sad(src, src_stride, ref, ref_stride, msk,           \
-                             msk_stride, m, n);                               \
-  }                                                                           \
   unsigned int aom_highbd_masked_compound_sad##m##x##n##_c(                   \
       const uint8_t *src8, int src_stride, const uint8_t *ref8,               \
       int ref_stride, const uint8_t *second_pred8, const uint8_t *msk,        \
diff --git a/aom_dsp/variance.c b/aom_dsp/variance.c
index 90d0622..85adcd1 100644
--- a/aom_dsp/variance.c
+++ b/aom_dsp/variance.c
@@ -714,163 +714,42 @@
   }
 }
 
-void masked_variance(const uint8_t *a, int a_stride, const uint8_t *b,
-                     int b_stride, const uint8_t *m, int m_stride, int w, int h,
-                     unsigned int *sse, int *sum) {
-  int i, j;
-
-  int64_t sum64 = 0;
-  uint64_t sse64 = 0;
-
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
-      const int diff = (a[j] - b[j]) * (m[j]);
-      sum64 += diff;
-      sse64 += diff * diff;
-    }
-
-    a += a_stride;
-    b += b_stride;
-    m += m_stride;
-  }
-  sum64 = (sum64 >= 0) ? sum64 : -sum64;
-  *sum = (int)ROUND_POWER_OF_TWO(sum64, 6);
-  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 12);
-}
-
-void masked_compound_variance(const uint8_t *src, int src_stride,
-                              const uint8_t *a, int a_stride, const uint8_t *b,
-                              int b_stride, const uint8_t *m, int m_stride,
-                              int w, int h, unsigned int *sse, int *sum) {
-  int i, j;
-
-  int64_t sum64 = 0;
-  uint64_t sse64 = 0;
-
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
-      const uint8_t pred = AOM_BLEND_A64(m[j], a[j], b[j]);
-      const int diff = pred - src[j];
-      sum64 += diff;
-      sse64 += diff * diff;
-    }
-
-    src += src_stride;
-    a += a_stride;
-    b += b_stride;
-    m += m_stride;
-  }
-  sum64 = (sum64 >= 0) ? sum64 : -sum64;
-  *sum = (int)ROUND_POWER_OF_TWO(sum64, 6);
-  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 12);
-}
-
-#define MASK_VAR(W, H)                                                        \
-  unsigned int aom_masked_variance##W##x##H##_c(                              \
-      const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,         \
-      const uint8_t *m, int m_stride, unsigned int *sse) {                    \
-    int sum;                                                                  \
-    masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H, sse, &sum);  \
-    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));             \
-  }                                                                           \
-                                                                              \
-  unsigned int aom_masked_compound_variance##W##x##H##_c(                     \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      const uint8_t *second_pred, const uint8_t *m, int m_stride,             \
-      int invert_mask, unsigned int *sse) {                                   \
-    int sum;                                                                  \
-    if (!invert_mask)                                                         \
-      masked_compound_variance(src, src_stride, ref, ref_stride, second_pred, \
-                               W, m, m_stride, W, H, sse, &sum);              \
-    else                                                                      \
-      masked_compound_variance(src, src_stride, second_pred, W, ref,          \
-                               ref_stride, m, m_stride, W, H, sse, &sum);     \
-    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));             \
+#define MASK_SUBPIX_VAR(W, H)                                               \
+  unsigned int aom_masked_compound_sub_pixel_variance##W##x##H##_c(         \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,         \
+      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,       \
+      const uint8_t *msk, int msk_stride, int invert_mask,                  \
+      unsigned int *sse) {                                                  \
+    uint16_t fdata3[(H + 1) * W];                                           \
+    uint8_t temp2[H * W];                                                   \
+    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                             \
+                                                                            \
+    var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \
+                                      bilinear_filters_2t[xoffset]);        \
+    var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,           \
+                                       bilinear_filters_2t[yoffset]);       \
+                                                                            \
+    aom_comp_mask_pred(temp3, second_pred, W, H, temp2, W, msk, msk_stride, \
+                       invert_mask);                                        \
+    return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse);       \
   }
 
-#define MASK_SUBPIX_VAR(W, H)                                                 \
-  unsigned int aom_masked_sub_pixel_variance##W##x##H##_c(                    \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
-      const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \
-      unsigned int *sse) {                                                    \
-    uint16_t fdata3[(H + 1) * W];                                             \
-    uint8_t temp2[H * W];                                                     \
-                                                                              \
-    var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W,   \
-                                      bilinear_filters_2t[xoffset]);          \
-    var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,             \
-                                       bilinear_filters_2t[yoffset]);         \
-                                                                              \
-    return aom_masked_variance##W##x##H##_c(temp2, W, dst, dst_stride, msk,   \
-                                            msk_stride, sse);                 \
-  }                                                                           \
-                                                                              \
-  unsigned int aom_masked_compound_sub_pixel_variance##W##x##H##_c(           \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
-      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,         \
-      const uint8_t *msk, int msk_stride, int invert_mask,                    \
-      unsigned int *sse) {                                                    \
-    uint16_t fdata3[(H + 1) * W];                                             \
-    uint8_t temp2[H * W];                                                     \
-    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                               \
-                                                                              \
-    var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W,   \
-                                      bilinear_filters_2t[xoffset]);          \
-    var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,             \
-                                       bilinear_filters_2t[yoffset]);         \
-                                                                              \
-    aom_comp_mask_pred(temp3, second_pred, W, H, temp2, W, msk, msk_stride,   \
-                       invert_mask);                                          \
-    return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse);         \
-  }
-
-MASK_VAR(4, 4)
 MASK_SUBPIX_VAR(4, 4)
-
-MASK_VAR(4, 8)
 MASK_SUBPIX_VAR(4, 8)
-
-MASK_VAR(8, 4)
 MASK_SUBPIX_VAR(8, 4)
-
-MASK_VAR(8, 8)
 MASK_SUBPIX_VAR(8, 8)
-
-MASK_VAR(8, 16)
 MASK_SUBPIX_VAR(8, 16)
-
-MASK_VAR(16, 8)
 MASK_SUBPIX_VAR(16, 8)
-
-MASK_VAR(16, 16)
 MASK_SUBPIX_VAR(16, 16)
-
-MASK_VAR(16, 32)
 MASK_SUBPIX_VAR(16, 32)
-
-MASK_VAR(32, 16)
 MASK_SUBPIX_VAR(32, 16)
-
-MASK_VAR(32, 32)
 MASK_SUBPIX_VAR(32, 32)
-
-MASK_VAR(32, 64)
 MASK_SUBPIX_VAR(32, 64)
-
-MASK_VAR(64, 32)
 MASK_SUBPIX_VAR(64, 32)
-
-MASK_VAR(64, 64)
 MASK_SUBPIX_VAR(64, 64)
-
 #if CONFIG_EXT_PARTITION
-MASK_VAR(64, 128)
 MASK_SUBPIX_VAR(64, 128)
-
-MASK_VAR(128, 64)
 MASK_SUBPIX_VAR(128, 64)
-
-MASK_VAR(128, 128)
 MASK_SUBPIX_VAR(128, 128)
 #endif  // CONFIG_EXT_PARTITION
 
@@ -920,270 +799,7 @@
   }
 }
 
-void highbd_masked_variance64(const uint8_t *a8, int a_stride,
-                              const uint8_t *b8, int b_stride, const uint8_t *m,
-                              int m_stride, int w, int h, uint64_t *sse,
-                              int64_t *sum) {
-  int i, j;
-  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-
-  *sum = 0;
-  *sse = 0;
-
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
-      const int diff = (a[j] - b[j]) * (m[j]);
-      *sum += (int64_t)diff;
-      *sse += (int64_t)diff * diff;
-    }
-
-    a += a_stride;
-    b += b_stride;
-    m += m_stride;
-  }
-  *sum = (*sum >= 0) ? *sum : -*sum;
-  *sum = ROUND_POWER_OF_TWO(*sum, 6);
-  *sse = ROUND_POWER_OF_TWO(*sse, 12);
-}
-
-void highbd_masked_variance(const uint8_t *a8, int a_stride, const uint8_t *b8,
-                            int b_stride, const uint8_t *m, int m_stride, int w,
-                            int h, unsigned int *sse, int *sum) {
-  int64_t sum64;
-  uint64_t sse64;
-  highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride, w, h,
-                           &sse64, &sum64);
-  *sum = (int)sum64;
-  *sse = (unsigned int)sse64;
-}
-
-void highbd_10_masked_variance(const uint8_t *a8, int a_stride,
-                               const uint8_t *b8, int b_stride,
-                               const uint8_t *m, int m_stride, int w, int h,
-                               unsigned int *sse, int *sum) {
-  int64_t sum64;
-  uint64_t sse64;
-  highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride, w, h,
-                           &sse64, &sum64);
-  *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
-  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
-}
-
-void highbd_12_masked_variance(const uint8_t *a8, int a_stride,
-                               const uint8_t *b8, int b_stride,
-                               const uint8_t *m, int m_stride, int w, int h,
-                               unsigned int *sse, int *sum) {
-  int64_t sum64;
-  uint64_t sse64;
-  highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride, w, h,
-                           &sse64, &sum64);
-  *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
-  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
-}
-
-void highbd_masked_compound_variance64(const uint8_t *src8, int src_stride,
-                                       const uint8_t *a8, int a_stride,
-                                       const uint8_t *b8, int b_stride,
-                                       const uint8_t *m, int m_stride, int w,
-                                       int h, uint64_t *sse, int64_t *sum) {
-  int i, j;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-
-  *sum = 0;
-  *sse = 0;
-
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
-      const uint16_t pred = AOM_BLEND_A64(m[j], a[j], b[j]);
-      const int diff = pred - src[j];
-      *sum += (int64_t)diff;
-      *sse += (int64_t)diff * diff;
-    }
-
-    src += src_stride;
-    a += a_stride;
-    b += b_stride;
-    m += m_stride;
-  }
-  *sum = (*sum >= 0) ? *sum : -*sum;
-  *sum = ROUND_POWER_OF_TWO(*sum, 6);
-  *sse = ROUND_POWER_OF_TWO(*sse, 12);
-}
-
-void highbd_masked_compound_variance(const uint8_t *src8, int src_stride,
-                                     const uint8_t *a8, int a_stride,
-                                     const uint8_t *b8, int b_stride,
-                                     const uint8_t *m, int m_stride, int w,
-                                     int h, unsigned int *sse, int *sum) {
-  int64_t sum64;
-  uint64_t sse64;
-  highbd_masked_compound_variance64(src8, src_stride, a8, a_stride, b8,
-                                    b_stride, m, m_stride, w, h, &sse64,
-                                    &sum64);
-  *sum = (int)sum64;
-  *sse = (unsigned int)sse64;
-}
-
-void highbd_10_masked_compound_variance(const uint8_t *src8, int src_stride,
-                                        const uint8_t *a8, int a_stride,
-                                        const uint8_t *b8, int b_stride,
-                                        const uint8_t *m, int m_stride, int w,
-                                        int h, unsigned int *sse, int *sum) {
-  int64_t sum64;
-  uint64_t sse64;
-  highbd_masked_compound_variance64(src8, src_stride, a8, a_stride, b8,
-                                    b_stride, m, m_stride, w, h, &sse64,
-                                    &sum64);
-  *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
-  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
-}
-
-void highbd_12_masked_compound_variance(const uint8_t *src8, int src_stride,
-                                        const uint8_t *a8, int a_stride,
-                                        const uint8_t *b8, int b_stride,
-                                        const uint8_t *m, int m_stride, int w,
-                                        int h, unsigned int *sse, int *sum) {
-  int64_t sum64;
-  uint64_t sse64;
-  highbd_masked_compound_variance64(src8, src_stride, a8, a_stride, b8,
-                                    b_stride, m, m_stride, w, h, &sse64,
-                                    &sum64);
-  *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
-  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
-}
-
-#define HIGHBD_MASK_VAR(W, H)                                                  \
-  unsigned int aom_highbd_masked_variance##W##x##H##_c(                        \
-      const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,          \
-      const uint8_t *m, int m_stride, unsigned int *sse) {                     \
-    int sum;                                                                   \
-    highbd_masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H, sse,   \
-                           &sum);                                              \
-    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));              \
-  }                                                                            \
-                                                                               \
-  unsigned int aom_highbd_10_masked_variance##W##x##H##_c(                     \
-      const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,          \
-      const uint8_t *m, int m_stride, unsigned int *sse) {                     \
-    int sum;                                                                   \
-    int64_t var;                                                               \
-    highbd_10_masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H,     \
-                              sse, &sum);                                      \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));                  \
-    return (var >= 0) ? (uint32_t)var : 0;                                     \
-  }                                                                            \
-                                                                               \
-  unsigned int aom_highbd_12_masked_variance##W##x##H##_c(                     \
-      const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,          \
-      const uint8_t *m, int m_stride, unsigned int *sse) {                     \
-    int sum;                                                                   \
-    int64_t var;                                                               \
-    highbd_12_masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H,     \
-                              sse, &sum);                                      \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));                  \
-    return (var >= 0) ? (uint32_t)var : 0;                                     \
-  }                                                                            \
-                                                                               \
-  unsigned int aom_highbd_masked_compound_variance##W##x##H##_c(               \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
-      const uint8_t *second_pred, const uint8_t *m, int m_stride,              \
-      int invert_mask, unsigned int *sse) {                                    \
-    int sum;                                                                   \
-    if (!invert_mask)                                                          \
-      highbd_masked_compound_variance(src, src_stride, ref, ref_stride,        \
-                                      second_pred, W, m, m_stride, W, H, sse,  \
-                                      &sum);                                   \
-    else                                                                       \
-      highbd_masked_compound_variance(src, src_stride, second_pred, W, ref,    \
-                                      ref_stride, m, m_stride, W, H, sse,      \
-                                      &sum);                                   \
-    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));              \
-  }                                                                            \
-                                                                               \
-  unsigned int aom_highbd_10_masked_compound_variance##W##x##H##_c(            \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
-      const uint8_t *second_pred, const uint8_t *m, int m_stride,              \
-      int invert_mask, unsigned int *sse) {                                    \
-    int sum;                                                                   \
-    if (!invert_mask)                                                          \
-      highbd_10_masked_compound_variance(src, src_stride, ref, ref_stride,     \
-                                         second_pred, W, m, m_stride, W, H,    \
-                                         sse, &sum);                           \
-    else                                                                       \
-      highbd_10_masked_compound_variance(src, src_stride, second_pred, W, ref, \
-                                         ref_stride, m, m_stride, W, H, sse,   \
-                                         &sum);                                \
-    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));              \
-  }                                                                            \
-                                                                               \
-  unsigned int aom_highbd_12_masked_compound_variance##W##x##H##_c(            \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
-      const uint8_t *second_pred, const uint8_t *m, int m_stride,              \
-      int invert_mask, unsigned int *sse) {                                    \
-    int sum;                                                                   \
-    if (!invert_mask)                                                          \
-      highbd_12_masked_compound_variance(src, src_stride, ref, ref_stride,     \
-                                         second_pred, W, m, m_stride, W, H,    \
-                                         sse, &sum);                           \
-    else                                                                       \
-      highbd_12_masked_compound_variance(src, src_stride, second_pred, W, ref, \
-                                         ref_stride, m, m_stride, W, H, sse,   \
-                                         &sum);                                \
-    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));              \
-  }
-
 #define HIGHBD_MASK_SUBPIX_VAR(W, H)                                           \
-  unsigned int aom_highbd_masked_sub_pixel_variance##W##x##H##_c(              \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
-      const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,  \
-      unsigned int *sse) {                                                     \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-                                                                               \
-    aom_highbd_var_filter_block2d_bil_first_pass(                              \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
-    aom_highbd_var_filter_block2d_bil_second_pass(                             \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
-                                                                               \
-    return aom_highbd_masked_variance##W##x##H##_c(                            \
-        CONVERT_TO_BYTEPTR(temp2), W, dst, dst_stride, msk, msk_stride, sse);  \
-  }                                                                            \
-                                                                               \
-  unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_c(           \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
-      const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,  \
-      unsigned int *sse) {                                                     \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-                                                                               \
-    aom_highbd_var_filter_block2d_bil_first_pass(                              \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
-    aom_highbd_var_filter_block2d_bil_second_pass(                             \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
-                                                                               \
-    return aom_highbd_10_masked_variance##W##x##H##_c(                         \
-        CONVERT_TO_BYTEPTR(temp2), W, dst, dst_stride, msk, msk_stride, sse);  \
-  }                                                                            \
-                                                                               \
-  unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_c(           \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
-      const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,  \
-      unsigned int *sse) {                                                     \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-                                                                               \
-    aom_highbd_var_filter_block2d_bil_first_pass(                              \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
-    aom_highbd_var_filter_block2d_bil_second_pass(                             \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
-                                                                               \
-    return aom_highbd_12_masked_variance##W##x##H##_c(                         \
-        CONVERT_TO_BYTEPTR(temp2), W, dst, dst_stride, msk, msk_stride, sse);  \
-  }                                                                            \
-                                                                               \
   unsigned int aom_highbd_masked_compound_sub_pixel_variance##W##x##H##_c(     \
       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
@@ -1250,53 +866,22 @@
                                                ref, ref_stride, sse);          \
   }
 
-HIGHBD_MASK_VAR(4, 4)
 HIGHBD_MASK_SUBPIX_VAR(4, 4)
-
-HIGHBD_MASK_VAR(4, 8)
 HIGHBD_MASK_SUBPIX_VAR(4, 8)
-
-HIGHBD_MASK_VAR(8, 4)
 HIGHBD_MASK_SUBPIX_VAR(8, 4)
-
-HIGHBD_MASK_VAR(8, 8)
 HIGHBD_MASK_SUBPIX_VAR(8, 8)
-
-HIGHBD_MASK_VAR(8, 16)
 HIGHBD_MASK_SUBPIX_VAR(8, 16)
-
-HIGHBD_MASK_VAR(16, 8)
 HIGHBD_MASK_SUBPIX_VAR(16, 8)
-
-HIGHBD_MASK_VAR(16, 16)
 HIGHBD_MASK_SUBPIX_VAR(16, 16)
-
-HIGHBD_MASK_VAR(16, 32)
 HIGHBD_MASK_SUBPIX_VAR(16, 32)
-
-HIGHBD_MASK_VAR(32, 16)
 HIGHBD_MASK_SUBPIX_VAR(32, 16)
-
-HIGHBD_MASK_VAR(32, 32)
 HIGHBD_MASK_SUBPIX_VAR(32, 32)
-
-HIGHBD_MASK_VAR(32, 64)
 HIGHBD_MASK_SUBPIX_VAR(32, 64)
-
-HIGHBD_MASK_VAR(64, 32)
 HIGHBD_MASK_SUBPIX_VAR(64, 32)
-
-HIGHBD_MASK_VAR(64, 64)
 HIGHBD_MASK_SUBPIX_VAR(64, 64)
-
 #if CONFIG_EXT_PARTITION
-HIGHBD_MASK_VAR(64, 128)
 HIGHBD_MASK_SUBPIX_VAR(64, 128)
-
-HIGHBD_MASK_VAR(128, 64)
 HIGHBD_MASK_SUBPIX_VAR(128, 64)
-
-HIGHBD_MASK_VAR(128, 128)
 HIGHBD_MASK_SUBPIX_VAR(128, 128)
 #endif  // CONFIG_EXT_PARTITION
 #endif  // CONFIG_HIGHBITDEPTH
diff --git a/aom_dsp/variance.h b/aom_dsp/variance.h
index adcf8b4..1b546ab 100644
--- a/aom_dsp/variance.h
+++ b/aom_dsp/variance.h
@@ -55,26 +55,10 @@
     int b_stride, unsigned int *sse, const uint8_t *second_pred);
 
 #if CONFIG_AV1 && CONFIG_EXT_INTER
-typedef unsigned int (*aom_masked_sad_fn_t)(const uint8_t *src, int src_stride,
-                                            const uint8_t *ref, int ref_stride,
-                                            const uint8_t *msk_ptr,
-                                            int msk_stride);
-typedef unsigned int (*aom_masked_variance_fn_t)(
-    const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
-    const uint8_t *msk, int msk_stride, unsigned int *sse);
-typedef unsigned int (*aom_masked_subpixvariance_fn_t)(
-    const uint8_t *src, int src_stride, int xoffset, int yoffset,
-    const uint8_t *ref, int ref_stride, const uint8_t *msk, int msk_stride,
-    unsigned int *sse);
-
 typedef unsigned int (*aom_masked_compound_sad_fn_t)(
     const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
     const uint8_t *second_pred, const uint8_t *msk, int msk_stride,
     int invert_mask);
-typedef unsigned int (*aom_masked_compound_variance_fn_t)(
-    const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
-    const uint8_t *second_pred, const uint8_t *m, int m_stride, int invert_mask,
-    unsigned int *sse);
 typedef unsigned int (*aom_masked_compound_subpixvariance_fn_t)(
     const uint8_t *src, int src_stride, int xoffset, int yoffset,
     const uint8_t *ref, int ref_stride, const uint8_t *second_pred,
@@ -106,12 +90,7 @@
   aom_sad_multi_fn_t sdx8f;
   aom_sad_multi_d_fn_t sdx4df;
 #if CONFIG_EXT_INTER
-  aom_masked_sad_fn_t msdf;
-  aom_masked_variance_fn_t mvf;
-  aom_masked_subpixvariance_fn_t msvf;
-
   aom_masked_compound_sad_fn_t mcsdf;
-  aom_masked_compound_variance_fn_t mcvf;
   aom_masked_compound_subpixvariance_fn_t mcsvf;
 #endif  // CONFIG_EXT_INTER
 #if CONFIG_MOTION_VAR
diff --git a/aom_dsp/x86/masked_sad_intrin_ssse3.c b/aom_dsp/x86/masked_sad_intrin_ssse3.c
deleted file mode 100644
index 5166e9e..0000000
--- a/aom_dsp/x86/masked_sad_intrin_ssse3.c
+++ /dev/null
@@ -1,334 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-#include <emmintrin.h>
-#include <tmmintrin.h>
-
-#include "aom_ports/mem.h"
-#include "./aom_config.h"
-#include "aom/aom_integer.h"
-
-static INLINE __m128i width8_load_2rows(const uint8_t *ptr, int stride) {
-  __m128i temp1 = _mm_loadl_epi64((const __m128i *)ptr);
-  __m128i temp2 = _mm_loadl_epi64((const __m128i *)(ptr + stride));
-  return _mm_unpacklo_epi64(temp1, temp2);
-}
-
-static INLINE __m128i width4_load_4rows(const uint8_t *ptr, int stride) {
-  __m128i temp1 = _mm_cvtsi32_si128(*(const uint32_t *)ptr);
-  __m128i temp2 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride));
-  __m128i temp3 = _mm_unpacklo_epi32(temp1, temp2);
-  temp1 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride * 2));
-  temp2 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride * 3));
-  temp1 = _mm_unpacklo_epi32(temp1, temp2);
-  return _mm_unpacklo_epi64(temp3, temp1);
-}
-
-static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride,
-                                            const uint8_t *b_ptr, int b_stride,
-                                            const uint8_t *m_ptr, int m_stride,
-                                            int width, int height);
-
-static INLINE unsigned int masked_sad8xh_ssse3(
-    const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
-    const uint8_t *m_ptr, int m_stride, int height);
-
-static INLINE unsigned int masked_sad4xh_ssse3(
-    const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
-    const uint8_t *m_ptr, int m_stride, int height);
-
-#define MASKSADMXN_SSSE3(m, n)                                                 \
-  unsigned int aom_masked_sad##m##x##n##_ssse3(                                \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
-      const uint8_t *msk, int msk_stride) {                                    \
-    return masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, msk_stride, \
-                            m, n);                                             \
-  }
-
-#if CONFIG_EXT_PARTITION
-MASKSADMXN_SSSE3(128, 128)
-MASKSADMXN_SSSE3(128, 64)
-MASKSADMXN_SSSE3(64, 128)
-#endif  // CONFIG_EXT_PARTITION
-MASKSADMXN_SSSE3(64, 64)
-MASKSADMXN_SSSE3(64, 32)
-MASKSADMXN_SSSE3(32, 64)
-MASKSADMXN_SSSE3(32, 32)
-MASKSADMXN_SSSE3(32, 16)
-MASKSADMXN_SSSE3(16, 32)
-MASKSADMXN_SSSE3(16, 16)
-MASKSADMXN_SSSE3(16, 8)
-
-#define MASKSAD8XN_SSSE3(n)                                                   \
-  unsigned int aom_masked_sad8x##n##_ssse3(                                   \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      const uint8_t *msk, int msk_stride) {                                   \
-    return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, msk,         \
-                               msk_stride, n);                                \
-  }
-
-MASKSAD8XN_SSSE3(16)
-MASKSAD8XN_SSSE3(8)
-MASKSAD8XN_SSSE3(4)
-
-#define MASKSAD4XN_SSSE3(n)                                                   \
-  unsigned int aom_masked_sad4x##n##_ssse3(                                   \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      const uint8_t *msk, int msk_stride) {                                   \
-    return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk,         \
-                               msk_stride, n);                                \
-  }
-
-MASKSAD4XN_SSSE3(8)
-MASKSAD4XN_SSSE3(4)
-
-// For width a multiple of 16
-// Assumes values in m are <=64
-static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride,
-                                            const uint8_t *b_ptr, int b_stride,
-                                            const uint8_t *m_ptr, int m_stride,
-                                            int width, int height) {
-  int y, x;
-  __m128i a, b, m, temp1, temp2;
-  __m128i res = _mm_setzero_si128();
-  __m128i one = _mm_set1_epi16(1);
-  // For each row
-  for (y = 0; y < height; y++) {
-    // Covering the full width
-    for (x = 0; x < width; x += 16) {
-      // Load a, b, m in xmm registers
-      a = _mm_loadu_si128((const __m128i *)(a_ptr + x));
-      b = _mm_loadu_si128((const __m128i *)(b_ptr + x));
-      m = _mm_loadu_si128((const __m128i *)(m_ptr + x));
-
-      // Calculate the difference between a & b
-      temp1 = _mm_subs_epu8(a, b);
-      temp2 = _mm_subs_epu8(b, a);
-      temp1 = _mm_or_si128(temp1, temp2);
-
-      // Multiply by m and add together
-      temp2 = _mm_maddubs_epi16(temp1, m);
-      // Pad out row result to 32 bit integers & add to running total
-      res = _mm_add_epi32(res, _mm_madd_epi16(temp2, one));
-    }
-    // Move onto the next row
-    a_ptr += a_stride;
-    b_ptr += b_stride;
-    m_ptr += m_stride;
-  }
-  res = _mm_hadd_epi32(res, _mm_setzero_si128());
-  res = _mm_hadd_epi32(res, _mm_setzero_si128());
-  // sad = (sad + 31) >> 6;
-  return (_mm_cvtsi128_si32(res) + 31) >> 6;
-}
-
-static INLINE unsigned int masked_sad8xh_ssse3(
-    const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
-    const uint8_t *m_ptr, int m_stride, int height) {
-  int y;
-  __m128i a, b, m, temp1, temp2, row_res;
-  __m128i res = _mm_setzero_si128();
-  __m128i one = _mm_set1_epi16(1);
-  // Add the masked SAD for 2 rows at a time
-  for (y = 0; y < height; y += 2) {
-    // Load a, b, m in xmm registers
-    a = width8_load_2rows(a_ptr, a_stride);
-    b = width8_load_2rows(b_ptr, b_stride);
-    m = width8_load_2rows(m_ptr, m_stride);
-
-    // Calculate the difference between a & b
-    temp1 = _mm_subs_epu8(a, b);
-    temp2 = _mm_subs_epu8(b, a);
-    temp1 = _mm_or_si128(temp1, temp2);
-
-    // Multiply by m and add together
-    row_res = _mm_maddubs_epi16(temp1, m);
-
-    // Pad out row result to 32 bit integers & add to running total
-    res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one));
-
-    // Move onto the next rows
-    a_ptr += a_stride * 2;
-    b_ptr += b_stride * 2;
-    m_ptr += m_stride * 2;
-  }
-  res = _mm_hadd_epi32(res, _mm_setzero_si128());
-  res = _mm_hadd_epi32(res, _mm_setzero_si128());
-  // sad = (sad + 31) >> 6;
-  return (_mm_cvtsi128_si32(res) + 31) >> 6;
-}
-
-static INLINE unsigned int masked_sad4xh_ssse3(
-    const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
-    const uint8_t *m_ptr, int m_stride, int height) {
-  int y;
-  __m128i a, b, m, temp1, temp2, row_res;
-  __m128i res = _mm_setzero_si128();
-  __m128i one = _mm_set1_epi16(1);
-  // Add the masked SAD for 4 rows at a time
-  for (y = 0; y < height; y += 4) {
-    // Load a, b, m in xmm registers
-    a = width4_load_4rows(a_ptr, a_stride);
-    b = width4_load_4rows(b_ptr, b_stride);
-    m = width4_load_4rows(m_ptr, m_stride);
-
-    // Calculate the difference between a & b
-    temp1 = _mm_subs_epu8(a, b);
-    temp2 = _mm_subs_epu8(b, a);
-    temp1 = _mm_or_si128(temp1, temp2);
-
-    // Multiply by m and add together
-    row_res = _mm_maddubs_epi16(temp1, m);
-
-    // Pad out row result to 32 bit integers & add to running total
-    res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one));
-
-    // Move onto the next rows
-    a_ptr += a_stride * 4;
-    b_ptr += b_stride * 4;
-    m_ptr += m_stride * 4;
-  }
-  // Pad out row result to 32 bit integers & add to running total
-  res = _mm_hadd_epi32(res, _mm_setzero_si128());
-  res = _mm_hadd_epi32(res, _mm_setzero_si128());
-  // sad = (sad + 31) >> 6;
-  return (_mm_cvtsi128_si32(res) + 31) >> 6;
-}
-
-#if CONFIG_HIGHBITDEPTH
-static INLINE __m128i highbd_width4_load_2rows(const uint16_t *ptr,
-                                               int stride) {
-  __m128i temp1 = _mm_loadl_epi64((const __m128i *)ptr);
-  __m128i temp2 = _mm_loadl_epi64((const __m128i *)(ptr + stride));
-  return _mm_unpacklo_epi64(temp1, temp2);
-}
-
-static INLINE unsigned int highbd_masked_sad_ssse3(
-    const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
-    const uint8_t *m_ptr, int m_stride, int width, int height);
-
-static INLINE unsigned int highbd_masked_sad4xh_ssse3(
-    const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
-    const uint8_t *m_ptr, int m_stride, int height);
-
-#define HIGHBD_MASKSADMXN_SSSE3(m, n)                                         \
-  unsigned int aom_highbd_masked_sad##m##x##n##_ssse3(                        \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      const uint8_t *msk, int msk_stride) {                                   \
-    return highbd_masked_sad_ssse3(src, src_stride, ref, ref_stride, msk,     \
-                                   msk_stride, m, n);                         \
-  }
-
-#if CONFIG_EXT_PARTITION
-HIGHBD_MASKSADMXN_SSSE3(128, 128)
-HIGHBD_MASKSADMXN_SSSE3(128, 64)
-HIGHBD_MASKSADMXN_SSSE3(64, 128)
-#endif  // CONFIG_EXT_PARTITION
-HIGHBD_MASKSADMXN_SSSE3(64, 64)
-HIGHBD_MASKSADMXN_SSSE3(64, 32)
-HIGHBD_MASKSADMXN_SSSE3(32, 64)
-HIGHBD_MASKSADMXN_SSSE3(32, 32)
-HIGHBD_MASKSADMXN_SSSE3(32, 16)
-HIGHBD_MASKSADMXN_SSSE3(16, 32)
-HIGHBD_MASKSADMXN_SSSE3(16, 16)
-HIGHBD_MASKSADMXN_SSSE3(16, 8)
-HIGHBD_MASKSADMXN_SSSE3(8, 16)
-HIGHBD_MASKSADMXN_SSSE3(8, 8)
-HIGHBD_MASKSADMXN_SSSE3(8, 4)
-
-#define HIGHBD_MASKSAD4XN_SSSE3(n)                                            \
-  unsigned int aom_highbd_masked_sad4x##n##_ssse3(                            \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      const uint8_t *msk, int msk_stride) {                                   \
-    return highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk,  \
-                                      msk_stride, n);                         \
-  }
-
-HIGHBD_MASKSAD4XN_SSSE3(8)
-HIGHBD_MASKSAD4XN_SSSE3(4)
-
-// For width a multiple of 8
-// Assumes values in m are <=64
-static INLINE unsigned int highbd_masked_sad_ssse3(
-    const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
-    const uint8_t *m_ptr, int m_stride, int width, int height) {
-  int y, x;
-  __m128i a, b, m, temp1, temp2;
-  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr);
-  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr);
-  __m128i res = _mm_setzero_si128();
-  // For each row
-  for (y = 0; y < height; y++) {
-    // Covering the full width
-    for (x = 0; x < width; x += 8) {
-      // Load a, b, m in xmm registers
-      a = _mm_loadu_si128((const __m128i *)(a_ptr + x));
-      b = _mm_loadu_si128((const __m128i *)(b_ptr + x));
-      m = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(m_ptr + x)),
-                            _mm_setzero_si128());
-
-      // Calculate the difference between a & b
-      temp1 = _mm_subs_epu16(a, b);
-      temp2 = _mm_subs_epu16(b, a);
-      temp1 = _mm_or_si128(temp1, temp2);
-
-      // Add result of multiplying by m and add pairs together to running total
-      res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m));
-    }
-    // Move onto the next row
-    a_ptr += a_stride;
-    b_ptr += b_stride;
-    m_ptr += m_stride;
-  }
-  res = _mm_hadd_epi32(res, _mm_setzero_si128());
-  res = _mm_hadd_epi32(res, _mm_setzero_si128());
-  // sad = (sad + 31) >> 6;
-  return (_mm_cvtsi128_si32(res) + 31) >> 6;
-}
-
-static INLINE unsigned int highbd_masked_sad4xh_ssse3(
-    const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
-    const uint8_t *m_ptr, int m_stride, int height) {
-  int y;
-  __m128i a, b, m, temp1, temp2;
-  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr);
-  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr);
-  __m128i res = _mm_setzero_si128();
-  // Add the masked SAD for 2 rows at a time
-  for (y = 0; y < height; y += 2) {
-    // Load a, b, m in xmm registers
-    a = highbd_width4_load_2rows(a_ptr, a_stride);
-    b = highbd_width4_load_2rows(b_ptr, b_stride);
-    temp1 = _mm_loadl_epi64((const __m128i *)m_ptr);
-    temp2 = _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride));
-    m = _mm_unpacklo_epi8(_mm_unpacklo_epi32(temp1, temp2),
-                          _mm_setzero_si128());
-
-    // Calculate the difference between a & b
-    temp1 = _mm_subs_epu16(a, b);
-    temp2 = _mm_subs_epu16(b, a);
-    temp1 = _mm_or_si128(temp1, temp2);
-
-    // Multiply by m and add together
-    res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m));
-
-    // Move onto the next rows
-    a_ptr += a_stride * 2;
-    b_ptr += b_stride * 2;
-    m_ptr += m_stride * 2;
-  }
-  res = _mm_hadd_epi32(res, _mm_setzero_si128());
-  res = _mm_hadd_epi32(res, _mm_setzero_si128());
-  // sad = (sad + 31) >> 6;
-  return (_mm_cvtsi128_si32(res) + 31) >> 6;
-}
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/aom_dsp/x86/masked_variance_intrin_ssse3.c b/aom_dsp/x86/masked_variance_intrin_ssse3.c
deleted file mode 100644
index fe14597..0000000
--- a/aom_dsp/x86/masked_variance_intrin_ssse3.c
+++ /dev/null
@@ -1,1948 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <stdlib.h>
-#include <emmintrin.h>
-#include <tmmintrin.h>
-
-#include "./aom_config.h"
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-#include "aom_dsp/aom_filter.h"
-
-// Half pixel shift
-#define HALF_PIXEL_OFFSET (BIL_SUBPEL_SHIFTS / 2)
-
-/*****************************************************************************
- * Horizontal additions
- *****************************************************************************/
-
-static INLINE int32_t hsum_epi32_si32(__m128i v_d) {
-  v_d = _mm_hadd_epi32(v_d, v_d);
-  v_d = _mm_hadd_epi32(v_d, v_d);
-  return _mm_cvtsi128_si32(v_d);
-}
-
-static INLINE int64_t hsum_epi64_si64(__m128i v_q) {
-  v_q = _mm_add_epi64(v_q, _mm_srli_si128(v_q, 8));
-#if ARCH_X86_64
-  return _mm_cvtsi128_si64(v_q);
-#else
-  {
-    int64_t tmp;
-    _mm_storel_epi64((__m128i *)&tmp, v_q);
-    return tmp;
-  }
-#endif
-}
-
-#if CONFIG_HIGHBITDEPTH
-static INLINE int64_t hsum_epi32_si64(__m128i v_d) {
-  const __m128i v_sign_d = _mm_cmplt_epi32(v_d, _mm_setzero_si128());
-  const __m128i v_0_q = _mm_unpacklo_epi32(v_d, v_sign_d);
-  const __m128i v_1_q = _mm_unpackhi_epi32(v_d, v_sign_d);
-  return hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q));
-}
-#endif  // CONFIG_HIGHBITDEPTH
-
-static INLINE uint32_t calc_masked_variance(__m128i v_sum_d, __m128i v_sse_q,
-                                            uint32_t *sse, int w, int h) {
-  int64_t sum64;
-  uint64_t sse64;
-
-  // Horizontal sum
-  sum64 = hsum_epi32_si32(v_sum_d);
-  sse64 = hsum_epi64_si64(v_sse_q);
-
-  sum64 = (sum64 >= 0) ? sum64 : -sum64;
-
-  // Round
-  sum64 = ROUND_POWER_OF_TWO(sum64, 6);
-  sse64 = ROUND_POWER_OF_TWO(sse64, 12);
-
-  // Store the SSE
-  *sse = (uint32_t)sse64;
-  // Compute the variance
-  return *sse - (uint32_t)((sum64 * sum64) / (w * h));
-}
-
-/*****************************************************************************
- * n*16 Wide versions
- *****************************************************************************/
-
-static INLINE unsigned int masked_variancewxh_ssse3(
-    const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,
-    const uint8_t *m, int m_stride, int w, int h, unsigned int *sse) {
-  int ii, jj;
-
-  const __m128i v_zero = _mm_setzero_si128();
-
-  __m128i v_sum_d = _mm_setzero_si128();
-  __m128i v_sse_q = _mm_setzero_si128();
-
-  assert((w % 16) == 0);
-
-  for (ii = 0; ii < h; ii++) {
-    for (jj = 0; jj < w; jj += 16) {
-      // Load inputs - 8 bits
-      const __m128i v_a_b = _mm_loadu_si128((const __m128i *)(a + jj));
-      const __m128i v_b_b = _mm_loadu_si128((const __m128i *)(b + jj));
-      const __m128i v_m_b = _mm_loadu_si128((const __m128i *)(m + jj));
-
-      // Unpack to 16 bits - still containing max 8 bits
-      const __m128i v_a0_w = _mm_unpacklo_epi8(v_a_b, v_zero);
-      const __m128i v_b0_w = _mm_unpacklo_epi8(v_b_b, v_zero);
-      const __m128i v_m0_w = _mm_unpacklo_epi8(v_m_b, v_zero);
-      const __m128i v_a1_w = _mm_unpackhi_epi8(v_a_b, v_zero);
-      const __m128i v_b1_w = _mm_unpackhi_epi8(v_b_b, v_zero);
-      const __m128i v_m1_w = _mm_unpackhi_epi8(v_m_b, v_zero);
-
-      // Difference: [-255, 255]
-      const __m128i v_d0_w = _mm_sub_epi16(v_a0_w, v_b0_w);
-      const __m128i v_d1_w = _mm_sub_epi16(v_a1_w, v_b1_w);
-
-      // Error - [-255, 255] * [0, 64] = [0xc040, 0x3fc0] => fits in 15 bits
-      const __m128i v_e0_w = _mm_mullo_epi16(v_d0_w, v_m0_w);
-      const __m128i v_e0_d = _mm_madd_epi16(v_d0_w, v_m0_w);
-      const __m128i v_e1_w = _mm_mullo_epi16(v_d1_w, v_m1_w);
-      const __m128i v_e1_d = _mm_madd_epi16(v_d1_w, v_m1_w);
-
-      // Squared error - using madd it's max (15 bits * 15 bits) * 2 = 31 bits
-      const __m128i v_se0_d = _mm_madd_epi16(v_e0_w, v_e0_w);
-      const __m128i v_se1_d = _mm_madd_epi16(v_e1_w, v_e1_w);
-
-      // Sum of v_se{0,1}_d - 31 bits + 31 bits = 32 bits
-      const __m128i v_se_d = _mm_add_epi32(v_se0_d, v_se1_d);
-
-      // Unpack Squared error to 64 bits
-      const __m128i v_se_lo_q = _mm_unpacklo_epi32(v_se_d, v_zero);
-      const __m128i v_se_hi_q = _mm_unpackhi_epi32(v_se_d, v_zero);
-
-      // Accumulate
-      v_sum_d = _mm_add_epi32(v_sum_d, v_e0_d);
-      v_sum_d = _mm_add_epi32(v_sum_d, v_e1_d);
-      v_sse_q = _mm_add_epi64(v_sse_q, v_se_lo_q);
-      v_sse_q = _mm_add_epi64(v_sse_q, v_se_hi_q);
-    }
-
-    // Move on to next row
-    a += a_stride;
-    b += b_stride;
-    m += m_stride;
-  }
-
-  return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h);
-}
-
-#define MASKED_VARWXH(W, H)                                                   \
-  unsigned int aom_masked_variance##W##x##H##_ssse3(                          \
-      const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,         \
-      const uint8_t *m, int m_stride, unsigned int *sse) {                    \
-    return masked_variancewxh_ssse3(a, a_stride, b, b_stride, m, m_stride, W, \
-                                    H, sse);                                  \
-  }
-
-MASKED_VARWXH(16, 8)
-MASKED_VARWXH(16, 16)
-MASKED_VARWXH(16, 32)
-MASKED_VARWXH(32, 16)
-MASKED_VARWXH(32, 32)
-MASKED_VARWXH(32, 64)
-MASKED_VARWXH(64, 32)
-MASKED_VARWXH(64, 64)
-#if CONFIG_EXT_PARTITION
-MASKED_VARWXH(64, 128)
-MASKED_VARWXH(128, 64)
-MASKED_VARWXH(128, 128)
-#endif  // CONFIG_EXT_PARTITION
-
-/*****************************************************************************
- * 8 Wide versions
- *****************************************************************************/
-
-static INLINE unsigned int masked_variance8xh_ssse3(
-    const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,
-    const uint8_t *m, int m_stride, int h, unsigned int *sse) {
-  int ii;
-
-  const __m128i v_zero = _mm_setzero_si128();
-
-  __m128i v_sum_d = _mm_setzero_si128();
-  __m128i v_sse_q = _mm_setzero_si128();
-
-  for (ii = 0; ii < h; ii++) {
-    // Load inputs - 8 bits
-    const __m128i v_a_b = _mm_loadl_epi64((const __m128i *)a);
-    const __m128i v_b_b = _mm_loadl_epi64((const __m128i *)b);
-    const __m128i v_m_b = _mm_loadl_epi64((const __m128i *)m);
-
-    // Unpack to 16 bits - still containing max 8 bits
-    const __m128i v_a_w = _mm_unpacklo_epi8(v_a_b, v_zero);
-    const __m128i v_b_w = _mm_unpacklo_epi8(v_b_b, v_zero);
-    const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero);
-
-    // Difference: [-255, 255]
-    const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
-
-    // Error - [-255, 255] * [0, 64] = [0xc040, 0x3fc0] => fits in 15 bits
-    const __m128i v_e_w = _mm_mullo_epi16(v_d_w, v_m_w);
-    const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w);
-
-    // Squared error - using madd it's max (15 bits * 15 bits) * 2 = 31 bits
-    const __m128i v_se_d = _mm_madd_epi16(v_e_w, v_e_w);
-
-    // Unpack Squared error to 64 bits
-    const __m128i v_se_lo_q = _mm_unpacklo_epi32(v_se_d, v_zero);
-    const __m128i v_se_hi_q = _mm_unpackhi_epi32(v_se_d, v_zero);
-
-    // Accumulate
-    v_sum_d = _mm_add_epi32(v_sum_d, v_e_d);
-    v_sse_q = _mm_add_epi64(v_sse_q, v_se_lo_q);
-    v_sse_q = _mm_add_epi64(v_sse_q, v_se_hi_q);
-
-    // Move on to next row
-    a += a_stride;
-    b += b_stride;
-    m += m_stride;
-  }
-
-  return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h);
-}
-
-#define MASKED_VAR8XH(H)                                                      \
-  unsigned int aom_masked_variance8x##H##_ssse3(                              \
-      const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,         \
-      const uint8_t *m, int m_stride, unsigned int *sse) {                    \
-    return masked_variance8xh_ssse3(a, a_stride, b, b_stride, m, m_stride, H, \
-                                    sse);                                     \
-  }
-
-MASKED_VAR8XH(4)
-MASKED_VAR8XH(8)
-MASKED_VAR8XH(16)
-
-/*****************************************************************************
- * 4 Wide versions
- *****************************************************************************/
-
-static INLINE unsigned int masked_variance4xh_ssse3(
-    const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,
-    const uint8_t *m, int m_stride, int h, unsigned int *sse) {
-  int ii;
-
-  const __m128i v_zero = _mm_setzero_si128();
-
-  __m128i v_sum_d = _mm_setzero_si128();
-  __m128i v_sse_q = _mm_setzero_si128();
-
-  assert((h % 2) == 0);
-
-  for (ii = 0; ii < h / 2; ii++) {
-    // Load 2 input rows - 8 bits
-    const __m128i v_a0_b = _mm_cvtsi32_si128(*(const uint32_t *)a);
-    const __m128i v_b0_b = _mm_cvtsi32_si128(*(const uint32_t *)b);
-    const __m128i v_m0_b = _mm_cvtsi32_si128(*(const uint32_t *)m);
-    const __m128i v_a1_b = _mm_cvtsi32_si128(*(const uint32_t *)(a + a_stride));
-    const __m128i v_b1_b = _mm_cvtsi32_si128(*(const uint32_t *)(b + b_stride));
-    const __m128i v_m1_b = _mm_cvtsi32_si128(*(const uint32_t *)(m + m_stride));
-
-    // Interleave 2 rows into a single register
-    const __m128i v_a_b = _mm_unpacklo_epi32(v_a0_b, v_a1_b);
-    const __m128i v_b_b = _mm_unpacklo_epi32(v_b0_b, v_b1_b);
-    const __m128i v_m_b = _mm_unpacklo_epi32(v_m0_b, v_m1_b);
-
-    // Unpack to 16 bits - still containing max 8 bits
-    const __m128i v_a_w = _mm_unpacklo_epi8(v_a_b, v_zero);
-    const __m128i v_b_w = _mm_unpacklo_epi8(v_b_b, v_zero);
-    const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero);
-
-    // Difference: [-255, 255]
-    const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
-
-    // Error - [-255, 255] * [0, 64] = [0xc040, 0x3fc0] => fits in 15 bits
-    const __m128i v_e_w = _mm_mullo_epi16(v_d_w, v_m_w);
-    const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w);
-
-    // Squared error - using madd it's max (15 bits * 15 bits) * 2 = 31 bits
-    const __m128i v_se_d = _mm_madd_epi16(v_e_w, v_e_w);
-
-    // Unpack Squared error to 64 bits
-    const __m128i v_se_lo_q = _mm_unpacklo_epi32(v_se_d, v_zero);
-    const __m128i v_se_hi_q = _mm_unpackhi_epi32(v_se_d, v_zero);
-
-    // Accumulate
-    v_sum_d = _mm_add_epi32(v_sum_d, v_e_d);
-    v_sse_q = _mm_add_epi64(v_sse_q, v_se_lo_q);
-    v_sse_q = _mm_add_epi64(v_sse_q, v_se_hi_q);
-
-    // Move on to next 2 row
-    a += a_stride * 2;
-    b += b_stride * 2;
-    m += m_stride * 2;
-  }
-
-  return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h);
-}
-
-#define MASKED_VAR4XH(H)                                                      \
-  unsigned int aom_masked_variance4x##H##_ssse3(                              \
-      const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,         \
-      const uint8_t *m, int m_stride, unsigned int *sse) {                    \
-    return masked_variance4xh_ssse3(a, a_stride, b, b_stride, m, m_stride, H, \
-                                    sse);                                     \
-  }
-
-MASKED_VAR4XH(4)
-MASKED_VAR4XH(8)
-
-#if CONFIG_HIGHBITDEPTH
-
-// Main calculation for n*8 wide blocks
-static INLINE void highbd_masked_variance64_ssse3(
-    const uint16_t *a, int a_stride, const uint16_t *b, int b_stride,
-    const uint8_t *m, int m_stride, int w, int h, int64_t *sum, uint64_t *sse) {
-  int ii, jj;
-
-  const __m128i v_zero = _mm_setzero_si128();
-
-  __m128i v_sum_d = _mm_setzero_si128();
-  __m128i v_sse_q = _mm_setzero_si128();
-
-  assert((w % 8) == 0);
-
-  for (ii = 0; ii < h; ii++) {
-    for (jj = 0; jj < w; jj += 8) {
-      // Load inputs - 8 bits
-      const __m128i v_a_w = _mm_loadu_si128((const __m128i *)(a + jj));
-      const __m128i v_b_w = _mm_loadu_si128((const __m128i *)(b + jj));
-      const __m128i v_m_b = _mm_loadl_epi64((const __m128i *)(m + jj));
-
-      // Unpack m to 16 bits - still containing max 8 bits
-      const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero);
-
-      // Difference: [-4095, 4095]
-      const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
-
-      // Error - [-4095, 4095] * [0, 64] => sum of 2 of these fits in 19 bits
-      const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w);
-
-      // Squared error - max (18 bits * 18 bits) = 36 bits (no sign bit)
-      const __m128i v_absd_w = _mm_abs_epi16(v_d_w);
-      const __m128i v_dlo_d = _mm_unpacklo_epi16(v_absd_w, v_zero);
-      const __m128i v_mlo_d = _mm_unpacklo_epi16(v_m_w, v_zero);
-      const __m128i v_elo_d = _mm_madd_epi16(v_dlo_d, v_mlo_d);
-      const __m128i v_dhi_d = _mm_unpackhi_epi16(v_absd_w, v_zero);
-      const __m128i v_mhi_d = _mm_unpackhi_epi16(v_m_w, v_zero);
-      const __m128i v_ehi_d = _mm_madd_epi16(v_dhi_d, v_mhi_d);
-      // Square and sum the errors -> 36bits * 4 = 38bits
-      __m128i v_se0_q, v_se1_q, v_se2_q, v_se3_q, v_se_q, v_elo1_d, v_ehi3_d;
-      v_se0_q = _mm_mul_epu32(v_elo_d, v_elo_d);
-      v_elo1_d = _mm_srli_si128(v_elo_d, 4);
-      v_se1_q = _mm_mul_epu32(v_elo1_d, v_elo1_d);
-      v_se0_q = _mm_add_epi64(v_se0_q, v_se1_q);
-      v_se2_q = _mm_mul_epu32(v_ehi_d, v_ehi_d);
-      v_ehi3_d = _mm_srli_si128(v_ehi_d, 4);
-      v_se3_q = _mm_mul_epu32(v_ehi3_d, v_ehi3_d);
-      v_se1_q = _mm_add_epi64(v_se2_q, v_se3_q);
-      v_se_q = _mm_add_epi64(v_se0_q, v_se1_q);
-
-      // Accumulate
-      v_sum_d = _mm_add_epi32(v_sum_d, v_e_d);
-      v_sse_q = _mm_add_epi64(v_sse_q, v_se_q);
-    }
-
-    // Move on to next row
-    a += a_stride;
-    b += b_stride;
-    m += m_stride;
-  }
-
-  // Horizontal sum
-  *sum = hsum_epi32_si64(v_sum_d);
-  *sse = hsum_epi64_si64(v_sse_q);
-
-  // Round
-  *sum = (*sum >= 0) ? *sum : -*sum;
-  *sum = ROUND_POWER_OF_TWO(*sum, 6);
-  *sse = ROUND_POWER_OF_TWO(*sse, 12);
-}
-
-// Main calculation for 4 wide blocks
-static INLINE void highbd_masked_variance64_4wide_ssse3(
-    const uint16_t *a, int a_stride, const uint16_t *b, int b_stride,
-    const uint8_t *m, int m_stride, int h, int64_t *sum, uint64_t *sse) {
-  int ii;
-
-  const __m128i v_zero = _mm_setzero_si128();
-
-  __m128i v_sum_d = _mm_setzero_si128();
-  __m128i v_sse_q = _mm_setzero_si128();
-
-  assert((h % 2) == 0);
-
-  for (ii = 0; ii < h / 2; ii++) {
-    // Load 2 input rows - 8 bits
-    const __m128i v_a0_w = _mm_loadl_epi64((const __m128i *)a);
-    const __m128i v_b0_w = _mm_loadl_epi64((const __m128i *)b);
-    const __m128i v_m0_b = _mm_cvtsi32_si128(*(const uint32_t *)m);
-    const __m128i v_a1_w = _mm_loadl_epi64((const __m128i *)(a + a_stride));
-    const __m128i v_b1_w = _mm_loadl_epi64((const __m128i *)(b + b_stride));
-    const __m128i v_m1_b = _mm_cvtsi32_si128(*(const uint32_t *)(m + m_stride));
-
-    // Interleave 2 rows into a single register
-    const __m128i v_a_w = _mm_unpacklo_epi64(v_a0_w, v_a1_w);
-    const __m128i v_b_w = _mm_unpacklo_epi64(v_b0_w, v_b1_w);
-    const __m128i v_m_b = _mm_unpacklo_epi32(v_m0_b, v_m1_b);
-
-    // Unpack to 16 bits - still containing max 8 bits
-    const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero);
-
-    // Difference: [-4095, 4095]
-    const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
-
-    // Error - [-4095, 4095] * [0, 64] => fits in 19 bits (incld sign bit)
-    const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w);
-
-    // Squared error - max (18 bits * 18 bits) = 36 bits (no sign bit)
-    const __m128i v_absd_w = _mm_abs_epi16(v_d_w);
-    const __m128i v_dlo_d = _mm_unpacklo_epi16(v_absd_w, v_zero);
-    const __m128i v_mlo_d = _mm_unpacklo_epi16(v_m_w, v_zero);
-    const __m128i v_elo_d = _mm_madd_epi16(v_dlo_d, v_mlo_d);
-    const __m128i v_dhi_d = _mm_unpackhi_epi16(v_absd_w, v_zero);
-    const __m128i v_mhi_d = _mm_unpackhi_epi16(v_m_w, v_zero);
-    const __m128i v_ehi_d = _mm_madd_epi16(v_dhi_d, v_mhi_d);
-    // Square and sum the errors -> 36bits * 4 = 38bits
-    __m128i v_se0_q, v_se1_q, v_se2_q, v_se3_q, v_se_q, v_elo1_d, v_ehi3_d;
-    v_se0_q = _mm_mul_epu32(v_elo_d, v_elo_d);
-    v_elo1_d = _mm_srli_si128(v_elo_d, 4);
-    v_se1_q = _mm_mul_epu32(v_elo1_d, v_elo1_d);
-    v_se0_q = _mm_add_epi64(v_se0_q, v_se1_q);
-    v_se2_q = _mm_mul_epu32(v_ehi_d, v_ehi_d);
-    v_ehi3_d = _mm_srli_si128(v_ehi_d, 4);
-    v_se3_q = _mm_mul_epu32(v_ehi3_d, v_ehi3_d);
-    v_se1_q = _mm_add_epi64(v_se2_q, v_se3_q);
-    v_se_q = _mm_add_epi64(v_se0_q, v_se1_q);
-
-    // Accumulate
-    v_sum_d = _mm_add_epi32(v_sum_d, v_e_d);
-    v_sse_q = _mm_add_epi64(v_sse_q, v_se_q);
-
-    // Move on to next row
-    a += a_stride * 2;
-    b += b_stride * 2;
-    m += m_stride * 2;
-  }
-
-  // Horizontal sum
-  *sum = hsum_epi32_si32(v_sum_d);
-  *sse = hsum_epi64_si64(v_sse_q);
-
-  // Round
-  *sum = (*sum >= 0) ? *sum : -*sum;
-  *sum = ROUND_POWER_OF_TWO(*sum, 6);
-  *sse = ROUND_POWER_OF_TWO(*sse, 12);
-}
-
-static INLINE unsigned int highbd_masked_variancewxh_ssse3(
-    const uint16_t *a, int a_stride, const uint16_t *b, int b_stride,
-    const uint8_t *m, int m_stride, int w, int h, unsigned int *sse) {
-  uint64_t sse64;
-  int64_t sum64;
-
-  if (w == 4)
-    highbd_masked_variance64_4wide_ssse3(a, a_stride, b, b_stride, m, m_stride,
-                                         h, &sum64, &sse64);
-  else
-    highbd_masked_variance64_ssse3(a, a_stride, b, b_stride, m, m_stride, w, h,
-                                   &sum64, &sse64);
-
-  // Store the SSE
-  *sse = (uint32_t)sse64;
-  // Compute and return variance
-  return *sse - (uint32_t)((sum64 * sum64) / (w * h));
-}
-
-static INLINE unsigned int highbd_10_masked_variancewxh_ssse3(
-    const uint16_t *a, int a_stride, const uint16_t *b, int b_stride,
-    const uint8_t *m, int m_stride, int w, int h, unsigned int *sse) {
-  uint64_t sse64;
-  int64_t sum64;
-
-  if (w == 4)
-    highbd_masked_variance64_4wide_ssse3(a, a_stride, b, b_stride, m, m_stride,
-                                         h, &sum64, &sse64);
-  else
-    highbd_masked_variance64_ssse3(a, a_stride, b, b_stride, m, m_stride, w, h,
-                                   &sum64, &sse64);
-
-  // Normalise
-  sum64 = ROUND_POWER_OF_TWO(sum64, 2);
-  sse64 = ROUND_POWER_OF_TWO(sse64, 4);
-
-  // Store the SSE
-  *sse = (uint32_t)sse64;
-  // Compute and return variance
-  return *sse - (uint32_t)((sum64 * sum64) / (w * h));
-}
-
-static INLINE unsigned int highbd_12_masked_variancewxh_ssse3(
-    const uint16_t *a, int a_stride, const uint16_t *b, int b_stride,
-    const uint8_t *m, int m_stride, int w, int h, unsigned int *sse) {
-  uint64_t sse64;
-  int64_t sum64;
-
-  if (w == 4)
-    highbd_masked_variance64_4wide_ssse3(a, a_stride, b, b_stride, m, m_stride,
-                                         h, &sum64, &sse64);
-  else
-    highbd_masked_variance64_ssse3(a, a_stride, b, b_stride, m, m_stride, w, h,
-                                   &sum64, &sse64);
-
-  sum64 = ROUND_POWER_OF_TWO(sum64, 4);
-  sse64 = ROUND_POWER_OF_TWO(sse64, 8);
-
-  // Store the SSE
-  *sse = (uint32_t)sse64;
-  // Compute and return variance
-  return *sse - (uint32_t)((sum64 * sum64) / (w * h));
-}
-
-#define HIGHBD_MASKED_VARWXH(W, H)                                         \
-  unsigned int aom_highbd_masked_variance##W##x##H##_ssse3(                \
-      const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride,    \
-      const uint8_t *m, int m_stride, unsigned int *sse) {                 \
-    uint16_t *a = CONVERT_TO_SHORTPTR(a8);                                 \
-    uint16_t *b = CONVERT_TO_SHORTPTR(b8);                                 \
-    return highbd_masked_variancewxh_ssse3(a, a_stride, b, b_stride, m,    \
-                                           m_stride, W, H, sse);           \
-  }                                                                        \
-                                                                           \
-  unsigned int aom_highbd_10_masked_variance##W##x##H##_ssse3(             \
-      const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride,    \
-      const uint8_t *m, int m_stride, unsigned int *sse) {                 \
-    uint16_t *a = CONVERT_TO_SHORTPTR(a8);                                 \
-    uint16_t *b = CONVERT_TO_SHORTPTR(b8);                                 \
-    return highbd_10_masked_variancewxh_ssse3(a, a_stride, b, b_stride, m, \
-                                              m_stride, W, H, sse);        \
-  }                                                                        \
-                                                                           \
-  unsigned int aom_highbd_12_masked_variance##W##x##H##_ssse3(             \
-      const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride,    \
-      const uint8_t *m, int m_stride, unsigned int *sse) {                 \
-    uint16_t *a = CONVERT_TO_SHORTPTR(a8);                                 \
-    uint16_t *b = CONVERT_TO_SHORTPTR(b8);                                 \
-    return highbd_12_masked_variancewxh_ssse3(a, a_stride, b, b_stride, m, \
-                                              m_stride, W, H, sse);        \
-  }
-
-HIGHBD_MASKED_VARWXH(4, 4)
-HIGHBD_MASKED_VARWXH(4, 8)
-HIGHBD_MASKED_VARWXH(8, 4)
-HIGHBD_MASKED_VARWXH(8, 8)
-HIGHBD_MASKED_VARWXH(8, 16)
-HIGHBD_MASKED_VARWXH(16, 8)
-HIGHBD_MASKED_VARWXH(16, 16)
-HIGHBD_MASKED_VARWXH(16, 32)
-HIGHBD_MASKED_VARWXH(32, 16)
-HIGHBD_MASKED_VARWXH(32, 32)
-HIGHBD_MASKED_VARWXH(32, 64)
-HIGHBD_MASKED_VARWXH(64, 32)
-HIGHBD_MASKED_VARWXH(64, 64)
-#if CONFIG_EXT_PARTITION
-HIGHBD_MASKED_VARWXH(64, 128)
-HIGHBD_MASKED_VARWXH(128, 64)
-HIGHBD_MASKED_VARWXH(128, 128)
-#endif  // CONFIG_EXT_PARTITION
-
-#endif
-
-//////////////////////////////////////////////////////////////////////////////
-// Sub pixel versions
-//////////////////////////////////////////////////////////////////////////////
-
-typedef __m128i (*filter_fn_t)(__m128i v_a_b, __m128i v_b_b,
-                               __m128i v_filter_b);
-
-static INLINE __m128i apply_filter_avg(const __m128i v_a_b, const __m128i v_b_b,
-                                       const __m128i v_filter_b) {
-  (void)v_filter_b;
-  return _mm_avg_epu8(v_a_b, v_b_b);
-}
-
-static INLINE __m128i apply_filter(const __m128i v_a_b, const __m128i v_b_b,
-                                   const __m128i v_filter_b) {
-  const __m128i v_rounding_w = _mm_set1_epi16(1 << (FILTER_BITS - 1));
-  __m128i v_input_lo_b = _mm_unpacklo_epi8(v_a_b, v_b_b);
-  __m128i v_input_hi_b = _mm_unpackhi_epi8(v_a_b, v_b_b);
-  __m128i v_temp0_w = _mm_maddubs_epi16(v_input_lo_b, v_filter_b);
-  __m128i v_temp1_w = _mm_maddubs_epi16(v_input_hi_b, v_filter_b);
-  __m128i v_res_lo_w =
-      _mm_srai_epi16(_mm_add_epi16(v_temp0_w, v_rounding_w), FILTER_BITS);
-  __m128i v_res_hi_w =
-      _mm_srai_epi16(_mm_add_epi16(v_temp1_w, v_rounding_w), FILTER_BITS);
-  return _mm_packus_epi16(v_res_lo_w, v_res_hi_w);
-}
-
-// Apply the filter to the contents of the lower half of a and b
-static INLINE void apply_filter_lo(const __m128i v_a_lo_b,
-                                   const __m128i v_b_lo_b,
-                                   const __m128i v_filter_b, __m128i *v_res_w) {
-  const __m128i v_rounding_w = _mm_set1_epi16(1 << (FILTER_BITS - 1));
-  __m128i v_input_b = _mm_unpacklo_epi8(v_a_lo_b, v_b_lo_b);
-  __m128i v_temp0_w = _mm_maddubs_epi16(v_input_b, v_filter_b);
-  *v_res_w =
-      _mm_srai_epi16(_mm_add_epi16(v_temp0_w, v_rounding_w), FILTER_BITS);
-}
-
-static void sum_and_sse(const __m128i v_a_b, const __m128i v_b_b,
-                        const __m128i v_m_b, __m128i *v_sum_d,
-                        __m128i *v_sse_q) {
-  const __m128i v_zero = _mm_setzero_si128();
-  // Unpack to 16 bits - still containing max 8 bits
-  const __m128i v_a0_w = _mm_unpacklo_epi8(v_a_b, v_zero);
-  const __m128i v_b0_w = _mm_unpacklo_epi8(v_b_b, v_zero);
-  const __m128i v_m0_w = _mm_unpacklo_epi8(v_m_b, v_zero);
-  const __m128i v_a1_w = _mm_unpackhi_epi8(v_a_b, v_zero);
-  const __m128i v_b1_w = _mm_unpackhi_epi8(v_b_b, v_zero);
-  const __m128i v_m1_w = _mm_unpackhi_epi8(v_m_b, v_zero);
-
-  // Difference: [-255, 255]
-  const __m128i v_d0_w = _mm_sub_epi16(v_a0_w, v_b0_w);
-  const __m128i v_d1_w = _mm_sub_epi16(v_a1_w, v_b1_w);
-
-  // Error - [-255, 255] * [0, 64] = [0xc040, 0x3fc0] => fits in 15 bits
-  const __m128i v_e0_w = _mm_mullo_epi16(v_d0_w, v_m0_w);
-  const __m128i v_e0_d = _mm_madd_epi16(v_d0_w, v_m0_w);
-  const __m128i v_e1_w = _mm_mullo_epi16(v_d1_w, v_m1_w);
-  const __m128i v_e1_d = _mm_madd_epi16(v_d1_w, v_m1_w);
-
-  // Squared error - using madd it's max (15 bits * 15 bits) * 2 = 31 bits
-  const __m128i v_se0_d = _mm_madd_epi16(v_e0_w, v_e0_w);
-  const __m128i v_se1_d = _mm_madd_epi16(v_e1_w, v_e1_w);
-
-  // Sum of v_se{0,1}_d - 31 bits + 31 bits = 32 bits
-  const __m128i v_se_d = _mm_add_epi32(v_se0_d, v_se1_d);
-
-  // Unpack Squared error to 64 bits
-  const __m128i v_se_lo_q = _mm_unpacklo_epi32(v_se_d, v_zero);
-  const __m128i v_se_hi_q = _mm_unpackhi_epi32(v_se_d, v_zero);
-
-  // Accumulate
-  *v_sum_d = _mm_add_epi32(*v_sum_d, v_e0_d);
-  *v_sum_d = _mm_add_epi32(*v_sum_d, v_e1_d);
-  *v_sse_q = _mm_add_epi64(*v_sse_q, v_se_lo_q);
-  *v_sse_q = _mm_add_epi64(*v_sse_q, v_se_hi_q);
-}
-
-// Functions for width (W) >= 16
-unsigned int aom_masked_subpel_varWxH_xzero(const uint8_t *src, int src_stride,
-                                            int yoffset, const uint8_t *dst,
-                                            int dst_stride, const uint8_t *msk,
-                                            int msk_stride, unsigned int *sse,
-                                            int w, int h,
-                                            filter_fn_t filter_fn) {
-  int i, j;
-  __m128i v_src0_b, v_src1_b, v_res_b, v_dst_b, v_msk_b;
-  __m128i v_sum_d = _mm_setzero_si128();
-  __m128i v_sse_q = _mm_setzero_si128();
-  const __m128i v_filter_b = _mm_set1_epi16(
-      (bilinear_filters_2t[yoffset][1] << 8) + bilinear_filters_2t[yoffset][0]);
-  assert(yoffset < BIL_SUBPEL_SHIFTS);
-  for (j = 0; j < w; j += 16) {
-    // Load the first row ready
-    v_src0_b = _mm_loadu_si128((const __m128i *)(src + j));
-    // Process 2 rows at a time
-    for (i = 0; i < h; i += 2) {
-      // Load the next row apply the filter
-      v_src1_b = _mm_loadu_si128((const __m128i *)(src + j + src_stride));
-      v_res_b = filter_fn(v_src0_b, v_src1_b, v_filter_b);
-      // Load the dst and msk for the variance calculation
-      v_dst_b = _mm_loadu_si128((const __m128i *)(dst + j));
-      v_msk_b = _mm_loadu_si128((const __m128i *)(msk + j));
-      sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
-
-      // Load the next row apply the filter
-      v_src0_b = _mm_loadu_si128((const __m128i *)(src + j + src_stride * 2));
-      v_res_b = filter_fn(v_src1_b, v_src0_b, v_filter_b);
-      // Load the dst and msk for the variance calculation
-      v_dst_b = _mm_loadu_si128((const __m128i *)(dst + j + dst_stride));
-      v_msk_b = _mm_loadu_si128((const __m128i *)(msk + j + msk_stride));
-      sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
-      // Move onto the next block of rows
-      src += src_stride * 2;
-      dst += dst_stride * 2;
-      msk += msk_stride * 2;
-    }
-    // Reset to the top of the block
-    src -= src_stride * h;
-    dst -= dst_stride * h;
-    msk -= msk_stride * h;
-  }
-  return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h);
-}
-unsigned int aom_masked_subpel_varWxH_yzero(const uint8_t *src, int src_stride,
-                                            int xoffset, const uint8_t *dst,
-                                            int dst_stride, const uint8_t *msk,
-                                            int msk_stride, unsigned int *sse,
-                                            int w, int h,
-                                            filter_fn_t filter_fn) {
-  int i, j;
-  __m128i v_src0_b, v_src1_b, v_res_b, v_dst_b, v_msk_b;
-  __m128i v_sum_d = _mm_setzero_si128();
-  __m128i v_sse_q = _mm_setzero_si128();
-  const __m128i v_filter_b = _mm_set1_epi16(
-      (bilinear_filters_2t[xoffset][1] << 8) + bilinear_filters_2t[xoffset][0]);
-  assert(xoffset < BIL_SUBPEL_SHIFTS);
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j += 16) {
-      // Load this row and one below & apply the filter to them
-      v_src0_b = _mm_loadu_si128((const __m128i *)(src + j));
-      v_src1_b = _mm_loadu_si128((const __m128i *)(src + j + 1));
-      v_res_b = filter_fn(v_src0_b, v_src1_b, v_filter_b);
-
-      // Load the dst and msk for the variance calculation
-      v_dst_b = _mm_loadu_si128((const __m128i *)(dst + j));
-      v_msk_b = _mm_loadu_si128((const __m128i *)(msk + j));
-      sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
-    }
-    src += src_stride;
-    dst += dst_stride;
-    msk += msk_stride;
-  }
-  return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h);
-}
-unsigned int aom_masked_subpel_varWxH_xnonzero_ynonzero(
-    const uint8_t *src, int src_stride, int xoffset, int yoffset,
-    const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
-    unsigned int *sse, int w, int h, filter_fn_t xfilter_fn,
-    filter_fn_t yfilter_fn) {
-  int i, j;
-  __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b;
-  __m128i v_filtered0_b, v_filtered1_b, v_res_b, v_dst_b, v_msk_b;
-  __m128i v_sum_d = _mm_setzero_si128();
-  __m128i v_sse_q = _mm_setzero_si128();
-  const __m128i v_filterx_b = _mm_set1_epi16(
-      (bilinear_filters_2t[xoffset][1] << 8) + bilinear_filters_2t[xoffset][0]);
-  const __m128i v_filtery_b = _mm_set1_epi16(
-      (bilinear_filters_2t[yoffset][1] << 8) + bilinear_filters_2t[yoffset][0]);
-  assert(yoffset < BIL_SUBPEL_SHIFTS);
-  assert(xoffset < BIL_SUBPEL_SHIFTS);
-  for (j = 0; j < w; j += 16) {
-    // Load the first row ready
-    v_src0_b = _mm_loadu_si128((const __m128i *)(src + j));
-    v_src1_b = _mm_loadu_si128((const __m128i *)(src + j + 1));
-    v_filtered0_b = xfilter_fn(v_src0_b, v_src1_b, v_filterx_b);
-    // Process 2 rows at a time
-    for (i = 0; i < h; i += 2) {
-      // Load the next row & apply the filter
-      v_src2_b = _mm_loadu_si128((const __m128i *)(src + src_stride + j));
-      v_src3_b = _mm_loadu_si128((const __m128i *)(src + src_stride + j + 1));
-      v_filtered1_b = xfilter_fn(v_src2_b, v_src3_b, v_filterx_b);
-      // Load the dst and msk for the variance calculation
-      v_dst_b = _mm_loadu_si128((const __m128i *)(dst + j));
-      v_msk_b = _mm_loadu_si128((const __m128i *)(msk + j));
-      // Complete the calculation for this row and add it to the running total
-      v_res_b = yfilter_fn(v_filtered0_b, v_filtered1_b, v_filtery_b);
-      sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
-
-      // Load the next row & apply the filter
-      v_src0_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 2 + j));
-      v_src1_b =
-          _mm_loadu_si128((const __m128i *)(src + src_stride * 2 + j + 1));
-      v_filtered0_b = xfilter_fn(v_src0_b, v_src1_b, v_filterx_b);
-      // Load the dst and msk for the variance calculation
-      v_dst_b = _mm_loadu_si128((const __m128i *)(dst + dst_stride + j));
-      v_msk_b = _mm_loadu_si128((const __m128i *)(msk + msk_stride + j));
-      // Complete the calculation for this row and add it to the running total
-      v_res_b = yfilter_fn(v_filtered1_b, v_filtered0_b, v_filtery_b);
-      sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
-      // Move onto the next block of rows
-      src += src_stride * 2;
-      dst += dst_stride * 2;
-      msk += msk_stride * 2;
-    }
-    // Reset to the top of the block
-    src -= src_stride * h;
-    dst -= dst_stride * h;
-    msk -= msk_stride * h;
-  }
-  return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h);
-}
-
-// Note order in which rows loaded xmm[127:96] = row 1, xmm[95:64] = row 2,
-// xmm[63:32] = row 3, xmm[31:0] = row 4
-unsigned int aom_masked_subpel_var4xH_xzero(const uint8_t *src, int src_stride,
-                                            int yoffset, const uint8_t *dst,
-                                            int dst_stride, const uint8_t *msk,
-                                            int msk_stride, unsigned int *sse,
-                                            int h) {
-  int i;
-  __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b, v_filtered1_w, v_filtered2_w;
-  __m128i v_dst0_b, v_dst1_b, v_dst2_b, v_dst3_b;
-  __m128i v_msk0_b, v_msk1_b, v_msk2_b, v_msk3_b, v_res_b;
-  __m128i v_sum_d = _mm_setzero_si128();
-  __m128i v_sse_q = _mm_setzero_si128();
-  __m128i v_filter_b = _mm_set1_epi16((bilinear_filters_2t[yoffset][1] << 8) +
-                                      bilinear_filters_2t[yoffset][0]);
-  assert(yoffset < BIL_SUBPEL_SHIFTS);
-  // Load the first row of src data ready
-  v_src0_b = _mm_loadl_epi64((const __m128i *)src);
-  for (i = 0; i < h; i += 4) {
-    // Load the rest of the source data for these rows
-    v_src1_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1));
-    v_src1_b = _mm_unpacklo_epi32(v_src1_b, v_src0_b);
-    v_src2_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
-    v_src3_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
-    v_src3_b = _mm_unpacklo_epi32(v_src3_b, v_src2_b);
-    v_src0_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4));
-    // Load the dst data
-    v_dst0_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 0));
-    v_dst1_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 1));
-    v_dst0_b = _mm_unpacklo_epi32(v_dst1_b, v_dst0_b);
-    v_dst2_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 2));
-    v_dst3_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 3));
-    v_dst2_b = _mm_unpacklo_epi32(v_dst3_b, v_dst2_b);
-    v_dst0_b = _mm_unpacklo_epi64(v_dst2_b, v_dst0_b);
-    // Load the mask data
-    v_msk0_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 0));
-    v_msk1_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 1));
-    v_msk0_b = _mm_unpacklo_epi32(v_msk1_b, v_msk0_b);
-    v_msk2_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 2));
-    v_msk3_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 3));
-    v_msk2_b = _mm_unpacklo_epi32(v_msk3_b, v_msk2_b);
-    v_msk0_b = _mm_unpacklo_epi64(v_msk2_b, v_msk0_b);
-    // Apply the y filter
-    if (yoffset == HALF_PIXEL_OFFSET) {
-      v_src1_b = _mm_unpacklo_epi64(v_src3_b, v_src1_b);
-      v_src2_b =
-          _mm_or_si128(_mm_slli_si128(v_src1_b, 4),
-                       _mm_and_si128(v_src0_b, _mm_setr_epi32(-1, 0, 0, 0)));
-      v_res_b = _mm_avg_epu8(v_src1_b, v_src2_b);
-    } else {
-      v_src2_b =
-          _mm_or_si128(_mm_slli_si128(v_src1_b, 4),
-                       _mm_and_si128(v_src2_b, _mm_setr_epi32(-1, 0, 0, 0)));
-      apply_filter_lo(v_src1_b, v_src2_b, v_filter_b, &v_filtered1_w);
-      v_src2_b =
-          _mm_or_si128(_mm_slli_si128(v_src3_b, 4),
-                       _mm_and_si128(v_src0_b, _mm_setr_epi32(-1, 0, 0, 0)));
-      apply_filter_lo(v_src3_b, v_src2_b, v_filter_b, &v_filtered2_w);
-      v_res_b = _mm_packus_epi16(v_filtered2_w, v_filtered1_w);
-    }
-    // Compute the sum and SSE
-    sum_and_sse(v_res_b, v_dst0_b, v_msk0_b, &v_sum_d, &v_sse_q);
-    // Move onto the next set of rows
-    src += src_stride * 4;
-    dst += dst_stride * 4;
-    msk += msk_stride * 4;
-  }
-  return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h);
-}
-
-// Note order in which rows loaded xmm[127:64] = row 1, xmm[63:0] = row 2
-unsigned int aom_masked_subpel_var8xH_xzero(const uint8_t *src, int src_stride,
-                                            int yoffset, const uint8_t *dst,
-                                            int dst_stride, const uint8_t *msk,
-                                            int msk_stride, unsigned int *sse,
-                                            int h) {
-  int i;
-  __m128i v_src0_b, v_src1_b, v_filtered0_w, v_filtered1_w, v_res_b;
-  __m128i v_dst_b = _mm_setzero_si128();
-  __m128i v_msk_b = _mm_setzero_si128();
-  __m128i v_sum_d = _mm_setzero_si128();
-  __m128i v_sse_q = _mm_setzero_si128();
-  __m128i v_filter_b = _mm_set1_epi16((bilinear_filters_2t[yoffset][1] << 8) +
-                                      bilinear_filters_2t[yoffset][0]);
-  assert(yoffset < BIL_SUBPEL_SHIFTS);
-  // Load the first row of src data ready
-  v_src0_b = _mm_loadl_epi64((const __m128i *)src);
-  for (i = 0; i < h; i += 2) {
-    if (yoffset == HALF_PIXEL_OFFSET) {
-      // Load the rest of the source data for these rows
-      v_src1_b = _mm_or_si128(
-          _mm_slli_si128(v_src0_b, 8),
-          _mm_loadl_epi64((const __m128i *)(src + src_stride * 1)));
-      v_src0_b = _mm_or_si128(
-          _mm_slli_si128(v_src1_b, 8),
-          _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)));
-      // Apply the y filter
-      v_res_b = _mm_avg_epu8(v_src1_b, v_src0_b);
-    } else {
-      // Load the data and apply the y filter
-      v_src1_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1));
-      apply_filter_lo(v_src0_b, v_src1_b, v_filter_b, &v_filtered0_w);
-      v_src0_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
-      apply_filter_lo(v_src1_b, v_src0_b, v_filter_b, &v_filtered1_w);
-      v_res_b = _mm_packus_epi16(v_filtered1_w, v_filtered0_w);
-    }
-    // Load the dst data
-    v_dst_b = _mm_unpacklo_epi64(
-        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)),
-        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)));
-    // Load the mask data
-    v_msk_b = _mm_unpacklo_epi64(
-        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)),
-        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)));
-    // Compute the sum and SSE
-    sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
-    // Move onto the next set of rows
-    src += src_stride * 2;
-    dst += dst_stride * 2;
-    msk += msk_stride * 2;
-  }
-  return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h);
-}
-
-// Note order in which rows loaded xmm[127:96] = row 1, xmm[95:64] = row 2,
-// xmm[63:32] = row 3, xmm[31:0] = row 4
-unsigned int aom_masked_subpel_var4xH_yzero(const uint8_t *src, int src_stride,
-                                            int xoffset, const uint8_t *dst,
-                                            int dst_stride, const uint8_t *msk,
-                                            int msk_stride, unsigned int *sse,
-                                            int h) {
-  int i;
-  __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b, v_filtered0_w, v_filtered2_w;
-  __m128i v_src0_shift_b, v_src1_shift_b, v_src2_shift_b, v_src3_shift_b;
-  __m128i v_dst0_b, v_dst1_b, v_dst2_b, v_dst3_b;
-  __m128i v_msk0_b, v_msk1_b, v_msk2_b, v_msk3_b, v_res_b;
-  __m128i v_sum_d = _mm_setzero_si128();
-  __m128i v_sse_q = _mm_setzero_si128();
-  __m128i v_filter_b = _mm_set1_epi16((bilinear_filters_2t[xoffset][1] << 8) +
-                                      bilinear_filters_2t[xoffset][0]);
-  assert(xoffset < BIL_SUBPEL_SHIFTS);
-  for (i = 0; i < h; i += 4) {
-    // Load the src data
-    v_src0_b = _mm_loadl_epi64((const __m128i *)src);
-    v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
-    v_src1_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1));
-    v_src0_b = _mm_unpacklo_epi32(v_src1_b, v_src0_b);
-    v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
-    v_src2_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
-    v_src0_shift_b = _mm_unpacklo_epi32(v_src1_shift_b, v_src0_shift_b);
-    v_src2_shift_b = _mm_srli_si128(v_src2_b, 1);
-    v_src3_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
-    v_src2_b = _mm_unpacklo_epi32(v_src3_b, v_src2_b);
-    v_src3_shift_b = _mm_srli_si128(v_src3_b, 1);
-    v_src2_shift_b = _mm_unpacklo_epi32(v_src3_shift_b, v_src2_shift_b);
-    // Load the dst data
-    v_dst0_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 0));
-    v_dst1_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 1));
-    v_dst0_b = _mm_unpacklo_epi32(v_dst1_b, v_dst0_b);
-    v_dst2_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 2));
-    v_dst3_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 3));
-    v_dst2_b = _mm_unpacklo_epi32(v_dst3_b, v_dst2_b);
-    v_dst0_b = _mm_unpacklo_epi64(v_dst2_b, v_dst0_b);
-    // Load the mask data
-    v_msk0_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 0));
-    v_msk1_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 1));
-    v_msk0_b = _mm_unpacklo_epi32(v_msk1_b, v_msk0_b);
-    v_msk2_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 2));
-    v_msk3_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 3));
-    v_msk2_b = _mm_unpacklo_epi32(v_msk3_b, v_msk2_b);
-    v_msk0_b = _mm_unpacklo_epi64(v_msk2_b, v_msk0_b);
-    // Apply the x filter
-    if (xoffset == HALF_PIXEL_OFFSET) {
-      v_src0_b = _mm_unpacklo_epi64(v_src2_b, v_src0_b);
-      v_src0_shift_b = _mm_unpacklo_epi64(v_src2_shift_b, v_src0_shift_b);
-      v_res_b = _mm_avg_epu8(v_src0_b, v_src0_shift_b);
-    } else {
-      apply_filter_lo(v_src0_b, v_src0_shift_b, v_filter_b, &v_filtered0_w);
-      apply_filter_lo(v_src2_b, v_src2_shift_b, v_filter_b, &v_filtered2_w);
-      v_res_b = _mm_packus_epi16(v_filtered2_w, v_filtered0_w);
-    }
-    // Compute the sum and SSE
-    sum_and_sse(v_res_b, v_dst0_b, v_msk0_b, &v_sum_d, &v_sse_q);
-    // Move onto the next set of rows
-    src += src_stride * 4;
-    dst += dst_stride * 4;
-    msk += msk_stride * 4;
-  }
-  return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h);
-}
-
-unsigned int aom_masked_subpel_var8xH_yzero(const uint8_t *src, int src_stride,
-                                            int xoffset, const uint8_t *dst,
-                                            int dst_stride, const uint8_t *msk,
-                                            int msk_stride, unsigned int *sse,
-                                            int h) {
-  int i;
-  __m128i v_src0_b, v_src1_b, v_filtered0_w, v_filtered1_w;
-  __m128i v_src0_shift_b, v_src1_shift_b, v_res_b, v_dst_b, v_msk_b;
-  __m128i v_sum_d = _mm_setzero_si128();
-  __m128i v_sse_q = _mm_setzero_si128();
-  __m128i v_filter_b = _mm_set1_epi16((bilinear_filters_2t[xoffset][1] << 8) +
-                                      bilinear_filters_2t[xoffset][0]);
-  assert(xoffset < BIL_SUBPEL_SHIFTS);
-  for (i = 0; i < h; i += 2) {
-    // Load the src data
-    v_src0_b = _mm_loadu_si128((const __m128i *)(src));
-    v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
-    v_src1_b = _mm_loadu_si128((const __m128i *)(src + src_stride));
-    v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
-    // Apply the x filter
-    if (xoffset == HALF_PIXEL_OFFSET) {
-      v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b);
-      v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b);
-      v_res_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b);
-    } else {
-      apply_filter_lo(v_src0_b, v_src0_shift_b, v_filter_b, &v_filtered0_w);
-      apply_filter_lo(v_src1_b, v_src1_shift_b, v_filter_b, &v_filtered1_w);
-      v_res_b = _mm_packus_epi16(v_filtered0_w, v_filtered1_w);
-    }
-    // Load the dst data
-    v_dst_b = _mm_unpacklo_epi64(
-        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)),
-        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)));
-    // Load the mask data
-    v_msk_b = _mm_unpacklo_epi64(
-        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)),
-        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)));
-    // Compute the sum and SSE
-    sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
-    // Move onto the next set of rows
-    src += src_stride * 2;
-    dst += dst_stride * 2;
-    msk += msk_stride * 2;
-  }
-  return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h);
-}
-
-// Note order in which rows loaded xmm[127:96] = row 1, xmm[95:64] = row 2,
-// xmm[63:32] = row 3, xmm[31:0] = row 4
-unsigned int aom_masked_subpel_var4xH_xnonzero_ynonzero(
-    const uint8_t *src, int src_stride, int xoffset, int yoffset,
-    const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
-    unsigned int *sse, int h) {
-  int i;
-  __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b, v_filtered0_w, v_filtered2_w;
-  __m128i v_src0_shift_b, v_src1_shift_b, v_src2_shift_b, v_src3_shift_b;
-  __m128i v_dst0_b, v_dst1_b, v_dst2_b, v_dst3_b, v_temp_b;
-  __m128i v_msk0_b, v_msk1_b, v_msk2_b, v_msk3_b, v_extra_row_b, v_res_b;
-  __m128i v_xres_b[2];
-  __m128i v_sum_d = _mm_setzero_si128();
-  __m128i v_sse_q = _mm_setzero_si128();
-  __m128i v_filterx_b = _mm_set1_epi16((bilinear_filters_2t[xoffset][1] << 8) +
-                                       bilinear_filters_2t[xoffset][0]);
-  __m128i v_filtery_b = _mm_set1_epi16((bilinear_filters_2t[yoffset][1] << 8) +
-                                       bilinear_filters_2t[yoffset][0]);
-  assert(xoffset < BIL_SUBPEL_SHIFTS);
-  assert(yoffset < BIL_SUBPEL_SHIFTS);
-  for (i = 0; i < h; i += 4) {
-    // Load the src data
-    v_src0_b = _mm_loadl_epi64((const __m128i *)src);
-    v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
-    v_src1_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1));
-    v_src0_b = _mm_unpacklo_epi32(v_src1_b, v_src0_b);
-    v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
-    v_src2_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
-    v_src0_shift_b = _mm_unpacklo_epi32(v_src1_shift_b, v_src0_shift_b);
-    v_src2_shift_b = _mm_srli_si128(v_src2_b, 1);
-    v_src3_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
-    v_src2_b = _mm_unpacklo_epi32(v_src3_b, v_src2_b);
-    v_src3_shift_b = _mm_srli_si128(v_src3_b, 1);
-    v_src2_shift_b = _mm_unpacklo_epi32(v_src3_shift_b, v_src2_shift_b);
-    // Apply the x filter
-    if (xoffset == HALF_PIXEL_OFFSET) {
-      v_src0_b = _mm_unpacklo_epi64(v_src2_b, v_src0_b);
-      v_src0_shift_b = _mm_unpacklo_epi64(v_src2_shift_b, v_src0_shift_b);
-      v_xres_b[i == 0 ? 0 : 1] = _mm_avg_epu8(v_src0_b, v_src0_shift_b);
-    } else {
-      apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w);
-      apply_filter_lo(v_src2_b, v_src2_shift_b, v_filterx_b, &v_filtered2_w);
-      v_xres_b[i == 0 ? 0 : 1] = _mm_packus_epi16(v_filtered2_w, v_filtered0_w);
-    }
-    // Move onto the next set of rows
-    src += src_stride * 4;
-  }
-  // Load one more row to be used in the y filter
-  v_src0_b = _mm_loadl_epi64((const __m128i *)src);
-  v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
-  // Apply the x filter
-  if (xoffset == HALF_PIXEL_OFFSET) {
-    v_extra_row_b = _mm_and_si128(_mm_avg_epu8(v_src0_b, v_src0_shift_b),
-                                  _mm_setr_epi32(-1, 0, 0, 0));
-  } else {
-    apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w);
-    v_extra_row_b =
-        _mm_and_si128(_mm_packus_epi16(v_filtered0_w, _mm_setzero_si128()),
-                      _mm_setr_epi32(-1, 0, 0, 0));
-  }
-
-  for (i = 0; i < h; i += 4) {
-    if (h == 8 && i == 0) {
-      v_temp_b = _mm_or_si128(_mm_slli_si128(v_xres_b[0], 4),
-                              _mm_srli_si128(v_xres_b[1], 12));
-    } else {
-      v_temp_b = _mm_or_si128(_mm_slli_si128(v_xres_b[i == 0 ? 0 : 1], 4),
-                              v_extra_row_b);
-    }
-    // Apply the y filter
-    if (yoffset == HALF_PIXEL_OFFSET) {
-      v_res_b = _mm_avg_epu8(v_xres_b[i == 0 ? 0 : 1], v_temp_b);
-    } else {
-      v_res_b = apply_filter(v_xres_b[i == 0 ? 0 : 1], v_temp_b, v_filtery_b);
-    }
-
-    // Load the dst data
-    v_dst0_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 0));
-    v_dst1_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 1));
-    v_dst0_b = _mm_unpacklo_epi32(v_dst1_b, v_dst0_b);
-    v_dst2_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 2));
-    v_dst3_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 3));
-    v_dst2_b = _mm_unpacklo_epi32(v_dst3_b, v_dst2_b);
-    v_dst0_b = _mm_unpacklo_epi64(v_dst2_b, v_dst0_b);
-    // Load the mask data
-    v_msk0_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 0));
-    v_msk1_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 1));
-    v_msk0_b = _mm_unpacklo_epi32(v_msk1_b, v_msk0_b);
-    v_msk2_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 2));
-    v_msk3_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 3));
-    v_msk2_b = _mm_unpacklo_epi32(v_msk3_b, v_msk2_b);
-    v_msk0_b = _mm_unpacklo_epi64(v_msk2_b, v_msk0_b);
-    // Compute the sum and SSE
-    sum_and_sse(v_res_b, v_dst0_b, v_msk0_b, &v_sum_d, &v_sse_q);
-    // Move onto the next set of rows
-    dst += dst_stride * 4;
-    msk += msk_stride * 4;
-  }
-  return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h);
-}
-
-unsigned int aom_masked_subpel_var8xH_xnonzero_ynonzero(
-    const uint8_t *src, int src_stride, int xoffset, int yoffset,
-    const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
-    unsigned int *sse, int h) {
-  int i;
-  __m128i v_src0_b, v_src1_b, v_filtered0_w, v_filtered1_w, v_dst_b, v_msk_b;
-  __m128i v_src0_shift_b, v_src1_shift_b;
-  __m128i v_xres0_b, v_xres1_b, v_res_b, v_temp_b;
-  __m128i v_sum_d = _mm_setzero_si128();
-  __m128i v_sse_q = _mm_setzero_si128();
-  __m128i v_filterx_b = _mm_set1_epi16((bilinear_filters_2t[xoffset][1] << 8) +
-                                       bilinear_filters_2t[xoffset][0]);
-  __m128i v_filtery_b = _mm_set1_epi16((bilinear_filters_2t[yoffset][1] << 8) +
-                                       bilinear_filters_2t[yoffset][0]);
-  assert(xoffset < BIL_SUBPEL_SHIFTS);
-  assert(yoffset < BIL_SUBPEL_SHIFTS);
-  // Load the first block of src data
-  v_src0_b = _mm_loadu_si128((const __m128i *)(src));
-  v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
-  v_src1_b = _mm_loadu_si128((const __m128i *)(src + src_stride));
-  v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
-  // Apply the x filter
-  if (xoffset == HALF_PIXEL_OFFSET) {
-    v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b);
-    v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b);
-    v_xres0_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b);
-  } else {
-    apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w);
-    apply_filter_lo(v_src1_b, v_src1_shift_b, v_filterx_b, &v_filtered1_w);
-    v_xres0_b = _mm_packus_epi16(v_filtered0_w, v_filtered1_w);
-  }
-  for (i = 0; i < h; i += 4) {
-    // Load the next block of src data
-    v_src0_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 2));
-    v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
-    v_src1_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 3));
-    v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
-    // Apply the x filter
-    if (xoffset == HALF_PIXEL_OFFSET) {
-      v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b);
-      v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b);
-      v_xres1_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b);
-    } else {
-      apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w);
-      apply_filter_lo(v_src1_b, v_src1_shift_b, v_filterx_b, &v_filtered1_w);
-      v_xres1_b = _mm_packus_epi16(v_filtered0_w, v_filtered1_w);
-    }
-    // Apply the y filter to the previous block
-    v_temp_b = _mm_or_si128(_mm_srli_si128(v_xres0_b, 8),
-                            _mm_slli_si128(v_xres1_b, 8));
-    if (yoffset == HALF_PIXEL_OFFSET) {
-      v_res_b = _mm_avg_epu8(v_xres0_b, v_temp_b);
-    } else {
-      v_res_b = apply_filter(v_xres0_b, v_temp_b, v_filtery_b);
-    }
-    // Load the dst data
-    v_dst_b = _mm_unpacklo_epi64(
-        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)),
-        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)));
-    // Load the mask data
-    v_msk_b = _mm_unpacklo_epi64(
-        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)),
-        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)));
-    // Compute the sum and SSE
-    sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
-
-    // Load the next block of src data
-    v_src0_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 4));
-    v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
-    v_src1_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 5));
-    v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
-    // Apply the x filter
-    if (xoffset == HALF_PIXEL_OFFSET) {
-      v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b);
-      v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b);
-      v_xres0_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b);
-    } else {
-      apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w);
-      apply_filter_lo(v_src1_b, v_src1_shift_b, v_filterx_b, &v_filtered1_w);
-      v_xres0_b = _mm_packus_epi16(v_filtered0_w, v_filtered1_w);
-    }
-    // Apply the y filter to the previous block
-    v_temp_b = _mm_or_si128(_mm_srli_si128(v_xres1_b, 8),
-                            _mm_slli_si128(v_xres0_b, 8));
-    if (yoffset == HALF_PIXEL_OFFSET) {
-      v_res_b = _mm_avg_epu8(v_xres1_b, v_temp_b);
-    } else {
-      v_res_b = apply_filter(v_xres1_b, v_temp_b, v_filtery_b);
-    }
-    // Load the dst data
-    v_dst_b = _mm_unpacklo_epi64(
-        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 2)),
-        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 3)));
-    // Load the mask data
-    v_msk_b = _mm_unpacklo_epi64(
-        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 2)),
-        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 3)));
-    // Compute the sum and SSE
-    sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
-    // Move onto the next set of rows
-    src += src_stride * 4;
-    dst += dst_stride * 4;
-    msk += msk_stride * 4;
-  }
-  return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h);
-}
-
-// For W >=16
-#define MASK_SUBPIX_VAR_LARGE(W, H)                                            \
-  unsigned int aom_masked_sub_pixel_variance##W##x##H##_ssse3(                 \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
-      const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,  \
-      unsigned int *sse) {                                                     \
-    assert(W % 16 == 0);                                                       \
-    if (xoffset == 0) {                                                        \
-      if (yoffset == 0)                                                        \
-        return aom_masked_variance##W##x##H##_ssse3(                           \
-            src, src_stride, dst, dst_stride, msk, msk_stride, sse);           \
-      else if (yoffset == HALF_PIXEL_OFFSET)                                   \
-        return aom_masked_subpel_varWxH_xzero(                                 \
-            src, src_stride, HALF_PIXEL_OFFSET, dst, dst_stride, msk,          \
-            msk_stride, sse, W, H, apply_filter_avg);                          \
-      else                                                                     \
-        return aom_masked_subpel_varWxH_xzero(src, src_stride, yoffset, dst,   \
-                                              dst_stride, msk, msk_stride,     \
-                                              sse, W, H, apply_filter);        \
-    } else if (yoffset == 0) {                                                 \
-      if (xoffset == HALF_PIXEL_OFFSET)                                        \
-        return aom_masked_subpel_varWxH_yzero(                                 \
-            src, src_stride, HALF_PIXEL_OFFSET, dst, dst_stride, msk,          \
-            msk_stride, sse, W, H, apply_filter_avg);                          \
-      else                                                                     \
-        return aom_masked_subpel_varWxH_yzero(src, src_stride, xoffset, dst,   \
-                                              dst_stride, msk, msk_stride,     \
-                                              sse, W, H, apply_filter);        \
-    } else if (xoffset == HALF_PIXEL_OFFSET) {                                 \
-      if (yoffset == HALF_PIXEL_OFFSET)                                        \
-        return aom_masked_subpel_varWxH_xnonzero_ynonzero(                     \
-            src, src_stride, HALF_PIXEL_OFFSET, HALF_PIXEL_OFFSET, dst,        \
-            dst_stride, msk, msk_stride, sse, W, H, apply_filter_avg,          \
-            apply_filter_avg);                                                 \
-      else                                                                     \
-        return aom_masked_subpel_varWxH_xnonzero_ynonzero(                     \
-            src, src_stride, HALF_PIXEL_OFFSET, yoffset, dst, dst_stride, msk, \
-            msk_stride, sse, W, H, apply_filter_avg, apply_filter);            \
-    } else {                                                                   \
-      if (yoffset == HALF_PIXEL_OFFSET)                                        \
-        return aom_masked_subpel_varWxH_xnonzero_ynonzero(                     \
-            src, src_stride, xoffset, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \
-            msk_stride, sse, W, H, apply_filter, apply_filter_avg);            \
-      else                                                                     \
-        return aom_masked_subpel_varWxH_xnonzero_ynonzero(                     \
-            src, src_stride, xoffset, yoffset, dst, dst_stride, msk,           \
-            msk_stride, sse, W, H, apply_filter, apply_filter);                \
-    }                                                                          \
-  }
-
-// For W < 16
-#define MASK_SUBPIX_VAR_SMALL(W, H)                                            \
-  unsigned int aom_masked_sub_pixel_variance##W##x##H##_ssse3(                 \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
-      const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,  \
-      unsigned int *sse) {                                                     \
-    assert(W == 4 || W == 8);                                                  \
-    if (xoffset == 0 && yoffset == 0)                                          \
-      return aom_masked_variance##W##x##H##_ssse3(                             \
-          src, src_stride, dst, dst_stride, msk, msk_stride, sse);             \
-    else if (xoffset == 0)                                                     \
-      return aom_masked_subpel_var##W##xH_xzero(                               \
-          src, src_stride, yoffset, dst, dst_stride, msk, msk_stride, sse, H); \
-    else if (yoffset == 0)                                                     \
-      return aom_masked_subpel_var##W##xH_yzero(                               \
-          src, src_stride, xoffset, dst, dst_stride, msk, msk_stride, sse, H); \
-    else                                                                       \
-      return aom_masked_subpel_var##W##xH_xnonzero_ynonzero(                   \
-          src, src_stride, xoffset, yoffset, dst, dst_stride, msk, msk_stride, \
-          sse, H);                                                             \
-  }
-
-MASK_SUBPIX_VAR_SMALL(4, 4)
-MASK_SUBPIX_VAR_SMALL(4, 8)
-MASK_SUBPIX_VAR_SMALL(8, 4)
-MASK_SUBPIX_VAR_SMALL(8, 8)
-MASK_SUBPIX_VAR_SMALL(8, 16)
-MASK_SUBPIX_VAR_LARGE(16, 8)
-MASK_SUBPIX_VAR_LARGE(16, 16)
-MASK_SUBPIX_VAR_LARGE(16, 32)
-MASK_SUBPIX_VAR_LARGE(32, 16)
-MASK_SUBPIX_VAR_LARGE(32, 32)
-MASK_SUBPIX_VAR_LARGE(32, 64)
-MASK_SUBPIX_VAR_LARGE(64, 32)
-MASK_SUBPIX_VAR_LARGE(64, 64)
-#if CONFIG_EXT_PARTITION
-MASK_SUBPIX_VAR_LARGE(64, 128)
-MASK_SUBPIX_VAR_LARGE(128, 64)
-MASK_SUBPIX_VAR_LARGE(128, 128)
-#endif  // CONFIG_EXT_PARTITION
-
-#if CONFIG_HIGHBITDEPTH
-typedef uint32_t (*highbd_calc_masked_var_t)(__m128i v_sum_d, __m128i v_sse_q,
-                                             uint32_t *sse, int w, int h);
-typedef unsigned int (*highbd_variance_fn_t)(const uint8_t *a8, int a_stride,
-                                             const uint8_t *b8, int b_stride,
-                                             const uint8_t *m, int m_stride,
-                                             unsigned int *sse);
-typedef __m128i (*highbd_filter_fn_t)(__m128i v_a_w, __m128i v_b_w,
-                                      __m128i v_filter_w);
-
-static INLINE __m128i highbd_apply_filter_avg(const __m128i v_a_w,
-                                              const __m128i v_b_w,
-                                              const __m128i v_filter_w) {
-  (void)v_filter_w;
-  return _mm_avg_epu16(v_a_w, v_b_w);
-}
-
-static INLINE __m128i highbd_apply_filter(const __m128i v_a_w,
-                                          const __m128i v_b_w,
-                                          const __m128i v_filter_w) {
-  const __m128i v_rounding_d = _mm_set1_epi32(1 << (FILTER_BITS - 1));
-  __m128i v_input_lo_w = _mm_unpacklo_epi16(v_a_w, v_b_w);
-  __m128i v_input_hi_w = _mm_unpackhi_epi16(v_a_w, v_b_w);
-  __m128i v_temp0_d = _mm_madd_epi16(v_input_lo_w, v_filter_w);
-  __m128i v_temp1_d = _mm_madd_epi16(v_input_hi_w, v_filter_w);
-  __m128i v_res_lo_d =
-      _mm_srai_epi32(_mm_add_epi32(v_temp0_d, v_rounding_d), FILTER_BITS);
-  __m128i v_res_hi_d =
-      _mm_srai_epi32(_mm_add_epi32(v_temp1_d, v_rounding_d), FILTER_BITS);
-  return _mm_packs_epi32(v_res_lo_d, v_res_hi_d);
-}
-// Apply the filter to the contents of the lower half of a and b
-static INLINE void highbd_apply_filter_lo(const __m128i v_a_lo_w,
-                                          const __m128i v_b_lo_w,
-                                          const __m128i v_filter_w,
-                                          __m128i *v_res_d) {
-  const __m128i v_rounding_d = _mm_set1_epi32(1 << (FILTER_BITS - 1));
-  __m128i v_input_w = _mm_unpacklo_epi16(v_a_lo_w, v_b_lo_w);
-  __m128i v_temp0_d = _mm_madd_epi16(v_input_w, v_filter_w);
-  *v_res_d =
-      _mm_srai_epi32(_mm_add_epi32(v_temp0_d, v_rounding_d), FILTER_BITS);
-}
-
-static void highbd_sum_and_sse(const __m128i v_a_w, const __m128i v_b_w,
-                               const __m128i v_m_b, __m128i *v_sum_d,
-                               __m128i *v_sse_q) {
-  const __m128i v_zero = _mm_setzero_si128();
-  const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero);
-
-  // Difference: [-2^12, 2^12] => 13 bits (incld sign bit)
-  const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
-
-  // Error - [-4095, 4095] * [0, 64] & sum pairs => fits in 19 + 1 bits
-  const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w);
-
-  // Squared error - max (18 bits * 18 bits) = 36 bits (no sign bit)
-  const __m128i v_absd_w = _mm_abs_epi16(v_d_w);
-  const __m128i v_dlo_d = _mm_unpacklo_epi16(v_absd_w, v_zero);
-  const __m128i v_mlo_d = _mm_unpacklo_epi16(v_m_w, v_zero);
-  const __m128i v_elo_d = _mm_madd_epi16(v_dlo_d, v_mlo_d);
-  const __m128i v_dhi_d = _mm_unpackhi_epi16(v_absd_w, v_zero);
-  const __m128i v_mhi_d = _mm_unpackhi_epi16(v_m_w, v_zero);
-  const __m128i v_ehi_d = _mm_madd_epi16(v_dhi_d, v_mhi_d);
-  // Square and sum the errors -> 36bits * 4 = 38bits
-  __m128i v_se0_q, v_se1_q, v_se2_q, v_se3_q, v_se_q, v_elo1_d, v_ehi3_d;
-  v_se0_q = _mm_mul_epu32(v_elo_d, v_elo_d);
-  v_elo1_d = _mm_srli_si128(v_elo_d, 4);
-  v_se1_q = _mm_mul_epu32(v_elo1_d, v_elo1_d);
-  v_se0_q = _mm_add_epi64(v_se0_q, v_se1_q);
-  v_se2_q = _mm_mul_epu32(v_ehi_d, v_ehi_d);
-  v_ehi3_d = _mm_srli_si128(v_ehi_d, 4);
-  v_se3_q = _mm_mul_epu32(v_ehi3_d, v_ehi3_d);
-  v_se1_q = _mm_add_epi64(v_se2_q, v_se3_q);
-  v_se_q = _mm_add_epi64(v_se0_q, v_se1_q);
-
-  // Accumulate
-  *v_sum_d = _mm_add_epi32(*v_sum_d, v_e_d);
-  *v_sse_q = _mm_add_epi64(*v_sse_q, v_se_q);
-}
-
-static INLINE uint32_t highbd_10_calc_masked_variance(__m128i v_sum_d,
-                                                      __m128i v_sse_q,
-                                                      uint32_t *sse, int w,
-                                                      int h) {
-  int64_t sum64;
-  uint64_t sse64;
-
-  // Horizontal sum
-  sum64 = hsum_epi32_si32(v_sum_d);
-  sse64 = hsum_epi64_si64(v_sse_q);
-
-  sum64 = (sum64 >= 0) ? sum64 : -sum64;
-
-  // Round
-  sum64 = ROUND_POWER_OF_TWO(sum64, 6);
-  sse64 = ROUND_POWER_OF_TWO(sse64, 12);
-
-  // Normalise
-  sum64 = ROUND_POWER_OF_TWO(sum64, 2);
-  sse64 = ROUND_POWER_OF_TWO(sse64, 4);
-
-  // Store the SSE
-  *sse = (uint32_t)sse64;
-  // Compute the variance
-  return *sse - (uint32_t)((sum64 * sum64) / (w * h));
-}
-static INLINE uint32_t highbd_12_calc_masked_variance(__m128i v_sum_d,
-                                                      __m128i v_sse_q,
-                                                      uint32_t *sse, int w,
-                                                      int h) {
-  int64_t sum64;
-  uint64_t sse64;
-
-  // Horizontal sum
-  sum64 = hsum_epi32_si64(v_sum_d);
-  sse64 = hsum_epi64_si64(v_sse_q);
-
-  sum64 = (sum64 >= 0) ? sum64 : -sum64;
-
-  // Round
-  sum64 = ROUND_POWER_OF_TWO(sum64, 6);
-  sse64 = ROUND_POWER_OF_TWO(sse64, 12);
-
-  // Normalise
-  sum64 = ROUND_POWER_OF_TWO(sum64, 4);
-  sse64 = ROUND_POWER_OF_TWO(sse64, 8);
-
-  // Store the SSE
-  *sse = (uint32_t)sse64;
-  // Compute the variance
-  return *sse - (uint32_t)((sum64 * sum64) / (w * h));
-}
-
-// High bit depth functions for width (W) >= 8
-unsigned int aom_highbd_masked_subpel_varWxH_xzero(
-    const uint16_t *src, int src_stride, int yoffset, const uint16_t *dst,
-    int dst_stride, const uint8_t *msk, int msk_stride, unsigned int *sse,
-    int w, int h, highbd_filter_fn_t filter_fn,
-    highbd_calc_masked_var_t calc_var) {
-  int i, j;
-  __m128i v_src0_w, v_src1_w, v_res_w, v_dst_w, v_msk_b;
-  __m128i v_sum_d = _mm_setzero_si128();
-  __m128i v_sse_q = _mm_setzero_si128();
-  const __m128i v_filter_w =
-      _mm_set1_epi32((bilinear_filters_2t[yoffset][1] << 16) +
-                     bilinear_filters_2t[yoffset][0]);
-  assert(yoffset < BIL_SUBPEL_SHIFTS);
-  for (j = 0; j < w; j += 8) {
-    // Load the first row ready
-    v_src0_w = _mm_loadu_si128((const __m128i *)(src + j));
-    // Process 2 rows at a time
-    for (i = 0; i < h; i += 2) {
-      // Load the next row apply the filter
-      v_src1_w = _mm_loadu_si128((const __m128i *)(src + j + src_stride));
-      v_res_w = filter_fn(v_src0_w, v_src1_w, v_filter_w);
-      // Load the dst and msk for the variance calculation
-      v_dst_w = _mm_loadu_si128((const __m128i *)(dst + j));
-      v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + j));
-      highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
-
-      // Load the next row apply the filter
-      v_src0_w = _mm_loadu_si128((const __m128i *)(src + j + src_stride * 2));
-      v_res_w = filter_fn(v_src1_w, v_src0_w, v_filter_w);
-      // Load the dst and msk for the variance calculation
-      v_dst_w = _mm_loadu_si128((const __m128i *)(dst + j + dst_stride));
-      v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + j + msk_stride));
-      highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
-      // Move onto the next block of rows
-      src += src_stride * 2;
-      dst += dst_stride * 2;
-      msk += msk_stride * 2;
-    }
-    // Reset to the top of the block
-    src -= src_stride * h;
-    dst -= dst_stride * h;
-    msk -= msk_stride * h;
-  }
-  return calc_var(v_sum_d, v_sse_q, sse, w, h);
-}
-unsigned int aom_highbd_masked_subpel_varWxH_yzero(
-    const uint16_t *src, int src_stride, int xoffset, const uint16_t *dst,
-    int dst_stride, const uint8_t *msk, int msk_stride, unsigned int *sse,
-    int w, int h, highbd_filter_fn_t filter_fn,
-    highbd_calc_masked_var_t calc_var) {
-  int i, j;
-  __m128i v_src0_w, v_src1_w, v_res_w, v_dst_w, v_msk_b;
-  __m128i v_sum_d = _mm_setzero_si128();
-  __m128i v_sse_q = _mm_setzero_si128();
-  const __m128i v_filter_w =
-      _mm_set1_epi32((bilinear_filters_2t[xoffset][1] << 16) +
-                     bilinear_filters_2t[xoffset][0]);
-  assert(xoffset < BIL_SUBPEL_SHIFTS);
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j += 8) {
-      // Load this row & apply the filter to them
-      v_src0_w = _mm_loadu_si128((const __m128i *)(src + j));
-      v_src1_w = _mm_loadu_si128((const __m128i *)(src + j + 1));
-      v_res_w = filter_fn(v_src0_w, v_src1_w, v_filter_w);
-
-      // Load the dst and msk for the variance calculation
-      v_dst_w = _mm_loadu_si128((const __m128i *)(dst + j));
-      v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + j));
-      highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
-    }
-    src += src_stride;
-    dst += dst_stride;
-    msk += msk_stride;
-  }
-  return calc_var(v_sum_d, v_sse_q, sse, w, h);
-}
-
-unsigned int aom_highbd_masked_subpel_varWxH_xnonzero_ynonzero(
-    const uint16_t *src, int src_stride, int xoffset, int yoffset,
-    const uint16_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
-    unsigned int *sse, int w, int h, highbd_filter_fn_t xfilter_fn,
-    highbd_filter_fn_t yfilter_fn, highbd_calc_masked_var_t calc_var) {
-  int i, j;
-  __m128i v_src0_w, v_src1_w, v_src2_w, v_src3_w;
-  __m128i v_filtered0_w, v_filtered1_w, v_res_w, v_dst_w, v_msk_b;
-  __m128i v_sum_d = _mm_setzero_si128();
-  __m128i v_sse_q = _mm_setzero_si128();
-  const __m128i v_filterx_w =
-      _mm_set1_epi32((bilinear_filters_2t[xoffset][1] << 16) +
-                     bilinear_filters_2t[xoffset][0]);
-  const __m128i v_filtery_w =
-      _mm_set1_epi32((bilinear_filters_2t[yoffset][1] << 16) +
-                     bilinear_filters_2t[yoffset][0]);
-  assert(xoffset < BIL_SUBPEL_SHIFTS);
-  assert(yoffset < BIL_SUBPEL_SHIFTS);
-  for (j = 0; j < w; j += 8) {
-    // Load the first row ready
-    v_src0_w = _mm_loadu_si128((const __m128i *)(src + j));
-    v_src1_w = _mm_loadu_si128((const __m128i *)(src + j + 1));
-    v_filtered0_w = xfilter_fn(v_src0_w, v_src1_w, v_filterx_w);
-    // Process 2 rows at a time
-    for (i = 0; i < h; i += 2) {
-      // Load the next row & apply the filter
-      v_src2_w = _mm_loadu_si128((const __m128i *)(src + src_stride + j));
-      v_src3_w = _mm_loadu_si128((const __m128i *)(src + src_stride + j + 1));
-      v_filtered1_w = xfilter_fn(v_src2_w, v_src3_w, v_filterx_w);
-      // Load the dst and msk for the variance calculation
-      v_dst_w = _mm_loadu_si128((const __m128i *)(dst + j));
-      v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + j));
-      // Complete the calculation for this row and add it to the running total
-      v_res_w = yfilter_fn(v_filtered0_w, v_filtered1_w, v_filtery_w);
-      highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
-
-      // Load the next row & apply the filter
-      v_src0_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 2 + j));
-      v_src1_w =
-          _mm_loadu_si128((const __m128i *)(src + src_stride * 2 + j + 1));
-      v_filtered0_w = xfilter_fn(v_src0_w, v_src1_w, v_filterx_w);
-      // Load the dst and msk for the variance calculation
-      v_dst_w = _mm_loadu_si128((const __m128i *)(dst + dst_stride + j));
-      v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + msk_stride + j));
-      // Complete the calculation for this row and add it to the running total
-      v_res_w = yfilter_fn(v_filtered1_w, v_filtered0_w, v_filtery_w);
-      highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
-      // Move onto the next block of rows
-      src += src_stride * 2;
-      dst += dst_stride * 2;
-      msk += msk_stride * 2;
-    }
-    // Reset to the top of the block
-    src -= src_stride * h;
-    dst -= dst_stride * h;
-    msk -= msk_stride * h;
-  }
-  return calc_var(v_sum_d, v_sse_q, sse, w, h);
-}
-
-// Note order in which rows loaded xmm[127:64] = row 1, xmm[63:0] = row 2
-unsigned int aom_highbd_masked_subpel_var4xH_xzero(
-    const uint16_t *src, int src_stride, int yoffset, const uint16_t *dst,
-    int dst_stride, const uint8_t *msk, int msk_stride, unsigned int *sse,
-    int h, highbd_calc_masked_var_t calc_var) {
-  int i;
-  __m128i v_src0_w, v_src1_w, v_filtered0_d, v_filtered1_d, v_res_w;
-  __m128i v_dst_w, v_msk_b;
-  __m128i v_sum_d = _mm_setzero_si128();
-  __m128i v_sse_q = _mm_setzero_si128();
-  __m128i v_filter_w = _mm_set1_epi32((bilinear_filters_2t[yoffset][1] << 16) +
-                                      bilinear_filters_2t[yoffset][0]);
-  assert(yoffset < BIL_SUBPEL_SHIFTS);
-  // Load the first row of src data ready
-  v_src0_w = _mm_loadl_epi64((const __m128i *)src);
-  for (i = 0; i < h; i += 2) {
-    if (yoffset == HALF_PIXEL_OFFSET) {
-      // Load the rest of the source data for these rows
-      v_src1_w = _mm_or_si128(
-          _mm_slli_si128(v_src0_w, 8),
-          _mm_loadl_epi64((const __m128i *)(src + src_stride * 1)));
-      v_src0_w = _mm_or_si128(
-          _mm_slli_si128(v_src1_w, 8),
-          _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)));
-      // Apply the y filter
-      v_res_w = _mm_avg_epu16(v_src1_w, v_src0_w);
-    } else {
-      // Load the data and apply the y filter
-      v_src1_w = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1));
-      highbd_apply_filter_lo(v_src0_w, v_src1_w, v_filter_w, &v_filtered0_d);
-      v_src0_w = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
-      highbd_apply_filter_lo(v_src1_w, v_src0_w, v_filter_w, &v_filtered1_d);
-      v_res_w = _mm_packs_epi32(v_filtered1_d, v_filtered0_d);
-    }
-    // Load the dst data
-    v_dst_w = _mm_unpacklo_epi64(
-        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)),
-        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)));
-    // Load the mask data
-    v_msk_b = _mm_unpacklo_epi32(
-        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)),
-        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)));
-    // Compute the sum and SSE
-    highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
-    // Move onto the next set of rows
-    src += src_stride * 2;
-    dst += dst_stride * 2;
-    msk += msk_stride * 2;
-  }
-  return calc_var(v_sum_d, v_sse_q, sse, 4, h);
-}
-
-unsigned int aom_highbd_masked_subpel_var4xH_yzero(
-    const uint16_t *src, int src_stride, int xoffset, const uint16_t *dst,
-    int dst_stride, const uint8_t *msk, int msk_stride, unsigned int *sse,
-    int h, highbd_calc_masked_var_t calc_var) {
-  int i;
-  __m128i v_src0_w, v_src1_w, v_filtered0_d, v_filtered1_d;
-  __m128i v_src0_shift_w, v_src1_shift_w, v_res_w, v_dst_w, v_msk_b;
-  __m128i v_sum_d = _mm_setzero_si128();
-  __m128i v_sse_q = _mm_setzero_si128();
-  __m128i v_filter_w = _mm_set1_epi32((bilinear_filters_2t[xoffset][1] << 16) +
-                                      bilinear_filters_2t[xoffset][0]);
-  assert(xoffset < BIL_SUBPEL_SHIFTS);
-  for (i = 0; i < h; i += 2) {
-    // Load the src data
-    v_src0_w = _mm_loadu_si128((const __m128i *)(src));
-    v_src0_shift_w = _mm_srli_si128(v_src0_w, 2);
-    v_src1_w = _mm_loadu_si128((const __m128i *)(src + src_stride));
-    v_src1_shift_w = _mm_srli_si128(v_src1_w, 2);
-    // Apply the x filter
-    if (xoffset == HALF_PIXEL_OFFSET) {
-      v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w);
-      v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w);
-      v_res_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w);
-    } else {
-      highbd_apply_filter_lo(v_src0_w, v_src0_shift_w, v_filter_w,
-                             &v_filtered0_d);
-      highbd_apply_filter_lo(v_src1_w, v_src1_shift_w, v_filter_w,
-                             &v_filtered1_d);
-      v_res_w = _mm_packs_epi32(v_filtered0_d, v_filtered1_d);
-    }
-    // Load the dst data
-    v_dst_w = _mm_unpacklo_epi64(
-        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)),
-        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)));
-    // Load the mask data
-    v_msk_b = _mm_unpacklo_epi32(
-        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)),
-        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)));
-    // Compute the sum and SSE
-    highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
-    // Move onto the next set of rows
-    src += src_stride * 2;
-    dst += dst_stride * 2;
-    msk += msk_stride * 2;
-  }
-  return calc_var(v_sum_d, v_sse_q, sse, 4, h);
-}
-
-unsigned int aom_highbd_masked_subpel_var4xH_xnonzero_ynonzero(
-    const uint16_t *src, int src_stride, int xoffset, int yoffset,
-    const uint16_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
-    unsigned int *sse, int h, highbd_calc_masked_var_t calc_var) {
-  int i;
-  __m128i v_src0_w, v_src1_w, v_filtered0_d, v_filtered1_d, v_dst_w, v_msk_b;
-  __m128i v_src0_shift_w, v_src1_shift_w;
-  __m128i v_xres0_w, v_xres1_w, v_res_w, v_temp_w;
-  __m128i v_sum_d = _mm_setzero_si128();
-  __m128i v_sse_q = _mm_setzero_si128();
-  __m128i v_filterx_w = _mm_set1_epi32((bilinear_filters_2t[xoffset][1] << 16) +
-                                       bilinear_filters_2t[xoffset][0]);
-  __m128i v_filtery_w = _mm_set1_epi32((bilinear_filters_2t[yoffset][1] << 16) +
-                                       bilinear_filters_2t[yoffset][0]);
-  assert(xoffset < BIL_SUBPEL_SHIFTS);
-  assert(yoffset < BIL_SUBPEL_SHIFTS);
-  // Load the first block of src data
-  v_src0_w = _mm_loadu_si128((const __m128i *)(src));
-  v_src0_shift_w = _mm_srli_si128(v_src0_w, 2);
-  v_src1_w = _mm_loadu_si128((const __m128i *)(src + src_stride));
-  v_src1_shift_w = _mm_srli_si128(v_src1_w, 2);
-  // Apply the x filter
-  if (xoffset == HALF_PIXEL_OFFSET) {
-    v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w);
-    v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w);
-    v_xres0_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w);
-  } else {
-    highbd_apply_filter_lo(v_src0_w, v_src0_shift_w, v_filterx_w,
-                           &v_filtered0_d);
-    highbd_apply_filter_lo(v_src1_w, v_src1_shift_w, v_filterx_w,
-                           &v_filtered1_d);
-    v_xres0_w = _mm_packs_epi32(v_filtered0_d, v_filtered1_d);
-  }
-  for (i = 0; i < h; i += 4) {
-    // Load the next block of src data
-    v_src0_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 2));
-    v_src0_shift_w = _mm_srli_si128(v_src0_w, 2);
-    v_src1_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 3));
-    v_src1_shift_w = _mm_srli_si128(v_src1_w, 2);
-    // Apply the x filter
-    if (xoffset == HALF_PIXEL_OFFSET) {
-      v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w);
-      v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w);
-      v_xres1_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w);
-    } else {
-      highbd_apply_filter_lo(v_src0_w, v_src0_shift_w, v_filterx_w,
-                             &v_filtered0_d);
-      highbd_apply_filter_lo(v_src1_w, v_src1_shift_w, v_filterx_w,
-                             &v_filtered1_d);
-      v_xres1_w = _mm_packs_epi32(v_filtered0_d, v_filtered1_d);
-    }
-    // Apply the y filter to the previous block
-    v_temp_w = _mm_or_si128(_mm_srli_si128(v_xres0_w, 8),
-                            _mm_slli_si128(v_xres1_w, 8));
-    if (yoffset == HALF_PIXEL_OFFSET) {
-      v_res_w = _mm_avg_epu16(v_xres0_w, v_temp_w);
-    } else {
-      v_res_w = highbd_apply_filter(v_xres0_w, v_temp_w, v_filtery_w);
-    }
-    // Load the dst data
-    v_dst_w = _mm_unpacklo_epi64(
-        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)),
-        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)));
-    // Load the mask data
-    v_msk_b = _mm_unpacklo_epi32(
-        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)),
-        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)));
-    // Compute the sum and SSE
-    highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
-
-    // Load the next block of src data
-    v_src0_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 4));
-    v_src0_shift_w = _mm_srli_si128(v_src0_w, 2);
-    v_src1_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 5));
-    v_src1_shift_w = _mm_srli_si128(v_src1_w, 2);
-    // Apply the x filter
-    if (xoffset == HALF_PIXEL_OFFSET) {
-      v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w);
-      v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w);
-      v_xres0_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w);
-    } else {
-      highbd_apply_filter_lo(v_src0_w, v_src0_shift_w, v_filterx_w,
-                             &v_filtered0_d);
-      highbd_apply_filter_lo(v_src1_w, v_src1_shift_w, v_filterx_w,
-                             &v_filtered1_d);
-      v_xres0_w = _mm_packs_epi32(v_filtered0_d, v_filtered1_d);
-    }
-    // Apply the y filter to the previous block
-    v_temp_w = _mm_or_si128(_mm_srli_si128(v_xres1_w, 8),
-                            _mm_slli_si128(v_xres0_w, 8));
-    if (yoffset == HALF_PIXEL_OFFSET) {
-      v_res_w = _mm_avg_epu16(v_xres1_w, v_temp_w);
-    } else {
-      v_res_w = highbd_apply_filter(v_xres1_w, v_temp_w, v_filtery_w);
-    }
-    // Load the dst data
-    v_dst_w = _mm_unpacklo_epi64(
-        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 2)),
-        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 3)));
-    // Load the mask data
-    v_msk_b = _mm_unpacklo_epi32(
-        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 2)),
-        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 3)));
-    // Compute the sum and SSE
-    highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
-    // Move onto the next set of rows
-    src += src_stride * 4;
-    dst += dst_stride * 4;
-    msk += msk_stride * 4;
-  }
-  return calc_var(v_sum_d, v_sse_q, sse, 4, h);
-}
-
-// For W >=8
-#define HIGHBD_MASK_SUBPIX_VAR_LARGE(W, H)                                     \
-  unsigned int highbd_masked_sub_pixel_variance##W##x##H##_ssse3(              \
-      const uint8_t *src8, int src_stride, int xoffset, int yoffset,           \
-      const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \
-      unsigned int *sse, highbd_calc_masked_var_t calc_var,                    \
-      highbd_variance_fn_t full_variance_function) {                           \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
-    assert(W % 8 == 0);                                                        \
-    if (xoffset == 0) {                                                        \
-      if (yoffset == 0)                                                        \
-        return full_variance_function(src8, src_stride, dst8, dst_stride, msk, \
-                                      msk_stride, sse);                        \
-      else if (yoffset == HALF_PIXEL_OFFSET)                                   \
-        return aom_highbd_masked_subpel_varWxH_xzero(                          \
-            src, src_stride, HALF_PIXEL_OFFSET, dst, dst_stride, msk,          \
-            msk_stride, sse, W, H, highbd_apply_filter_avg, calc_var);         \
-      else                                                                     \
-        return aom_highbd_masked_subpel_varWxH_xzero(                          \
-            src, src_stride, yoffset, dst, dst_stride, msk, msk_stride, sse,   \
-            W, H, highbd_apply_filter, calc_var);                              \
-    } else if (yoffset == 0) {                                                 \
-      if (xoffset == HALF_PIXEL_OFFSET)                                        \
-        return aom_highbd_masked_subpel_varWxH_yzero(                          \
-            src, src_stride, HALF_PIXEL_OFFSET, dst, dst_stride, msk,          \
-            msk_stride, sse, W, H, highbd_apply_filter_avg, calc_var);         \
-      else                                                                     \
-        return aom_highbd_masked_subpel_varWxH_yzero(                          \
-            src, src_stride, xoffset, dst, dst_stride, msk, msk_stride, sse,   \
-            W, H, highbd_apply_filter, calc_var);                              \
-    } else if (xoffset == HALF_PIXEL_OFFSET) {                                 \
-      if (yoffset == HALF_PIXEL_OFFSET)                                        \
-        return aom_highbd_masked_subpel_varWxH_xnonzero_ynonzero(              \
-            src, src_stride, HALF_PIXEL_OFFSET, HALF_PIXEL_OFFSET, dst,        \
-            dst_stride, msk, msk_stride, sse, W, H, highbd_apply_filter_avg,   \
-            highbd_apply_filter_avg, calc_var);                                \
-      else                                                                     \
-        return aom_highbd_masked_subpel_varWxH_xnonzero_ynonzero(              \
-            src, src_stride, HALF_PIXEL_OFFSET, yoffset, dst, dst_stride, msk, \
-            msk_stride, sse, W, H, highbd_apply_filter_avg,                    \
-            highbd_apply_filter, calc_var);                                    \
-    } else {                                                                   \
-      if (yoffset == HALF_PIXEL_OFFSET)                                        \
-        return aom_highbd_masked_subpel_varWxH_xnonzero_ynonzero(              \
-            src, src_stride, xoffset, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \
-            msk_stride, sse, W, H, highbd_apply_filter,                        \
-            highbd_apply_filter_avg, calc_var);                                \
-      else                                                                     \
-        return aom_highbd_masked_subpel_varWxH_xnonzero_ynonzero(              \
-            src, src_stride, xoffset, yoffset, dst, dst_stride, msk,           \
-            msk_stride, sse, W, H, highbd_apply_filter, highbd_apply_filter,   \
-            calc_var);                                                         \
-    }                                                                          \
-  }
-
-// For W < 8
-#define HIGHBD_MASK_SUBPIX_VAR_SMALL(W, H)                                     \
-  unsigned int highbd_masked_sub_pixel_variance##W##x##H##_ssse3(              \
-      const uint8_t *src8, int src_stride, int xoffset, int yoffset,           \
-      const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \
-      unsigned int *sse, highbd_calc_masked_var_t calc_var,                    \
-      highbd_variance_fn_t full_variance_function) {                           \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
-    assert(W == 4);                                                            \
-    if (xoffset == 0 && yoffset == 0)                                          \
-      return full_variance_function(src8, src_stride, dst8, dst_stride, msk,   \
-                                    msk_stride, sse);                          \
-    else if (xoffset == 0)                                                     \
-      return aom_highbd_masked_subpel_var4xH_xzero(                            \
-          src, src_stride, yoffset, dst, dst_stride, msk, msk_stride, sse, H,  \
-          calc_var);                                                           \
-    else if (yoffset == 0)                                                     \
-      return aom_highbd_masked_subpel_var4xH_yzero(                            \
-          src, src_stride, xoffset, dst, dst_stride, msk, msk_stride, sse, H,  \
-          calc_var);                                                           \
-    else                                                                       \
-      return aom_highbd_masked_subpel_var4xH_xnonzero_ynonzero(                \
-          src, src_stride, xoffset, yoffset, dst, dst_stride, msk, msk_stride, \
-          sse, H, calc_var);                                                   \
-  }
-
-#define HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(W, H)                                  \
-  unsigned int aom_highbd_masked_sub_pixel_variance##W##x##H##_ssse3(          \
-      const uint8_t *src8, int src_stride, int xoffset, int yoffset,           \
-      const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \
-      unsigned int *sse) {                                                     \
-    return highbd_masked_sub_pixel_variance##W##x##H##_ssse3(                  \
-        src8, src_stride, xoffset, yoffset, dst8, dst_stride, msk, msk_stride, \
-        sse, calc_masked_variance,                                             \
-        aom_highbd_masked_variance##W##x##H##_ssse3);                          \
-  }                                                                            \
-  unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_ssse3(       \
-      const uint8_t *src8, int src_stride, int xoffset, int yoffset,           \
-      const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \
-      unsigned int *sse) {                                                     \
-    return highbd_masked_sub_pixel_variance##W##x##H##_ssse3(                  \
-        src8, src_stride, xoffset, yoffset, dst8, dst_stride, msk, msk_stride, \
-        sse, highbd_10_calc_masked_variance,                                   \
-        aom_highbd_10_masked_variance##W##x##H##_ssse3);                       \
-  }                                                                            \
-  unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_ssse3(       \
-      const uint8_t *src8, int src_stride, int xoffset, int yoffset,           \
-      const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \
-      unsigned int *sse) {                                                     \
-    return highbd_masked_sub_pixel_variance##W##x##H##_ssse3(                  \
-        src8, src_stride, xoffset, yoffset, dst8, dst_stride, msk, msk_stride, \
-        sse, highbd_12_calc_masked_variance,                                   \
-        aom_highbd_12_masked_variance##W##x##H##_ssse3);                       \
-  }
-
-HIGHBD_MASK_SUBPIX_VAR_SMALL(4, 4)
-HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(4, 4)
-HIGHBD_MASK_SUBPIX_VAR_SMALL(4, 8)
-HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(4, 8)
-HIGHBD_MASK_SUBPIX_VAR_LARGE(8, 4)
-HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(8, 4)
-HIGHBD_MASK_SUBPIX_VAR_LARGE(8, 8)
-HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(8, 8)
-HIGHBD_MASK_SUBPIX_VAR_LARGE(8, 16)
-HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(8, 16)
-HIGHBD_MASK_SUBPIX_VAR_LARGE(16, 8)
-HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(16, 8)
-HIGHBD_MASK_SUBPIX_VAR_LARGE(16, 16)
-HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(16, 16)
-HIGHBD_MASK_SUBPIX_VAR_LARGE(16, 32)
-HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(16, 32)
-HIGHBD_MASK_SUBPIX_VAR_LARGE(32, 16)
-HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(32, 16)
-HIGHBD_MASK_SUBPIX_VAR_LARGE(32, 32)
-HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(32, 32)
-HIGHBD_MASK_SUBPIX_VAR_LARGE(32, 64)
-HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(32, 64)
-HIGHBD_MASK_SUBPIX_VAR_LARGE(64, 32)
-HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(64, 32)
-HIGHBD_MASK_SUBPIX_VAR_LARGE(64, 64)
-HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(64, 64)
-#if CONFIG_EXT_PARTITION
-HIGHBD_MASK_SUBPIX_VAR_LARGE(64, 128)
-HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(64, 128)
-HIGHBD_MASK_SUBPIX_VAR_LARGE(128, 64)
-HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(128, 64)
-HIGHBD_MASK_SUBPIX_VAR_LARGE(128, 128)
-HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(128, 128)
-#endif  // CONFIG_EXT_PARTITION
-#endif
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index cfa0557..5b2dcef 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -1171,33 +1171,10 @@
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x4d)
 
 #if CONFIG_EXT_INTER
-#define HIGHBD_MBFP(BT, MSDF, MVF, MSVF, MCSDF, MCVF, MCSVF) \
-  cpi->fn_ptr[BT].msdf = MSDF;                               \
-  cpi->fn_ptr[BT].mvf = MVF;                                 \
-  cpi->fn_ptr[BT].msvf = MSVF;                               \
-  cpi->fn_ptr[BT].mcsdf = MCSDF;                             \
-  cpi->fn_ptr[BT].mcvf = MCVF;                               \
+#define HIGHBD_MBFP(BT, MCSDF, MCSVF) \
+  cpi->fn_ptr[BT].mcsdf = MCSDF;      \
   cpi->fn_ptr[BT].mcsvf = MCSVF;
 
-#define MAKE_MBFP_SAD_WRAPPER(fnname)                                          \
-  static unsigned int fnname##_bits8(                                          \
-      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
-      int ref_stride, const uint8_t *m, int m_stride) {                        \
-    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, m, m_stride);   \
-  }                                                                            \
-  static unsigned int fnname##_bits10(                                         \
-      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
-      int ref_stride, const uint8_t *m, int m_stride) {                        \
-    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, m, m_stride) >> \
-           2;                                                                  \
-  }                                                                            \
-  static unsigned int fnname##_bits12(                                         \
-      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
-      int ref_stride, const uint8_t *m, int m_stride) {                        \
-    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, m, m_stride) >> \
-           4;                                                                  \
-  }
-
 #define MAKE_MBFP_COMPOUND_SAD_WRAPPER(fnname)                           \
   static unsigned int fnname##_bits8(                                    \
       const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
@@ -1224,26 +1201,10 @@
   }
 
 #if CONFIG_EXT_PARTITION
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad128x128)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad128x64)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad64x128)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_compound_sad128x128)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_compound_sad128x64)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_compound_sad64x128)
 #endif  // CONFIG_EXT_PARTITION
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad64x64)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad64x32)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad32x64)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad32x32)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad32x16)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad16x32)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad16x16)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad16x8)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad8x16)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad8x8)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad8x4)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad4x8)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad4x4)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_compound_sad64x64)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_compound_sad64x32)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_compound_sad32x64)
@@ -1421,102 +1382,38 @@
 
 #if CONFIG_EXT_INTER
 #if CONFIG_EXT_PARTITION
-        HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits8,
-                    aom_highbd_masked_variance128x128,
-                    aom_highbd_masked_sub_pixel_variance128x128,
-                    aom_highbd_masked_compound_sad128x128_bits8,
-                    aom_highbd_masked_compound_variance128x128,
+        HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_compound_sad128x128_bits8,
                     aom_highbd_masked_compound_sub_pixel_variance128x128)
-        HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits8,
-                    aom_highbd_masked_variance128x64,
-                    aom_highbd_masked_sub_pixel_variance128x64,
-                    aom_highbd_masked_compound_sad128x64_bits8,
-                    aom_highbd_masked_compound_variance128x64,
+        HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_compound_sad128x64_bits8,
                     aom_highbd_masked_compound_sub_pixel_variance128x64)
-        HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits8,
-                    aom_highbd_masked_variance64x128,
-                    aom_highbd_masked_sub_pixel_variance64x128,
-                    aom_highbd_masked_compound_sad64x128_bits8,
-                    aom_highbd_masked_compound_variance64x128,
+        HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_compound_sad64x128_bits8,
                     aom_highbd_masked_compound_sub_pixel_variance64x128)
 #endif  // CONFIG_EXT_PARTITION
-        HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits8,
-                    aom_highbd_masked_variance64x64,
-                    aom_highbd_masked_sub_pixel_variance64x64,
-                    aom_highbd_masked_compound_sad64x64_bits8,
-                    aom_highbd_masked_compound_variance64x64,
+        HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_compound_sad64x64_bits8,
                     aom_highbd_masked_compound_sub_pixel_variance64x64)
-        HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits8,
-                    aom_highbd_masked_variance64x32,
-                    aom_highbd_masked_sub_pixel_variance64x32,
-                    aom_highbd_masked_compound_sad64x32_bits8,
-                    aom_highbd_masked_compound_variance64x32,
+        HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_compound_sad64x32_bits8,
                     aom_highbd_masked_compound_sub_pixel_variance64x32)
-        HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits8,
-                    aom_highbd_masked_variance32x64,
-                    aom_highbd_masked_sub_pixel_variance32x64,
-                    aom_highbd_masked_compound_sad32x64_bits8,
-                    aom_highbd_masked_compound_variance32x64,
+        HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_compound_sad32x64_bits8,
                     aom_highbd_masked_compound_sub_pixel_variance32x64)
-        HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits8,
-                    aom_highbd_masked_variance32x32,
-                    aom_highbd_masked_sub_pixel_variance32x32,
-                    aom_highbd_masked_compound_sad32x32_bits8,
-                    aom_highbd_masked_compound_variance32x32,
+        HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_compound_sad32x32_bits8,
                     aom_highbd_masked_compound_sub_pixel_variance32x32)
-        HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits8,
-                    aom_highbd_masked_variance32x16,
-                    aom_highbd_masked_sub_pixel_variance32x16,
-                    aom_highbd_masked_compound_sad32x16_bits8,
-                    aom_highbd_masked_compound_variance32x16,
+        HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_compound_sad32x16_bits8,
                     aom_highbd_masked_compound_sub_pixel_variance32x16)
-        HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits8,
-                    aom_highbd_masked_variance16x32,
-                    aom_highbd_masked_sub_pixel_variance16x32,
-                    aom_highbd_masked_compound_sad16x32_bits8,
-                    aom_highbd_masked_compound_variance16x32,
+        HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_compound_sad16x32_bits8,
                     aom_highbd_masked_compound_sub_pixel_variance16x32)
-        HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits8,
-                    aom_highbd_masked_variance16x16,
-                    aom_highbd_masked_sub_pixel_variance16x16,
-                    aom_highbd_masked_compound_sad16x16_bits8,
-                    aom_highbd_masked_compound_variance16x16,
+        HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_compound_sad16x16_bits8,
                     aom_highbd_masked_compound_sub_pixel_variance16x16)
-        HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits8,
-                    aom_highbd_masked_variance8x16,
-                    aom_highbd_masked_sub_pixel_variance8x16,
-                    aom_highbd_masked_compound_sad8x16_bits8,
-                    aom_highbd_masked_compound_variance8x16,
+        HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_compound_sad8x16_bits8,
                     aom_highbd_masked_compound_sub_pixel_variance8x16)
-        HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits8,
-                    aom_highbd_masked_variance16x8,
-                    aom_highbd_masked_sub_pixel_variance16x8,
-                    aom_highbd_masked_compound_sad16x8_bits8,
-                    aom_highbd_masked_compound_variance16x8,
+        HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_compound_sad16x8_bits8,
                     aom_highbd_masked_compound_sub_pixel_variance16x8)
-        HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits8,
-                    aom_highbd_masked_variance8x8,
-                    aom_highbd_masked_sub_pixel_variance8x8,
-                    aom_highbd_masked_compound_sad8x8_bits8,
-                    aom_highbd_masked_compound_variance8x8,
+        HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_compound_sad8x8_bits8,
                     aom_highbd_masked_compound_sub_pixel_variance8x8)
-        HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits8,
-                    aom_highbd_masked_variance4x8,
-                    aom_highbd_masked_sub_pixel_variance4x8,
-                    aom_highbd_masked_compound_sad4x8_bits8,
-                    aom_highbd_masked_compound_variance4x8,
+        HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_compound_sad4x8_bits8,
                     aom_highbd_masked_compound_sub_pixel_variance4x8)
-        HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits8,
-                    aom_highbd_masked_variance8x4,
-                    aom_highbd_masked_sub_pixel_variance8x4,
-                    aom_highbd_masked_compound_sad8x4_bits8,
-                    aom_highbd_masked_compound_variance8x4,
+        HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_compound_sad8x4_bits8,
                     aom_highbd_masked_compound_sub_pixel_variance8x4)
-        HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits8,
-                    aom_highbd_masked_variance4x4,
-                    aom_highbd_masked_sub_pixel_variance4x4,
-                    aom_highbd_masked_compound_sad4x4_bits8,
-                    aom_highbd_masked_compound_variance4x4,
+        HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_compound_sad4x4_bits8,
                     aom_highbd_masked_compound_sub_pixel_variance4x4)
 #endif  // CONFIG_EXT_INTER
 #if CONFIG_MOTION_VAR
@@ -1692,102 +1589,38 @@
 
 #if CONFIG_EXT_INTER
 #if CONFIG_EXT_PARTITION
-        HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits10,
-                    aom_highbd_10_masked_variance128x128,
-                    aom_highbd_10_masked_sub_pixel_variance128x128,
-                    aom_highbd_masked_compound_sad128x128_bits10,
-                    aom_highbd_10_masked_compound_variance128x128,
+        HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_compound_sad128x128_bits10,
                     aom_highbd_10_masked_compound_sub_pixel_variance128x128)
-        HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits10,
-                    aom_highbd_10_masked_variance128x64,
-                    aom_highbd_10_masked_sub_pixel_variance128x64,
-                    aom_highbd_masked_compound_sad128x64_bits10,
-                    aom_highbd_10_masked_compound_variance128x64,
+        HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_compound_sad128x64_bits10,
                     aom_highbd_10_masked_compound_sub_pixel_variance128x64)
-        HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits10,
-                    aom_highbd_10_masked_variance64x128,
-                    aom_highbd_10_masked_sub_pixel_variance64x128,
-                    aom_highbd_masked_compound_sad64x128_bits10,
-                    aom_highbd_10_masked_compound_variance64x128,
+        HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_compound_sad64x128_bits10,
                     aom_highbd_10_masked_compound_sub_pixel_variance64x128)
 #endif  // CONFIG_EXT_PARTITION
-        HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits10,
-                    aom_highbd_10_masked_variance64x64,
-                    aom_highbd_10_masked_sub_pixel_variance64x64,
-                    aom_highbd_masked_compound_sad64x64_bits10,
-                    aom_highbd_10_masked_compound_variance64x64,
+        HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_compound_sad64x64_bits10,
                     aom_highbd_10_masked_compound_sub_pixel_variance64x64)
-        HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits10,
-                    aom_highbd_10_masked_variance64x32,
-                    aom_highbd_10_masked_sub_pixel_variance64x32,
-                    aom_highbd_masked_compound_sad64x32_bits10,
-                    aom_highbd_10_masked_compound_variance64x32,
+        HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_compound_sad64x32_bits10,
                     aom_highbd_10_masked_compound_sub_pixel_variance64x32)
-        HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits10,
-                    aom_highbd_10_masked_variance32x64,
-                    aom_highbd_10_masked_sub_pixel_variance32x64,
-                    aom_highbd_masked_compound_sad32x64_bits10,
-                    aom_highbd_10_masked_compound_variance32x64,
+        HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_compound_sad32x64_bits10,
                     aom_highbd_10_masked_compound_sub_pixel_variance32x64)
-        HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits10,
-                    aom_highbd_10_masked_variance32x32,
-                    aom_highbd_10_masked_sub_pixel_variance32x32,
-                    aom_highbd_masked_compound_sad32x32_bits10,
-                    aom_highbd_10_masked_compound_variance32x32,
+        HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_compound_sad32x32_bits10,
                     aom_highbd_10_masked_compound_sub_pixel_variance32x32)
-        HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits10,
-                    aom_highbd_10_masked_variance32x16,
-                    aom_highbd_10_masked_sub_pixel_variance32x16,
-                    aom_highbd_masked_compound_sad32x16_bits10,
-                    aom_highbd_10_masked_compound_variance32x16,
+        HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_compound_sad32x16_bits10,
                     aom_highbd_10_masked_compound_sub_pixel_variance32x16)
-        HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits10,
-                    aom_highbd_10_masked_variance16x32,
-                    aom_highbd_10_masked_sub_pixel_variance16x32,
-                    aom_highbd_masked_compound_sad16x32_bits10,
-                    aom_highbd_10_masked_compound_variance16x32,
+        HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_compound_sad16x32_bits10,
                     aom_highbd_10_masked_compound_sub_pixel_variance16x32)
-        HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits10,
-                    aom_highbd_10_masked_variance16x16,
-                    aom_highbd_10_masked_sub_pixel_variance16x16,
-                    aom_highbd_masked_compound_sad16x16_bits10,
-                    aom_highbd_10_masked_compound_variance16x16,
+        HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_compound_sad16x16_bits10,
                     aom_highbd_10_masked_compound_sub_pixel_variance16x16)
-        HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits10,
-                    aom_highbd_10_masked_variance8x16,
-                    aom_highbd_10_masked_sub_pixel_variance8x16,
-                    aom_highbd_masked_compound_sad8x16_bits10,
-                    aom_highbd_10_masked_compound_variance8x16,
+        HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_compound_sad8x16_bits10,
                     aom_highbd_10_masked_compound_sub_pixel_variance8x16)
-        HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits10,
-                    aom_highbd_10_masked_variance16x8,
-                    aom_highbd_10_masked_sub_pixel_variance16x8,
-                    aom_highbd_masked_compound_sad16x8_bits10,
-                    aom_highbd_10_masked_compound_variance16x8,
+        HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_compound_sad16x8_bits10,
                     aom_highbd_10_masked_compound_sub_pixel_variance16x8)
-        HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits10,
-                    aom_highbd_10_masked_variance8x8,
-                    aom_highbd_10_masked_sub_pixel_variance8x8,
-                    aom_highbd_masked_compound_sad8x8_bits10,
-                    aom_highbd_10_masked_compound_variance8x8,
+        HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_compound_sad8x8_bits10,
                     aom_highbd_10_masked_compound_sub_pixel_variance8x8)
-        HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits10,
-                    aom_highbd_10_masked_variance4x8,
-                    aom_highbd_10_masked_sub_pixel_variance4x8,
-                    aom_highbd_masked_compound_sad4x8_bits10,
-                    aom_highbd_10_masked_compound_variance4x8,
+        HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_compound_sad4x8_bits10,
                     aom_highbd_10_masked_compound_sub_pixel_variance4x8)
-        HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits10,
-                    aom_highbd_10_masked_variance8x4,
-                    aom_highbd_10_masked_sub_pixel_variance8x4,
-                    aom_highbd_masked_compound_sad8x4_bits10,
-                    aom_highbd_10_masked_compound_variance8x4,
+        HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_compound_sad8x4_bits10,
                     aom_highbd_10_masked_compound_sub_pixel_variance8x4)
-        HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits10,
-                    aom_highbd_10_masked_variance4x4,
-                    aom_highbd_10_masked_sub_pixel_variance4x4,
-                    aom_highbd_masked_compound_sad4x4_bits10,
-                    aom_highbd_10_masked_compound_variance4x4,
+        HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_compound_sad4x4_bits10,
                     aom_highbd_10_masked_compound_sub_pixel_variance4x4)
 #endif  // CONFIG_EXT_INTER
 #if CONFIG_MOTION_VAR
@@ -1963,102 +1796,38 @@
 
 #if CONFIG_EXT_INTER
 #if CONFIG_EXT_PARTITION
-        HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits12,
-                    aom_highbd_12_masked_variance128x128,
-                    aom_highbd_12_masked_sub_pixel_variance128x128,
-                    aom_highbd_masked_compound_sad128x128_bits12,
-                    aom_highbd_12_masked_compound_variance128x128,
+        HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_compound_sad128x128_bits12,
                     aom_highbd_12_masked_compound_sub_pixel_variance128x128)
-        HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits12,
-                    aom_highbd_12_masked_variance128x64,
-                    aom_highbd_12_masked_sub_pixel_variance128x64,
-                    aom_highbd_masked_compound_sad128x64_bits12,
-                    aom_highbd_12_masked_compound_variance128x64,
+        HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_compound_sad128x64_bits12,
                     aom_highbd_12_masked_compound_sub_pixel_variance128x64)
-        HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits12,
-                    aom_highbd_12_masked_variance64x128,
-                    aom_highbd_12_masked_sub_pixel_variance64x128,
-                    aom_highbd_masked_compound_sad64x128_bits12,
-                    aom_highbd_12_masked_compound_variance64x128,
+        HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_compound_sad64x128_bits12,
                     aom_highbd_12_masked_compound_sub_pixel_variance64x128)
 #endif  // CONFIG_EXT_PARTITION
-        HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits12,
-                    aom_highbd_12_masked_variance64x64,
-                    aom_highbd_12_masked_sub_pixel_variance64x64,
-                    aom_highbd_masked_compound_sad64x64_bits12,
-                    aom_highbd_12_masked_compound_variance64x64,
+        HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_compound_sad64x64_bits12,
                     aom_highbd_12_masked_compound_sub_pixel_variance64x64)
-        HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits12,
-                    aom_highbd_12_masked_variance64x32,
-                    aom_highbd_12_masked_sub_pixel_variance64x32,
-                    aom_highbd_masked_compound_sad64x32_bits12,
-                    aom_highbd_12_masked_compound_variance64x32,
+        HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_compound_sad64x32_bits12,
                     aom_highbd_12_masked_compound_sub_pixel_variance64x32)
-        HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits12,
-                    aom_highbd_12_masked_variance32x64,
-                    aom_highbd_12_masked_sub_pixel_variance32x64,
-                    aom_highbd_masked_compound_sad32x64_bits12,
-                    aom_highbd_12_masked_compound_variance32x64,
+        HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_compound_sad32x64_bits12,
                     aom_highbd_12_masked_compound_sub_pixel_variance32x64)
-        HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits12,
-                    aom_highbd_12_masked_variance32x32,
-                    aom_highbd_12_masked_sub_pixel_variance32x32,
-                    aom_highbd_masked_compound_sad32x32_bits12,
-                    aom_highbd_12_masked_compound_variance32x32,
+        HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_compound_sad32x32_bits12,
                     aom_highbd_12_masked_compound_sub_pixel_variance32x32)
-        HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits12,
-                    aom_highbd_12_masked_variance32x16,
-                    aom_highbd_12_masked_sub_pixel_variance32x16,
-                    aom_highbd_masked_compound_sad32x16_bits12,
-                    aom_highbd_12_masked_compound_variance32x16,
+        HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_compound_sad32x16_bits12,
                     aom_highbd_12_masked_compound_sub_pixel_variance32x16)
-        HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits12,
-                    aom_highbd_12_masked_variance16x32,
-                    aom_highbd_12_masked_sub_pixel_variance16x32,
-                    aom_highbd_masked_compound_sad16x32_bits12,
-                    aom_highbd_12_masked_compound_variance16x32,
+        HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_compound_sad16x32_bits12,
                     aom_highbd_12_masked_compound_sub_pixel_variance16x32)
-        HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits12,
-                    aom_highbd_12_masked_variance16x16,
-                    aom_highbd_12_masked_sub_pixel_variance16x16,
-                    aom_highbd_masked_compound_sad16x16_bits12,
-                    aom_highbd_12_masked_compound_variance16x16,
+        HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_compound_sad16x16_bits12,
                     aom_highbd_12_masked_compound_sub_pixel_variance16x16)
-        HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits12,
-                    aom_highbd_12_masked_variance8x16,
-                    aom_highbd_12_masked_sub_pixel_variance8x16,
-                    aom_highbd_masked_compound_sad8x16_bits12,
-                    aom_highbd_12_masked_compound_variance8x16,
+        HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_compound_sad8x16_bits12,
                     aom_highbd_12_masked_compound_sub_pixel_variance8x16)
-        HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits12,
-                    aom_highbd_12_masked_variance16x8,
-                    aom_highbd_12_masked_sub_pixel_variance16x8,
-                    aom_highbd_masked_compound_sad16x8_bits12,
-                    aom_highbd_12_masked_compound_variance16x8,
+        HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_compound_sad16x8_bits12,
                     aom_highbd_12_masked_compound_sub_pixel_variance16x8)
-        HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits12,
-                    aom_highbd_12_masked_variance8x8,
-                    aom_highbd_12_masked_sub_pixel_variance8x8,
-                    aom_highbd_masked_compound_sad8x8_bits12,
-                    aom_highbd_12_masked_compound_variance8x8,
+        HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_compound_sad8x8_bits12,
                     aom_highbd_12_masked_compound_sub_pixel_variance8x8)
-        HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits12,
-                    aom_highbd_12_masked_variance4x8,
-                    aom_highbd_12_masked_sub_pixel_variance4x8,
-                    aom_highbd_masked_compound_sad4x8_bits12,
-                    aom_highbd_12_masked_compound_variance4x8,
+        HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_compound_sad4x8_bits12,
                     aom_highbd_12_masked_compound_sub_pixel_variance4x8)
-        HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits12,
-                    aom_highbd_12_masked_variance8x4,
-                    aom_highbd_12_masked_sub_pixel_variance8x4,
-                    aom_highbd_masked_compound_sad8x4_bits12,
-                    aom_highbd_12_masked_compound_variance8x4,
+        HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_compound_sad8x4_bits12,
                     aom_highbd_12_masked_compound_sub_pixel_variance8x4)
-        HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits12,
-                    aom_highbd_12_masked_variance4x4,
-                    aom_highbd_12_masked_sub_pixel_variance4x4,
-                    aom_highbd_masked_compound_sad4x4_bits12,
-                    aom_highbd_12_masked_compound_variance4x4,
+        HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_compound_sad4x4_bits12,
                     aom_highbd_12_masked_compound_sub_pixel_variance4x4)
 #endif  // CONFIG_EXT_INTER
 
@@ -2642,79 +2411,43 @@
 #endif  // CONFIG_MOTION_VAR
 
 #if CONFIG_EXT_INTER
-#define MBFP(BT, MSDF, MVF, MSVF, MCSDF, MCVF, MCSVF) \
-  cpi->fn_ptr[BT].msdf = MSDF;                        \
-  cpi->fn_ptr[BT].mvf = MVF;                          \
-  cpi->fn_ptr[BT].msvf = MSVF;                        \
-  cpi->fn_ptr[BT].mcsdf = MCSDF;                      \
-  cpi->fn_ptr[BT].mcvf = MCVF;                        \
+#define MBFP(BT, MCSDF, MCSVF)   \
+  cpi->fn_ptr[BT].mcsdf = MCSDF; \
   cpi->fn_ptr[BT].mcsvf = MCSVF;
 
 #if CONFIG_EXT_PARTITION
-  MBFP(BLOCK_128X128, aom_masked_sad128x128, aom_masked_variance128x128,
-       aom_masked_sub_pixel_variance128x128, aom_masked_compound_sad128x128,
-       aom_masked_compound_variance128x128,
+  MBFP(BLOCK_128X128, aom_masked_compound_sad128x128,
        aom_masked_compound_sub_pixel_variance128x128)
-  MBFP(BLOCK_128X64, aom_masked_sad128x64, aom_masked_variance128x64,
-       aom_masked_sub_pixel_variance128x64, aom_masked_compound_sad128x64,
-       aom_masked_compound_variance128x64,
+  MBFP(BLOCK_128X64, aom_masked_compound_sad128x64,
        aom_masked_compound_sub_pixel_variance128x64)
-  MBFP(BLOCK_64X128, aom_masked_sad64x128, aom_masked_variance64x128,
-       aom_masked_sub_pixel_variance64x128, aom_masked_compound_sad64x128,
-       aom_masked_compound_variance64x128,
+  MBFP(BLOCK_64X128, aom_masked_compound_sad64x128,
        aom_masked_compound_sub_pixel_variance64x128)
 #endif  // CONFIG_EXT_PARTITION
-  MBFP(BLOCK_64X64, aom_masked_sad64x64, aom_masked_variance64x64,
-       aom_masked_sub_pixel_variance64x64, aom_masked_compound_sad64x64,
-       aom_masked_compound_variance64x64,
+  MBFP(BLOCK_64X64, aom_masked_compound_sad64x64,
        aom_masked_compound_sub_pixel_variance64x64)
-  MBFP(BLOCK_64X32, aom_masked_sad64x32, aom_masked_variance64x32,
-       aom_masked_sub_pixel_variance64x32, aom_masked_compound_sad64x32,
-       aom_masked_compound_variance64x32,
+  MBFP(BLOCK_64X32, aom_masked_compound_sad64x32,
        aom_masked_compound_sub_pixel_variance64x32)
-  MBFP(BLOCK_32X64, aom_masked_sad32x64, aom_masked_variance32x64,
-       aom_masked_sub_pixel_variance32x64, aom_masked_compound_sad32x64,
-       aom_masked_compound_variance32x64,
+  MBFP(BLOCK_32X64, aom_masked_compound_sad32x64,
        aom_masked_compound_sub_pixel_variance32x64)
-  MBFP(BLOCK_32X32, aom_masked_sad32x32, aom_masked_variance32x32,
-       aom_masked_sub_pixel_variance32x32, aom_masked_compound_sad32x32,
-       aom_masked_compound_variance32x32,
+  MBFP(BLOCK_32X32, aom_masked_compound_sad32x32,
        aom_masked_compound_sub_pixel_variance32x32)
-  MBFP(BLOCK_32X16, aom_masked_sad32x16, aom_masked_variance32x16,
-       aom_masked_sub_pixel_variance32x16, aom_masked_compound_sad32x16,
-       aom_masked_compound_variance32x16,
+  MBFP(BLOCK_32X16, aom_masked_compound_sad32x16,
        aom_masked_compound_sub_pixel_variance32x16)
-  MBFP(BLOCK_16X32, aom_masked_sad16x32, aom_masked_variance16x32,
-       aom_masked_sub_pixel_variance16x32, aom_masked_compound_sad16x32,
-       aom_masked_compound_variance16x32,
+  MBFP(BLOCK_16X32, aom_masked_compound_sad16x32,
        aom_masked_compound_sub_pixel_variance16x32)
-  MBFP(BLOCK_16X16, aom_masked_sad16x16, aom_masked_variance16x16,
-       aom_masked_sub_pixel_variance16x16, aom_masked_compound_sad16x16,
-       aom_masked_compound_variance16x16,
+  MBFP(BLOCK_16X16, aom_masked_compound_sad16x16,
        aom_masked_compound_sub_pixel_variance16x16)
-  MBFP(BLOCK_16X8, aom_masked_sad16x8, aom_masked_variance16x8,
-       aom_masked_sub_pixel_variance16x8, aom_masked_compound_sad16x8,
-       aom_masked_compound_variance16x8,
+  MBFP(BLOCK_16X8, aom_masked_compound_sad16x8,
        aom_masked_compound_sub_pixel_variance16x8)
-  MBFP(BLOCK_8X16, aom_masked_sad8x16, aom_masked_variance8x16,
-       aom_masked_sub_pixel_variance8x16, aom_masked_compound_sad8x16,
-       aom_masked_compound_variance8x16,
+  MBFP(BLOCK_8X16, aom_masked_compound_sad8x16,
        aom_masked_compound_sub_pixel_variance8x16)
-  MBFP(BLOCK_8X8, aom_masked_sad8x8, aom_masked_variance8x8,
-       aom_masked_sub_pixel_variance8x8, aom_masked_compound_sad8x8,
-       aom_masked_compound_variance8x8,
+  MBFP(BLOCK_8X8, aom_masked_compound_sad8x8,
        aom_masked_compound_sub_pixel_variance8x8)
-  MBFP(BLOCK_4X8, aom_masked_sad4x8, aom_masked_variance4x8,
-       aom_masked_sub_pixel_variance4x8, aom_masked_compound_sad4x8,
-       aom_masked_compound_variance4x8,
+  MBFP(BLOCK_4X8, aom_masked_compound_sad4x8,
        aom_masked_compound_sub_pixel_variance4x8)
-  MBFP(BLOCK_8X4, aom_masked_sad8x4, aom_masked_variance8x4,
-       aom_masked_sub_pixel_variance8x4, aom_masked_compound_sad8x4,
-       aom_masked_compound_variance8x4,
+  MBFP(BLOCK_8X4, aom_masked_compound_sad8x4,
        aom_masked_compound_sub_pixel_variance8x4)
-  MBFP(BLOCK_4X4, aom_masked_sad4x4, aom_masked_variance4x4,
-       aom_masked_sub_pixel_variance4x4, aom_masked_compound_sad4x4,
-       aom_masked_compound_variance4x4,
+  MBFP(BLOCK_4X4, aom_masked_compound_sad4x4,
        aom_masked_compound_sub_pixel_variance4x4)
 #endif  // CONFIG_EXT_INTER
 
diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index cbdfc8f..b727739 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c
@@ -2524,598 +2524,6 @@
   return var;
 }
 
-#if CONFIG_EXT_INTER
-/* returns subpixel variance error function */
-#define DIST(r, c)                                                         \
-  vfp->msvf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, src_stride, \
-            mask, mask_stride, &sse)
-
-/* checks if (r, c) has better score than previous best */
-
-#define MVC(r, c)                                                              \
-  (unsigned int)(mvcost                                                        \
-                     ? ((mvjcost[((r) != rr) * 2 + ((c) != rc)] +              \
-                         mvcost[0][((r)-rr)] + (int64_t)mvcost[1][((c)-rc)]) * \
-                            error_per_bit +                                    \
-                        4096) >>                                               \
-                           13                                                  \
-                     : 0)
-
-#define CHECK_BETTER(v, r, c)                             \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \
-    thismse = (DIST(r, c));                               \
-    if ((v = MVC(r, c) + thismse) < besterr) {            \
-      besterr = v;                                        \
-      br = r;                                             \
-      bc = c;                                             \
-      *distortion = thismse;                              \
-      *sse1 = sse;                                        \
-    }                                                     \
-  } else {                                                \
-    v = INT_MAX;                                          \
-  }
-
-#undef CHECK_BETTER0
-#define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
-
-#undef CHECK_BETTER1
-#define CHECK_BETTER1(v, r, c)                                                 \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                      \
-    thismse = upsampled_masked_pref_error(xd, mask, mask_stride, vfp, z,       \
-                                          src_stride, upre(y, y_stride, r, c), \
-                                          y_stride, w, h, &sse);               \
-    if ((v = MVC(r, c) + thismse) < besterr) {                                 \
-      besterr = v;                                                             \
-      br = r;                                                                  \
-      bc = c;                                                                  \
-      *distortion = thismse;                                                   \
-      *sse1 = sse;                                                             \
-    }                                                                          \
-  } else {                                                                     \
-    v = INT_MAX;                                                               \
-  }
-
-int av1_find_best_masked_sub_pixel_tree(
-    const MACROBLOCK *x, const uint8_t *mask, int mask_stride, MV *bestmv,
-    const MV *ref_mv, int allow_hp, int error_per_bit,
-    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
-    int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1,
-    int is_second) {
-  const uint8_t *const z = x->plane[0].src.buf;
-  const int src_stride = x->plane[0].src.stride;
-  const MACROBLOCKD *xd = &x->e_mbd;
-  unsigned int besterr = INT_MAX;
-  unsigned int sse;
-  int thismse;
-  unsigned int whichdir;
-  unsigned int halfiters = iters_per_step;
-  unsigned int quarteriters = iters_per_step;
-  unsigned int eighthiters = iters_per_step;
-
-  const int y_stride = xd->plane[0].pre[is_second].stride;
-  const int offset = bestmv->row * y_stride + bestmv->col;
-  const uint8_t *const y = xd->plane[0].pre[is_second].buf;
-
-  int rr = ref_mv->row;
-  int rc = ref_mv->col;
-  int br = bestmv->row * 8;
-  int bc = bestmv->col * 8;
-  int hstep = 4;
-  int tr = br;
-  int tc = bc;
-  int minc, maxc, minr, maxr;
-
-  av1_set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr,
-                                 ref_mv);
-
-  // central mv
-  bestmv->row *= 8;
-  bestmv->col *= 8;
-
-  // calculate central point error
-  besterr =
-      vfp->mvf(y + offset, y_stride, z, src_stride, mask, mask_stride, sse1);
-  *distortion = besterr;
-  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
-
-  // 1/2 pel
-  FIRST_LEVEL_CHECKS;
-  if (halfiters > 1) {
-    SECOND_LEVEL_CHECKS;
-  }
-  tr = br;
-  tc = bc;
-
-  // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
-  if (forced_stop != 2) {
-    hstep >>= 1;
-    FIRST_LEVEL_CHECKS;
-    if (quarteriters > 1) {
-      SECOND_LEVEL_CHECKS;
-    }
-    tr = br;
-    tc = bc;
-  }
-
-  if (allow_hp && forced_stop == 0) {
-    hstep >>= 1;
-    FIRST_LEVEL_CHECKS;
-    if (eighthiters > 1) {
-      SECOND_LEVEL_CHECKS;
-    }
-    tr = br;
-    tc = bc;
-  }
-  // These lines insure static analysis doesn't warn that
-  // tr and tc aren't used after the above point.
-  (void)tr;
-  (void)tc;
-
-  bestmv->row = br;
-  bestmv->col = bc;
-
-  return besterr;
-}
-
-static unsigned int setup_masked_center_error(
-    const uint8_t *mask, int mask_stride, const MV *bestmv, const MV *ref_mv,
-    int error_per_bit, const aom_variance_fn_ptr_t *vfp,
-    const uint8_t *const src, const int src_stride, const uint8_t *const y,
-    int y_stride, int offset, int *mvjcost, int *mvcost[2], unsigned int *sse1,
-    int *distortion) {
-  unsigned int besterr;
-  besterr =
-      vfp->mvf(y + offset, y_stride, src, src_stride, mask, mask_stride, sse1);
-  *distortion = besterr;
-  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
-  return besterr;
-}
-
-static int upsampled_masked_pref_error(const MACROBLOCKD *xd,
-                                       const uint8_t *mask, int mask_stride,
-                                       const aom_variance_fn_ptr_t *vfp,
-                                       const uint8_t *const src,
-                                       const int src_stride,
-                                       const uint8_t *const y, int y_stride,
-                                       int w, int h, unsigned int *sse) {
-  unsigned int besterr;
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
-    aom_highbd_upsampled_pred(pred16, w, h, y, y_stride);
-
-    besterr = vfp->mvf(CONVERT_TO_BYTEPTR(pred16), w, src, src_stride, mask,
-                       mask_stride, sse);
-  } else {
-    DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
-#else
-  DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
-  (void)xd;
-#endif  // CONFIG_HIGHBITDEPTH
-    aom_upsampled_pred(pred, w, h, y, y_stride);
-
-    besterr = vfp->mvf(pred, w, src, src_stride, mask, mask_stride, sse);
-#if CONFIG_HIGHBITDEPTH
-  }
-#endif
-  return besterr;
-}
-
-static unsigned int upsampled_setup_masked_center_error(
-    const MACROBLOCKD *xd, const uint8_t *mask, int mask_stride,
-    const MV *bestmv, const MV *ref_mv, int error_per_bit,
-    const aom_variance_fn_ptr_t *vfp, const uint8_t *const src,
-    const int src_stride, const uint8_t *const y, int y_stride, int w, int h,
-    int offset, int *mvjcost, int *mvcost[2], unsigned int *sse1,
-    int *distortion) {
-  unsigned int besterr =
-      upsampled_masked_pref_error(xd, mask, mask_stride, vfp, src, src_stride,
-                                  y + offset, y_stride, w, h, sse1);
-  *distortion = besterr;
-  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
-  return besterr;
-}
-
-int av1_find_best_masked_sub_pixel_tree_up(
-    const AV1_COMP *cpi, MACROBLOCK *x, const uint8_t *mask, int mask_stride,
-    int mi_row, int mi_col, MV *bestmv, const MV *ref_mv, int allow_hp,
-    int error_per_bit, const aom_variance_fn_ptr_t *vfp, int forced_stop,
-    int iters_per_step, int *mvjcost, int *mvcost[2], int *distortion,
-    unsigned int *sse1, int is_second, int use_upsampled_ref) {
-  const uint8_t *const z = x->plane[0].src.buf;
-  const uint8_t *const src_address = z;
-  const int src_stride = x->plane[0].src.stride;
-  MACROBLOCKD *xd = &x->e_mbd;
-  struct macroblockd_plane *const pd = &xd->plane[0];
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  unsigned int besterr = INT_MAX;
-  unsigned int sse;
-  unsigned int thismse;
-
-  int rr = ref_mv->row;
-  int rc = ref_mv->col;
-  int br = bestmv->row * 8;
-  int bc = bestmv->col * 8;
-  int hstep = 4;
-  int iter;
-  int round = 3 - forced_stop;
-  int tr = br;
-  int tc = bc;
-  const MV *search_step = search_step_table;
-  int idx, best_idx = -1;
-  unsigned int cost_array[5];
-  int kr, kc;
-  const int w = block_size_wide[mbmi->sb_type];
-  const int h = block_size_high[mbmi->sb_type];
-  int offset;
-  int y_stride;
-  const uint8_t *y;
-
-  const struct buf_2d backup_pred = pd->pre[is_second];
-  int minc, maxc, minr, maxr;
-
-  av1_set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr,
-                                 ref_mv);
-
-  if (use_upsampled_ref) {
-    int ref = xd->mi[0]->mbmi.ref_frame[is_second];
-    const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref);
-    setup_pred_plane(&pd->pre[is_second], mbmi->sb_type,
-                     upsampled_ref->y_buffer, upsampled_ref->y_crop_width,
-                     upsampled_ref->y_crop_height, upsampled_ref->y_stride,
-                     (mi_row << 3), (mi_col << 3), NULL, pd->subsampling_x,
-                     pd->subsampling_y);
-  }
-  y = pd->pre[is_second].buf;
-  y_stride = pd->pre[is_second].stride;
-  offset = bestmv->row * y_stride + bestmv->col;
-
-  if (!allow_hp)
-    if (round == 3) round = 2;
-
-  bestmv->row *= 8;
-  bestmv->col *= 8;
-
-  // use_upsampled_ref can be 0 or 1
-  if (use_upsampled_ref)
-    besterr = upsampled_setup_masked_center_error(
-        xd, mask, mask_stride, bestmv, ref_mv, error_per_bit, vfp, z,
-        src_stride, y, y_stride, w, h, (offset * 8), mvjcost, mvcost, sse1,
-        distortion);
-  else
-    besterr = setup_masked_center_error(
-        mask, mask_stride, bestmv, ref_mv, error_per_bit, vfp, z, src_stride, y,
-        y_stride, offset, mvjcost, mvcost, sse1, distortion);
-
-  for (iter = 0; iter < round; ++iter) {
-    // Check vertical and horizontal sub-pixel positions.
-    for (idx = 0; idx < 4; ++idx) {
-      tr = br + search_step[idx].row;
-      tc = bc + search_step[idx].col;
-      if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
-        MV this_mv = { tr, tc };
-
-        if (use_upsampled_ref) {
-          const uint8_t *const pre_address = y + tr * y_stride + tc;
-
-          thismse = upsampled_masked_pref_error(
-              xd, mask, mask_stride, vfp, src_address, src_stride, pre_address,
-              y_stride, w, h, &sse);
-        } else {
-          const uint8_t *const pre_address =
-              y + (tr >> 3) * y_stride + (tc >> 3);
-          thismse = vfp->msvf(pre_address, y_stride, sp(tc), sp(tr),
-                              src_address, src_stride, mask, mask_stride, &sse);
-        }
-
-        cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost,
-                                                mvcost, error_per_bit);
-
-        if (cost_array[idx] < besterr) {
-          best_idx = idx;
-          besterr = cost_array[idx];
-          *distortion = thismse;
-          *sse1 = sse;
-        }
-      } else {
-        cost_array[idx] = INT_MAX;
-      }
-    }
-
-    // Check diagonal sub-pixel position
-    kc = (cost_array[0] <= cost_array[1] ? -hstep : hstep);
-    kr = (cost_array[2] <= cost_array[3] ? -hstep : hstep);
-
-    tc = bc + kc;
-    tr = br + kr;
-    if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
-      MV this_mv = { tr, tc };
-
-      if (use_upsampled_ref) {
-        const uint8_t *const pre_address = y + tr * y_stride + tc;
-
-        thismse = upsampled_masked_pref_error(
-            xd, mask, mask_stride, vfp, src_address, src_stride, pre_address,
-            y_stride, w, h, &sse);
-      } else {
-        const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
-
-        thismse = vfp->msvf(pre_address, y_stride, sp(tc), sp(tr), src_address,
-                            src_stride, mask, mask_stride, &sse);
-      }
-
-      cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
-                                            error_per_bit);
-
-      if (cost_array[4] < besterr) {
-        best_idx = 4;
-        besterr = cost_array[4];
-        *distortion = thismse;
-        *sse1 = sse;
-      }
-    } else {
-      cost_array[idx] = INT_MAX;
-    }
-
-    if (best_idx < 4 && best_idx >= 0) {
-      br += search_step[best_idx].row;
-      bc += search_step[best_idx].col;
-    } else if (best_idx == 4) {
-      br = tr;
-      bc = tc;
-    }
-
-    if (iters_per_step > 1 && best_idx != -1) {
-      if (use_upsampled_ref) {
-        SECOND_LEVEL_CHECKS_BEST(1);
-      } else {
-        SECOND_LEVEL_CHECKS_BEST(0);
-      }
-    }
-
-    tr = br;
-    tc = bc;
-
-    search_step += 4;
-    hstep >>= 1;
-    best_idx = -1;
-  }
-
-  // These lines insure static analysis doesn't warn that
-  // tr and tc aren't used after the above point.
-  (void)tr;
-  (void)tc;
-
-  bestmv->row = br;
-  bestmv->col = bc;
-
-  if (use_upsampled_ref) {
-    pd->pre[is_second] = backup_pred;
-  }
-
-  return besterr;
-}
-
-#undef DIST
-#undef MVC
-#undef CHECK_BETTER
-
-static int get_masked_mvpred_var(const MACROBLOCK *x, const uint8_t *mask,
-                                 int mask_stride, const MV *best_mv,
-                                 const MV *center_mv,
-                                 const aom_variance_fn_ptr_t *vfp,
-                                 int use_mvcost, int is_second) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
-  const MV mv = { best_mv->row * 8, best_mv->col * 8 };
-  unsigned int unused;
-
-  return vfp->mvf(what->buf, what->stride, get_buf_from_mv(in_what, best_mv),
-                  in_what->stride, mask, mask_stride, &unused) +
-         (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost,
-                                   x->errorperbit)
-                     : 0);
-}
-
-int masked_refining_search_sad(const MACROBLOCK *x, const uint8_t *mask,
-                               int mask_stride, MV *ref_mv, int error_per_bit,
-                               int search_range,
-                               const aom_variance_fn_ptr_t *fn_ptr,
-                               const MV *center_mv, int is_second) {
-  const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } };
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
-  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
-  unsigned int best_sad =
-      fn_ptr->msdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
-                   in_what->stride, mask, mask_stride) +
-      mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
-  int i, j;
-
-  for (i = 0; i < search_range; i++) {
-    int best_site = -1;
-
-    for (j = 0; j < 4; j++) {
-      const MV mv = { ref_mv->row + neighbors[j].row,
-                      ref_mv->col + neighbors[j].col };
-      if (is_mv_in(&x->mv_limits, &mv)) {
-        unsigned int sad =
-            fn_ptr->msdf(what->buf, what->stride, get_buf_from_mv(in_what, &mv),
-                         in_what->stride, mask, mask_stride);
-        if (sad < best_sad) {
-          sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
-          if (sad < best_sad) {
-            best_sad = sad;
-            best_site = j;
-          }
-        }
-      }
-    }
-
-    if (best_site == -1) {
-      break;
-    } else {
-      ref_mv->row += neighbors[best_site].row;
-      ref_mv->col += neighbors[best_site].col;
-    }
-  }
-  return best_sad;
-}
-
-int masked_diamond_search_sad(const MACROBLOCK *x,
-                              const search_site_config *cfg,
-                              const uint8_t *mask, int mask_stride, MV *ref_mv,
-                              MV *best_mv, int search_param, int sad_per_bit,
-                              int *num00, const aom_variance_fn_ptr_t *fn_ptr,
-                              const MV *center_mv, int is_second) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
-  // search_param determines the length of the initial step and hence the number
-  // of iterations
-  // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 =
-  // (MAX_FIRST_STEP/4) pel... etc.
-  const search_site *const ss = &cfg->ss[search_param * cfg->searches_per_step];
-  const int tot_steps = (cfg->ss_count / cfg->searches_per_step) - search_param;
-  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
-  const uint8_t *best_address, *in_what_ref;
-  int best_sad = INT_MAX;
-  int best_site = 0;
-  int last_site = 0;
-  int i, j, step;
-
-  clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max,
-           x->mv_limits.row_min, x->mv_limits.row_max);
-  in_what_ref = get_buf_from_mv(in_what, ref_mv);
-  best_address = in_what_ref;
-  *num00 = 0;
-  *best_mv = *ref_mv;
-
-  // Check the starting position
-  best_sad = fn_ptr->msdf(what->buf, what->stride, best_address,
-                          in_what->stride, mask, mask_stride) +
-             mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
-
-  i = 1;
-
-  for (step = 0; step < tot_steps; step++) {
-    for (j = 0; j < cfg->searches_per_step; j++) {
-      const MV mv = { best_mv->row + ss[i].mv.row,
-                      best_mv->col + ss[i].mv.col };
-      if (is_mv_in(&x->mv_limits, &mv)) {
-        int sad =
-            fn_ptr->msdf(what->buf, what->stride, best_address + ss[i].offset,
-                         in_what->stride, mask, mask_stride);
-        if (sad < best_sad) {
-          sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-          if (sad < best_sad) {
-            best_sad = sad;
-            best_site = i;
-          }
-        }
-      }
-
-      i++;
-    }
-
-    if (best_site != last_site) {
-      best_mv->row += ss[best_site].mv.row;
-      best_mv->col += ss[best_site].mv.col;
-      best_address += ss[best_site].offset;
-      last_site = best_site;
-#if defined(NEW_DIAMOND_SEARCH)
-      while (1) {
-        const MV this_mv = { best_mv->row + ss[best_site].mv.row,
-                             best_mv->col + ss[best_site].mv.col };
-        if (is_mv_in(&x->mv_limits, &this_mv)) {
-          int sad = fn_ptr->msdf(what->buf, what->stride,
-                                 best_address + ss[best_site].offset,
-                                 in_what->stride, mask, mask_stride);
-          if (sad < best_sad) {
-            sad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
-            if (sad < best_sad) {
-              best_sad = sad;
-              best_mv->row += ss[best_site].mv.row;
-              best_mv->col += ss[best_site].mv.col;
-              best_address += ss[best_site].offset;
-              continue;
-            }
-          }
-        }
-        break;
-      }
-#endif
-    } else if (best_address == in_what_ref) {
-      (*num00)++;
-    }
-  }
-  return best_sad;
-}
-
-int av1_masked_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x,
-                                  const uint8_t *mask, int mask_stride,
-                                  MV *mvp_full, int step_param, int sadpb,
-                                  int further_steps, int do_refine,
-                                  const aom_variance_fn_ptr_t *fn_ptr,
-                                  const MV *ref_mv, MV *dst_mv, int is_second) {
-  MV temp_mv;
-  int thissme, n, num00 = 0;
-  int bestsme = masked_diamond_search_sad(x, &cpi->ss_cfg, mask, mask_stride,
-                                          mvp_full, &temp_mv, step_param, sadpb,
-                                          &n, fn_ptr, ref_mv, is_second);
-  if (bestsme < INT_MAX)
-    bestsme = get_masked_mvpred_var(x, mask, mask_stride, &temp_mv, ref_mv,
-                                    fn_ptr, 1, is_second);
-  *dst_mv = temp_mv;
-
-  // If there won't be more n-step search, check to see if refining search is
-  // needed.
-  if (n > further_steps) do_refine = 0;
-
-  while (n < further_steps) {
-    ++n;
-
-    if (num00) {
-      num00--;
-    } else {
-      thissme = masked_diamond_search_sad(
-          x, &cpi->ss_cfg, mask, mask_stride, mvp_full, &temp_mv,
-          step_param + n, sadpb, &num00, fn_ptr, ref_mv, is_second);
-      if (thissme < INT_MAX)
-        thissme = get_masked_mvpred_var(x, mask, mask_stride, &temp_mv, ref_mv,
-                                        fn_ptr, 1, is_second);
-
-      // check to see if refining search is needed.
-      if (num00 > further_steps - n) do_refine = 0;
-
-      if (thissme < bestsme) {
-        bestsme = thissme;
-        *dst_mv = temp_mv;
-      }
-    }
-  }
-
-  // final 1-away diamond refining search
-  if (do_refine) {
-    const int search_range = 8;
-    MV best_mv = *dst_mv;
-    thissme =
-        masked_refining_search_sad(x, mask, mask_stride, &best_mv, sadpb,
-                                   search_range, fn_ptr, ref_mv, is_second);
-    if (thissme < INT_MAX)
-      thissme = get_masked_mvpred_var(x, mask, mask_stride, &best_mv, ref_mv,
-                                      fn_ptr, 1, is_second);
-    if (thissme < bestsme) {
-      bestsme = thissme;
-      *dst_mv = best_mv;
-    }
-  }
-  return bestsme;
-}
-#endif  // CONFIG_EXT_INTER
-
 #if CONFIG_MOTION_VAR
 /* returns subpixel variance error function */
 #define DIST(r, c) \
diff --git a/av1/encoder/mcomp.h b/av1/encoder/mcomp.h
index eb989e8..9ed0817 100644
--- a/av1/encoder/mcomp.h
+++ b/av1/encoder/mcomp.h
@@ -136,27 +136,6 @@
                           int error_per_bit, int *cost_list, const MV *ref_mv,
                           int var_max, int rd);
 
-#if CONFIG_EXT_INTER
-int av1_find_best_masked_sub_pixel_tree(
-    const MACROBLOCK *x, const uint8_t *mask, int mask_stride, MV *bestmv,
-    const MV *ref_mv, int allow_hp, int error_per_bit,
-    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
-    int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1,
-    int is_second);
-int av1_find_best_masked_sub_pixel_tree_up(
-    const struct AV1_COMP *cpi, MACROBLOCK *x, const uint8_t *mask,
-    int mask_stride, int mi_row, int mi_col, MV *bestmv, const MV *ref_mv,
-    int allow_hp, int error_per_bit, const aom_variance_fn_ptr_t *vfp,
-    int forced_stop, int iters_per_step, int *mvjcost, int *mvcost[2],
-    int *distortion, unsigned int *sse1, int is_second, int use_upsampled_ref);
-int av1_masked_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x,
-                                  const uint8_t *mask, int mask_stride,
-                                  MV *mvp_full, int step_param, int sadpb,
-                                  int further_steps, int do_refine,
-                                  const aom_variance_fn_ptr_t *fn_ptr,
-                                  const MV *ref_mv, MV *dst_mv, int is_second);
-#endif  // CONFIG_EXT_INTER
-
 #if CONFIG_MOTION_VAR
 int av1_obmc_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x,
                                 MV *mvp_full, int step_param, int sadpb,
diff --git a/test/masked_sad_test.cc b/test/masked_sad_test.cc
index 53f85ee..c0b6eb2 100644
--- a/test/masked_sad_test.cc
+++ b/test/masked_sad_test.cc
@@ -27,9 +27,11 @@
 namespace {
 const int number_of_iterations = 500;
 
-typedef unsigned int (*MaskedSADFunc)(const uint8_t *a, int a_stride,
-                                      const uint8_t *b, int b_stride,
-                                      const uint8_t *m, int m_stride);
+typedef unsigned int (*MaskedSADFunc)(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride,
+                                      const uint8_t *second_pred,
+                                      const uint8_t *msk, int msk_stride,
+                                      int invert_mask);
 typedef std::tr1::tuple<MaskedSADFunc, MaskedSADFunc> MaskedSADParam;
 
 class MaskedSADTest : public ::testing::TestWithParam<MaskedSADParam> {
@@ -52,6 +54,7 @@
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   DECLARE_ALIGNED(16, uint8_t, src_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
   DECLARE_ALIGNED(16, uint8_t, ref_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t, second_pred_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
   DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
   int err_count = 0;
   int first_failure = -1;
@@ -62,18 +65,23 @@
     for (int j = 0; j < MAX_SB_SIZE * MAX_SB_SIZE; j++) {
       src_ptr[j] = rnd.Rand8();
       ref_ptr[j] = rnd.Rand8();
+      second_pred_ptr[j] = rnd.Rand8();
       msk_ptr[j] = ((rnd.Rand8() & 0x7f) > 64) ? rnd.Rand8() & 0x3f : 64;
       assert(msk_ptr[j] <= 64);
     }
 
-    ref_ret = ref_maskedSAD_op_(src_ptr, src_stride, ref_ptr, ref_stride,
-                                msk_ptr, msk_stride);
-    ASM_REGISTER_STATE_CHECK(ret = maskedSAD_op_(src_ptr, src_stride, ref_ptr,
-                                                 ref_stride, msk_ptr,
-                                                 msk_stride));
-    if (ret != ref_ret) {
-      err_count++;
-      if (first_failure == -1) first_failure = i;
+    for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
+      ref_ret =
+          ref_maskedSAD_op_(src_ptr, src_stride, ref_ptr, ref_stride,
+                            second_pred_ptr, msk_ptr, msk_stride, invert_mask);
+      ASM_REGISTER_STATE_CHECK(ret = maskedSAD_op_(src_ptr, src_stride, ref_ptr,
+                                                   ref_stride, second_pred_ptr,
+                                                   msk_ptr, msk_stride,
+                                                   invert_mask));
+      if (ret != ref_ret) {
+        err_count++;
+        if (first_failure == -1) first_failure = i;
+      }
     }
   }
   EXPECT_EQ(0, err_count)
@@ -82,9 +90,11 @@
 }
 
 #if CONFIG_HIGHBITDEPTH
-typedef unsigned int (*HighbdMaskedSADFunc)(const uint8_t *a, int a_stride,
-                                            const uint8_t *b, int b_stride,
-                                            const uint8_t *m, int m_stride);
+typedef unsigned int (*HighbdMaskedSADFunc)(const uint8_t *src, int src_stride,
+                                            const uint8_t *ref, int ref_stride,
+                                            const uint8_t *second_pred,
+                                            const uint8_t *msk, int msk_stride,
+                                            int invert_mask);
 typedef std::tr1::tuple<HighbdMaskedSADFunc, HighbdMaskedSADFunc>
     HighbdMaskedSADParam;
 
@@ -109,9 +119,11 @@
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   DECLARE_ALIGNED(16, uint16_t, src_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
   DECLARE_ALIGNED(16, uint16_t, ref_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint16_t, second_pred_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
   DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
   uint8_t *src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
   uint8_t *ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
+  uint8_t *second_pred8_ptr = CONVERT_TO_BYTEPTR(second_pred_ptr);
   int err_count = 0;
   int first_failure = -1;
   int src_stride = MAX_SB_SIZE;
@@ -121,17 +133,22 @@
     for (int j = 0; j < MAX_SB_SIZE * MAX_SB_SIZE; j++) {
       src_ptr[j] = rnd.Rand16() & 0xfff;
       ref_ptr[j] = rnd.Rand16() & 0xfff;
+      second_pred_ptr[j] = rnd.Rand16() & 0xfff;
       msk_ptr[j] = ((rnd.Rand8() & 0x7f) > 64) ? rnd.Rand8() & 0x3f : 64;
     }
 
-    ref_ret = ref_maskedSAD_op_(src8_ptr, src_stride, ref8_ptr, ref_stride,
-                                msk_ptr, msk_stride);
-    ASM_REGISTER_STATE_CHECK(ret = maskedSAD_op_(src8_ptr, src_stride, ref8_ptr,
-                                                 ref_stride, msk_ptr,
-                                                 msk_stride));
-    if (ret != ref_ret) {
-      err_count++;
-      if (first_failure == -1) first_failure = i;
+    for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
+      ref_ret =
+          ref_maskedSAD_op_(src8_ptr, src_stride, ref8_ptr, ref_stride,
+                            second_pred8_ptr, msk_ptr, msk_stride, invert_mask);
+      ASM_REGISTER_STATE_CHECK(ret = maskedSAD_op_(src8_ptr, src_stride,
+                                                   ref8_ptr, ref_stride,
+                                                   second_pred8_ptr, msk_ptr,
+                                                   msk_stride, invert_mask));
+      if (ret != ref_ret) {
+        err_count++;
+        if (first_failure == -1) first_failure = i;
+      }
     }
   }
   EXPECT_EQ(0, err_count)
@@ -142,65 +159,83 @@
 
 using std::tr1::make_tuple;
 
-#if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(
-    SSSE3_C_COMPARE, MaskedSADTest,
-    ::testing::Values(
-#if CONFIG_EXT_PARTITION
-        make_tuple(&aom_masked_sad128x128_ssse3, &aom_masked_sad128x128_c),
-        make_tuple(&aom_masked_sad128x64_ssse3, &aom_masked_sad128x64_c),
-        make_tuple(&aom_masked_sad64x128_ssse3, &aom_masked_sad64x128_c),
-#endif  // CONFIG_EXT_PARTITION
-        make_tuple(&aom_masked_sad64x64_ssse3, &aom_masked_sad64x64_c),
-        make_tuple(&aom_masked_sad64x32_ssse3, &aom_masked_sad64x32_c),
-        make_tuple(&aom_masked_sad32x64_ssse3, &aom_masked_sad32x64_c),
-        make_tuple(&aom_masked_sad32x32_ssse3, &aom_masked_sad32x32_c),
-        make_tuple(&aom_masked_sad32x16_ssse3, &aom_masked_sad32x16_c),
-        make_tuple(&aom_masked_sad16x32_ssse3, &aom_masked_sad16x32_c),
-        make_tuple(&aom_masked_sad16x16_ssse3, &aom_masked_sad16x16_c),
-        make_tuple(&aom_masked_sad16x8_ssse3, &aom_masked_sad16x8_c),
-        make_tuple(&aom_masked_sad8x16_ssse3, &aom_masked_sad8x16_c),
-        make_tuple(&aom_masked_sad8x8_ssse3, &aom_masked_sad8x8_c),
-        make_tuple(&aom_masked_sad8x4_ssse3, &aom_masked_sad8x4_c),
-        make_tuple(&aom_masked_sad4x8_ssse3, &aom_masked_sad4x8_c),
-        make_tuple(&aom_masked_sad4x4_ssse3, &aom_masked_sad4x4_c)));
-#if CONFIG_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(SSSE3_C_COMPARE, HighbdMaskedSADTest,
+// TODO(david.barker): Re-enable this once we have vectorized
+// versions of the masked_compound_* functions
+#if 0 && HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(SSSE3_C_COMPARE, MaskedSADTest,
                         ::testing::Values(
 #if CONFIG_EXT_PARTITION
-                            make_tuple(&aom_highbd_masked_sad128x128_ssse3,
-                                       &aom_highbd_masked_sad128x128_c),
-                            make_tuple(&aom_highbd_masked_sad128x64_ssse3,
-                                       &aom_highbd_masked_sad128x64_c),
-                            make_tuple(&aom_highbd_masked_sad64x128_ssse3,
-                                       &aom_highbd_masked_sad64x128_c),
+                            make_tuple(&aom_masked_compound_sad128x128_ssse3,
+                                       &aom_masked_compound_sad128x128_c),
+                            make_tuple(&aom_masked_compound_sad128x64_ssse3,
+                                       &aom_masked_compound_sad128x64_c),
+                            make_tuple(&aom_masked_compound_sad64x128_ssse3,
+                                       &aom_masked_compound_sad64x128_c),
 #endif  // CONFIG_EXT_PARTITION
-                            make_tuple(&aom_highbd_masked_sad64x64_ssse3,
-                                       &aom_highbd_masked_sad64x64_c),
-                            make_tuple(&aom_highbd_masked_sad64x32_ssse3,
-                                       &aom_highbd_masked_sad64x32_c),
-                            make_tuple(&aom_highbd_masked_sad32x64_ssse3,
-                                       &aom_highbd_masked_sad32x64_c),
-                            make_tuple(&aom_highbd_masked_sad32x32_ssse3,
-                                       &aom_highbd_masked_sad32x32_c),
-                            make_tuple(&aom_highbd_masked_sad32x16_ssse3,
-                                       &aom_highbd_masked_sad32x16_c),
-                            make_tuple(&aom_highbd_masked_sad16x32_ssse3,
-                                       &aom_highbd_masked_sad16x32_c),
-                            make_tuple(&aom_highbd_masked_sad16x16_ssse3,
-                                       &aom_highbd_masked_sad16x16_c),
-                            make_tuple(&aom_highbd_masked_sad16x8_ssse3,
-                                       &aom_highbd_masked_sad16x8_c),
-                            make_tuple(&aom_highbd_masked_sad8x16_ssse3,
-                                       &aom_highbd_masked_sad8x16_c),
-                            make_tuple(&aom_highbd_masked_sad8x8_ssse3,
-                                       &aom_highbd_masked_sad8x8_c),
-                            make_tuple(&aom_highbd_masked_sad8x4_ssse3,
-                                       &aom_highbd_masked_sad8x4_c),
-                            make_tuple(&aom_highbd_masked_sad4x8_ssse3,
-                                       &aom_highbd_masked_sad4x8_c),
-                            make_tuple(&aom_highbd_masked_sad4x4_ssse3,
-                                       &aom_highbd_masked_sad4x4_c)));
+                            make_tuple(&aom_masked_compound_sad64x64_ssse3,
+                                       &aom_masked_compound_sad64x64_c),
+                            make_tuple(&aom_masked_compound_sad64x32_ssse3,
+                                       &aom_masked_compound_sad64x32_c),
+                            make_tuple(&aom_masked_compound_sad32x64_ssse3,
+                                       &aom_masked_compound_sad32x64_c),
+                            make_tuple(&aom_masked_compound_sad32x32_ssse3,
+                                       &aom_masked_compound_sad32x32_c),
+                            make_tuple(&aom_masked_compound_sad32x16_ssse3,
+                                       &aom_masked_compound_sad32x16_c),
+                            make_tuple(&aom_masked_compound_sad16x32_ssse3,
+                                       &aom_masked_compound_sad16x32_c),
+                            make_tuple(&aom_masked_compound_sad16x16_ssse3,
+                                       &aom_masked_compound_sad16x16_c),
+                            make_tuple(&aom_masked_compound_sad16x8_ssse3,
+                                       &aom_masked_compound_sad16x8_c),
+                            make_tuple(&aom_masked_compound_sad8x16_ssse3,
+                                       &aom_masked_compound_sad8x16_c),
+                            make_tuple(&aom_masked_compound_sad8x8_ssse3,
+                                       &aom_masked_compound_sad8x8_c),
+                            make_tuple(&aom_masked_compound_sad8x4_ssse3,
+                                       &aom_masked_compound_sad8x4_c),
+                            make_tuple(&aom_masked_compound_sad4x8_ssse3,
+                                       &aom_masked_compound_sad4x8_c),
+                            make_tuple(&aom_masked_compound_sad4x4_ssse3,
+                                       &aom_masked_compound_sad4x4_c)));
+#if CONFIG_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    SSSE3_C_COMPARE, HighbdMaskedSADTest,
+    ::testing::Values(
+#if CONFIG_EXT_PARTITION
+        make_tuple(&aom_highbd_masked_compound_sad128x128_ssse3,
+                   &aom_highbd_masked_compound_sad128x128_c),
+        make_tuple(&aom_highbd_masked_compound_sad128x64_ssse3,
+                   &aom_highbd_masked_compound_sad128x64_c),
+        make_tuple(&aom_highbd_masked_compound_sad64x128_ssse3,
+                   &aom_highbd_masked_compound_sad64x128_c),
+#endif  // CONFIG_EXT_PARTITION
+        make_tuple(&aom_highbd_masked_compound_sad64x64_ssse3,
+                   &aom_highbd_masked_compound_sad64x64_c),
+        make_tuple(&aom_highbd_masked_compound_sad64x32_ssse3,
+                   &aom_highbd_masked_compound_sad64x32_c),
+        make_tuple(&aom_highbd_masked_compound_sad32x64_ssse3,
+                   &aom_highbd_masked_compound_sad32x64_c),
+        make_tuple(&aom_highbd_masked_compound_sad32x32_ssse3,
+                   &aom_highbd_masked_compound_sad32x32_c),
+        make_tuple(&aom_highbd_masked_compound_sad32x16_ssse3,
+                   &aom_highbd_masked_compound_sad32x16_c),
+        make_tuple(&aom_highbd_masked_compound_sad16x32_ssse3,
+                   &aom_highbd_masked_compound_sad16x32_c),
+        make_tuple(&aom_highbd_masked_compound_sad16x16_ssse3,
+                   &aom_highbd_masked_compound_sad16x16_c),
+        make_tuple(&aom_highbd_masked_compound_sad16x8_ssse3,
+                   &aom_highbd_masked_compound_sad16x8_c),
+        make_tuple(&aom_highbd_masked_compound_sad8x16_ssse3,
+                   &aom_highbd_masked_compound_sad8x16_c),
+        make_tuple(&aom_highbd_masked_compound_sad8x8_ssse3,
+                   &aom_highbd_masked_compound_sad8x8_c),
+        make_tuple(&aom_highbd_masked_compound_sad8x4_ssse3,
+                   &aom_highbd_masked_compound_sad8x4_c),
+        make_tuple(&aom_highbd_masked_compound_sad4x8_ssse3,
+                   &aom_highbd_masked_compound_sad4x8_c),
+        make_tuple(&aom_highbd_masked_compound_sad4x4_ssse3,
+                   &aom_highbd_masked_compound_sad4x4_c)));
 #endif  // CONFIG_HIGHBITDEPTH
-#endif  // HAVE_SSSE3
+#endif  // 0 && HAVE_SSSE3
 }  // namespace
diff --git a/test/masked_variance_test.cc b/test/masked_variance_test.cc
index 65e852a..e0fc010 100644
--- a/test/masked_variance_test.cc
+++ b/test/masked_variance_test.cc
@@ -31,105 +31,10 @@
 namespace {
 const int number_of_iterations = 500;
 
-typedef unsigned int (*MaskedVarianceFunc)(const uint8_t *a, int a_stride,
-                                           const uint8_t *b, int b_stride,
-                                           const uint8_t *m, int m_stride,
-                                           unsigned int *sse);
-
-typedef std::tr1::tuple<MaskedVarianceFunc, MaskedVarianceFunc>
-    MaskedVarianceParam;
-
-class MaskedVarianceTest
-    : public ::testing::TestWithParam<MaskedVarianceParam> {
- public:
-  virtual ~MaskedVarianceTest() {}
-  virtual void SetUp() {
-    opt_func_ = GET_PARAM(0);
-    ref_func_ = GET_PARAM(1);
-  }
-
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  MaskedVarianceFunc opt_func_;
-  MaskedVarianceFunc ref_func_;
-};
-
-TEST_P(MaskedVarianceTest, OperationCheck) {
-  unsigned int ref_ret, opt_ret;
-  unsigned int ref_sse, opt_sse;
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint8_t, src_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
-  DECLARE_ALIGNED(16, uint8_t, ref_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
-  DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
-  int err_count = 0;
-  int first_failure = -1;
-  int src_stride = MAX_SB_SIZE;
-  int ref_stride = MAX_SB_SIZE;
-  int msk_stride = MAX_SB_SIZE;
-
-  for (int i = 0; i < number_of_iterations; ++i) {
-    for (int j = 0; j < MAX_SB_SIZE * MAX_SB_SIZE; j++) {
-      src_ptr[j] = rnd.Rand8();
-      ref_ptr[j] = rnd.Rand8();
-      msk_ptr[j] = rnd(65);
-    }
-
-    ref_ret = ref_func_(src_ptr, src_stride, ref_ptr, ref_stride, msk_ptr,
-                        msk_stride, &ref_sse);
-    ASM_REGISTER_STATE_CHECK(opt_ret = opt_func_(src_ptr, src_stride, ref_ptr,
-                                                 ref_stride, msk_ptr,
-                                                 msk_stride, &opt_sse));
-
-    if (opt_ret != ref_ret || opt_sse != ref_sse) {
-      err_count++;
-      if (first_failure == -1) first_failure = i;
-    }
-  }
-
-  EXPECT_EQ(0, err_count) << "Error: Masked Variance Test OperationCheck,"
-                          << "C output doesn't match SSSE3 output. "
-                          << "First failed at test case " << first_failure;
-}
-
-TEST_P(MaskedVarianceTest, ExtremeValues) {
-  unsigned int ref_ret, opt_ret;
-  unsigned int ref_sse, opt_sse;
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint8_t, src_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
-  DECLARE_ALIGNED(16, uint8_t, ref_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
-  DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
-  int err_count = 0;
-  int first_failure = -1;
-  int src_stride = MAX_SB_SIZE;
-  int ref_stride = MAX_SB_SIZE;
-  int msk_stride = MAX_SB_SIZE;
-
-  for (int i = 0; i < 8; ++i) {
-    memset(src_ptr, (i & 0x1) ? 255 : 0, MAX_SB_SIZE * MAX_SB_SIZE);
-    memset(ref_ptr, (i & 0x2) ? 255 : 0, MAX_SB_SIZE * MAX_SB_SIZE);
-    memset(msk_ptr, (i & 0x4) ? 64 : 0, MAX_SB_SIZE * MAX_SB_SIZE);
-
-    ref_ret = ref_func_(src_ptr, src_stride, ref_ptr, ref_stride, msk_ptr,
-                        msk_stride, &ref_sse);
-    ASM_REGISTER_STATE_CHECK(opt_ret = opt_func_(src_ptr, src_stride, ref_ptr,
-                                                 ref_stride, msk_ptr,
-                                                 msk_stride, &opt_sse));
-
-    if (opt_ret != ref_ret || opt_sse != ref_sse) {
-      err_count++;
-      if (first_failure == -1) first_failure = i;
-    }
-  }
-
-  EXPECT_EQ(0, err_count) << "Error: Masked Variance Test ExtremeValues,"
-                          << "C output doesn't match SSSE3 output. "
-                          << "First failed at test case " << first_failure;
-}
-
 typedef unsigned int (*MaskedSubPixelVarianceFunc)(
-    const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
-    int b_stride, const uint8_t *m, int m_stride, unsigned int *sse);
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *ref, int ref_stride, const uint8_t *second_pred,
+    const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
 
 typedef std::tr1::tuple<MaskedSubPixelVarianceFunc, MaskedSubPixelVarianceFunc>
     MaskedSubPixelVarianceParam;
@@ -156,6 +61,8 @@
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   DECLARE_ALIGNED(16, uint8_t, src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
   DECLARE_ALIGNED(16, uint8_t, ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
+  DECLARE_ALIGNED(16, uint8_t,
+                  second_pred_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
   DECLARE_ALIGNED(16, uint8_t, msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
   int err_count = 0;
   int first_failure = -1;
@@ -171,6 +78,7 @@
     for (int j = 0; j < (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1); j++) {
       src_ptr[j] = rnd.Rand8();
       ref_ptr[j] = rnd.Rand8();
+      second_pred_ptr[j] = rnd.Rand8();
       msk_ptr[j] = rnd(65);
     }
     for (int k = 0; k < 3; k++) {
@@ -178,16 +86,23 @@
       for (int l = 0; l < 3; l++) {
         xoffset = xoffsets[k];
         yoffset = yoffsets[l];
+        for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
+          // const uint8_t *src, int src_stride, int xoffset, int yoffset, const
+          // uint8_t *ref, int ref_stride,
+          // const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int
+          // invert_mask, unsigned int *sse
+          ref_ret = ref_func_(src_ptr, src_stride, xoffset, yoffset, ref_ptr,
+                              ref_stride, second_pred_ptr, msk_ptr, msk_stride,
+                              invert_mask, &ref_sse);
+          ASM_REGISTER_STATE_CHECK(
+              opt_ret = opt_func_(src_ptr, src_stride, xoffset, yoffset,
+                                  ref_ptr, ref_stride, second_pred_ptr, msk_ptr,
+                                  msk_stride, invert_mask, &opt_sse));
 
-        ref_ret = ref_func_(src_ptr, src_stride, xoffset, yoffset, ref_ptr,
-                            ref_stride, msk_ptr, msk_stride, &ref_sse);
-        ASM_REGISTER_STATE_CHECK(
-            opt_ret = opt_func_(src_ptr, src_stride, xoffset, yoffset, ref_ptr,
-                                ref_stride, msk_ptr, msk_stride, &opt_sse));
-
-        if (opt_ret != ref_ret || opt_sse != ref_sse) {
-          err_count++;
-          if (first_failure == -1) first_failure = i;
+          if (opt_ret != ref_ret || opt_sse != ref_sse) {
+            err_count++;
+            if (first_failure == -1) first_failure = i;
+          }
         }
       }
     }
@@ -205,6 +120,8 @@
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   DECLARE_ALIGNED(16, uint8_t, src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
   DECLARE_ALIGNED(16, uint8_t, ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
+  DECLARE_ALIGNED(16, uint8_t,
+                  second_pred_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
   DECLARE_ALIGNED(16, uint8_t, msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
   int first_failure_x = -1;
   int first_failure_y = -1;
@@ -216,26 +133,32 @@
 
   for (int xoffset = 0; xoffset < BIL_SUBPEL_SHIFTS; xoffset++) {
     for (int yoffset = 0; yoffset < BIL_SUBPEL_SHIFTS; yoffset++) {
-      for (int i = 0; i < 8; ++i) {
+      for (int i = 0; i < 16; ++i) {
         memset(src_ptr, (i & 0x1) ? 255 : 0,
                (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1));
         memset(ref_ptr, (i & 0x2) ? 255 : 0,
                (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1));
-        memset(msk_ptr, (i & 0x4) ? 64 : 0,
+        memset(second_pred_ptr, (i & 0x4) ? 255 : 0,
+               (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1));
+        memset(msk_ptr, (i & 0x8) ? 64 : 0,
                (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1));
 
-        ref_ret = ref_func_(src_ptr, src_stride, xoffset, yoffset, ref_ptr,
-                            ref_stride, msk_ptr, msk_stride, &ref_sse);
-        ASM_REGISTER_STATE_CHECK(
-            opt_ret = opt_func_(src_ptr, src_stride, xoffset, yoffset, ref_ptr,
-                                ref_stride, msk_ptr, msk_stride, &opt_sse));
+        for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
+          ref_ret = ref_func_(src_ptr, src_stride, xoffset, yoffset, ref_ptr,
+                              ref_stride, second_pred_ptr, msk_ptr, msk_stride,
+                              invert_mask, &ref_sse);
+          ASM_REGISTER_STATE_CHECK(
+              opt_ret = opt_func_(src_ptr, src_stride, xoffset, yoffset,
+                                  ref_ptr, ref_stride, second_pred_ptr, msk_ptr,
+                                  msk_stride, invert_mask, &opt_sse));
 
-        if (opt_ret != ref_ret || opt_sse != ref_sse) {
-          err_count++;
-          if (first_failure == -1) {
-            first_failure = i;
-            first_failure_x = xoffset;
-            first_failure_y = yoffset;
+          if (opt_ret != ref_ret || opt_sse != ref_sse) {
+            err_count++;
+            if (first_failure == -1) {
+              first_failure = i;
+              first_failure_x = xoffset;
+              first_failure_y = yoffset;
+            }
           }
         }
       }
@@ -250,105 +173,6 @@
 }
 
 #if CONFIG_HIGHBITDEPTH
-typedef std::tr1::tuple<MaskedVarianceFunc, MaskedVarianceFunc, aom_bit_depth_t>
-    HighbdMaskedVarianceParam;
-
-class HighbdMaskedVarianceTest
-    : public ::testing::TestWithParam<HighbdMaskedVarianceParam> {
- public:
-  virtual ~HighbdMaskedVarianceTest() {}
-  virtual void SetUp() {
-    opt_func_ = GET_PARAM(0);
-    ref_func_ = GET_PARAM(1);
-    bit_depth_ = GET_PARAM(2);
-  }
-
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  MaskedVarianceFunc opt_func_;
-  MaskedVarianceFunc ref_func_;
-  aom_bit_depth_t bit_depth_;
-};
-
-TEST_P(HighbdMaskedVarianceTest, OperationCheck) {
-  unsigned int ref_ret, opt_ret;
-  unsigned int ref_sse, opt_sse;
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint16_t, src_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
-  DECLARE_ALIGNED(16, uint16_t, ref_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
-  DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
-  uint8_t *src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
-  uint8_t *ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
-  int err_count = 0;
-  int first_failure = -1;
-  int src_stride = MAX_SB_SIZE;
-  int ref_stride = MAX_SB_SIZE;
-  int msk_stride = MAX_SB_SIZE;
-
-  for (int i = 0; i < number_of_iterations; ++i) {
-    for (int j = 0; j < MAX_SB_SIZE * MAX_SB_SIZE; j++) {
-      src_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
-      ref_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
-      msk_ptr[j] = rnd(65);
-    }
-
-    ref_ret = ref_func_(src8_ptr, src_stride, ref8_ptr, ref_stride, msk_ptr,
-                        msk_stride, &ref_sse);
-    ASM_REGISTER_STATE_CHECK(opt_ret = opt_func_(src8_ptr, src_stride, ref8_ptr,
-                                                 ref_stride, msk_ptr,
-                                                 msk_stride, &opt_sse));
-
-    if (opt_ret != ref_ret || opt_sse != ref_sse) {
-      err_count++;
-      if (first_failure == -1) first_failure = i;
-    }
-  }
-
-  EXPECT_EQ(0, err_count) << "Error: Masked Variance Test OperationCheck,"
-                          << "C output doesn't match SSSE3 output. "
-                          << "First failed at test case " << first_failure;
-}
-
-TEST_P(HighbdMaskedVarianceTest, ExtremeValues) {
-  unsigned int ref_ret, opt_ret;
-  unsigned int ref_sse, opt_sse;
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint16_t, src_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
-  DECLARE_ALIGNED(16, uint16_t, ref_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
-  DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
-  uint8_t *src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
-  uint8_t *ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
-  int err_count = 0;
-  int first_failure = -1;
-  int src_stride = MAX_SB_SIZE;
-  int ref_stride = MAX_SB_SIZE;
-  int msk_stride = MAX_SB_SIZE;
-
-  for (int i = 0; i < 8; ++i) {
-    aom_memset16(src_ptr, (i & 0x1) ? ((1 << bit_depth_) - 1) : 0,
-                 MAX_SB_SIZE * MAX_SB_SIZE);
-    aom_memset16(ref_ptr, (i & 0x2) ? ((1 << bit_depth_) - 1) : 0,
-                 MAX_SB_SIZE * MAX_SB_SIZE);
-    memset(msk_ptr, (i & 0x4) ? 64 : 0, MAX_SB_SIZE * MAX_SB_SIZE);
-
-    ref_ret = ref_func_(src8_ptr, src_stride, ref8_ptr, ref_stride, msk_ptr,
-                        msk_stride, &ref_sse);
-    ASM_REGISTER_STATE_CHECK(opt_ret = opt_func_(src8_ptr, src_stride, ref8_ptr,
-                                                 ref_stride, msk_ptr,
-                                                 msk_stride, &opt_sse));
-
-    if (opt_ret != ref_ret || opt_sse != ref_sse) {
-      err_count++;
-      if (first_failure == -1) first_failure = i;
-    }
-  }
-
-  EXPECT_EQ(0, err_count) << "Error: Masked Variance Test ExtremeValues,"
-                          << "C output doesn't match SSSE3 output. "
-                          << "First failed at test case " << first_failure;
-}
-
 typedef std::tr1::tuple<MaskedSubPixelVarianceFunc, MaskedSubPixelVarianceFunc,
                         aom_bit_depth_t>
     HighbdMaskedSubPixelVarianceParam;
@@ -377,9 +201,12 @@
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   DECLARE_ALIGNED(16, uint16_t, src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
   DECLARE_ALIGNED(16, uint16_t, ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
+  DECLARE_ALIGNED(16, uint16_t,
+                  second_pred_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
   DECLARE_ALIGNED(16, uint8_t, msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
   uint8_t *src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
   uint8_t *ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
+  uint8_t *second_pred8_ptr = CONVERT_TO_BYTEPTR(second_pred_ptr);
   int err_count = 0;
   int first_failure = -1;
   int first_failure_x = -1;
@@ -395,22 +222,26 @@
         for (int j = 0; j < (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1); j++) {
           src_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
           ref_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
+          second_pred_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
           msk_ptr[j] = rnd(65);
         }
 
-        ref_ret = ref_func_(src8_ptr, src_stride, xoffset, yoffset, ref8_ptr,
-                            ref_stride, msk_ptr, msk_stride, &ref_sse);
-        ASM_REGISTER_STATE_CHECK(opt_ret =
-                                     opt_func_(src8_ptr, src_stride, xoffset,
-                                               yoffset, ref8_ptr, ref_stride,
-                                               msk_ptr, msk_stride, &opt_sse));
+        for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
+          ref_ret = ref_func_(src8_ptr, src_stride, xoffset, yoffset, ref8_ptr,
+                              ref_stride, second_pred8_ptr, msk_ptr, msk_stride,
+                              invert_mask, &ref_sse);
+          ASM_REGISTER_STATE_CHECK(
+              opt_ret = opt_func_(src8_ptr, src_stride, xoffset, yoffset,
+                                  ref8_ptr, ref_stride, second_pred8_ptr,
+                                  msk_ptr, msk_stride, invert_mask, &opt_sse));
 
-        if (opt_ret != ref_ret || opt_sse != ref_sse) {
-          err_count++;
-          if (first_failure == -1) {
-            first_failure = i;
-            first_failure_x = xoffset;
-            first_failure_y = yoffset;
+          if (opt_ret != ref_ret || opt_sse != ref_sse) {
+            err_count++;
+            if (first_failure == -1) {
+              first_failure = i;
+              first_failure_x = xoffset;
+              first_failure_y = yoffset;
+            }
           }
         }
       }
@@ -431,8 +262,11 @@
   DECLARE_ALIGNED(16, uint16_t, src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
   DECLARE_ALIGNED(16, uint16_t, ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
   DECLARE_ALIGNED(16, uint8_t, msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
+  DECLARE_ALIGNED(16, uint16_t,
+                  second_pred_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
   uint8_t *src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
   uint8_t *ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
+  uint8_t *second_pred8_ptr = CONVERT_TO_BYTEPTR(second_pred_ptr);
   int first_failure_x = -1;
   int first_failure_y = -1;
   int err_count = 0;
@@ -443,27 +277,32 @@
 
   for (int xoffset = 0; xoffset < BIL_SUBPEL_SHIFTS; xoffset++) {
     for (int yoffset = 0; yoffset < BIL_SUBPEL_SHIFTS; yoffset++) {
-      for (int i = 0; i < 8; ++i) {
+      for (int i = 0; i < 16; ++i) {
         aom_memset16(src_ptr, (i & 0x1) ? ((1 << bit_depth_) - 1) : 0,
                      (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1));
         aom_memset16(ref_ptr, (i & 0x2) ? ((1 << bit_depth_) - 1) : 0,
                      (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1));
-        memset(msk_ptr, (i & 0x4) ? 64 : 0,
+        aom_memset16(second_pred_ptr, (i & 0x4) ? ((1 << bit_depth_) - 1) : 0,
+                     (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1));
+        memset(msk_ptr, (i & 0x8) ? 64 : 0,
                (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1));
 
-        ref_ret = ref_func_(src8_ptr, src_stride, xoffset, yoffset, ref8_ptr,
-                            ref_stride, msk_ptr, msk_stride, &ref_sse);
-        ASM_REGISTER_STATE_CHECK(opt_ret =
-                                     opt_func_(src8_ptr, src_stride, xoffset,
-                                               yoffset, ref8_ptr, ref_stride,
-                                               msk_ptr, msk_stride, &opt_sse));
+        for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
+          ref_ret = ref_func_(src8_ptr, src_stride, xoffset, yoffset, ref8_ptr,
+                              ref_stride, second_pred8_ptr, msk_ptr, msk_stride,
+                              invert_mask, &ref_sse);
+          ASM_REGISTER_STATE_CHECK(
+              opt_ret = opt_func_(src8_ptr, src_stride, xoffset, yoffset,
+                                  ref8_ptr, ref_stride, second_pred8_ptr,
+                                  msk_ptr, msk_stride, invert_mask, &opt_sse));
 
-        if (opt_ret != ref_ret || opt_sse != ref_sse) {
-          err_count++;
-          if (first_failure == -1) {
-            first_failure = i;
-            first_failure_x = xoffset;
-            first_failure_y = yoffset;
+          if (opt_ret != ref_ret || opt_sse != ref_sse) {
+            err_count++;
+            if (first_failure == -1) {
+              first_failure = i;
+              first_failure_x = xoffset;
+              first_failure_y = yoffset;
+            }
           }
         }
       }
@@ -480,311 +319,208 @@
 
 using std::tr1::make_tuple;
 
-#if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(
-    SSSE3_C_COMPARE, MaskedVarianceTest,
-    ::testing::Values(
-#if CONFIG_EXT_PARTITION
-        make_tuple(&aom_masked_variance128x128_ssse3,
-                   &aom_masked_variance128x128_c),
-        make_tuple(&aom_masked_variance128x64_ssse3,
-                   &aom_masked_variance128x64_c),
-        make_tuple(&aom_masked_variance64x128_ssse3,
-                   &aom_masked_variance64x128_c),
-#endif  // CONFIG_EXT_PARTITION
-        make_tuple(&aom_masked_variance64x64_ssse3,
-                   &aom_masked_variance64x64_c),
-        make_tuple(&aom_masked_variance64x32_ssse3,
-                   &aom_masked_variance64x32_c),
-        make_tuple(&aom_masked_variance32x64_ssse3,
-                   &aom_masked_variance32x64_c),
-        make_tuple(&aom_masked_variance32x32_ssse3,
-                   &aom_masked_variance32x32_c),
-        make_tuple(&aom_masked_variance32x16_ssse3,
-                   &aom_masked_variance32x16_c),
-        make_tuple(&aom_masked_variance16x32_ssse3,
-                   &aom_masked_variance16x32_c),
-        make_tuple(&aom_masked_variance16x16_ssse3,
-                   &aom_masked_variance16x16_c),
-        make_tuple(&aom_masked_variance16x8_ssse3, &aom_masked_variance16x8_c),
-        make_tuple(&aom_masked_variance8x16_ssse3, &aom_masked_variance8x16_c),
-        make_tuple(&aom_masked_variance8x8_ssse3, &aom_masked_variance8x8_c),
-        make_tuple(&aom_masked_variance8x4_ssse3, &aom_masked_variance8x4_c),
-        make_tuple(&aom_masked_variance4x8_ssse3, &aom_masked_variance4x8_c),
-        make_tuple(&aom_masked_variance4x4_ssse3, &aom_masked_variance4x4_c)));
-
+// TODO(david.barker): Re-enable this once we have vectorized
+// versions of the masked_compound_* functions
+#if 0 && HAVE_SSSE3
 INSTANTIATE_TEST_CASE_P(
     SSSE3_C_COMPARE, MaskedSubPixelVarianceTest,
     ::testing::Values(
 #if CONFIG_EXT_PARTITION
-        make_tuple(&aom_masked_sub_pixel_variance128x128_ssse3,
-                   &aom_masked_sub_pixel_variance128x128_c),
-        make_tuple(&aom_masked_sub_pixel_variance128x64_ssse3,
-                   &aom_masked_sub_pixel_variance128x64_c),
-        make_tuple(&aom_masked_sub_pixel_variance64x128_ssse3,
-                   &aom_masked_sub_pixel_variance64x128_c),
+        make_tuple(&aom_masked_compound_sub_pixel_variance128x128_ssse3,
+                   &aom_masked_compound_sub_pixel_variance128x128_c),
+        make_tuple(&aom_masked_compound_sub_pixel_variance128x64_ssse3,
+                   &aom_masked_compound_sub_pixel_variance128x64_c),
+        make_tuple(&aom_masked_compound_sub_pixel_variance64x128_ssse3,
+                   &aom_masked_compound_sub_pixel_variance64x128_c),
 #endif  // CONFIG_EXT_PARTITION
-        make_tuple(&aom_masked_sub_pixel_variance64x64_ssse3,
-                   &aom_masked_sub_pixel_variance64x64_c),
-        make_tuple(&aom_masked_sub_pixel_variance64x32_ssse3,
-                   &aom_masked_sub_pixel_variance64x32_c),
-        make_tuple(&aom_masked_sub_pixel_variance32x64_ssse3,
-                   &aom_masked_sub_pixel_variance32x64_c),
-        make_tuple(&aom_masked_sub_pixel_variance32x32_ssse3,
-                   &aom_masked_sub_pixel_variance32x32_c),
-        make_tuple(&aom_masked_sub_pixel_variance32x16_ssse3,
-                   &aom_masked_sub_pixel_variance32x16_c),
-        make_tuple(&aom_masked_sub_pixel_variance16x32_ssse3,
-                   &aom_masked_sub_pixel_variance16x32_c),
-        make_tuple(&aom_masked_sub_pixel_variance16x16_ssse3,
-                   &aom_masked_sub_pixel_variance16x16_c),
-        make_tuple(&aom_masked_sub_pixel_variance16x8_ssse3,
-                   &aom_masked_sub_pixel_variance16x8_c),
-        make_tuple(&aom_masked_sub_pixel_variance8x16_ssse3,
-                   &aom_masked_sub_pixel_variance8x16_c),
-        make_tuple(&aom_masked_sub_pixel_variance8x8_ssse3,
-                   &aom_masked_sub_pixel_variance8x8_c),
-        make_tuple(&aom_masked_sub_pixel_variance8x4_ssse3,
-                   &aom_masked_sub_pixel_variance8x4_c),
-        make_tuple(&aom_masked_sub_pixel_variance4x8_ssse3,
-                   &aom_masked_sub_pixel_variance4x8_c),
-        make_tuple(&aom_masked_sub_pixel_variance4x4_ssse3,
-                   &aom_masked_sub_pixel_variance4x4_c)));
+        make_tuple(&aom_masked_compound_sub_pixel_variance64x64_ssse3,
+                   &aom_masked_compound_sub_pixel_variance64x64_c),
+        make_tuple(&aom_masked_compound_sub_pixel_variance64x32_ssse3,
+                   &aom_masked_compound_sub_pixel_variance64x32_c),
+        make_tuple(&aom_masked_compound_sub_pixel_variance32x64_ssse3,
+                   &aom_masked_compound_sub_pixel_variance32x64_c),
+        make_tuple(&aom_masked_compound_sub_pixel_variance32x32_ssse3,
+                   &aom_masked_compound_sub_pixel_variance32x32_c),
+        make_tuple(&aom_masked_compound_sub_pixel_variance32x16_ssse3,
+                   &aom_masked_compound_sub_pixel_variance32x16_c),
+        make_tuple(&aom_masked_compound_sub_pixel_variance16x32_ssse3,
+                   &aom_masked_compound_sub_pixel_variance16x32_c),
+        make_tuple(&aom_masked_compound_sub_pixel_variance16x16_ssse3,
+                   &aom_masked_compound_sub_pixel_variance16x16_c),
+        make_tuple(&aom_masked_compound_sub_pixel_variance16x8_ssse3,
+                   &aom_masked_compound_sub_pixel_variance16x8_c),
+        make_tuple(&aom_masked_compound_sub_pixel_variance8x16_ssse3,
+                   &aom_masked_compound_sub_pixel_variance8x16_c),
+        make_tuple(&aom_masked_compound_sub_pixel_variance8x8_ssse3,
+                   &aom_masked_compound_sub_pixel_variance8x8_c),
+        make_tuple(&aom_masked_compound_sub_pixel_variance8x4_ssse3,
+                   &aom_masked_compound_sub_pixel_variance8x4_c),
+        make_tuple(&aom_masked_compound_sub_pixel_variance4x8_ssse3,
+                   &aom_masked_compound_sub_pixel_variance4x8_c),
+        make_tuple(&aom_masked_compound_sub_pixel_variance4x4_ssse3,
+                   &aom_masked_compound_sub_pixel_variance4x4_c)));
 
 #if CONFIG_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
-    SSSE3_C_COMPARE, HighbdMaskedVarianceTest,
-    ::testing::Values(
-#if CONFIG_EXT_PARTITION
-        make_tuple(&aom_highbd_masked_variance128x128_ssse3,
-                   &aom_highbd_masked_variance128x128_c, AOM_BITS_8),
-        make_tuple(&aom_highbd_masked_variance128x64_ssse3,
-                   &aom_highbd_masked_variance128x64_c, AOM_BITS_8),
-        make_tuple(&aom_highbd_masked_variance64x128_ssse3,
-                   &aom_highbd_masked_variance64x128_c, AOM_BITS_8),
-#endif  // CONFIG_EXT_PARTITION
-        make_tuple(&aom_highbd_masked_variance64x64_ssse3,
-                   &aom_highbd_masked_variance64x64_c, AOM_BITS_8),
-        make_tuple(&aom_highbd_masked_variance64x32_ssse3,
-                   &aom_highbd_masked_variance64x32_c, AOM_BITS_8),
-        make_tuple(&aom_highbd_masked_variance32x64_ssse3,
-                   &aom_highbd_masked_variance32x64_c, AOM_BITS_8),
-        make_tuple(&aom_highbd_masked_variance32x32_ssse3,
-                   &aom_highbd_masked_variance32x32_c, AOM_BITS_8),
-        make_tuple(&aom_highbd_masked_variance32x16_ssse3,
-                   &aom_highbd_masked_variance32x16_c, AOM_BITS_8),
-        make_tuple(&aom_highbd_masked_variance16x32_ssse3,
-                   &aom_highbd_masked_variance16x32_c, AOM_BITS_8),
-        make_tuple(&aom_highbd_masked_variance16x16_ssse3,
-                   &aom_highbd_masked_variance16x16_c, AOM_BITS_8),
-        make_tuple(&aom_highbd_masked_variance16x8_ssse3,
-                   &aom_highbd_masked_variance16x8_c, AOM_BITS_8),
-        make_tuple(&aom_highbd_masked_variance8x16_ssse3,
-                   &aom_highbd_masked_variance8x16_c, AOM_BITS_8),
-        make_tuple(&aom_highbd_masked_variance8x8_ssse3,
-                   &aom_highbd_masked_variance8x8_c, AOM_BITS_8),
-        make_tuple(&aom_highbd_masked_variance8x4_ssse3,
-                   &aom_highbd_masked_variance8x4_c, AOM_BITS_8),
-        make_tuple(&aom_highbd_masked_variance4x8_ssse3,
-                   &aom_highbd_masked_variance4x8_c, AOM_BITS_8),
-        make_tuple(&aom_highbd_masked_variance4x4_ssse3,
-                   &aom_highbd_masked_variance4x4_c, AOM_BITS_8),
-#if CONFIG_EXT_PARTITION
-        make_tuple(&aom_highbd_10_masked_variance128x128_ssse3,
-                   &aom_highbd_10_masked_variance128x128_c, AOM_BITS_10),
-        make_tuple(&aom_highbd_10_masked_variance128x64_ssse3,
-                   &aom_highbd_10_masked_variance128x64_c, AOM_BITS_10),
-        make_tuple(&aom_highbd_10_masked_variance64x128_ssse3,
-                   &aom_highbd_10_masked_variance64x128_c, AOM_BITS_10),
-#endif  // CONFIG_EXT_PARTITION
-        make_tuple(&aom_highbd_10_masked_variance64x64_ssse3,
-                   &aom_highbd_10_masked_variance64x64_c, AOM_BITS_10),
-        make_tuple(&aom_highbd_10_masked_variance64x32_ssse3,
-                   &aom_highbd_10_masked_variance64x32_c, AOM_BITS_10),
-        make_tuple(&aom_highbd_10_masked_variance32x64_ssse3,
-                   &aom_highbd_10_masked_variance32x64_c, AOM_BITS_10),
-        make_tuple(&aom_highbd_10_masked_variance32x32_ssse3,
-                   &aom_highbd_10_masked_variance32x32_c, AOM_BITS_10),
-        make_tuple(&aom_highbd_10_masked_variance32x16_ssse3,
-                   &aom_highbd_10_masked_variance32x16_c, AOM_BITS_10),
-        make_tuple(&aom_highbd_10_masked_variance16x32_ssse3,
-                   &aom_highbd_10_masked_variance16x32_c, AOM_BITS_10),
-        make_tuple(&aom_highbd_10_masked_variance16x16_ssse3,
-                   &aom_highbd_10_masked_variance16x16_c, AOM_BITS_10),
-        make_tuple(&aom_highbd_10_masked_variance16x8_ssse3,
-                   &aom_highbd_10_masked_variance16x8_c, AOM_BITS_10),
-        make_tuple(&aom_highbd_10_masked_variance8x16_ssse3,
-                   &aom_highbd_10_masked_variance8x16_c, AOM_BITS_10),
-        make_tuple(&aom_highbd_10_masked_variance8x8_ssse3,
-                   &aom_highbd_10_masked_variance8x8_c, AOM_BITS_10),
-        make_tuple(&aom_highbd_10_masked_variance8x4_ssse3,
-                   &aom_highbd_10_masked_variance8x4_c, AOM_BITS_10),
-        make_tuple(&aom_highbd_10_masked_variance4x8_ssse3,
-                   &aom_highbd_10_masked_variance4x8_c, AOM_BITS_10),
-        make_tuple(&aom_highbd_10_masked_variance4x4_ssse3,
-                   &aom_highbd_10_masked_variance4x4_c, AOM_BITS_10),
-#if CONFIG_EXT_PARTITION
-        make_tuple(&aom_highbd_12_masked_variance128x128_ssse3,
-                   &aom_highbd_12_masked_variance128x128_c, AOM_BITS_12),
-        make_tuple(&aom_highbd_12_masked_variance128x64_ssse3,
-                   &aom_highbd_12_masked_variance128x64_c, AOM_BITS_12),
-        make_tuple(&aom_highbd_12_masked_variance64x128_ssse3,
-                   &aom_highbd_12_masked_variance64x128_c, AOM_BITS_12),
-#endif  // CONFIG_EXT_PARTITION
-        make_tuple(&aom_highbd_12_masked_variance64x64_ssse3,
-                   &aom_highbd_12_masked_variance64x64_c, AOM_BITS_12),
-        make_tuple(&aom_highbd_12_masked_variance64x32_ssse3,
-                   &aom_highbd_12_masked_variance64x32_c, AOM_BITS_12),
-        make_tuple(&aom_highbd_12_masked_variance32x64_ssse3,
-                   &aom_highbd_12_masked_variance32x64_c, AOM_BITS_12),
-        make_tuple(&aom_highbd_12_masked_variance32x32_ssse3,
-                   &aom_highbd_12_masked_variance32x32_c, AOM_BITS_12),
-        make_tuple(&aom_highbd_12_masked_variance32x16_ssse3,
-                   &aom_highbd_12_masked_variance32x16_c, AOM_BITS_12),
-        make_tuple(&aom_highbd_12_masked_variance16x32_ssse3,
-                   &aom_highbd_12_masked_variance16x32_c, AOM_BITS_12),
-        make_tuple(&aom_highbd_12_masked_variance16x16_ssse3,
-                   &aom_highbd_12_masked_variance16x16_c, AOM_BITS_12),
-        make_tuple(&aom_highbd_12_masked_variance16x8_ssse3,
-                   &aom_highbd_12_masked_variance16x8_c, AOM_BITS_12),
-        make_tuple(&aom_highbd_12_masked_variance8x16_ssse3,
-                   &aom_highbd_12_masked_variance8x16_c, AOM_BITS_12),
-        make_tuple(&aom_highbd_12_masked_variance8x8_ssse3,
-                   &aom_highbd_12_masked_variance8x8_c, AOM_BITS_12),
-        make_tuple(&aom_highbd_12_masked_variance8x4_ssse3,
-                   &aom_highbd_12_masked_variance8x4_c, AOM_BITS_12),
-        make_tuple(&aom_highbd_12_masked_variance4x8_ssse3,
-                   &aom_highbd_12_masked_variance4x8_c, AOM_BITS_12),
-        make_tuple(&aom_highbd_12_masked_variance4x4_ssse3,
-                   &aom_highbd_12_masked_variance4x4_c, AOM_BITS_12)));
-
-INSTANTIATE_TEST_CASE_P(
     SSSE3_C_COMPARE, HighbdMaskedSubPixelVarianceTest,
     ::testing::Values(
 #if CONFIG_EXT_PARTITION
-        make_tuple(&aom_highbd_masked_sub_pixel_variance128x128_ssse3,
-                   &aom_highbd_masked_sub_pixel_variance128x128_c, AOM_BITS_8),
-        make_tuple(&aom_highbd_masked_sub_pixel_variance128x64_ssse3,
-                   &aom_highbd_masked_sub_pixel_variance128x64_c, AOM_BITS_8),
-        make_tuple(&aom_highbd_masked_sub_pixel_variance64x128_ssse3,
-                   &aom_highbd_masked_sub_pixel_variance64x128_c, AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_compound_sub_pixel_variance128x128_ssse3,
+                   &aom_highbd_masked_compound_sub_pixel_variance128x128_c,
+                   AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_compound_sub_pixel_variance128x64_ssse3,
+                   &aom_highbd_masked_compound_sub_pixel_variance128x64_c,
+                   AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_compound_sub_pixel_variance64x128_ssse3,
+                   &aom_highbd_masked_compound_sub_pixel_variance64x128_c,
+                   AOM_BITS_8),
 #endif  // CONFIG_EXT_PARTITION
-        make_tuple(&aom_highbd_masked_sub_pixel_variance64x64_ssse3,
-                   &aom_highbd_masked_sub_pixel_variance64x64_c, AOM_BITS_8),
-        make_tuple(&aom_highbd_masked_sub_pixel_variance64x32_ssse3,
-                   &aom_highbd_masked_sub_pixel_variance64x32_c, AOM_BITS_8),
-        make_tuple(&aom_highbd_masked_sub_pixel_variance32x64_ssse3,
-                   &aom_highbd_masked_sub_pixel_variance32x64_c, AOM_BITS_8),
-        make_tuple(&aom_highbd_masked_sub_pixel_variance32x32_ssse3,
-                   &aom_highbd_masked_sub_pixel_variance32x32_c, AOM_BITS_8),
-        make_tuple(&aom_highbd_masked_sub_pixel_variance32x16_ssse3,
-                   &aom_highbd_masked_sub_pixel_variance32x16_c, AOM_BITS_8),
-        make_tuple(&aom_highbd_masked_sub_pixel_variance16x32_ssse3,
-                   &aom_highbd_masked_sub_pixel_variance16x32_c, AOM_BITS_8),
-        make_tuple(&aom_highbd_masked_sub_pixel_variance16x16_ssse3,
-                   &aom_highbd_masked_sub_pixel_variance16x16_c, AOM_BITS_8),
-        make_tuple(&aom_highbd_masked_sub_pixel_variance16x8_ssse3,
-                   &aom_highbd_masked_sub_pixel_variance16x8_c, AOM_BITS_8),
-        make_tuple(&aom_highbd_masked_sub_pixel_variance8x16_ssse3,
-                   &aom_highbd_masked_sub_pixel_variance8x16_c, AOM_BITS_8),
-        make_tuple(&aom_highbd_masked_sub_pixel_variance8x8_ssse3,
-                   &aom_highbd_masked_sub_pixel_variance8x8_c, AOM_BITS_8),
-        make_tuple(&aom_highbd_masked_sub_pixel_variance8x4_ssse3,
-                   &aom_highbd_masked_sub_pixel_variance8x4_c, AOM_BITS_8),
-        make_tuple(&aom_highbd_masked_sub_pixel_variance4x8_ssse3,
-                   &aom_highbd_masked_sub_pixel_variance4x8_c, AOM_BITS_8),
-        make_tuple(&aom_highbd_masked_sub_pixel_variance4x4_ssse3,
-                   &aom_highbd_masked_sub_pixel_variance4x4_c, AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_compound_sub_pixel_variance64x64_ssse3,
+                   &aom_highbd_masked_compound_sub_pixel_variance64x64_c,
+                   AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_compound_sub_pixel_variance64x32_ssse3,
+                   &aom_highbd_masked_compound_sub_pixel_variance64x32_c,
+                   AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_compound_sub_pixel_variance32x64_ssse3,
+                   &aom_highbd_masked_compound_sub_pixel_variance32x64_c,
+                   AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_compound_sub_pixel_variance32x32_ssse3,
+                   &aom_highbd_masked_compound_sub_pixel_variance32x32_c,
+                   AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_compound_sub_pixel_variance32x16_ssse3,
+                   &aom_highbd_masked_compound_sub_pixel_variance32x16_c,
+                   AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_compound_sub_pixel_variance16x32_ssse3,
+                   &aom_highbd_masked_compound_sub_pixel_variance16x32_c,
+                   AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_compound_sub_pixel_variance16x16_ssse3,
+                   &aom_highbd_masked_compound_sub_pixel_variance16x16_c,
+                   AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_compound_sub_pixel_variance16x8_ssse3,
+                   &aom_highbd_masked_compound_sub_pixel_variance16x8_c,
+                   AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_compound_sub_pixel_variance8x16_ssse3,
+                   &aom_highbd_masked_compound_sub_pixel_variance8x16_c,
+                   AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_compound_sub_pixel_variance8x8_ssse3,
+                   &aom_highbd_masked_compound_sub_pixel_variance8x8_c,
+                   AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_compound_sub_pixel_variance8x4_ssse3,
+                   &aom_highbd_masked_compound_sub_pixel_variance8x4_c,
+                   AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_compound_sub_pixel_variance4x8_ssse3,
+                   &aom_highbd_masked_compound_sub_pixel_variance4x8_c,
+                   AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_compound_sub_pixel_variance4x4_ssse3,
+                   &aom_highbd_masked_compound_sub_pixel_variance4x4_c,
+                   AOM_BITS_8),
 #if CONFIG_EXT_PARTITION
-        make_tuple(&aom_highbd_10_masked_sub_pixel_variance128x128_ssse3,
-                   &aom_highbd_10_masked_sub_pixel_variance128x128_c,
-                   AOM_BITS_10),
-        make_tuple(&aom_highbd_10_masked_sub_pixel_variance128x64_ssse3,
-                   &aom_highbd_10_masked_sub_pixel_variance128x64_c,
-                   AOM_BITS_10),
-        make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x128_ssse3,
-                   &aom_highbd_10_masked_sub_pixel_variance64x128_c,
-                   AOM_BITS_10),
+        make_tuple(
+            &aom_highbd_10_masked_compound_sub_pixel_variance128x128_ssse3,
+            &aom_highbd_10_masked_compound_sub_pixel_variance128x128_c,
+            AOM_BITS_10),
+        make_tuple(
+            &aom_highbd_10_masked_compound_sub_pixel_variance128x64_ssse3,
+            &aom_highbd_10_masked_compound_sub_pixel_variance128x64_c,
+            AOM_BITS_10),
+        make_tuple(
+            &aom_highbd_10_masked_compound_sub_pixel_variance64x128_ssse3,
+            &aom_highbd_10_masked_compound_sub_pixel_variance64x128_c,
+            AOM_BITS_10),
 #endif  // CONFIG_EXT_PARTITION
-        make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x64_ssse3,
-                   &aom_highbd_10_masked_sub_pixel_variance64x64_c,
+        make_tuple(&aom_highbd_10_masked_compound_sub_pixel_variance64x64_ssse3,
+                   &aom_highbd_10_masked_compound_sub_pixel_variance64x64_c,
                    AOM_BITS_10),
-        make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x32_ssse3,
-                   &aom_highbd_10_masked_sub_pixel_variance64x32_c,
+        make_tuple(&aom_highbd_10_masked_compound_sub_pixel_variance64x32_ssse3,
+                   &aom_highbd_10_masked_compound_sub_pixel_variance64x32_c,
                    AOM_BITS_10),
-        make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x64_ssse3,
-                   &aom_highbd_10_masked_sub_pixel_variance32x64_c,
+        make_tuple(&aom_highbd_10_masked_compound_sub_pixel_variance32x64_ssse3,
+                   &aom_highbd_10_masked_compound_sub_pixel_variance32x64_c,
                    AOM_BITS_10),
-        make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x32_ssse3,
-                   &aom_highbd_10_masked_sub_pixel_variance32x32_c,
+        make_tuple(&aom_highbd_10_masked_compound_sub_pixel_variance32x32_ssse3,
+                   &aom_highbd_10_masked_compound_sub_pixel_variance32x32_c,
                    AOM_BITS_10),
-        make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x16_ssse3,
-                   &aom_highbd_10_masked_sub_pixel_variance32x16_c,
+        make_tuple(&aom_highbd_10_masked_compound_sub_pixel_variance32x16_ssse3,
+                   &aom_highbd_10_masked_compound_sub_pixel_variance32x16_c,
                    AOM_BITS_10),
-        make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x32_ssse3,
-                   &aom_highbd_10_masked_sub_pixel_variance16x32_c,
+        make_tuple(&aom_highbd_10_masked_compound_sub_pixel_variance16x32_ssse3,
+                   &aom_highbd_10_masked_compound_sub_pixel_variance16x32_c,
                    AOM_BITS_10),
-        make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x16_ssse3,
-                   &aom_highbd_10_masked_sub_pixel_variance16x16_c,
+        make_tuple(&aom_highbd_10_masked_compound_sub_pixel_variance16x16_ssse3,
+                   &aom_highbd_10_masked_compound_sub_pixel_variance16x16_c,
                    AOM_BITS_10),
-        make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x8_ssse3,
-                   &aom_highbd_10_masked_sub_pixel_variance16x8_c, AOM_BITS_10),
-        make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x16_ssse3,
-                   &aom_highbd_10_masked_sub_pixel_variance8x16_c, AOM_BITS_10),
-        make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x8_ssse3,
-                   &aom_highbd_10_masked_sub_pixel_variance8x8_c, AOM_BITS_10),
-        make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x4_ssse3,
-                   &aom_highbd_10_masked_sub_pixel_variance8x4_c, AOM_BITS_10),
-        make_tuple(&aom_highbd_10_masked_sub_pixel_variance4x8_ssse3,
-                   &aom_highbd_10_masked_sub_pixel_variance4x8_c, AOM_BITS_10),
-        make_tuple(&aom_highbd_10_masked_sub_pixel_variance4x4_ssse3,
-                   &aom_highbd_10_masked_sub_pixel_variance4x4_c, AOM_BITS_10),
+        make_tuple(&aom_highbd_10_masked_compound_sub_pixel_variance16x8_ssse3,
+                   &aom_highbd_10_masked_compound_sub_pixel_variance16x8_c,
+                   AOM_BITS_10),
+        make_tuple(&aom_highbd_10_masked_compound_sub_pixel_variance8x16_ssse3,
+                   &aom_highbd_10_masked_compound_sub_pixel_variance8x16_c,
+                   AOM_BITS_10),
+        make_tuple(&aom_highbd_10_masked_compound_sub_pixel_variance8x8_ssse3,
+                   &aom_highbd_10_masked_compound_sub_pixel_variance8x8_c,
+                   AOM_BITS_10),
+        make_tuple(&aom_highbd_10_masked_compound_sub_pixel_variance8x4_ssse3,
+                   &aom_highbd_10_masked_compound_sub_pixel_variance8x4_c,
+                   AOM_BITS_10),
+        make_tuple(&aom_highbd_10_masked_compound_sub_pixel_variance4x8_ssse3,
+                   &aom_highbd_10_masked_compound_sub_pixel_variance4x8_c,
+                   AOM_BITS_10),
+        make_tuple(&aom_highbd_10_masked_compound_sub_pixel_variance4x4_ssse3,
+                   &aom_highbd_10_masked_compound_sub_pixel_variance4x4_c,
+                   AOM_BITS_10),
 #if CONFIG_EXT_PARTITION
-        make_tuple(&aom_highbd_12_masked_sub_pixel_variance128x128_ssse3,
-                   &aom_highbd_12_masked_sub_pixel_variance128x128_c,
-                   AOM_BITS_12),
-        make_tuple(&aom_highbd_12_masked_sub_pixel_variance128x64_ssse3,
-                   &aom_highbd_12_masked_sub_pixel_variance128x64_c,
-                   AOM_BITS_12),
-        make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x128_ssse3,
-                   &aom_highbd_12_masked_sub_pixel_variance64x128_c,
-                   AOM_BITS_12),
+        make_tuple(
+            &aom_highbd_12_masked_compound_sub_pixel_variance128x128_ssse3,
+            &aom_highbd_12_masked_compound_sub_pixel_variance128x128_c,
+            AOM_BITS_12),
+        make_tuple(
+            &aom_highbd_12_masked_compound_sub_pixel_variance128x64_ssse3,
+            &aom_highbd_12_masked_compound_sub_pixel_variance128x64_c,
+            AOM_BITS_12),
+        make_tuple(
+            &aom_highbd_12_masked_compound_sub_pixel_variance64x128_ssse3,
+            &aom_highbd_12_masked_compound_sub_pixel_variance64x128_c,
+            AOM_BITS_12),
 #endif  // CONFIG_EXT_PARTITION
-        make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x64_ssse3,
-                   &aom_highbd_12_masked_sub_pixel_variance64x64_c,
+        make_tuple(&aom_highbd_12_masked_compound_sub_pixel_variance64x64_ssse3,
+                   &aom_highbd_12_masked_compound_sub_pixel_variance64x64_c,
                    AOM_BITS_12),
-        make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x32_ssse3,
-                   &aom_highbd_12_masked_sub_pixel_variance64x32_c,
+        make_tuple(&aom_highbd_12_masked_compound_sub_pixel_variance64x32_ssse3,
+                   &aom_highbd_12_masked_compound_sub_pixel_variance64x32_c,
                    AOM_BITS_12),
-        make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x64_ssse3,
-                   &aom_highbd_12_masked_sub_pixel_variance32x64_c,
+        make_tuple(&aom_highbd_12_masked_compound_sub_pixel_variance32x64_ssse3,
+                   &aom_highbd_12_masked_compound_sub_pixel_variance32x64_c,
                    AOM_BITS_12),
-        make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x32_ssse3,
-                   &aom_highbd_12_masked_sub_pixel_variance32x32_c,
+        make_tuple(&aom_highbd_12_masked_compound_sub_pixel_variance32x32_ssse3,
+                   &aom_highbd_12_masked_compound_sub_pixel_variance32x32_c,
                    AOM_BITS_12),
-        make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x16_ssse3,
-                   &aom_highbd_12_masked_sub_pixel_variance32x16_c,
+        make_tuple(&aom_highbd_12_masked_compound_sub_pixel_variance32x16_ssse3,
+                   &aom_highbd_12_masked_compound_sub_pixel_variance32x16_c,
                    AOM_BITS_12),
-        make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x32_ssse3,
-                   &aom_highbd_12_masked_sub_pixel_variance16x32_c,
+        make_tuple(&aom_highbd_12_masked_compound_sub_pixel_variance16x32_ssse3,
+                   &aom_highbd_12_masked_compound_sub_pixel_variance16x32_c,
                    AOM_BITS_12),
-        make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x16_ssse3,
-                   &aom_highbd_12_masked_sub_pixel_variance16x16_c,
+        make_tuple(&aom_highbd_12_masked_compound_sub_pixel_variance16x16_ssse3,
+                   &aom_highbd_12_masked_compound_sub_pixel_variance16x16_c,
                    AOM_BITS_12),
-        make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x8_ssse3,
-                   &aom_highbd_12_masked_sub_pixel_variance16x8_c, AOM_BITS_12),
-        make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x16_ssse3,
-                   &aom_highbd_12_masked_sub_pixel_variance8x16_c, AOM_BITS_12),
-        make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x8_ssse3,
-                   &aom_highbd_12_masked_sub_pixel_variance8x8_c, AOM_BITS_12),
-        make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x4_ssse3,
-                   &aom_highbd_12_masked_sub_pixel_variance8x4_c, AOM_BITS_12),
-        make_tuple(&aom_highbd_12_masked_sub_pixel_variance4x8_ssse3,
-                   &aom_highbd_12_masked_sub_pixel_variance4x8_c, AOM_BITS_12),
-        make_tuple(&aom_highbd_12_masked_sub_pixel_variance4x4_ssse3,
-                   &aom_highbd_12_masked_sub_pixel_variance4x4_c,
+        make_tuple(&aom_highbd_12_masked_compound_sub_pixel_variance16x8_ssse3,
+                   &aom_highbd_12_masked_compound_sub_pixel_variance16x8_c,
+                   AOM_BITS_12),
+        make_tuple(&aom_highbd_12_masked_compound_sub_pixel_variance8x16_ssse3,
+                   &aom_highbd_12_masked_compound_sub_pixel_variance8x16_c,
+                   AOM_BITS_12),
+        make_tuple(&aom_highbd_12_masked_compound_sub_pixel_variance8x8_ssse3,
+                   &aom_highbd_12_masked_compound_sub_pixel_variance8x8_c,
+                   AOM_BITS_12),
+        make_tuple(&aom_highbd_12_masked_compound_sub_pixel_variance8x4_ssse3,
+                   &aom_highbd_12_masked_compound_sub_pixel_variance8x4_c,
+                   AOM_BITS_12),
+        make_tuple(&aom_highbd_12_masked_compound_sub_pixel_variance4x8_ssse3,
+                   &aom_highbd_12_masked_compound_sub_pixel_variance4x8_c,
+                   AOM_BITS_12),
+        make_tuple(&aom_highbd_12_masked_compound_sub_pixel_variance4x4_ssse3,
+                   &aom_highbd_12_masked_compound_sub_pixel_variance4x4_c,
                    AOM_BITS_12)));
 #endif  // CONFIG_HIGHBITDEPTH
 
-#endif  // HAVE_SSSE3
+#endif  // 0 && HAVE_SSSE3
 }  // namespace