Facilitate SIMD optimization of subpel motion search

To facilitate simd optimization of 4-tap filter in
subpel motion search, unit tests are corrected to do
a C vs SIMD comparison.

Change-Id: I502dae35bb17d5ce9cf47e49bc6b46ba73deb37c
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 040ac16..27e95b9 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -909,6 +909,13 @@
                                                        int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search";
   specialize qw/aom_jnt_comp_avg_upsampled_pred ssse3/;
 
+  add_proto qw/void aom_comp_mask_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                       const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+                                                       int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+                                                       int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+                                                       int subpel_search";
+  specialize qw/aom_comp_mask_upsampled_pred sse2/;
+
 
   add_proto qw/void aom_highbd_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
                                                  const MV *const mv, uint8_t *comp_pred8, int width, int height, int subpel_x_q3,
diff --git a/aom_dsp/variance.c b/aom_dsp/variance.c
index db179d6..23b7153 100644
--- a/aom_dsp/variance.c
+++ b/aom_dsp/variance.c
@@ -382,13 +382,13 @@
   } else if (!subpel_y_q3) {
     const int16_t *const kernel =
         av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-    aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1,
-                        width, height);
+    aom_convolve8_horiz_c(ref, ref_stride, comp_pred, width, kernel, 16, NULL,
+                          -1, width, height);
   } else if (!subpel_x_q3) {
     const int16_t *const kernel =
         av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-    aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16,
-                       width, height);
+    aom_convolve8_vert_c(ref, ref_stride, comp_pred, width, NULL, -1, kernel,
+                         16, width, height);
   } else {
     DECLARE_ALIGNED(16, uint8_t,
                     temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
@@ -399,12 +399,12 @@
     const int intermediate_height =
         (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
     assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-    aom_convolve8_horiz(ref - ref_stride * ((filter->taps >> 1) - 1),
-                        ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
-                        width, intermediate_height);
-    aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1),
-                       MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
-                       width, height);
+    aom_convolve8_horiz_c(ref - ref_stride * ((filter->taps >> 1) - 1),
+                          ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
+                          width, intermediate_height);
+    aom_convolve8_vert_c(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1),
+                         MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
+                         width, height);
   }
 }
 
@@ -455,8 +455,9 @@
   const int fwd_offset = jcp_param->fwd_offset;
   const int bck_offset = jcp_param->bck_offset;
 
-  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                     subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
+  aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                       subpel_x_q3, subpel_y_q3, ref, ref_stride,
+                       subpel_search);
 
   for (i = 0; i < height; i++) {
     for (j = 0; j < width; j++) {
@@ -1092,23 +1093,23 @@
   }
 }
 
-void aom_comp_mask_upsampled_pred(MACROBLOCKD *xd, const AV1_COMMON *const cm,
-                                  int mi_row, int mi_col, const MV *const mv,
-                                  uint8_t *comp_pred, const uint8_t *pred,
-                                  int width, int height, int subpel_x_q3,
-                                  int subpel_y_q3, const uint8_t *ref,
-                                  int ref_stride, const uint8_t *mask,
-                                  int mask_stride, int invert_mask,
-                                  int subpel_search) {
+void aom_comp_mask_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                    int mi_row, int mi_col, const MV *const mv,
+                                    uint8_t *comp_pred, const uint8_t *pred,
+                                    int width, int height, int subpel_x_q3,
+                                    int subpel_y_q3, const uint8_t *ref,
+                                    int ref_stride, const uint8_t *mask,
+                                    int mask_stride, int invert_mask,
+                                    int subpel_search) {
   if (subpel_x_q3 | subpel_y_q3) {
-    aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                       subpel_x_q3, subpel_y_q3, ref, ref_stride,
-                       subpel_search);
+    aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                         subpel_x_q3, subpel_y_q3, ref, ref_stride,
+                         subpel_search);
     ref = comp_pred;
     ref_stride = width;
   }
-  aom_comp_mask_pred(comp_pred, pred, width, height, ref, ref_stride, mask,
-                     mask_stride, invert_mask);
+  aom_comp_mask_pred_c(comp_pred, pred, width, height, ref, ref_stride, mask,
+                       mask_stride, invert_mask);
 }
 
 #define MASK_SUBPIX_VAR(W, H)                                                  \
diff --git a/aom_dsp/variance.h b/aom_dsp/variance.h
index d29c6e4..77ad261 100644
--- a/aom_dsp/variance.h
+++ b/aom_dsp/variance.h
@@ -70,13 +70,6 @@
     const uint8_t *ref, int ref_stride, const uint8_t *second_pred,
     const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
 
-void aom_comp_mask_upsampled_pred(
-    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
-    int subpel_search);
-
 void aom_highbd_comp_mask_upsampled_pred(
     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
     const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
diff --git a/aom_dsp/x86/convolve.h b/aom_dsp/x86/convolve.h
index b7c6980..001ed47 100644
--- a/aom_dsp/x86/convolve.h
+++ b/aom_dsp/x86/convolve.h
@@ -33,7 +33,30 @@
     (void)y_step_q4;                                                         \
     assert((-128 <= filter[3]) && (filter[3] <= 127));                       \
     assert(step_q4 == 16);                                                   \
-    if (filter[0] | filter[1] | filter[2]) {                                 \
+    if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) &&            \
+        (filter[2] | filter[5])) {                                           \
+      while (w >= 16) {                                                      \
+        aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
+                                                 dst_stride, h, filter);     \
+        src += 16;                                                           \
+        dst += 16;                                                           \
+        w -= 16;                                                             \
+      }                                                                      \
+      while (w >= 8) {                                                       \
+        aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst,  \
+                                                dst_stride, h, filter);      \
+        src += 8;                                                            \
+        dst += 8;                                                            \
+        w -= 8;                                                              \
+      }                                                                      \
+      while (w >= 4) {                                                       \
+        aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst,  \
+                                                dst_stride, h, filter);      \
+        src += 4;                                                            \
+        dst += 4;                                                            \
+        w -= 4;                                                              \
+      }                                                                      \
+    } else if (filter[0] | filter[1] | filter[2]) {                          \
       while (w >= 16) {                                                      \
         aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
                                                  dst_stride, h, filter);     \
diff --git a/aom_dsp/x86/variance_sse2.c b/aom_dsp/x86/variance_sse2.c
index 1a27fd2..051dff6 100644
--- a/aom_dsp/x86/variance_sse2.c
+++ b/aom_dsp/x86/variance_sse2.c
@@ -575,6 +575,7 @@
       (subpel_search == 1)
           ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR)
           : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
+  int filter_taps = SUBPEL_TAPS;
 
   if (!subpel_x_q3 && !subpel_y_q3) {
     if (width >= 16) {
@@ -636,15 +637,16 @@
         av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
     const int16_t *const kernel_y =
         av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-    const int intermediate_height =
-        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
+    const uint8_t *ref_start = ref - ref_stride * ((filter_taps >> 1) - 1);
+    uint8_t *temp_start_horiz = temp;
+    uint8_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
+    int intermediate_height =
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
     assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-    aom_convolve8_horiz(ref - ref_stride * ((filter->taps >> 1) - 1),
-                        ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
-                        width, intermediate_height);
-    aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1),
-                       MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
-                       width, height);
+    aom_convolve8_horiz(ref_start, ref_stride, temp_start_horiz, MAX_SB_SIZE,
+                        kernel_x, 16, NULL, -1, width, intermediate_height);
+    aom_convolve8_vert(temp_start_vert, MAX_SB_SIZE, comp_pred, width, NULL, -1,
+                       kernel_y, 16, width, height);
   }
 }
 
@@ -669,6 +671,23 @@
   }
 }
 
+void aom_comp_mask_upsampled_pred_sse2(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+    int subpel_search) {
+  if (subpel_x_q3 | subpel_y_q3) {
+    aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                       subpel_x_q3, subpel_y_q3, ref, ref_stride,
+                       subpel_search);
+    ref = comp_pred;
+    ref_stride = width;
+  }
+  aom_comp_mask_pred(comp_pred, pred, width, height, ref, ref_stride, mask,
+                     mask_stride, invert_mask);
+}
+
 static INLINE __m128i highbd_comp_mask_pred_line_sse2(const __m128i s0,
                                                       const __m128i s1,
                                                       const __m128i a) {
diff --git a/test/comp_mask_variance_test.cc b/test/comp_mask_variance_test.cc
index e663469..34be2aa 100644
--- a/test/comp_mask_variance_test.cc
+++ b/test/comp_mask_variance_test.cc
@@ -200,10 +200,10 @@
         const uint8_t *mask =
             av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
 
-        aom_comp_mask_pred = aom_comp_mask_pred_c;  // ref
-        aom_comp_mask_upsampled_pred(NULL, NULL, 0, 0, NULL, comp_pred1_, pred_,
-                                     w, h, subx, suby, ref_, MAX_SB_SIZE, mask,
-                                     w, inv, subpel_search);
+        // ref
+        aom_comp_mask_upsampled_pred_c(
+            NULL, NULL, 0, 0, NULL, comp_pred1_, pred_, w, h, subx, suby, ref_,
+            MAX_SB_SIZE, mask, w, inv, subpel_search);
 
         aom_comp_mask_pred = test_impl;  // test
         aom_comp_mask_upsampled_pred(NULL, NULL, 0, 0, NULL, comp_pred2_, pred_,