Do interpolation with lower-tap filter first

There is 0.003% BDRate change on lowres dataset.
Change-Id: Ie62a5ea07bdcfe0a62f37e8f981382df4cc59918
diff --git a/av1/common/convolve.c b/av1/common/convolve.c
index 975c759..bb0f4d3 100644
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c
@@ -146,48 +146,85 @@
     av1_convolve_vert(src, src_stride, dst, dst_stride, w, h, filter_params,
                       subpel_y_q4, y_step_q4, ref_idx);
   } else {
-    // temp's size is set to (maximum possible intermediate_height) *
-    // MAX_BLOCK_WIDTH
-    uint8_t temp[((((MAX_BLOCK_HEIGHT - 1) * MAX_STEP + 15) >> SUBPEL_BITS) +
+    // temp's size is set to (maximum possible intermediate height or width) *
+    // MAX_SB_SIZE
+    uint8_t temp[((((MAX_SB_SIZE - 1) * MAX_STEP + 15) >> SUBPEL_BITS) +
                   MAX_FILTER_TAP) *
-                 MAX_BLOCK_WIDTH];
-    int temp_stride = MAX_BLOCK_WIDTH;
+                 MAX_SB_SIZE];
+    int filter_size;
+    InterpFilterParams filter_params;
 #if CONFIG_DUAL_FILTER
     InterpFilterParams filter_params_x =
         av1_get_interp_filter_params(interp_filter[1 + 2 * ref_idx]);
     InterpFilterParams filter_params_y =
         av1_get_interp_filter_params(interp_filter[0 + 2 * ref_idx]);
-    InterpFilterParams filter_params = filter_params_x;
-
-    // The filter size implies the required number of reference pixels for
-    // the second stage filtering. It is possible that the two directions
-    // require different filter sizes.
-    int filter_size = filter_params_y.taps;
-#else
-    InterpFilterParams filter_params =
-        av1_get_interp_filter_params(interp_filter);
-    int filter_size = filter_params.taps;
 #endif
-    int intermediate_height =
-        (((h - 1) * y_step_q4 + subpel_y_q4) >> SUBPEL_BITS) + filter_size;
-
-    assert(filter_params.taps <= MAX_FILTER_TAP);
-
-    av1_convolve_horiz(src - src_stride * (filter_size / 2 - 1), src_stride,
-                       temp, temp_stride, w, intermediate_height, filter_params,
-                       subpel_x_q4, x_step_q4, 0);
 
 #if CONFIG_DUAL_FILTER
-    filter_params = filter_params_y;
+    // we do filter with fewer taps first to reduce hardware implementation
+    // complexity
+    if (filter_params_y.taps < filter_params_x.taps) {
+      int intermediate_width;
+      int temp_stride;
+#if CONFIG_DUAL_FILTER
+      filter_params = filter_params_y;
+      filter_size = filter_params_x.taps;
 #else
-    filter_params = av1_get_interp_filter_params(interp_filter);
+      filter_params = av1_get_interp_filter_params(interp_filter);
+      filter_size = filter_params.taps;
 #endif
-    filter_size = filter_params.taps;
-    assert(filter_params.taps <= MAX_FILTER_TAP);
+      intermediate_width =
+          (((w - 1) * x_step_q4 + subpel_x_q4) >> SUBPEL_BITS) + filter_size;
+      temp_stride = intermediate_width;
 
-    av1_convolve_vert(temp + temp_stride * (filter_size / 2 - 1), temp_stride,
-                      dst, dst_stride, w, h, filter_params, subpel_y_q4,
-                      y_step_q4, ref_idx);
+      assert(filter_params.taps <= MAX_FILTER_TAP);
+
+      av1_convolve_vert(src - (filter_size / 2 - 1), src_stride, temp,
+                        temp_stride, intermediate_width, h, filter_params,
+                        subpel_y_q4, y_step_q4, 0);
+
+#if CONFIG_DUAL_FILTER
+      filter_params = filter_params_x;
+#else
+      filter_params = av1_get_interp_filter_params(interp_filter);
+#endif
+      assert(filter_params.taps <= MAX_FILTER_TAP);
+
+      av1_convolve_horiz(temp + (filter_size / 2 - 1), temp_stride, dst,
+                         dst_stride, w, h, filter_params, subpel_x_q4,
+                         x_step_q4, ref_idx);
+    } else
+#endif
+    {
+      int intermediate_height;
+      int temp_stride = w;
+#if CONFIG_DUAL_FILTER
+      filter_params = filter_params_x;
+      filter_size = filter_params_y.taps;
+#else
+      filter_params = av1_get_interp_filter_params(interp_filter);
+      filter_size = filter_params.taps;
+#endif
+      intermediate_height =
+          (((h - 1) * y_step_q4 + subpel_y_q4) >> SUBPEL_BITS) + filter_size;
+
+      assert(filter_params.taps <= MAX_FILTER_TAP);
+
+      av1_convolve_horiz(src - src_stride * (filter_size / 2 - 1), src_stride,
+                         temp, temp_stride, w, intermediate_height,
+                         filter_params, subpel_x_q4, x_step_q4, 0);
+
+#if CONFIG_DUAL_FILTER
+      filter_params = filter_params_y;
+#else
+      filter_params = av1_get_interp_filter_params(interp_filter);
+#endif
+      assert(filter_params.taps <= MAX_FILTER_TAP);
+
+      av1_convolve_vert(temp + temp_stride * (filter_size / 2 - 1), temp_stride,
+                        dst, dst_stride, w, h, filter_params, subpel_y_q4,
+                        y_step_q4, ref_idx);
+    }
   }
 }
 
diff --git a/test/av1_convolve_test.cc b/test/av1_convolve_test.cc
index fda5b80..2350182 100644
--- a/test/av1_convolve_test.cc
+++ b/test/av1_convolve_test.cc
@@ -137,6 +137,68 @@
   }
 }
 
+#if CONFIG_EXT_INTERP && CONFIG_DUAL_FILTER
+TEST(AV1ConvolveTest, av1_convolve_vert_first) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  InterpFilter interp_filter[4] = { EIGHTTAP_REGULAR, MULTITAP_SHARP,
+                                    EIGHTTAP_REGULAR, MULTITAP_SHARP };
+  InterpFilterParams filter_params_x =
+      av1_get_interp_filter_params(interp_filter[1]);
+  InterpFilterParams filter_params_y =
+      av1_get_interp_filter_params(interp_filter[0]);
+  int filter_size_x = filter_params_x.taps;
+  int filter_size_y = filter_params_y.taps;
+  int filter_center_x = filter_size_x / 2 - 1;
+  int filter_center_y = filter_size_y / 2 - 1;
+  uint8_t src[12 * 12];
+  int src_stride = filter_size_x;
+  uint8_t dst[1] = { 0 };
+  int dst_stride = 1;
+  int x_step_q4 = 16;
+  int y_step_q4 = 16;
+  int avg = 0;
+  int w = 1;
+  int h = 1;
+
+  int subpel_x_q4;
+  int subpel_y_q4;
+
+  ASSERT_LE(filter_size_x, 12);
+  ASSERT_LE(filter_size_y, 12);
+  setup_convolve();
+
+  for (int i = 0; i < static_cast<int>(sizeof(src) / sizeof(src[0])); i++) {
+    src[i] = rnd.Rand16() % (1 << 8);
+  }
+
+  for (subpel_x_q4 = 1; subpel_x_q4 < 2; subpel_x_q4++) {
+    for (subpel_y_q4 = 1; subpel_y_q4 < 2; subpel_y_q4++) {
+      av1_convolve(src + src_stride * filter_center_y + filter_center_x,
+                   src_stride, dst, dst_stride, w, h, interp_filter,
+                   subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4, avg);
+
+      const int16_t *x_filter =
+          av1_get_interp_filter_subpel_kernel(filter_params_x, subpel_x_q4);
+      const int16_t *y_filter =
+          av1_get_interp_filter_subpel_kernel(filter_params_y, subpel_y_q4);
+
+      int temp[12];
+      int dst_ref = 0;
+      for (int c = 0; c < filter_size_x; c++) {
+        temp[c] = 0;
+        for (int r = 0; r < filter_size_y; r++) {
+          temp[c] += y_filter[r] * src[r * filter_size_x + c];
+        }
+        temp[c] = clip_pixel(ROUND_POWER_OF_TWO(temp[c], FILTER_BITS));
+        dst_ref += temp[c] * x_filter[c];
+      }
+      dst_ref = clip_pixel(ROUND_POWER_OF_TWO(dst_ref, FILTER_BITS));
+      EXPECT_EQ(dst[0], dst_ref);
+    }
+  }
+}
+#endif
+
 TEST(AV1ConvolveTest, av1_convolve_avg) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
 #if CONFIG_DUAL_FILTER