Add av1_convolve_2d_facade

When convolve_round is on, av1_convolve_2d_facade will be used for
interpolation rather than av1_convolve. Will remove the experiment
code of convolve_round experiment from av1_convolve in another CL.

So far we use 4-bit rounding in the intermediate stage on top of using
post rounding for compound mode after the last stage.

This will give us roughly 0.45% gain on lowres , 0.39% on midres and
roughly 0.6-0.7% on hdres
Altogether, is 1.15% on lowresm, 0.74% on midres and roughly 1.7-1.8% on
hdres

Note that there no restriction usage of 12-tap filter in the CL.
Adding that, we will lose roughly 0.1% again on lowres.

Change-Id: I6332e1d888e28a3b3ddc29711817d66e52cb5cdf
diff --git a/av1/common/convolve.c b/av1/common/convolve.c
index a1a266b..97ce6ba 100644
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c
@@ -199,15 +199,79 @@
 
 #if CONFIG_CONVOLVE_ROUND
 void av1_convolve_rounding(const int32_t *src, int src_stride, uint8_t *dst,
-                           int dst_stride, int w, int h) {
+                           int dst_stride, int w, int h, int bits) {
   int r, c;
   for (r = 0; r < h; ++r) {
     for (c = 0; c < w; ++c) {
       dst[r * dst_stride + c] =
-          clip_pixel(ROUND_POWER_OF_TWO(src[r * src_stride + c], FILTER_BITS));
+          clip_pixel(ROUND_POWER_OF_TWO_SIGNED(src[r * src_stride + c], bits));
     }
   }
 }
+
+void av1_convolve_2d(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst,
+                     int dst_stride, int w, int h,
+                     InterpFilterParams *filter_params_x,
+                     InterpFilterParams *filter_params_y, const int subpel_x_q4,
+                     const int subpel_y_q4, ConvolveParams *conv_params) {
+  int x, y, k;
+  CONV_BUF_TYPE im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = w;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  (void)conv_params;
+
+  // horizontal filter
+  const uint8_t *src_horiz = src - fo_vert * src_stride;
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+  for (y = 0; y < im_h; ++y) {
+    for (x = 0; x < w; ++x) {
+      CONV_BUF_TYPE sum = 0;
+      for (k = 0; k < filter_params_x->taps; ++k) {
+        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
+      }
+      im_block[y * im_stride + x] =
+          ROUND_POWER_OF_TWO_SIGNED(sum, conv_params->round_0);
+    }
+  }
+
+  // vertical filter
+  CONV_BUF_TYPE *src_vert = im_block + fo_vert * im_stride;
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      CONV_BUF_TYPE sum = 0;
+      for (k = 0; k < filter_params_y->taps; ++k) {
+        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
+      }
+      dst[y * dst_stride + x] +=
+          ROUND_POWER_OF_TWO_SIGNED(sum, conv_params->round_1);
+    }
+  }
+}
+
+void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
+                            const InterpFilter *interp_filter,
+                            const int subpel_x_q4, int x_step_q4,
+                            const int subpel_y_q4, int y_step_q4,
+                            ConvolveParams *conv_params) {
+  (void)x_step_q4;
+  (void)y_step_q4;
+  (void)dst;
+  (void)dst_stride;
+  InterpFilterParams filter_params_x =
+      av1_get_interp_filter_params(interp_filter[1 + 2 * conv_params->ref]);
+  InterpFilterParams filter_params_y =
+      av1_get_interp_filter_params(interp_filter[0 + 2 * conv_params->ref]);
+  av1_convolve_2d(src, src_stride, conv_params->dst, conv_params->dst_stride, w,
+                  h, &filter_params_x, &filter_params_y, subpel_x_q4,
+                  subpel_y_q4, conv_params);
+}
+
 #endif  // CONFIG_CONVOLVE_ROUND
 
 void av1_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
@@ -294,7 +358,6 @@
 
       filter_params = filter_params_x;
       assert(filter_params.taps <= MAX_FILTER_TAP);
-
       av1_convolve_horiz_facade(temp + (filter_size / 2 - 1), temp_stride, dst,
                                 dst_stride, w, h, filter_params, subpel_x_q4,
                                 x_step_q4, conv_params);
diff --git a/av1/common/convolve.h b/av1/common/convolve.h
index 349e9ac..88b413d 100644
--- a/av1/common/convolve.h
+++ b/av1/common/convolve.h
@@ -23,11 +23,15 @@
   CONVOLVE_OPT_NO_ROUND,
 } CONVOLVE_OPT;
 
+typedef int32_t CONV_BUF_TYPE;
+
 typedef struct ConvolveParams {
   int ref;
   CONVOLVE_OPT round;
-  int32_t *dst;
+  CONV_BUF_TYPE *dst;
   int dst_stride;
+  int round_0;
+  int round_1;
 } ConvolveParams;
 
 static INLINE ConvolveParams get_conv_params(int ref) {
@@ -38,18 +42,33 @@
 }
 
 #if CONFIG_CONVOLVE_ROUND
+void av1_convolve_2d(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst,
+                     int dst_stride, int w, int h,
+                     InterpFilterParams *filter_params_x,
+                     InterpFilterParams *filter_params_y, const int subpel_x_q4,
+                     const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
+                            const InterpFilter *interp_filter,
+                            const int subpel_x_q4, int x_step_q4,
+                            const int subpel_y_q4, int y_step_q4,
+                            ConvolveParams *conv_params);
+
 static INLINE ConvolveParams get_conv_params_no_round(int ref, int32_t *dst,
                                                       int dst_stride) {
   ConvolveParams conv_params;
   conv_params.ref = ref;
   conv_params.round = CONVOLVE_OPT_NO_ROUND;
+  conv_params.round_0 = 5;
+  conv_params.round_1 = 1;
   conv_params.dst = dst;
   conv_params.dst_stride = dst_stride;
   return conv_params;
 }
 
 void av1_convolve_rounding(const int32_t *src, int src_stride, uint8_t *dst,
-                           int dst_stride, int w, int h);
+                           int dst_stride, int w, int h, int bits);
 #endif  // CONFIG_CONVOLVE_ROUND
 
 void av1_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
diff --git a/av1/common/reconinter.c b/av1/common/reconinter.c
index 682d002..566a911 100644
--- a/av1/common/reconinter.c
+++ b/av1/common/reconinter.c
@@ -918,7 +918,8 @@
     MV32 scaled_mv[2];
     SubpelParams subpel_params[2];
 #if CONFIG_CONVOLVE_ROUND
-    int32_t tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE];
+    DECLARE_ALIGNED(16, int32_t, tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE]);
+    av1_zero(tmp_dst);
 #endif  // CONFIG_CONVOLVE_ROUND
 
     for (ref = 0; ref < 1 + is_compound; ++ref) {
@@ -968,15 +969,16 @@
                   (scaled_mv[ref].col >> SUBPEL_BITS);
     }
 
+#if CONFIG_CONVOLVE_ROUND
+    ConvolveParams conv_params =
+        get_conv_params_no_round(ref, tmp_dst, MAX_SB_SIZE);
+#else
+    ConvolveParams conv_params = get_conv_params(ref);
+#endif  // CONFIG_CONVOLVE_ROUND
     for (ref = 0; ref < 1 + is_compound; ++ref) {
       const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
       struct buf_2d *const pre_buf = &pd->pre[ref];
-#if CONFIG_CONVOLVE_ROUND
-      ConvolveParams conv_params =
-          get_conv_params_no_round(ref, tmp_dst, MAX_SB_SIZE);
-#else
-      ConvolveParams conv_params = get_conv_params(ref);
-#endif  // CONFIG_CONVOLVE_ROUND
+      conv_params.ref = ref;
 #if CONFIG_EXT_INTER
       if (ref &&
           is_masked_compound_type(mi->mbmi.interinter_compound_data.type))
@@ -1014,7 +1016,9 @@
 #if CONFIG_AOM_HIGHBITDEPTH
     if (!(xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH))
 #endif  // CONFIG_AOM_HIGHBITDEPTH
-      av1_convolve_rounding(tmp_dst, MAX_SB_SIZE, dst, dst_buf->stride, w, h);
+      av1_convolve_rounding(tmp_dst, MAX_SB_SIZE, dst, dst_buf->stride, w, h,
+                            FILTER_BITS * 2 + is_compound -
+                                conv_params.round_0 - conv_params.round_1);
 #endif  // CONFIG_CONVOLVE_ROUND
   }
 }
diff --git a/av1/common/reconinter.h b/av1/common/reconinter.h
index 519a1e3..cc4c858 100644
--- a/av1/common/reconinter.h
+++ b/av1/common/reconinter.h
@@ -64,11 +64,18 @@
     sf->predict[subpel_x != 0][subpel_y != 0][conv_params->ref](
         src, src_stride, dst, dst_stride, kernel_x, xs, kernel_y, ys, w, h);
   } else {
-    // ref_idx > 0 means this is the second reference frame
-    // first reference frame's prediction result is already in dst
-    // therefore we need to average the first and second results
-    av1_convolve(src, src_stride, dst, dst_stride, w, h, interp_filter,
-                 subpel_x, xs, subpel_y, ys, conv_params);
+// ref_idx > 0 means this is the second reference frame
+// first reference frame's prediction result is already in dst
+// therefore we need to average the first and second results
+#if CONFIG_CONVOLVE_ROUND
+    if (conv_params->round == CONVOLVE_OPT_NO_ROUND)
+      av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
+                             interp_filter, subpel_x, xs, subpel_y, ys,
+                             conv_params);
+    else
+#endif
+      av1_convolve(src, src_stride, dst, dst_stride, w, h, interp_filter,
+                   subpel_x, xs, subpel_y, ys, conv_params);
   }
 }