Add CONVOLVE_POST_ROUNDING flag

By turning on CONVOLVE_POST_ROUNDING, in the compound inter
prediction mode, FILTER_BITS rounding is moved after the summation
of two predictions.

Note that the post rounding is only applied on non-sub8x8 block

       PSNR     BDRate
lowres -0.808%  -0.673%

Change-Id: Ib91304e6122c24d832a582ab9f5757d33eac876c
diff --git a/av1/common/convolve.c b/av1/common/convolve.c
index 58c6fd3..dbbd40b 100644
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c
@@ -53,7 +53,8 @@
           tmp = ROUND_POWER_OF_TWO(tmp + sum, 1);
         else
           tmp = sum;
-        conv_params->dst[y * conv_params->dst_stride + x] = tmp;
+        conv_params->dst[y * conv_params->dst_stride + x] =
+            clamp(tmp, INT32_MIN, INT32_MAX);
       }
 
       x_q4 += x_step_q4;
@@ -95,7 +96,8 @@
           tmp = ROUND_POWER_OF_TWO(tmp + sum, 1);
         else
           tmp = sum;
-        conv_params->dst[y * conv_params->dst_stride + x] = tmp;
+        conv_params->dst[y * conv_params->dst_stride + x] =
+            clamp(tmp, INT32_MIN, INT32_MAX);
       }
 
       y_q4 += y_step_q4;
@@ -129,9 +131,9 @@
           dst[c] = clip_pixel(ROUND_POWER_OF_TWO(dst[c] + src[c], 1));
         } else {
           int tmp = conv_params->dst[r * conv_params->dst_stride + c];
-          tmp =
-              ROUND_POWER_OF_TWO(tmp + (((uint16_t)src[c]) << FILTER_BITS), 1);
-          conv_params->dst[r * conv_params->dst_stride + c] = tmp;
+          tmp = ROUND_POWER_OF_TWO(tmp + (((int32_t)src[c]) << FILTER_BITS), 1);
+          conv_params->dst[r * conv_params->dst_stride + c] =
+              clamp(tmp, INT32_MIN, INT32_MAX);
         }
       }
       src += src_stride;
@@ -195,6 +197,19 @@
   }
 }
 
+#if CONVOLVE_POST_ROUNDING
+void av1_convolve_rounding(const int32_t *src, int src_stride, uint8_t *dst,
+                           int dst_stride, int w, int h) {
+  int r, c;
+  for (r = 0; r < h; ++r) {
+    for (c = 0; c < w; ++c) {
+      dst[r * dst_stride + c] =
+          clip_pixel(ROUND_POWER_OF_TWO(src[r * src_stride + c], FILTER_BITS));
+    }
+  }
+}
+#endif  // CONVOLVE_POST_ROUNDING
+
 void av1_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
                   int dst_stride, int w, int h,
 #if CONFIG_DUAL_FILTER
diff --git a/av1/common/convolve.h b/av1/common/convolve.h
index e62bcea..0f1dd9c 100644
--- a/av1/common/convolve.h
+++ b/av1/common/convolve.h
@@ -17,6 +17,10 @@
 extern "C" {
 #endif
 
+#if CONFIG_DUAL_FILTER
+#define CONVOLVE_POST_ROUNDING 1
+#endif
+
 typedef enum CONVOLVE_OPT {
   // indicate the results in dst buf is rounded by FILTER_BITS or not
   CONVOLVE_OPT_ROUND,
@@ -26,7 +30,7 @@
 typedef struct ConvolveParams {
   int ref;
   CONVOLVE_OPT round;
-  uint16_t *dst;
+  int32_t *dst;
   int dst_stride;
 } ConvolveParams;
 
@@ -37,6 +41,21 @@
   return conv_params;
 }
 
+#if CONVOLVE_POST_ROUNDING
+static INLINE ConvolveParams get_conv_params_no_round(int ref, int32_t *dst,
+                                                      int dst_stride) {
+  ConvolveParams conv_params;
+  conv_params.ref = ref;
+  conv_params.round = CONVOLVE_OPT_NO_ROUND;
+  conv_params.dst = dst;
+  conv_params.dst_stride = dst_stride;
+  return conv_params;
+}
+
+void av1_convolve_rounding(const int32_t *src, int src_stride, uint8_t *dst,
+                           int dst_stride, int w, int h);
+#endif  // CONVOLVE_POST_ROUNDING
+
 void av1_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
                   int dst_stride, int w, int h,
 #if CONFIG_DUAL_FILTER
diff --git a/av1/common/reconinter.c b/av1/common/reconinter.c
index eaff6c8..be4e886 100644
--- a/av1/common/reconinter.c
+++ b/av1/common/reconinter.c
@@ -866,6 +866,9 @@
     uint8_t *pre[2];
     MV32 scaled_mv[2];
     SubpelParams subpel_params[2];
+#if CONVOLVE_POST_ROUNDING
+    int32_t tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE];
+#endif  // CONVOLVE_POST_ROUNDING
 
     for (ref = 0; ref < 1 + is_compound; ++ref) {
       const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
@@ -916,8 +919,13 @@
 
     for (ref = 0; ref < 1 + is_compound; ++ref) {
       const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
-      ConvolveParams conv_params = get_conv_params(ref);
       struct buf_2d *const pre_buf = &pd->pre[ref];
+#if CONVOLVE_POST_ROUNDING
+      ConvolveParams conv_params =
+          get_conv_params_no_round(ref, tmp_dst, MAX_SB_SIZE);
+#else
+      ConvolveParams conv_params = get_conv_params(ref);
+#endif  // CONVOLVE_POST_ROUNDING
 #if CONFIG_EXT_INTER
       if (ref &&
           is_masked_compound_type(mi->mbmi.interinter_compound_data.type))
@@ -955,6 +963,11 @@
             &conv_params, mi->mbmi.interp_filter, subpel_params[ref].xs,
             subpel_params[ref].ys, xd);
     }
+
+#if CONVOLVE_POST_ROUNDING
+    // TODO(angiebird): This part needs optimization
+    av1_convolve_rounding(tmp_dst, MAX_SB_SIZE, dst, dst_buf->stride, w, h);
+#endif  // CONVOLVE_POST_ROUNDING
   }
 }
 
diff --git a/av1/common/reconinter.h b/av1/common/reconinter.h
index 19c8032..a5d5422 100644
--- a/av1/common/reconinter.h
+++ b/av1/common/reconinter.h
@@ -44,13 +44,15 @@
 
 #if CONFIG_DUAL_FILTER
   if (interp_filter_params_x.taps == SUBPEL_TAPS &&
-      interp_filter_params_y.taps == SUBPEL_TAPS && w > 2 && h > 2) {
+      interp_filter_params_y.taps == SUBPEL_TAPS && w > 2 && h > 2 &&
+      conv_params->round == CONVOLVE_OPT_ROUND) {
     const int16_t *kernel_x =
         av1_get_interp_filter_subpel_kernel(interp_filter_params_x, subpel_x);
     const int16_t *kernel_y =
         av1_get_interp_filter_subpel_kernel(interp_filter_params_y, subpel_y);
 #else
-  if (interp_filter_params.taps == SUBPEL_TAPS && w > 2 && h > 2) {
+  if (interp_filter_params.taps == SUBPEL_TAPS && w > 2 && h > 2 &&
+      conv_params->round == CONVOLVE_OPT_ROUND) {
     const int16_t *kernel_x =
         av1_get_interp_filter_subpel_kernel(interp_filter_params, subpel_x);
     const int16_t *kernel_y =