Stop using VP9 convolve scheme in AV1 encoder.

Discontinue all VP9 style convolve rounding operations in the non-normative
parts of the encoder.

The function av1_convolve_2d_sr_c is forced instead of SIMD versions
of the same function, because of incompatibility when round_1 > 0.

In the -DCONFIG_LOWPRECISION_BLEND=2 -DCONFIG_HIGHPRECISION_INTBUF=1
setting, results on 15 frames of lowres (cpu-used=1) is -0.019% better.

Change-Id: I72154bd896357c352c944fb2cd3b25bafafba46a
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 2114451..a56eb94 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -453,19 +453,16 @@
 add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
 
 if (aom_config("CONFIG_JNT_COMP") eq "yes") {
-  if (aom_config("CONFIG_JNT_COMP") eq "yes") {
-    specialize qw/av1_warp_affine sse4_1/;
-  }
+  specialize qw/av1_warp_affine sse4_1/;
 } else {
   specialize qw/av1_warp_affine sse2 ssse3/;
 }
 
   add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
 
+
 if (aom_config("CONFIG_JNT_COMP") eq "yes") {
-  if (aom_config("CONFIG_JNT_COMP") eq "yes") {
-    specialize qw/av1_highbd_warp_affine sse4_1/;
-  }
+  specialize qw/av1_highbd_warp_affine sse4_1/;
 } else {
   specialize qw/av1_highbd_warp_affine ssse3/;
 }
diff --git a/av1/common/convolve.c b/av1/common/convolve.c
index 95458c1..8bc6f6d 100644
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c
@@ -1159,9 +1159,6 @@
                                    const int subpel_y_q4, int y_step_q4,
                                    int scaled, ConvolveParams *conv_params,
                                    int bd) {
-  (void)dst;
-  (void)dst_stride;
-
   InterpFilterParams filter_params_x, filter_params_y;
 #if CONFIG_SHORT_FILTER
   av1_get_convolve_filter_params(interp_filters, &filter_params_x,
@@ -1172,71 +1169,101 @@
 #endif
 
   const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  if (filter_params_y.taps < filter_params_x.taps) {
-    uint16_t tr_src[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) *
-                    (MAX_SB_SIZE + MAX_FILTER_TAP - 1)];
-    int tr_src_stride = MAX_SB_SIZE + MAX_FILTER_TAP - 1;
-    CONV_BUF_TYPE tr_dst[MAX_SB_SIZE * MAX_SB_SIZE];
-    int tr_dst_stride = MAX_SB_SIZE;
-    int fo_vert = filter_params_y.taps / 2 - 1;
-    int fo_horiz = filter_params_x.taps / 2 - 1;
+  if (conv_params->dst) {
+    if (filter_params_y.taps < filter_params_x.taps) {
+      uint16_t tr_src[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) *
+                      (MAX_SB_SIZE + MAX_FILTER_TAP - 1)];
+      int tr_src_stride = MAX_SB_SIZE + MAX_FILTER_TAP - 1;
+      CONV_BUF_TYPE tr_dst[MAX_SB_SIZE * MAX_SB_SIZE];
+      int tr_dst_stride = MAX_SB_SIZE;
+      int fo_vert = filter_params_y.taps / 2 - 1;
+      int fo_horiz = filter_params_x.taps / 2 - 1;
 
-    transpose_uint16(
-        tr_src, tr_src_stride, src - fo_vert * src_stride - fo_horiz,
-        src_stride, w + filter_params_x.taps - 1, h + filter_params_y.taps - 1);
-    transpose_int32(tr_dst, tr_dst_stride, conv_params->dst,
-                    conv_params->dst_stride, w, h);
+      transpose_uint16(tr_src, tr_src_stride,
+                       src - fo_vert * src_stride - fo_horiz, src_stride,
+                       w + filter_params_x.taps - 1,
+                       h + filter_params_y.taps - 1);
+      transpose_int32(tr_dst, tr_dst_stride, conv_params->dst,
+                      conv_params->dst_stride, w, h);
 
-// horizontal and vertical parameters are swapped because of the transpose
+      // horizontal and vertical parameters are swapped because of the transpose
 #if CONFIG_JNT_COMP
-    if (scaled)
-      av1_highbd_convolve_2d_scale(
-          tr_src + fo_horiz * tr_src_stride + fo_vert, tr_src_stride, tr_dst,
-          tr_dst_stride, h, w, &filter_params_y, &filter_params_x, subpel_y_q4,
-          y_step_q4, subpel_x_q4, x_step_q4, conv_params, bd);
-    else
-      av1_highbd_jnt_convolve_2d(tr_src + fo_horiz * tr_src_stride + fo_vert,
-                                 tr_src_stride, tr_dst, tr_dst_stride, h, w,
-                                 &filter_params_y, &filter_params_x,
-                                 subpel_y_q4, subpel_x_q4, conv_params, bd);
+      if (scaled)
+        av1_highbd_convolve_2d_scale(
+            tr_src + fo_horiz * tr_src_stride + fo_vert, tr_src_stride, tr_dst,
+            tr_dst_stride, h, w, &filter_params_y, &filter_params_x,
+            subpel_y_q4, y_step_q4, subpel_x_q4, x_step_q4, conv_params, bd);
+      else
+        av1_highbd_jnt_convolve_2d(tr_src + fo_horiz * tr_src_stride + fo_vert,
+                                   tr_src_stride, tr_dst, tr_dst_stride, h, w,
+                                   &filter_params_y, &filter_params_x,
+                                   subpel_y_q4, subpel_x_q4, conv_params, bd);
 #else
-    if (scaled)
-      av1_highbd_convolve_2d_scale(
-          tr_src + fo_horiz * tr_src_stride + fo_vert, tr_src_stride, tr_dst,
-          tr_dst_stride, h, w, &filter_params_y, &filter_params_x, subpel_y_q4,
-          y_step_q4, subpel_x_q4, x_step_q4, conv_params, bd);
-    else
-      av1_highbd_convolve_2d(tr_src + fo_horiz * tr_src_stride + fo_vert,
-                             tr_src_stride, tr_dst, tr_dst_stride, h, w,
-                             &filter_params_y, &filter_params_x, subpel_y_q4,
-                             subpel_x_q4, conv_params, bd);
+      if (scaled)
+        av1_highbd_convolve_2d_scale(
+            tr_src + fo_horiz * tr_src_stride + fo_vert, tr_src_stride, tr_dst,
+            tr_dst_stride, h, w, &filter_params_y, &filter_params_x,
+            subpel_y_q4, y_step_q4, subpel_x_q4, x_step_q4, conv_params, bd);
+      else
+        av1_highbd_convolve_2d(tr_src + fo_horiz * tr_src_stride + fo_vert,
+                               tr_src_stride, tr_dst, tr_dst_stride, h, w,
+                               &filter_params_y, &filter_params_x, subpel_y_q4,
+                               subpel_x_q4, conv_params, bd);
 #endif  // CONFIG_JNT_COMP
-    transpose_int32(conv_params->dst, conv_params->dst_stride, tr_dst,
-                    tr_dst_stride, h, w);
+      transpose_int32(conv_params->dst, conv_params->dst_stride, tr_dst,
+                      tr_dst_stride, h, w);
+    } else {
+#if CONFIG_JNT_COMP
+      if (scaled)
+        av1_highbd_convolve_2d_scale(
+            src, src_stride, conv_params->dst, conv_params->dst_stride, w, h,
+            &filter_params_x, &filter_params_y, subpel_x_q4, x_step_q4,
+            subpel_y_q4, y_step_q4, conv_params, bd);
+      else
+        av1_highbd_jnt_convolve_2d(src, src_stride, conv_params->dst,
+                                   conv_params->dst_stride, w, h,
+                                   &filter_params_x, &filter_params_y,
+                                   subpel_x_q4, subpel_y_q4, conv_params, bd);
+#else
+      if (scaled)
+        av1_highbd_convolve_2d_scale(
+            src, src_stride, conv_params->dst, conv_params->dst_stride, w, h,
+            &filter_params_x, &filter_params_y, subpel_x_q4, x_step_q4,
+            subpel_y_q4, y_step_q4, conv_params, bd);
+      else
+        av1_highbd_convolve_2d(src, src_stride, conv_params->dst,
+                               conv_params->dst_stride, w, h, &filter_params_x,
+                               &filter_params_y, subpel_x_q4, subpel_y_q4,
+                               conv_params, bd);
+#endif  // CONFIG_JNT_COMP
+    }
   } else {
+    CONV_BUF_TYPE tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE];
+    int tmp_dst_stride = MAX_SB_SIZE;
 #if CONFIG_JNT_COMP
     if (scaled)
-      av1_highbd_convolve_2d_scale(
-          src, src_stride, conv_params->dst, conv_params->dst_stride, w, h,
-          &filter_params_x, &filter_params_y, subpel_x_q4, x_step_q4,
-          subpel_y_q4, y_step_q4, conv_params, bd);
+      av1_highbd_convolve_2d_scale(src, src_stride, tmp_dst, tmp_dst_stride, w,
+                                   h, &filter_params_x, &filter_params_y,
+                                   subpel_x_q4, x_step_q4, subpel_y_q4,
+                                   y_step_q4, conv_params, bd);
     else
-      av1_highbd_jnt_convolve_2d(src, src_stride, conv_params->dst,
-                                 conv_params->dst_stride, w, h,
+      av1_highbd_jnt_convolve_2d(src, src_stride, tmp_dst, tmp_dst_stride, w, h,
                                  &filter_params_x, &filter_params_y,
                                  subpel_x_q4, subpel_y_q4, conv_params, bd);
 #else
     if (scaled)
-      av1_highbd_convolve_2d_scale(
-          src, src_stride, conv_params->dst, conv_params->dst_stride, w, h,
-          &filter_params_x, &filter_params_y, subpel_x_q4, x_step_q4,
-          subpel_y_q4, y_step_q4, conv_params, bd);
+      av1_highbd_convolve_2d_scale(src, src_stride, tmp_dst, tmp_dst_stride, w,
+                                   h, &filter_params_x, &filter_params_y,
+                                   subpel_x_q4, x_step_q4, subpel_y_q4,
+                                   y_step_q4, conv_params, bd);
     else
-      av1_highbd_convolve_2d(src, src_stride, conv_params->dst,
-                             conv_params->dst_stride, w, h, &filter_params_x,
-                             &filter_params_y, subpel_x_q4, subpel_y_q4,
-                             conv_params, bd);
+      av1_highbd_convolve_2d(src, src_stride, tmp_dst, tmp_dst_stride, w, h,
+                             &filter_params_x, &filter_params_y, subpel_x_q4,
+                             subpel_y_q4, conv_params, bd);
 #endif  // CONFIG_JNT_COMP
+    // 0-bit rounding just to convert from int32 to uint16
+    av1_highbd_convolve_rounding(tmp_dst, tmp_dst_stride, dst, dst_stride, w, h,
+                                 0, bd);
   }
 }
 
diff --git a/av1/common/convolve.h b/av1/common/convolve.h
index b95a58f..7bbf220 100644
--- a/av1/common/convolve.h
+++ b/av1/common/convolve.h
@@ -60,27 +60,6 @@
                                   const int subpel_x_q4, const int subpel_y_q4,
                                   ConvolveParams *conv_params);
 
-static INLINE ConvolveParams get_conv_params(int ref, int do_average, int plane,
-                                             int bd) {
-  ConvolveParams conv_params;
-  conv_params.ref = ref;
-  conv_params.do_average = do_average;
-  conv_params.round = CONVOLVE_OPT_ROUND;
-  conv_params.plane = plane;
-  conv_params.do_post_rounding = 0;
-  conv_params.round_0 = ROUND0_BITS;
-  conv_params.round_1 = 0;
-  conv_params.is_compound = 0;
-  conv_params.dst = NULL;
-  conv_params.dst_stride = 0;
-  const int intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2;
-  if (bd < 12) assert(intbufrange <= 16);
-  if (intbufrange > 16) {
-    conv_params.round_0 += intbufrange - 16;
-  }
-  return conv_params;
-}
-
 static INLINE void av1_get_convolve_filter_params(InterpFilters interp_filters,
                                                   InterpFilterParams *params_x,
                                                   InterpFilterParams *params_y
@@ -107,6 +86,7 @@
 
 struct AV1Common;
 struct scale_factors;
+
 void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
                             InterpFilters interp_filters, const int subpel_x_q4,
@@ -114,6 +94,27 @@
                             int scaled, ConvolveParams *conv_params,
                             const struct scale_factors *sf);
 
+static INLINE ConvolveParams get_conv_params_round(int ref, int do_average,
+                                                   int plane, int bd) {
+  ConvolveParams conv_params;
+  conv_params.ref = ref;
+  conv_params.do_average = do_average;
+  conv_params.plane = plane;
+  conv_params.round = CONVOLVE_OPT_ROUND;
+  conv_params.round_0 = ROUND0_BITS;
+  conv_params.round_1 = 0;
+  conv_params.do_post_rounding = 0;
+  conv_params.is_compound = 0;
+  conv_params.dst = NULL;
+  conv_params.dst_stride = 0;
+  const int intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2;
+  if (bd < 12) assert(intbufrange <= 16);
+  if (intbufrange > 16) {
+    conv_params.round_0 += intbufrange - 16;
+  }
+  return conv_params;
+}
+
 static INLINE ConvolveParams get_conv_params_no_round(int ref, int do_average,
                                                       int plane, int32_t *dst,
                                                       int dst_stride,
@@ -125,7 +126,8 @@
   conv_params.is_compound = is_compound;
   conv_params.round_0 = ROUND0_BITS;
 #if CONFIG_LOWPRECISION_BLEND
-  conv_params.round_1 = is_compound ? COMPOUND_ROUND1_BITS : 0;
+  conv_params.round_1 = is_compound ? COMPOUND_ROUND1_BITS
+                                    : 2 * FILTER_BITS - conv_params.round_0;
 #else
   conv_params.round_1 = 0;
 #endif
@@ -145,6 +147,11 @@
   return conv_params;
 }
 
+static INLINE ConvolveParams get_conv_params(int ref, int do_average, int plane,
+                                             int bd) {
+  return get_conv_params_no_round(ref, do_average, plane, NULL, 0, 0, bd);
+}
+
 void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
                                    uint8_t *dst, int dst_stride, int w, int h,
                                    InterpFilters interp_filters,
diff --git a/av1/common/mv.h b/av1/common/mv.h
index 4c547fe..2f9011a 100644
--- a/av1/common/mv.h
+++ b/av1/common/mv.h
@@ -58,7 +58,7 @@
 #define WARP_PARAM_REDUCE_BITS 6
 
 // Precision bits reduction after horizontal shear
-#define HORSHEAR_REDUCE_PREC_BITS 5
+#define HORSHEAR_REDUCE_PREC_BITS 3
 #define VERSHEAR_REDUCE_PREC_BITS \
   (2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)
 
diff --git a/av1/common/scale.c b/av1/common/scale.c
index a334bae..b43de0e 100644
--- a/av1/common/scale.c
+++ b/av1/common/scale.c
@@ -185,7 +185,7 @@
   // subpel_y_q4 == 0
   sf->convolve[1][0][0] = av1_convolve_x_sr;
   // subpel_x_q4 != 0 && subpel_y_q4 != 0
-  sf->convolve[1][1][0] = av1_convolve_2d_sr;
+  sf->convolve[1][1][0] = av1_convolve_2d_sr_c;
 #if CONFIG_JNT_COMP
   // subpel_x_q4 == 0 && subpel_y_q4 == 0
   sf->convolve[0][0][1] = av1_jnt_convolve_2d_copy;
diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c
index 5daa2cf..71cd85a 100644
--- a/av1/common/warped_motion.c
+++ b/av1/common/warped_motion.c
@@ -422,19 +422,24 @@
                               ConvolveParams *conv_params, int16_t alpha,
                               int16_t beta, int16_t gamma, int16_t delta) {
   int32_t tmp[15 * 8];
-  const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
-  const int reduce_bits_horiz =
+  const int use_conv_params =
+      (conv_params->round == CONVOLVE_OPT_NO_ROUND && conv_params->dst);
+  int reduce_bits_horiz =
       use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
+  if (!use_conv_params &&
+      bd + WARPEDPIXEL_FILTER_BITS + 2 - reduce_bits_horiz > 16)
+    reduce_bits_horiz += bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 14;
+  const int reduce_bits_vert =
+      use_conv_params ? conv_params->round_1
+                      : 2 * WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz;
   const int max_bits_horiz =
-      use_conv_params
-          ? bd + FILTER_BITS + 1 - conv_params->round_0
-          : bd + WARPEDPIXEL_FILTER_BITS + 1 - HORSHEAR_REDUCE_PREC_BITS;
+      use_conv_params ? bd + FILTER_BITS + 1 - conv_params->round_0
+                      : bd + WARPEDPIXEL_FILTER_BITS + 1 - reduce_bits_horiz;
   const int offset_bits_horiz =
       use_conv_params ? bd + FILTER_BITS - 1 : bd + WARPEDPIXEL_FILTER_BITS - 1;
   const int offset_bits_vert =
-      use_conv_params
-          ? bd + 2 * FILTER_BITS - conv_params->round_0
-          : bd + 2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS;
+      use_conv_params ? bd + 2 * FILTER_BITS - conv_params->round_0
+                      : bd + 2 * WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz;
   if (use_conv_params) {
     conv_params->do_post_rounding = 1;
   }
@@ -534,7 +539,7 @@
           } else {
             uint16_t *p =
                 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
-            sum = ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS);
+            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
             assert(0 <= sum && sum < (1 << (bd + 2)));
             uint16_t px =
                 clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd);
@@ -719,9 +724,13 @@
                        int16_t gamma, int16_t delta) {
   int32_t tmp[15 * 8];
   const int bd = 8;
-  const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
+  const int use_conv_params =
+      (conv_params->round == CONVOLVE_OPT_NO_ROUND && conv_params->dst);
   const int reduce_bits_horiz =
       use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
+  const int reduce_bits_vert =
+      use_conv_params ? conv_params->round_1
+                      : 2 * WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz;
   const int max_bits_horiz =
       use_conv_params
           ? bd + FILTER_BITS + 1 - conv_params->round_0
@@ -837,7 +846,7 @@
           } else {
             uint8_t *p =
                 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
-            sum = ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS);
+            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
             assert(0 <= sum && sum < (1 << (bd + 2)));
             uint8_t px = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd));
             if (conv_params->do_average)
diff --git a/av1/common/x86/convolve_avx2.c b/av1/common/x86/convolve_avx2.c
index 3692b60..2843a91 100644
--- a/av1/common/x86/convolve_avx2.c
+++ b/av1/common/x86/convolve_avx2.c
@@ -140,7 +140,7 @@
 void av1_convolve_rounding_avx2(const int32_t *src, int src_stride,
                                 uint8_t *dst, int dst_stride, int w, int h,
                                 int bits) {
-  const __m256i rnd_num = _mm256_set1_epi32((int32_t)(1 << (bits - 1)));
+  const __m256i rnd_num = _mm256_set1_epi32((int32_t)((1 << bits) >> 1));
   const __m128i rnd_num_sse2 = _mm256_castsi256_si128(rnd_num);
 
   if (w > 64) {  // width = 128
@@ -283,7 +283,7 @@
                                        uint8_t *dst8, int dst_stride, int w,
                                        int h, int bits, int bd) {
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  const __m256i rnd_num = _mm256_set1_epi32((int32_t)(1 << (bits - 1)));
+  const __m256i rnd_num = _mm256_set1_epi32((int32_t)((1 << bits) >> 1));
   const __m128i rnd_num_sse2 = _mm256_castsi256_si128(rnd_num);
 
   if (w > 64) {  // width = 128
diff --git a/av1/common/x86/highbd_warp_plane_sse4.c b/av1/common/x86/highbd_warp_plane_sse4.c
index 0cd438a..e89ad8b 100644
--- a/av1/common/x86/highbd_warp_plane_sse4.c
+++ b/av1/common/x86/highbd_warp_plane_sse4.c
@@ -22,15 +22,18 @@
                                    ConvolveParams *conv_params, int16_t alpha,
                                    int16_t beta, int16_t gamma, int16_t delta) {
   int comp_avg = conv_params->do_average;
-#if HORSHEAR_REDUCE_PREC_BITS >= 5
   __m128i tmp[15];
-#else
-#error "HORSHEAR_REDUCE_PREC_BITS < 5 not currently supported by SSSE3 filter"
-#endif
   int i, j, k;
-  const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
-  const int reduce_bits_horiz =
+  const int use_conv_params =
+      (conv_params->round == CONVOLVE_OPT_NO_ROUND && conv_params->dst);
+  int reduce_bits_horiz =
       use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
+  if (!use_conv_params &&
+      bd + WARPEDPIXEL_FILTER_BITS + 2 - reduce_bits_horiz > 16)
+    reduce_bits_horiz += bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 14;
+  const int reduce_bits_vert =
+      use_conv_params ? conv_params->round_1
+                      : 2 * WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz;
   const int offset_bits_horiz =
       use_conv_params ? bd + FILTER_BITS - 1 : bd + WARPEDPIXEL_FILTER_BITS - 1;
   if (use_conv_params) {
@@ -91,10 +94,9 @@
           else if (iy > height - 1)
             iy = height - 1;
           tmp[k + 7] = _mm_set1_epi16(
-              (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
-                     1)) +
+              (1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) +
               ref[iy * stride] *
-                  (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+                  (1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz)));
         }
       } else if (ix4 >= width + 6) {
         for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
@@ -104,10 +106,9 @@
           else if (iy > height - 1)
             iy = height - 1;
           tmp[k + 7] = _mm_set1_epi16(
-              (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
-                     1)) +
+              (1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) +
               ref[iy * stride + (width - 1)] *
-                  (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+                  (1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz)));
         }
       } else {
         for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
@@ -361,13 +362,13 @@
         } else {
           // Round and pack into 8 bits
           const __m128i round_const =
-              _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
-                             ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
+              _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
+                             ((1 << reduce_bits_vert) >> 1));
 
           const __m128i res_lo_round = _mm_srai_epi32(
-              _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
+              _mm_add_epi32(res_lo, round_const), reduce_bits_vert);
           const __m128i res_hi_round = _mm_srai_epi32(
-              _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
+              _mm_add_epi32(res_hi, round_const), reduce_bits_vert);
 
           __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
           // Clamp res_16bit to the range [0, 2^bd - 1]
diff --git a/av1/common/x86/highbd_warp_plane_ssse3.c b/av1/common/x86/highbd_warp_plane_ssse3.c
index dc727b6..e1d7f8e 100644
--- a/av1/common/x86/highbd_warp_plane_ssse3.c
+++ b/av1/common/x86/highbd_warp_plane_ssse3.c
@@ -22,21 +22,25 @@
                                   ConvolveParams *conv_params, int16_t alpha,
                                   int16_t beta, int16_t gamma, int16_t delta) {
   int comp_avg = conv_params->do_average;
-#if HORSHEAR_REDUCE_PREC_BITS >= 5
   __m128i tmp[15];
-#else
-#error "HORSHEAR_REDUCE_PREC_BITS < 5 not currently supported by SSSE3 filter"
-#endif
   int i, j, k;
-  const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
-  const int reduce_bits_horiz =
+  const int use_conv_params =
+      (conv_params->round == CONVOLVE_OPT_NO_ROUND && conv_params->dst);
+  int reduce_bits_horiz =
       use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
+  if (!use_conv_params &&
+      bd + WARPEDPIXEL_FILTER_BITS + 2 - reduce_bits_horiz > 16)
+    reduce_bits_horiz += bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 14;
+  const int reduce_bits_vert =
+      use_conv_params ? conv_params->round_1
+                      : 2 * WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz;
   const int offset_bits_horiz =
       use_conv_params ? bd + FILTER_BITS - 1 : bd + WARPEDPIXEL_FILTER_BITS - 1;
   if (use_conv_params) {
     conv_params->do_post_rounding = 1;
   }
   assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS);
+  if (bd == 12 && reduce_bits_horiz < 5) printf("Error\n");
 
   /* Note: For this code to work, the left/right frame borders need to be
      extended by at least 13 pixels each. By the time we get here, other
@@ -85,10 +89,9 @@
           else if (iy > height - 1)
             iy = height - 1;
           tmp[k + 7] = _mm_set1_epi16(
-              (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
-                     1)) +
+              (1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) +
               ref[iy * stride] *
-                  (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+                  (1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz)));
         }
       } else if (ix4 >= width + 6) {
         for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
@@ -98,10 +101,9 @@
           else if (iy > height - 1)
             iy = height - 1;
           tmp[k + 7] = _mm_set1_epi16(
-              (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
-                     1)) +
+              (1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) +
               ref[iy * stride + (width - 1)] *
-                  (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+                  (1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz)));
         }
       } else {
         for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
@@ -320,13 +322,13 @@
         } else {
           // Round and pack into 8 bits
           const __m128i round_const =
-              _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
-                             ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
+              _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
+                             ((1 << reduce_bits_vert) >> 1));
 
           const __m128i res_lo_round = _mm_srai_epi32(
-              _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
+              _mm_add_epi32(res_lo, round_const), reduce_bits_vert);
           const __m128i res_hi_round = _mm_srai_epi32(
-              _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
+              _mm_add_epi32(res_hi, round_const), reduce_bits_vert);
 
           __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
           // Clamp res_16bit to the range [0, 2^bd - 1]
diff --git a/av1/common/x86/warp_plane_sse2.c b/av1/common/x86/warp_plane_sse2.c
index 75ed82b..d330cd3 100644
--- a/av1/common/x86/warp_plane_sse2.c
+++ b/av1/common/x86/warp_plane_sse2.c
@@ -24,7 +24,8 @@
   __m128i tmp[15];
   int i, j, k;
   const int bd = 8;
-  const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
+  const int use_conv_params =
+      (conv_params->round == CONVOLVE_OPT_NO_ROUND && conv_params->dst);
   const int reduce_bits_horiz =
       use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
   const int offset_bits_horiz =
@@ -81,10 +82,9 @@
           else if (iy > height - 1)
             iy = height - 1;
           tmp[k + 7] = _mm_set1_epi16(
-              (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
-                     1)) +
+              (1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) +
               ref[iy * stride] *
-                  (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+                  (1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz)));
         }
       } else if (ix4 >= width + 6) {
         for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
@@ -94,10 +94,9 @@
           else if (iy > height - 1)
             iy = height - 1;
           tmp[k + 7] = _mm_set1_epi16(
-              (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
-                     1)) +
+              (1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) +
               ref[iy * stride + (width - 1)] *
-                  (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+                  (1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz)));
         }
       } else {
         for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
diff --git a/av1/common/x86/warp_plane_sse4.c b/av1/common/x86/warp_plane_sse4.c
index 2c97704..b421533 100644
--- a/av1/common/x86/warp_plane_sse4.c
+++ b/av1/common/x86/warp_plane_sse4.c
@@ -212,7 +212,8 @@
   __m128i tmp[15];
   int i, j, k;
   const int bd = 8;
-  const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
+  const int use_conv_params =
+      (conv_params->round == CONVOLVE_OPT_NO_ROUND && conv_params->dst);
   const int reduce_bits_horiz =
       use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
   const int offset_bits_horiz =
@@ -275,10 +276,9 @@
           else if (iy > height - 1)
             iy = height - 1;
           tmp[k + 7] = _mm_set1_epi16(
-              (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
-                     1)) +
+              (1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) +
               ref[iy * stride] *
-                  (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+                  (1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz)));
         }
       } else if (ix4 >= width + 6) {
         for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
@@ -288,10 +288,9 @@
           else if (iy > height - 1)
             iy = height - 1;
           tmp[k + 7] = _mm_set1_epi16(
-              (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
-                     1)) +
+              (1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) +
               ref[iy * stride + (width - 1)] *
-                  (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+                  (1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz)));
         }
       } else {
         for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
diff --git a/av1/common/x86/warp_plane_ssse3.c b/av1/common/x86/warp_plane_ssse3.c
index b0501e9..f18dad1 100644
--- a/av1/common/x86/warp_plane_ssse3.c
+++ b/av1/common/x86/warp_plane_ssse3.c
@@ -211,7 +211,8 @@
   __m128i tmp[15];
   int i, j, k;
   const int bd = 8;
-  const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
+  const int use_conv_params =
+      (conv_params->round == CONVOLVE_OPT_NO_ROUND && conv_params->dst);
   const int reduce_bits_horiz =
       use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
   const int offset_bits_horiz =
@@ -268,10 +269,9 @@
           else if (iy > height - 1)
             iy = height - 1;
           tmp[k + 7] = _mm_set1_epi16(
-              (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
-                     1)) +
+              (1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) +
               ref[iy * stride] *
-                  (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+                  (1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz)));
         }
       } else if (ix4 >= width + 6) {
         for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
@@ -281,10 +281,9 @@
           else if (iy > height - 1)
             iy = height - 1;
           tmp[k + 7] = _mm_set1_epi16(
-              (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
-                     1)) +
+              (1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) +
               ref[iy * stride + (width - 1)] *
-                  (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+                  (1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz)));
         }
       } else {
         for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
diff --git a/test/av1_convolve_optimz_test.cc b/test/av1_convolve_optimz_test.cc
index 288daeb..0900aa1 100644
--- a/test/av1_convolve_optimz_test.cc
+++ b/test/av1_convolve_optimz_test.cc
@@ -66,7 +66,7 @@
     subpel_ = GET_PARAM(4);
     int ref = GET_PARAM(5);
     const int plane = 0;
-    conv_params_ = get_conv_params(ref, ref, plane, 8);
+    conv_params_ = get_conv_params_round(ref, ref, plane, 8);
 
     alloc_ = new uint8_t[maxBlockSize * 4];
     src_ = alloc_ + (vertiOffset * maxWidth);
diff --git a/test/av1_convolve_test.cc b/test/av1_convolve_test.cc
index cb58289..397fc70 100644
--- a/test/av1_convolve_test.cc
+++ b/test/av1_convolve_test.cc
@@ -149,7 +149,7 @@
 
 TEST_P(Av1ConvolveTest, av1_convolve_vert) {
   const int y_step_q4 = 16;
-  ConvolveParams conv_params = get_conv_params(0, 0, 0, 8);
+  ConvolveParams conv_params = get_conv_params_round(0, 0, 0, 8);
 
   int in_stride, out_stride, ref_out_stride, avg_out_stride, ref_avg_out_stride;
   uint8_t *in = add_input(MAX_SB_SIZE, MAX_SB_SIZE, &in_stride);
@@ -202,7 +202,7 @@
 
 TEST_P(Av1ConvolveTest, av1_convolve_horiz) {
   const int x_step_q4 = 16;
-  ConvolveParams conv_params = get_conv_params(0, 0, 0, 8);
+  ConvolveParams conv_params = get_conv_params_round(0, 0, 0, 8);
 
   int in_stride, out_stride, ref_out_stride, avg_out_stride, ref_avg_out_stride;
   uint8_t *in = add_input(MAX_SB_SIZE, MAX_SB_SIZE, &in_stride);