Reduce precision bis between shears

Change-Id: I89e981c9396c7a1ba8051d65036a16692da94d0d
diff --git a/av1/common/mv.h b/av1/common/mv.h
index 0b7a151..a22a5c8 100644
--- a/av1/common/mv.h
+++ b/av1/common/mv.h
@@ -53,6 +53,11 @@
 // Precision of filter taps
 #define WARPEDPIXEL_FILTER_BITS 7
 
+// Precision bits reduction after horizontal shear
+#define HORSHEAR_REDUCE_PREC_BITS 5
+#define VERSHEAR_REDUCE_PREC_BITS \
+  (2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)
+
 #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
 
 /* clang-format off */
diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c
index 0ad142c..1aecd56 100644
--- a/av1/common/warped_motion.c
+++ b/av1/common/warped_motion.c
@@ -916,6 +916,7 @@
               else
                 sum += ref[iy * stride + ix + m - 3] * coeffs[m];
             }
+            sum = ROUND_POWER_OF_TWO(sum, HORSHEAR_REDUCE_PREC_BITS);
             tmp[(k + 7) * 8 + (l + 4)] = sum;
           }
         }
@@ -933,8 +934,7 @@
               sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
             }
             sum = clip_pixel_highbd(
-                ROUND_POWER_OF_TWO_SIGNED(sum, 2 * WARPEDPIXEL_FILTER_BITS),
-                bd);
+                ROUND_POWER_OF_TWO_SIGNED(sum, VERSHEAR_REDUCE_PREC_BITS), bd);
             if (ref_frm)
               *p = ROUND_POWER_OF_TWO_SIGNED(*p + sum, 1);
             else
@@ -1116,7 +1116,8 @@
           // (once border extension is taken into account)
           for (l = 0; l < 8; ++l) {
             tmp[(k + 7) * 8 + l] =
-                ref[iy * stride] * (1 << WARPEDPIXEL_FILTER_BITS);
+                ref[iy * stride] *
+                (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS));
           }
         } else if (ix4 >= width + 6) {
           // In this case, the leftmost pixel sampled is in column
@@ -1125,7 +1126,8 @@
           // (once border extension is taken into account)
           for (l = 0; l < 8; ++l) {
             tmp[(k + 7) * 8 + l] =
-                ref[iy * stride + (width - 1)] * (1 << WARPEDPIXEL_FILTER_BITS);
+                ref[iy * stride + (width - 1)] *
+                (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS));
           }
         } else {
           // If we get here, then
@@ -1148,6 +1150,7 @@
             for (m = 0; m < 8; ++m) {
               sum += ref[iy * stride + ix + m] * coeffs[m];
             }
+            sum = ROUND_POWER_OF_TWO(sum, HORSHEAR_REDUCE_PREC_BITS);
             tmp[(k + 7) * 8 + (l + 4)] = saturate_int16(sum);
             sx += alpha;
           }
@@ -1169,8 +1172,7 @@
           for (m = 0; m < 8; ++m) {
             sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
           }
-          sum =
-              clip_pixel(ROUND_POWER_OF_TWO(sum, 2 * WARPEDPIXEL_FILTER_BITS));
+          sum = clip_pixel(ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS));
           if (ref_frm)
             *p = ROUND_POWER_OF_TWO(*p + sum, 1);
           else
@@ -1199,7 +1201,6 @@
     const int32_t gamma = wm->gamma;
     const int32_t delta = wm->delta;
 
-    // printf("%d %d %d %d\n", mat[2], mat[3], mat[4], mat[5]);
     av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
                     p_width, p_height, p_stride, subsampling_x, subsampling_y,
                     ref_frm, alpha, beta, gamma, delta);
diff --git a/av1/common/x86/warp_plane_sse2.c b/av1/common/x86/warp_plane_sse2.c
index 42c25c9..d7d2a1c 100644
--- a/av1/common/x86/warp_plane_sse2.c
+++ b/av1/common/x86/warp_plane_sse2.c
@@ -79,11 +79,13 @@
         // would be taken from the leftmost/rightmost column, then we can
         // skip the expensive horizontal filter.
         if (ix4 <= -7) {
-          tmp[k + 7] =
-              _mm_set1_epi16(ref[iy * stride] * (1 << WARPEDPIXEL_FILTER_BITS));
+          tmp[k + 7] = _mm_set1_epi16(
+              ref[iy * stride] *
+              (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
         } else if (ix4 >= width + 6) {
-          tmp[k + 7] = _mm_set1_epi16(ref[iy * stride + (width - 1)] *
-                                      (1 << WARPEDPIXEL_FILTER_BITS));
+          tmp[k + 7] = _mm_set1_epi16(
+              ref[iy * stride + (width - 1)] *
+              (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
         } else {
           int sx = sx4 + alpha * (-4) + beta * k +
                    // Include rounding and offset here
@@ -119,6 +121,9 @@
           // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
           __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
 
+          __m128i round_const =
+              _mm_set1_epi32((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1);
+
           // Calculate filtered results
           __m128i src_0 = _mm_unpacklo_epi8(src, zero);
           __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
@@ -131,6 +136,8 @@
 
           __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
                                            _mm_add_epi32(res_2, res_6));
+          res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
+                                    HORSHEAR_REDUCE_PREC_BITS);
 
           // Filter odd-index pixels
           __m128i tmp_1 = filter[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS];
@@ -159,6 +166,8 @@
 
           __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
                                           _mm_add_epi32(res_3, res_7));
+          res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
+                                   HORSHEAR_REDUCE_PREC_BITS);
 
           // Combine results into one register.
           // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
@@ -240,12 +249,12 @@
 
         // Round and pack into 8 bits
         __m128i round_const =
-            _mm_set1_epi32((1 << (2 * WARPEDPIXEL_FILTER_BITS)) >> 1);
+            _mm_set1_epi32((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1);
 
         __m128i res_lo_round = _mm_srai_epi32(
-            _mm_add_epi32(res_lo, round_const), 2 * WARPEDPIXEL_FILTER_BITS);
+            _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
         __m128i res_hi_round = _mm_srai_epi32(
-            _mm_add_epi32(res_hi, round_const), 2 * WARPEDPIXEL_FILTER_BITS);
+            _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
 
         __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
         __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);