Change warp filter to use one less precision bit

Change-Id: Idc7bb686f5751b0457c9f21daac0fa6f4865fd22
diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c
index fb28b04..be3ad49 100644
--- a/av1/common/warped_motion.c
+++ b/av1/common/warped_motion.c
@@ -701,12 +701,8 @@
   8240,  8224,  8208,  8192,
 };
 
-static INLINE int16_t saturate_int16(int32_t v) {
-  if (v > 32767)
-    return 32767;
-  else if (v < -32768)
-    return -32768;
-  return v;
+static INLINE uint16_t saturate_uint(int32_t v, int bits) {
+  return (uint16_t)clamp(v, 0, (1 << bits) - 1);
 }
 
 #if CONFIG_WARPED_MOTION
@@ -1028,14 +1024,18 @@
         if (ix4 <= -7) {
           for (l = 0; l < 8; ++l) {
             tmp[(k + 7) * 8 + l] =
-                ref[iy * stride] *
-                (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS));
+                (1 << (bd + WARPEDPIXEL_FILTER_BITS -
+                       HORSHEAR_REDUCE_PREC_BITS - 1)) +
+                ref[iy * stride] * (1 << (WARPEDPIXEL_FILTER_BITS -
+                                          HORSHEAR_REDUCE_PREC_BITS));
           }
         } else if (ix4 >= width + 6) {
           for (l = 0; l < 8; ++l) {
-            tmp[(k + 7) * 8 + l] =
-                ref[iy * stride + (width - 1)] *
-                (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS));
+            tmp[(k + 7) * 8 + l] = (1 << (bd + WARPEDPIXEL_FILTER_BITS -
+                                          HORSHEAR_REDUCE_PREC_BITS - 1)) +
+                                   ref[iy * stride + (width - 1)] *
+                                       (1 << (WARPEDPIXEL_FILTER_BITS -
+                                              HORSHEAR_REDUCE_PREC_BITS));
           }
         } else {
           int sx = sx4 + beta * (k + 4);
@@ -1045,14 +1045,16 @@
             const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
                              WARPEDPIXEL_PREC_SHIFTS;
             const int16_t *coeffs = warped_filter[offs];
-            int32_t sum = 0;
+            int32_t sum = 1 << (bd + WARPEDPIXEL_FILTER_BITS - 1);
             // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
             for (m = 0; m < 8; ++m) {
               sum += ref[iy * stride + ix + m] * coeffs[m];
             }
             sum = ROUND_POWER_OF_TWO(sum, HORSHEAR_REDUCE_PREC_BITS);
 #if HORSHEAR_REDUCE_PREC_BITS >= 5
-            tmp[(k + 7) * 8 + (l + 4)] = saturate_int16(sum);
+            tmp[(k + 7) * 8 + (l + 4)] =
+                saturate_uint(sum, bd + WARPEDPIXEL_FILTER_BITS -
+                                       HORSHEAR_REDUCE_PREC_BITS + 1);
 #else
             tmp[(k + 7) * 8 + (l + 4)] = sum;
 #endif
@@ -1070,7 +1072,7 @@
           const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
                            WARPEDPIXEL_PREC_SHIFTS;
           const int16_t *coeffs = warped_filter[offs];
-          int32_t sum = 0;
+          int32_t sum = -(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1));
           // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
           for (m = 0; m < 8; ++m) {
             sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
@@ -1232,6 +1234,7 @@
                        int16_t delta) {
   int16_t tmp[15 * 8];
   int i, j, k, l, m;
+  const int bd = 8;
 
   /* Note: For this code to work, the left/right frame borders need to be
      extended by at least 13 pixels each. By the time we get here, other
@@ -1288,8 +1291,10 @@
           // (once border extension is taken into account)
           for (l = 0; l < 8; ++l) {
             tmp[(k + 7) * 8 + l] =
-                ref[iy * stride] *
-                (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS));
+                (1 << (bd + WARPEDPIXEL_FILTER_BITS -
+                       HORSHEAR_REDUCE_PREC_BITS - 1)) +
+                ref[iy * stride] * (1 << (WARPEDPIXEL_FILTER_BITS -
+                                          HORSHEAR_REDUCE_PREC_BITS));
           }
         } else if (ix4 >= width + 6) {
           // In this case, the leftmost pixel sampled is in column
@@ -1297,9 +1302,11 @@
           // will sample only from the rightmost column
           // (once border extension is taken into account)
           for (l = 0; l < 8; ++l) {
-            tmp[(k + 7) * 8 + l] =
-                ref[iy * stride + (width - 1)] *
-                (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS));
+            tmp[(k + 7) * 8 + l] = (1 << (bd + WARPEDPIXEL_FILTER_BITS -
+                                          HORSHEAR_REDUCE_PREC_BITS - 1)) +
+                                   ref[iy * stride + (width - 1)] *
+                                       (1 << (WARPEDPIXEL_FILTER_BITS -
+                                              HORSHEAR_REDUCE_PREC_BITS));
           }
         } else {
           // If we get here, then
@@ -1317,13 +1324,15 @@
             const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
                              WARPEDPIXEL_PREC_SHIFTS;
             const int16_t *coeffs = warped_filter[offs];
-            int32_t sum = 0;
+            int32_t sum = 1 << (bd + WARPEDPIXEL_FILTER_BITS - 1);
             // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
             for (m = 0; m < 8; ++m) {
               sum += ref[iy * stride + ix + m] * coeffs[m];
             }
             sum = ROUND_POWER_OF_TWO(sum, HORSHEAR_REDUCE_PREC_BITS);
-            tmp[(k + 7) * 8 + (l + 4)] = saturate_int16(sum);
+            tmp[(k + 7) * 8 + (l + 4)] =
+                saturate_uint(sum, bd + WARPEDPIXEL_FILTER_BITS -
+                                       HORSHEAR_REDUCE_PREC_BITS + 1);
             sx += alpha;
           }
         }
@@ -1339,7 +1348,7 @@
           const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
                            WARPEDPIXEL_PREC_SHIFTS;
           const int16_t *coeffs = warped_filter[offs];
-          int32_t sum = 0;
+          int32_t sum = -(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1));
           // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
           for (m = 0; m < 8; ++m) {
             sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
diff --git a/av1/common/x86/highbd_warp_plane_ssse3.c b/av1/common/x86/highbd_warp_plane_ssse3.c
index 51f67f7..eac7caf 100644
--- a/av1/common/x86/highbd_warp_plane_ssse3.c
+++ b/av1/common/x86/highbd_warp_plane_ssse3.c
@@ -89,8 +89,10 @@
           else if (iy > height - 1)
             iy = height - 1;
           tmp[k + 7] = _mm_set1_epi16(
+              (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
+                     1)) +
               ref[iy * stride] *
-              (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+                  (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
         }
       } else if (ix4 >= width + 6) {
         for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
@@ -100,8 +102,10 @@
           else if (iy > height - 1)
             iy = height - 1;
           tmp[k + 7] = _mm_set1_epi16(
+              (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
+                     1)) +
               ref[iy * stride + (width - 1)] *
-              (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+                  (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
         }
       } else {
         for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
@@ -151,7 +155,8 @@
           const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
 
           const __m128i round_const =
-              _mm_set1_epi32((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1);
+              _mm_set1_epi32((1 << (bd + WARPEDPIXEL_FILTER_BITS - 1)) +
+                             ((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1));
 
           // Calculate filtered results
           const __m128i res_0 = _mm_madd_epi16(src, coeff_0);
@@ -299,7 +304,8 @@
 
         // Round and pack into 8 bits
         const __m128i round_const =
-            _mm_set1_epi32((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1);
+            _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
+                           ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
 
         const __m128i res_lo_round = _mm_srai_epi32(
             _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
diff --git a/av1/common/x86/warp_plane_sse2.c b/av1/common/x86/warp_plane_sse2.c
index 09e72da..674a77f 100644
--- a/av1/common/x86/warp_plane_sse2.c
+++ b/av1/common/x86/warp_plane_sse2.c
@@ -23,6 +23,7 @@
                           int16_t delta) {
   __m128i tmp[15];
   int i, j, k;
+  const int bd = 8;
 
   /* Note: For this code to work, the left/right frame borders need to be
      extended by at least 13 pixels each. By the time we get here, other
@@ -84,8 +85,10 @@
           else if (iy > height - 1)
             iy = height - 1;
           tmp[k + 7] = _mm_set1_epi16(
+              (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
+                     1)) +
               ref[iy * stride] *
-              (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+                  (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
         }
       } else if (ix4 >= width + 6) {
         for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
@@ -95,8 +98,10 @@
           else if (iy > height - 1)
             iy = height - 1;
           tmp[k + 7] = _mm_set1_epi16(
+              (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
+                     1)) +
               ref[iy * stride + (width - 1)] *
-              (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+                  (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
         }
       } else {
         for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
@@ -145,7 +150,8 @@
           const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
 
           const __m128i round_const =
-              _mm_set1_epi32((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1);
+              _mm_set1_epi32((1 << (bd + WARPEDPIXEL_FILTER_BITS - 1)) +
+                             ((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1));
 
           // Calculate filtered results
           const __m128i src_0 = _mm_unpacklo_epi8(src, zero);
@@ -294,7 +300,8 @@
 
         // Round and pack into 8 bits
         const __m128i round_const =
-            _mm_set1_epi32((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1);
+            _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
+                           ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
 
         const __m128i res_lo_round = _mm_srai_epi32(
             _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
diff --git a/av1/common/x86/warp_plane_ssse3.c b/av1/common/x86/warp_plane_ssse3.c
index 37f7c4c..39a1f71 100644
--- a/av1/common/x86/warp_plane_ssse3.c
+++ b/av1/common/x86/warp_plane_ssse3.c
@@ -210,6 +210,7 @@
                            int16_t delta) {
   __m128i tmp[15];
   int i, j, k;
+  const int bd = 8;
 
   /* Note: For this code to work, the left/right frame borders need to be
      extended by at least 13 pixels each. By the time we get here, other
@@ -271,8 +272,10 @@
           else if (iy > height - 1)
             iy = height - 1;
           tmp[k + 7] = _mm_set1_epi16(
+              (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
+                     1)) +
               ref[iy * stride] *
-              (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+                  (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
         }
       } else if (ix4 >= width + 6) {
         for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
@@ -282,8 +285,10 @@
           else if (iy > height - 1)
             iy = height - 1;
           tmp[k + 7] = _mm_set1_epi16(
+              (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
+                     1)) +
               ref[iy * stride + (width - 1)] *
-              (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+                  (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
         }
       } else {
         for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
@@ -365,7 +370,8 @@
           const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff_57);
 
           const __m128i round_const =
-              _mm_set1_epi16((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1);
+              _mm_set1_epi16((1 << (bd + WARPEDPIXEL_FILTER_BITS - 1)) +
+                             ((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1));
 
           // Note: res_02 + res_46 and res_13 + res_57 are always in the range
           // [-6120, 32640]. This gives us enough room to add the rounding
@@ -374,12 +380,8 @@
               _mm_add_epi16(_mm_add_epi16(res_02, res_46), round_const);
           const __m128i res_b = _mm_add_epi16(res_13, res_57);
 
-          // Calculate (res_a + res_b) >> 1 while avoiding overflow
-          const __m128i t1 = _mm_and_si128(res_a, res_b);
-          const __m128i t2 = _mm_srai_epi16(_mm_xor_si128(res_a, res_b), 1);
-
-          const __m128i res = _mm_srai_epi16(_mm_add_epi16(t1, t2),
-                                             HORSHEAR_REDUCE_PREC_BITS - 1);
+          const __m128i res = _mm_srli_epi16(_mm_add_epi16(res_a, res_b),
+                                             HORSHEAR_REDUCE_PREC_BITS);
           tmp[k + 7] = res;
         }
       }
@@ -471,7 +473,8 @@
 
         // Round and pack into 8 bits
         const __m128i round_const =
-            _mm_set1_epi32((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1);
+            _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
+                           ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
 
         const __m128i res_lo_round = _mm_srai_epi32(
             _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);