Revert "warp_affine_c: Refactor highbd and lowbd versions." This reverts commit 8cd0e7efac95844556d98d4634755b47b203a3ed. Reason for revert: This change breaks av1_warp_affine_c when CONFIG_HIGHBITDEPTH is enabled. In particular, running ./test_libaom --gtest_filter=*Warp* compiled with --enable-warped-motion --enable-highbitdepth shows several test failures, followed by a segmentation fault when it gets up to test SSE2/AV1WarpFilterTest.CheckOutput/4 The tricky part is that the use the lowbd version of the function is dependent on a mix of two conditions: (1) Compile time check for CONFIG_HIGHBITDEPTH and (2) Run time check to see if bit-depth == 8 So, it is tricky to refactor. BUG=aomedia:442 Change-Id: I610c537fb65bde4f357185a13081639f906351de

commit: 0d08afdc7aa1fde54e664ecca34ca515dc97f689 [log] [tgz]
author: Urvang Joshi <urvang@google.com> Fri Apr 21 17:55:20 2017 +0000
committer: Urvang Joshi <urvang@google.com> Fri Apr 21 18:29:03 2017 +0000
tree: 404e460262a71d4e688888da4921c2d38f8d0d06
parent: ec4048aedd7521b88953ff3b872c4332e11eecdf [diff]
diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c
index f1cbbe2..c74609d 100644
--- a/av1/common/warped_motion.c
+++ b/av1/common/warped_motion.c

@@ -774,209 +774,6 @@
   return 1;
 }
 
-/* The warp filter for ROTZOOM and AFFINE models works as follows:
-   * Split the input into 8x8 blocks
-   * For each block, project the point (4, 4) within the block, to get the
-     overall block position. Split into integer and fractional coordinates,
-     maintaining full WARPEDMODEL precision
-   * Filter horizontally: Generate 15 rows of 8 pixels each. Each pixel gets a
-     variable horizontal offset. This means that, while the rows of the
-     intermediate buffer align with the rows of the *reference* image, the
-     columns align with the columns of the *destination* image.
-   * Filter vertically: Generate the output block (up to 8x8 pixels, but if the
-     destination is too small we crop the output at this stage). Each pixel has
-     a variable vertical offset, so that the resulting rows are aligned with
-     the rows of the destination image.
-
-   To accomplish these alignments, we factor the warp matrix as a
-   product of two shear / asymmetric zoom matrices:
-   / a b \  = /   1       0    \ * / 1+alpha  beta \
-   \ c d /    \ gamma  1+delta /   \    0      1   /
-   where a, b, c, d are wmmat[2], wmmat[3], wmmat[4], wmmat[5] respectively.
-   The second shear (with alpha and beta) is applied by the horizontal filter,
-   then the first shear (with gamma and delta) is applied by the vertical
-   filter.
-
-   The only limitation is that, to fit this in a fixed 8-tap filter size,
-   the fractional pixel offsets must be at most +-1. Since the horizontal filter
-   generates 15 rows of 8 columns, and the initial point we project is at (4, 4)
-   within the block, the parameters must satisfy
-   4 * |alpha| + 7 * |beta| <= 1   and   4 * |gamma| + 7 * |delta| <= 1
-   for this filter to be applicable.
-
-   Note: warp_affine() assumes that the caller has done all of the relevant
-   checks, ie. that we have a ROTZOOM or AFFINE model, that wm[4] and wm[5]
-   are set appropriately (if using a ROTZOOM model), and that alpha, beta,
-   gamma, delta are all in range.
-
-   TODO(david.barker): Maybe support scaled references?
-*/
-// Note also: The "worst case" in terms of modulus of the data stored into 'tmp'
-// (ie, the result of 'sum' in the horizontal filter) occurs when:
-// coeffs = { -2,   8, -22,  87,  72, -21,   8, -2}, and
-// ref =    {  0, 255,   0, 255, 255,   0, 255,  0}
-// Before rounding, this gives sum = 716625. After rounding,
-// HORSHEAR_REDUCE_PREC_BITS = 4 => sum = 44789 > 2^15
-// HORSHEAR_REDUCE_PREC_BITS = 5 => sum = 22395 < 2^15
-//
-// So, as long as HORSHEAR_REDUCE_PREC_BITS >= 5, we can safely use a 16-bit
-// intermediate array.
-static void warp_affine_c_helper(int32_t *mat, void *ref_void, int width,
-                                 int height, int stride, void *pred_void,
-                                 int p_col, int p_row, int p_width,
-                                 int p_height, int p_stride, int subsampling_x,
-                                 int subsampling_y, int bd, int ref_frm,
-                                 int16_t alpha, int16_t beta, int16_t gamma,
-                                 int16_t delta) {
-#if CONFIG_HIGHBITDEPTH
-  uint16_t *ref = (uint16_t *)ref_void;
-  uint16_t *pred = (uint16_t *)pred_void;
-#else
-  uint8_t *ref = (uint8_t *)ref_void;
-  uint8_t *pred = (uint8_t *)pred_void;
-  (void)bd;
-#endif  // CONFIG_HIGHBITDEPTH
-
-#if !CONFIG_HIGHBITDEPTH || (HORSHEAR_REDUCE_PREC_BITS >= 5)
-  int16_t tmp[15 * 8];
-#else
-  int32_t tmp[15 * 8];
-#endif  // !CONFIG_HIGHBITDEPTH || (HORSHEAR_REDUCE_PREC_BITS >= 5)
-
-  int i, j, k, l, m;
-
-  /* Note: For this code to work, the left/right frame borders need to be
-     extended by at least 13 pixels each. By the time we get here, other
-     code will have set up this border, but we allow an explicit check
-     for debugging purposes.
-  */
-  /*for (i = 0; i < height; ++i) {
-    for (j = 0; j < 13; ++j) {
-      assert(ref[i * stride - 13 + j] == ref[i * stride]);
-      assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
-    }
-  }*/
-
-  for (i = p_row; i < p_row + p_height; i += 8) {
-    for (j = p_col; j < p_col + p_width; j += 8) {
-      int32_t x4, y4, ix4, sx4, iy4, sy4;
-      if (subsampling_x)
-        x4 = ROUND_POWER_OF_TWO_SIGNED(
-            mat[2] * 2 * (j + 4) + mat[3] * 2 * (i + 4) + mat[0] +
-                (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
-            1);
-      else
-        x4 = mat[2] * (j + 4) + mat[3] * (i + 4) + mat[0];
-
-      if (subsampling_y)
-        y4 = ROUND_POWER_OF_TWO_SIGNED(
-            mat[4] * 2 * (j + 4) + mat[5] * 2 * (i + 4) + mat[1] +
-                (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
-            1);
-      else
-        y4 = mat[4] * (j + 4) + mat[5] * (i + 4) + mat[1];
-
-      ix4 = x4 >> WARPEDMODEL_PREC_BITS;
-      sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-      iy4 = y4 >> WARPEDMODEL_PREC_BITS;
-      sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-
-      // Horizontal filter
-      for (k = -7; k < 8; ++k) {
-        int iy = iy4 + k;
-        if (iy < 0)
-          iy = 0;
-        else if (iy > height - 1)
-          iy = height - 1;
-
-        if (ix4 <= -7) {
-          // In this case, the rightmost pixel sampled is in column
-          // ix4 + 3 + 7 - 3 = ix4 + 7 <= 0, ie. the entire block
-          // will sample only from the leftmost column
-          // (once border extension is taken into account)
-          for (l = 0; l < 8; ++l) {
-            tmp[(k + 7) * 8 + l] =
-                ref[iy * stride] *
-                (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS));
-          }
-        } else if (ix4 >= width + 6) {
-          // In this case, the leftmost pixel sampled is in column
-          // ix4 - 4 + 0 - 3 = ix4 - 7 >= width - 1, ie. the entire block
-          // will sample only from the rightmost column
-          // (once border extension is taken into account)
-          for (l = 0; l < 8; ++l) {
-            tmp[(k + 7) * 8 + l] =
-                ref[iy * stride + (width - 1)] *
-                (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS));
-          }
-        } else {
-          // If we get here, then
-          // the leftmost pixel sampled is
-          // ix4 - 4 + 0 - 3 = ix4 - 7 >= -13
-          // and the rightmost pixel sampled is at most
-          // ix4 + 3 + 7 - 3 = ix4 + 7 <= width + 12
-          // So, assuming that border extension has been done, we
-          // don't need to explicitly clamp values.
-          int sx = sx4 + alpha * (-4) + beta * k;
-
-          for (l = -4; l < 4; ++l) {
-            int ix = ix4 + l - 3;
-            // At this point, sx = sx4 + alpha * l + beta * k
-            const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
-                             WARPEDPIXEL_PREC_SHIFTS;
-            const int16_t *coeffs = warped_filter[offs];
-            int32_t sum = 0;
-            // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
-            for (m = 0; m < 8; ++m) {
-              sum += ref[iy * stride + ix + m] * coeffs[m];
-            }
-            sum = ROUND_POWER_OF_TWO(sum, HORSHEAR_REDUCE_PREC_BITS);
-#if !CONFIG_HIGHBITDEPTH || (HORSHEAR_REDUCE_PREC_BITS >= 5)
-            tmp[(k + 7) * 8 + (l + 4)] = saturate_int16(sum);
-#else
-            tmp[(k + 7) * 8 + (l + 4)] = sum;
-#endif  // !CONFIG_HIGHBITDEPTH || (HORSHEAR_REDUCE_PREC_BITS >= 5)
-            sx += alpha;
-          }
-        }
-      }
-
-      // Vertical filter
-      for (k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
-        int sy = sy4 + gamma * (-4) + delta * k;
-        for (l = -4; l < 4; ++l) {
-#if CONFIG_HIGHBITDEPTH
-          uint16_t *p =
-#else
-          uint8_t *p =
-#endif  // CONFIG_HIGHBITDEPTH
-              &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
-          // At this point, sy = sy4 + gamma * l + delta * k
-          const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
-                           WARPEDPIXEL_PREC_SHIFTS;
-          const int16_t *coeffs = warped_filter[offs];
-          int32_t sum = 0;
-          // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
-          for (m = 0; m < 8; ++m) {
-            sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
-          }
-#if CONFIG_HIGHBITDEPTH
-          sum = clip_pixel_highbd(
-              ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS), bd);
-#else
-          sum = clip_pixel(ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS));
-#endif  // CONFIG_HIGHBITDEPTH
-          if (ref_frm)
-            *p = ROUND_POWER_OF_TWO(*p + sum, 1);
-          else
-            *p = sum;
-          sy += gamma;
-        }
-      }
-    }
-  }
-}
-
 #if CONFIG_HIGHBITDEPTH
 static INLINE void highbd_get_subcolumn(int taps, uint16_t *ref, int32_t *col,
                                         int stride, int x, int y_start) {
@@ -1136,6 +933,19 @@
   }
 }
 
+// Note: For an explanation of the warp algorithm, see the comment
+// above warp_plane()
+//
+// Note also: The "worst case" in terms of modulus of the data stored into 'tmp'
+// (ie, the result of 'sum' in the horizontal filter) occurs when:
+// coeffs = { -2,   8, -22,  87,  72, -21,   8, -2}, and
+// ref =    {  0, 255,   0, 255, 255,   0, 255,  0}
+// Before rounding, this gives sum = 716625. After rounding,
+// HORSHEAR_REDUCE_PREC_BITS = 4 => sum = 44789 > 2^15
+// HORSHEAR_REDUCE_PREC_BITS = 5 => sum = 22395 < 2^15
+//
+// So, as long as HORSHEAR_REDUCE_PREC_BITS >= 5, we can safely use a 16-bit
+// intermediate array.
 void av1_highbd_warp_affine_c(int32_t *mat, uint16_t *ref, int width,
                               int height, int stride, uint16_t *pred, int p_col,
                               int p_row, int p_width, int p_height,
@@ -1143,9 +953,118 @@
                               int subsampling_y, int bd, int ref_frm,
                               int16_t alpha, int16_t beta, int16_t gamma,
                               int16_t delta) {
-  warp_affine_c_helper(mat, ref, width, height, stride, pred, p_col, p_row,
-                       p_width, p_height, p_stride, subsampling_x,
-                       subsampling_y, bd, ref_frm, alpha, beta, gamma, delta);
+#if HORSHEAR_REDUCE_PREC_BITS >= 5
+  int16_t tmp[15 * 8];
+#else
+  int32_t tmp[15 * 8];
+#endif
+  int i, j, k, l, m;
+
+  /* Note: For this code to work, the left/right frame borders need to be
+     extended by at least 13 pixels each. By the time we get here, other
+     code will have set up this border, but we allow an explicit check
+     for debugging purposes.
+  */
+  /*for (i = 0; i < height; ++i) {
+    for (j = 0; j < 13; ++j) {
+      assert(ref[i * stride - 13 + j] == ref[i * stride]);
+      assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
+    }
+  }*/
+
+  for (i = p_row; i < p_row + p_height; i += 8) {
+    for (j = p_col; j < p_col + p_width; j += 8) {
+      int32_t x4, y4, ix4, sx4, iy4, sy4;
+      if (subsampling_x)
+        x4 = ROUND_POWER_OF_TWO_SIGNED(
+            mat[2] * 2 * (j + 4) + mat[3] * 2 * (i + 4) + mat[0] +
+                (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+            1);
+      else
+        x4 = mat[2] * (j + 4) + mat[3] * (i + 4) + mat[0];
+
+      if (subsampling_y)
+        y4 = ROUND_POWER_OF_TWO_SIGNED(
+            mat[4] * 2 * (j + 4) + mat[5] * 2 * (i + 4) + mat[1] +
+                (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+            1);
+      else
+        y4 = mat[4] * (j + 4) + mat[5] * (i + 4) + mat[1];
+
+      ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+      // Horizontal filter
+      for (k = -7; k < 8; ++k) {
+        int iy = iy4 + k;
+        if (iy < 0)
+          iy = 0;
+        else if (iy > height - 1)
+          iy = height - 1;
+
+        if (ix4 <= -7) {
+          for (l = 0; l < 8; ++l) {
+            tmp[(k + 7) * 8 + l] =
+                ref[iy * stride] *
+                (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS));
+          }
+        } else if (ix4 >= width + 6) {
+          for (l = 0; l < 8; ++l) {
+            tmp[(k + 7) * 8 + l] =
+                ref[iy * stride + (width - 1)] *
+                (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS));
+          }
+        } else {
+          int sx = sx4 + alpha * (-4) + beta * k;
+
+          for (l = -4; l < 4; ++l) {
+            int ix = ix4 + l - 3;
+            const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
+                             WARPEDPIXEL_PREC_SHIFTS;
+            const int16_t *coeffs = warped_filter[offs];
+            int32_t sum = 0;
+            // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
+            for (m = 0; m < 8; ++m) {
+              sum += ref[iy * stride + ix + m] * coeffs[m];
+            }
+            sum = ROUND_POWER_OF_TWO(sum, HORSHEAR_REDUCE_PREC_BITS);
+#if HORSHEAR_REDUCE_PREC_BITS >= 5
+            tmp[(k + 7) * 8 + (l + 4)] = saturate_int16(sum);
+#else
+            tmp[(k + 7) * 8 + (l + 4)] = sum;
+#endif
+            sx += alpha;
+          }
+        }
+      }
+
+      // Vertical filter
+      for (k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
+        int sy = sy4 + gamma * (-4) + delta * k;
+        for (l = -4; l < 4; ++l) {
+          uint16_t *p =
+              &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
+          const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
+                           WARPEDPIXEL_PREC_SHIFTS;
+          const int16_t *coeffs = warped_filter[offs];
+          int32_t sum = 0;
+          // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
+          for (m = 0; m < 8; ++m) {
+            sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
+          }
+          sum = clip_pixel_highbd(
+              ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS), bd);
+          if (ref_frm)
+            *p = ROUND_POWER_OF_TWO(*p + sum, 1);
+          else
+            *p = sum;
+          sy += gamma;
+        }
+      }
+    }
+  }
 }
 
 static void highbd_warp_plane(WarpedMotionParams *wm, uint8_t *ref8, int width,
@@ -1240,15 +1159,169 @@
   }
 }
 
+/* The warp filter for ROTZOOM and AFFINE models works as follows:
+   * Split the input into 8x8 blocks
+   * For each block, project the point (4, 4) within the block, to get the
+     overall block position. Split into integer and fractional coordinates,
+     maintaining full WARPEDMODEL precision
+   * Filter horizontally: Generate 15 rows of 8 pixels each. Each pixel gets a
+     variable horizontal offset. This means that, while the rows of the
+     intermediate buffer align with the rows of the *reference* image, the
+     columns align with the columns of the *destination* image.
+   * Filter vertically: Generate the output block (up to 8x8 pixels, but if the
+     destination is too small we crop the output at this stage). Each pixel has
+     a variable vertical offset, so that the resulting rows are aligned with
+     the rows of the destination image.
+
+   To accomplish these alignments, we factor the warp matrix as a
+   product of two shear / asymmetric zoom matrices:
+   / a b \  = /   1       0    \ * / 1+alpha  beta \
+   \ c d /    \ gamma  1+delta /   \    0      1   /
+   where a, b, c, d are wmmat[2], wmmat[3], wmmat[4], wmmat[5] respectively.
+   The second shear (with alpha and beta) is applied by the horizontal filter,
+   then the first shear (with gamma and delta) is applied by the vertical
+   filter.
+
+   The only limitation is that, to fit this in a fixed 8-tap filter size,
+   the fractional pixel offsets must be at most +-1. Since the horizontal filter
+   generates 15 rows of 8 columns, and the initial point we project is at (4, 4)
+   within the block, the parameters must satisfy
+   4 * |alpha| + 7 * |beta| <= 1   and   4 * |gamma| + 7 * |delta| <= 1
+   for this filter to be applicable.
+
+   Note: warp_affine() assumes that the caller has done all of the relevant
+   checks, ie. that we have a ROTZOOM or AFFINE model, that wm[4] and wm[5]
+   are set appropriately (if using a ROTZOOM model), and that alpha, beta,
+   gamma, delta are all in range.
+
+   TODO(david.barker): Maybe support scaled references?
+*/
 void av1_warp_affine_c(int32_t *mat, uint8_t *ref, int width, int height,
                        int stride, uint8_t *pred, int p_col, int p_row,
                        int p_width, int p_height, int p_stride,
                        int subsampling_x, int subsampling_y, int ref_frm,
                        int16_t alpha, int16_t beta, int16_t gamma,
                        int16_t delta) {
-  warp_affine_c_helper(mat, ref, width, height, stride, pred, p_col, p_row,
-                       p_width, p_height, p_stride, subsampling_x,
-                       subsampling_y, 8, ref_frm, alpha, beta, gamma, delta);
+  int16_t tmp[15 * 8];
+  int i, j, k, l, m;
+
+  /* Note: For this code to work, the left/right frame borders need to be
+     extended by at least 13 pixels each. By the time we get here, other
+     code will have set up this border, but we allow an explicit check
+     for debugging purposes.
+  */
+  /*for (i = 0; i < height; ++i) {
+    for (j = 0; j < 13; ++j) {
+      assert(ref[i * stride - 13 + j] == ref[i * stride]);
+      assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
+    }
+  }*/
+
+  for (i = p_row; i < p_row + p_height; i += 8) {
+    for (j = p_col; j < p_col + p_width; j += 8) {
+      int32_t x4, y4, ix4, sx4, iy4, sy4;
+      if (subsampling_x)
+        x4 = ROUND_POWER_OF_TWO_SIGNED(
+            mat[2] * 2 * (j + 4) + mat[3] * 2 * (i + 4) + mat[0] +
+                (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+            1);
+      else
+        x4 = mat[2] * (j + 4) + mat[3] * (i + 4) + mat[0];
+
+      if (subsampling_y)
+        y4 = ROUND_POWER_OF_TWO_SIGNED(
+            mat[4] * 2 * (j + 4) + mat[5] * 2 * (i + 4) + mat[1] +
+                (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+            1);
+      else
+        y4 = mat[4] * (j + 4) + mat[5] * (i + 4) + mat[1];
+
+      ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+      // Horizontal filter
+      for (k = -7; k < 8; ++k) {
+        int iy = iy4 + k;
+        if (iy < 0)
+          iy = 0;
+        else if (iy > height - 1)
+          iy = height - 1;
+
+        if (ix4 <= -7) {
+          // In this case, the rightmost pixel sampled is in column
+          // ix4 + 3 + 7 - 3 = ix4 + 7 <= 0, ie. the entire block
+          // will sample only from the leftmost column
+          // (once border extension is taken into account)
+          for (l = 0; l < 8; ++l) {
+            tmp[(k + 7) * 8 + l] =
+                ref[iy * stride] *
+                (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS));
+          }
+        } else if (ix4 >= width + 6) {
+          // In this case, the leftmost pixel sampled is in column
+          // ix4 - 4 + 0 - 3 = ix4 - 7 >= width - 1, ie. the entire block
+          // will sample only from the rightmost column
+          // (once border extension is taken into account)
+          for (l = 0; l < 8; ++l) {
+            tmp[(k + 7) * 8 + l] =
+                ref[iy * stride + (width - 1)] *
+                (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS));
+          }
+        } else {
+          // If we get here, then
+          // the leftmost pixel sampled is
+          // ix4 - 4 + 0 - 3 = ix4 - 7 >= -13
+          // and the rightmost pixel sampled is at most
+          // ix4 + 3 + 7 - 3 = ix4 + 7 <= width + 12
+          // So, assuming that border extension has been done, we
+          // don't need to explicitly clamp values.
+          int sx = sx4 + alpha * (-4) + beta * k;
+
+          for (l = -4; l < 4; ++l) {
+            int ix = ix4 + l - 3;
+            // At this point, sx = sx4 + alpha * l + beta * k
+            const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
+                             WARPEDPIXEL_PREC_SHIFTS;
+            const int16_t *coeffs = warped_filter[offs];
+            int32_t sum = 0;
+            // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
+            for (m = 0; m < 8; ++m) {
+              sum += ref[iy * stride + ix + m] * coeffs[m];
+            }
+            sum = ROUND_POWER_OF_TWO(sum, HORSHEAR_REDUCE_PREC_BITS);
+            tmp[(k + 7) * 8 + (l + 4)] = saturate_int16(sum);
+            sx += alpha;
+          }
+        }
+      }
+
+      // Vertical filter
+      for (k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
+        int sy = sy4 + gamma * (-4) + delta * k;
+        for (l = -4; l < 4; ++l) {
+          uint8_t *p =
+              &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
+          // At this point, sy = sy4 + gamma * l + delta * k
+          const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
+                           WARPEDPIXEL_PREC_SHIFTS;
+          const int16_t *coeffs = warped_filter[offs];
+          int32_t sum = 0;
+          // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
+          for (m = 0; m < 8; ++m) {
+            sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
+          }
+          sum = clip_pixel(ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS));
+          if (ref_frm)
+            *p = ROUND_POWER_OF_TWO(*p + sum, 1);
+          else
+            *p = sum;
+          sy += gamma;
+        }
+      }
+    }
+  }
 }
 
 static void warp_plane(WarpedMotionParams *wm, uint8_t *ref, int width,
commit	0d08afdc7aa1fde54e664ecca34ca515dc97f689	[log] [tgz]
author	Urvang Joshi <urvang@google.com>	Fri Apr 21 17:55:20 2017 +0000
committer	Urvang Joshi <urvang@google.com>	Fri Apr 21 18:29:03 2017 +0000
tree	404e460262a71d4e688888da4921c2d38f8d0d06
parent	ec4048aedd7521b88953ff3b872c4332e11eecdf [diff]