Revert "warp_affine_c: Refactor highbd and lowbd versions."
This reverts commit 8cd0e7efac95844556d98d4634755b47b203a3ed.
Reason for revert:
This change breaks av1_warp_affine_c when CONFIG_HIGHBITDEPTH is enabled.
In particular, running ./test_libaom --gtest_filter=*Warp* compiled with --enable-warped-motion --enable-highbitdepth shows several test failures, followed by a segmentation fault when it gets up to test SSE2/AV1WarpFilterTest.CheckOutput/4
The tricky part is that the use the lowbd version of the function is dependent on a mix of two conditions:
(1) Compile time check for CONFIG_HIGHBITDEPTH and
(2) Run time check to see if bit-depth == 8
So, it is tricky to refactor.
BUG=aomedia:442
Change-Id: I610c537fb65bde4f357185a13081639f906351de
diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c
index f1cbbe2..c74609d 100644
--- a/av1/common/warped_motion.c
+++ b/av1/common/warped_motion.c
@@ -774,209 +774,6 @@
return 1;
}
-/* The warp filter for ROTZOOM and AFFINE models works as follows:
- * Split the input into 8x8 blocks
- * For each block, project the point (4, 4) within the block, to get the
- overall block position. Split into integer and fractional coordinates,
- maintaining full WARPEDMODEL precision
- * Filter horizontally: Generate 15 rows of 8 pixels each. Each pixel gets a
- variable horizontal offset. This means that, while the rows of the
- intermediate buffer align with the rows of the *reference* image, the
- columns align with the columns of the *destination* image.
- * Filter vertically: Generate the output block (up to 8x8 pixels, but if the
- destination is too small we crop the output at this stage). Each pixel has
- a variable vertical offset, so that the resulting rows are aligned with
- the rows of the destination image.
-
- To accomplish these alignments, we factor the warp matrix as a
- product of two shear / asymmetric zoom matrices:
- / a b \ = / 1 0 \ * / 1+alpha beta \
- \ c d / \ gamma 1+delta / \ 0 1 /
- where a, b, c, d are wmmat[2], wmmat[3], wmmat[4], wmmat[5] respectively.
- The second shear (with alpha and beta) is applied by the horizontal filter,
- then the first shear (with gamma and delta) is applied by the vertical
- filter.
-
- The only limitation is that, to fit this in a fixed 8-tap filter size,
- the fractional pixel offsets must be at most +-1. Since the horizontal filter
- generates 15 rows of 8 columns, and the initial point we project is at (4, 4)
- within the block, the parameters must satisfy
- 4 * |alpha| + 7 * |beta| <= 1 and 4 * |gamma| + 7 * |delta| <= 1
- for this filter to be applicable.
-
- Note: warp_affine() assumes that the caller has done all of the relevant
- checks, ie. that we have a ROTZOOM or AFFINE model, that wm[4] and wm[5]
- are set appropriately (if using a ROTZOOM model), and that alpha, beta,
- gamma, delta are all in range.
-
- TODO(david.barker): Maybe support scaled references?
-*/
-// Note also: The "worst case" in terms of modulus of the data stored into 'tmp'
-// (ie, the result of 'sum' in the horizontal filter) occurs when:
-// coeffs = { -2, 8, -22, 87, 72, -21, 8, -2}, and
-// ref = { 0, 255, 0, 255, 255, 0, 255, 0}
-// Before rounding, this gives sum = 716625. After rounding,
-// HORSHEAR_REDUCE_PREC_BITS = 4 => sum = 44789 > 2^15
-// HORSHEAR_REDUCE_PREC_BITS = 5 => sum = 22395 < 2^15
-//
-// So, as long as HORSHEAR_REDUCE_PREC_BITS >= 5, we can safely use a 16-bit
-// intermediate array.
-static void warp_affine_c_helper(int32_t *mat, void *ref_void, int width,
- int height, int stride, void *pred_void,
- int p_col, int p_row, int p_width,
- int p_height, int p_stride, int subsampling_x,
- int subsampling_y, int bd, int ref_frm,
- int16_t alpha, int16_t beta, int16_t gamma,
- int16_t delta) {
-#if CONFIG_HIGHBITDEPTH
- uint16_t *ref = (uint16_t *)ref_void;
- uint16_t *pred = (uint16_t *)pred_void;
-#else
- uint8_t *ref = (uint8_t *)ref_void;
- uint8_t *pred = (uint8_t *)pred_void;
- (void)bd;
-#endif // CONFIG_HIGHBITDEPTH
-
-#if !CONFIG_HIGHBITDEPTH || (HORSHEAR_REDUCE_PREC_BITS >= 5)
- int16_t tmp[15 * 8];
-#else
- int32_t tmp[15 * 8];
-#endif // !CONFIG_HIGHBITDEPTH || (HORSHEAR_REDUCE_PREC_BITS >= 5)
-
- int i, j, k, l, m;
-
- /* Note: For this code to work, the left/right frame borders need to be
- extended by at least 13 pixels each. By the time we get here, other
- code will have set up this border, but we allow an explicit check
- for debugging purposes.
- */
- /*for (i = 0; i < height; ++i) {
- for (j = 0; j < 13; ++j) {
- assert(ref[i * stride - 13 + j] == ref[i * stride]);
- assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
- }
- }*/
-
- for (i = p_row; i < p_row + p_height; i += 8) {
- for (j = p_col; j < p_col + p_width; j += 8) {
- int32_t x4, y4, ix4, sx4, iy4, sy4;
- if (subsampling_x)
- x4 = ROUND_POWER_OF_TWO_SIGNED(
- mat[2] * 2 * (j + 4) + mat[3] * 2 * (i + 4) + mat[0] +
- (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
- 1);
- else
- x4 = mat[2] * (j + 4) + mat[3] * (i + 4) + mat[0];
-
- if (subsampling_y)
- y4 = ROUND_POWER_OF_TWO_SIGNED(
- mat[4] * 2 * (j + 4) + mat[5] * 2 * (i + 4) + mat[1] +
- (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
- 1);
- else
- y4 = mat[4] * (j + 4) + mat[5] * (i + 4) + mat[1];
-
- ix4 = x4 >> WARPEDMODEL_PREC_BITS;
- sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
- iy4 = y4 >> WARPEDMODEL_PREC_BITS;
- sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-
- // Horizontal filter
- for (k = -7; k < 8; ++k) {
- int iy = iy4 + k;
- if (iy < 0)
- iy = 0;
- else if (iy > height - 1)
- iy = height - 1;
-
- if (ix4 <= -7) {
- // In this case, the rightmost pixel sampled is in column
- // ix4 + 3 + 7 - 3 = ix4 + 7 <= 0, ie. the entire block
- // will sample only from the leftmost column
- // (once border extension is taken into account)
- for (l = 0; l < 8; ++l) {
- tmp[(k + 7) * 8 + l] =
- ref[iy * stride] *
- (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS));
- }
- } else if (ix4 >= width + 6) {
- // In this case, the leftmost pixel sampled is in column
- // ix4 - 4 + 0 - 3 = ix4 - 7 >= width - 1, ie. the entire block
- // will sample only from the rightmost column
- // (once border extension is taken into account)
- for (l = 0; l < 8; ++l) {
- tmp[(k + 7) * 8 + l] =
- ref[iy * stride + (width - 1)] *
- (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS));
- }
- } else {
- // If we get here, then
- // the leftmost pixel sampled is
- // ix4 - 4 + 0 - 3 = ix4 - 7 >= -13
- // and the rightmost pixel sampled is at most
- // ix4 + 3 + 7 - 3 = ix4 + 7 <= width + 12
- // So, assuming that border extension has been done, we
- // don't need to explicitly clamp values.
- int sx = sx4 + alpha * (-4) + beta * k;
-
- for (l = -4; l < 4; ++l) {
- int ix = ix4 + l - 3;
- // At this point, sx = sx4 + alpha * l + beta * k
- const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
- WARPEDPIXEL_PREC_SHIFTS;
- const int16_t *coeffs = warped_filter[offs];
- int32_t sum = 0;
- // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
- for (m = 0; m < 8; ++m) {
- sum += ref[iy * stride + ix + m] * coeffs[m];
- }
- sum = ROUND_POWER_OF_TWO(sum, HORSHEAR_REDUCE_PREC_BITS);
-#if !CONFIG_HIGHBITDEPTH || (HORSHEAR_REDUCE_PREC_BITS >= 5)
- tmp[(k + 7) * 8 + (l + 4)] = saturate_int16(sum);
-#else
- tmp[(k + 7) * 8 + (l + 4)] = sum;
-#endif // !CONFIG_HIGHBITDEPTH || (HORSHEAR_REDUCE_PREC_BITS >= 5)
- sx += alpha;
- }
- }
- }
-
- // Vertical filter
- for (k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
- int sy = sy4 + gamma * (-4) + delta * k;
- for (l = -4; l < 4; ++l) {
-#if CONFIG_HIGHBITDEPTH
- uint16_t *p =
-#else
- uint8_t *p =
-#endif // CONFIG_HIGHBITDEPTH
- &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
- // At this point, sy = sy4 + gamma * l + delta * k
- const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
- WARPEDPIXEL_PREC_SHIFTS;
- const int16_t *coeffs = warped_filter[offs];
- int32_t sum = 0;
- // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
- for (m = 0; m < 8; ++m) {
- sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
- }
-#if CONFIG_HIGHBITDEPTH
- sum = clip_pixel_highbd(
- ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS), bd);
-#else
- sum = clip_pixel(ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS));
-#endif // CONFIG_HIGHBITDEPTH
- if (ref_frm)
- *p = ROUND_POWER_OF_TWO(*p + sum, 1);
- else
- *p = sum;
- sy += gamma;
- }
- }
- }
- }
-}
-
#if CONFIG_HIGHBITDEPTH
static INLINE void highbd_get_subcolumn(int taps, uint16_t *ref, int32_t *col,
int stride, int x, int y_start) {
@@ -1136,6 +933,19 @@
}
}
+// Note: For an explanation of the warp algorithm, see the comment
+// above warp_plane()
+//
+// Note also: The "worst case" in terms of modulus of the data stored into 'tmp'
+// (ie, the result of 'sum' in the horizontal filter) occurs when:
+// coeffs = { -2, 8, -22, 87, 72, -21, 8, -2}, and
+// ref = { 0, 255, 0, 255, 255, 0, 255, 0}
+// Before rounding, this gives sum = 716625. After rounding,
+// HORSHEAR_REDUCE_PREC_BITS = 4 => sum = 44789 > 2^15
+// HORSHEAR_REDUCE_PREC_BITS = 5 => sum = 22395 < 2^15
+//
+// So, as long as HORSHEAR_REDUCE_PREC_BITS >= 5, we can safely use a 16-bit
+// intermediate array.
void av1_highbd_warp_affine_c(int32_t *mat, uint16_t *ref, int width,
int height, int stride, uint16_t *pred, int p_col,
int p_row, int p_width, int p_height,
@@ -1143,9 +953,118 @@
int subsampling_y, int bd, int ref_frm,
int16_t alpha, int16_t beta, int16_t gamma,
int16_t delta) {
- warp_affine_c_helper(mat, ref, width, height, stride, pred, p_col, p_row,
- p_width, p_height, p_stride, subsampling_x,
- subsampling_y, bd, ref_frm, alpha, beta, gamma, delta);
+#if HORSHEAR_REDUCE_PREC_BITS >= 5
+ int16_t tmp[15 * 8];
+#else
+ int32_t tmp[15 * 8];
+#endif
+ int i, j, k, l, m;
+
+ /* Note: For this code to work, the left/right frame borders need to be
+ extended by at least 13 pixels each. By the time we get here, other
+ code will have set up this border, but we allow an explicit check
+ for debugging purposes.
+ */
+ /*for (i = 0; i < height; ++i) {
+ for (j = 0; j < 13; ++j) {
+ assert(ref[i * stride - 13 + j] == ref[i * stride]);
+ assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
+ }
+ }*/
+
+ for (i = p_row; i < p_row + p_height; i += 8) {
+ for (j = p_col; j < p_col + p_width; j += 8) {
+ int32_t x4, y4, ix4, sx4, iy4, sy4;
+ if (subsampling_x)
+ x4 = ROUND_POWER_OF_TWO_SIGNED(
+ mat[2] * 2 * (j + 4) + mat[3] * 2 * (i + 4) + mat[0] +
+ (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+ 1);
+ else
+ x4 = mat[2] * (j + 4) + mat[3] * (i + 4) + mat[0];
+
+ if (subsampling_y)
+ y4 = ROUND_POWER_OF_TWO_SIGNED(
+ mat[4] * 2 * (j + 4) + mat[5] * 2 * (i + 4) + mat[1] +
+ (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+ 1);
+ else
+ y4 = mat[4] * (j + 4) + mat[5] * (i + 4) + mat[1];
+
+ ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+ sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+ iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+ sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+ // Horizontal filter
+ for (k = -7; k < 8; ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+
+ if (ix4 <= -7) {
+ for (l = 0; l < 8; ++l) {
+ tmp[(k + 7) * 8 + l] =
+ ref[iy * stride] *
+ (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS));
+ }
+ } else if (ix4 >= width + 6) {
+ for (l = 0; l < 8; ++l) {
+ tmp[(k + 7) * 8 + l] =
+ ref[iy * stride + (width - 1)] *
+ (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS));
+ }
+ } else {
+ int sx = sx4 + alpha * (-4) + beta * k;
+
+ for (l = -4; l < 4; ++l) {
+ int ix = ix4 + l - 3;
+ const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
+ WARPEDPIXEL_PREC_SHIFTS;
+ const int16_t *coeffs = warped_filter[offs];
+ int32_t sum = 0;
+ // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
+ for (m = 0; m < 8; ++m) {
+ sum += ref[iy * stride + ix + m] * coeffs[m];
+ }
+ sum = ROUND_POWER_OF_TWO(sum, HORSHEAR_REDUCE_PREC_BITS);
+#if HORSHEAR_REDUCE_PREC_BITS >= 5
+ tmp[(k + 7) * 8 + (l + 4)] = saturate_int16(sum);
+#else
+ tmp[(k + 7) * 8 + (l + 4)] = sum;
+#endif
+ sx += alpha;
+ }
+ }
+ }
+
+ // Vertical filter
+ for (k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
+ int sy = sy4 + gamma * (-4) + delta * k;
+ for (l = -4; l < 4; ++l) {
+ uint16_t *p =
+ &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
+ const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
+ WARPEDPIXEL_PREC_SHIFTS;
+ const int16_t *coeffs = warped_filter[offs];
+ int32_t sum = 0;
+ // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
+ for (m = 0; m < 8; ++m) {
+ sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
+ }
+ sum = clip_pixel_highbd(
+ ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS), bd);
+ if (ref_frm)
+ *p = ROUND_POWER_OF_TWO(*p + sum, 1);
+ else
+ *p = sum;
+ sy += gamma;
+ }
+ }
+ }
+ }
}
static void highbd_warp_plane(WarpedMotionParams *wm, uint8_t *ref8, int width,
@@ -1240,15 +1159,169 @@
}
}
+/* The warp filter for ROTZOOM and AFFINE models works as follows:
+ * Split the input into 8x8 blocks
+ * For each block, project the point (4, 4) within the block, to get the
+ overall block position. Split into integer and fractional coordinates,
+ maintaining full WARPEDMODEL precision
+ * Filter horizontally: Generate 15 rows of 8 pixels each. Each pixel gets a
+ variable horizontal offset. This means that, while the rows of the
+ intermediate buffer align with the rows of the *reference* image, the
+ columns align with the columns of the *destination* image.
+ * Filter vertically: Generate the output block (up to 8x8 pixels, but if the
+ destination is too small we crop the output at this stage). Each pixel has
+ a variable vertical offset, so that the resulting rows are aligned with
+ the rows of the destination image.
+
+ To accomplish these alignments, we factor the warp matrix as a
+ product of two shear / asymmetric zoom matrices:
+ / a b \ = / 1 0 \ * / 1+alpha beta \
+ \ c d / \ gamma 1+delta / \ 0 1 /
+ where a, b, c, d are wmmat[2], wmmat[3], wmmat[4], wmmat[5] respectively.
+ The second shear (with alpha and beta) is applied by the horizontal filter,
+ then the first shear (with gamma and delta) is applied by the vertical
+ filter.
+
+ The only limitation is that, to fit this in a fixed 8-tap filter size,
+ the fractional pixel offsets must be at most +-1. Since the horizontal filter
+ generates 15 rows of 8 columns, and the initial point we project is at (4, 4)
+ within the block, the parameters must satisfy
+ 4 * |alpha| + 7 * |beta| <= 1 and 4 * |gamma| + 7 * |delta| <= 1
+ for this filter to be applicable.
+
+ Note: warp_affine() assumes that the caller has done all of the relevant
+ checks, ie. that we have a ROTZOOM or AFFINE model, that wm[4] and wm[5]
+ are set appropriately (if using a ROTZOOM model), and that alpha, beta,
+ gamma, delta are all in range.
+
+ TODO(david.barker): Maybe support scaled references?
+*/
void av1_warp_affine_c(int32_t *mat, uint8_t *ref, int width, int height,
int stride, uint8_t *pred, int p_col, int p_row,
int p_width, int p_height, int p_stride,
int subsampling_x, int subsampling_y, int ref_frm,
int16_t alpha, int16_t beta, int16_t gamma,
int16_t delta) {
- warp_affine_c_helper(mat, ref, width, height, stride, pred, p_col, p_row,
- p_width, p_height, p_stride, subsampling_x,
- subsampling_y, 8, ref_frm, alpha, beta, gamma, delta);
+ int16_t tmp[15 * 8];
+ int i, j, k, l, m;
+
+ /* Note: For this code to work, the left/right frame borders need to be
+ extended by at least 13 pixels each. By the time we get here, other
+ code will have set up this border, but we allow an explicit check
+ for debugging purposes.
+ */
+ /*for (i = 0; i < height; ++i) {
+ for (j = 0; j < 13; ++j) {
+ assert(ref[i * stride - 13 + j] == ref[i * stride]);
+ assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
+ }
+ }*/
+
+ for (i = p_row; i < p_row + p_height; i += 8) {
+ for (j = p_col; j < p_col + p_width; j += 8) {
+ int32_t x4, y4, ix4, sx4, iy4, sy4;
+ if (subsampling_x)
+ x4 = ROUND_POWER_OF_TWO_SIGNED(
+ mat[2] * 2 * (j + 4) + mat[3] * 2 * (i + 4) + mat[0] +
+ (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+ 1);
+ else
+ x4 = mat[2] * (j + 4) + mat[3] * (i + 4) + mat[0];
+
+ if (subsampling_y)
+ y4 = ROUND_POWER_OF_TWO_SIGNED(
+ mat[4] * 2 * (j + 4) + mat[5] * 2 * (i + 4) + mat[1] +
+ (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+ 1);
+ else
+ y4 = mat[4] * (j + 4) + mat[5] * (i + 4) + mat[1];
+
+ ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+ sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+ iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+ sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+ // Horizontal filter
+ for (k = -7; k < 8; ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+
+ if (ix4 <= -7) {
+ // In this case, the rightmost pixel sampled is in column
+ // ix4 + 3 + 7 - 3 = ix4 + 7 <= 0, ie. the entire block
+ // will sample only from the leftmost column
+ // (once border extension is taken into account)
+ for (l = 0; l < 8; ++l) {
+ tmp[(k + 7) * 8 + l] =
+ ref[iy * stride] *
+ (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS));
+ }
+ } else if (ix4 >= width + 6) {
+ // In this case, the leftmost pixel sampled is in column
+ // ix4 - 4 + 0 - 3 = ix4 - 7 >= width - 1, ie. the entire block
+ // will sample only from the rightmost column
+ // (once border extension is taken into account)
+ for (l = 0; l < 8; ++l) {
+ tmp[(k + 7) * 8 + l] =
+ ref[iy * stride + (width - 1)] *
+ (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS));
+ }
+ } else {
+ // If we get here, then
+ // the leftmost pixel sampled is
+ // ix4 - 4 + 0 - 3 = ix4 - 7 >= -13
+ // and the rightmost pixel sampled is at most
+ // ix4 + 3 + 7 - 3 = ix4 + 7 <= width + 12
+ // So, assuming that border extension has been done, we
+ // don't need to explicitly clamp values.
+ int sx = sx4 + alpha * (-4) + beta * k;
+
+ for (l = -4; l < 4; ++l) {
+ int ix = ix4 + l - 3;
+ // At this point, sx = sx4 + alpha * l + beta * k
+ const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
+ WARPEDPIXEL_PREC_SHIFTS;
+ const int16_t *coeffs = warped_filter[offs];
+ int32_t sum = 0;
+ // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
+ for (m = 0; m < 8; ++m) {
+ sum += ref[iy * stride + ix + m] * coeffs[m];
+ }
+ sum = ROUND_POWER_OF_TWO(sum, HORSHEAR_REDUCE_PREC_BITS);
+ tmp[(k + 7) * 8 + (l + 4)] = saturate_int16(sum);
+ sx += alpha;
+ }
+ }
+ }
+
+ // Vertical filter
+ for (k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
+ int sy = sy4 + gamma * (-4) + delta * k;
+ for (l = -4; l < 4; ++l) {
+ uint8_t *p =
+ &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
+ // At this point, sy = sy4 + gamma * l + delta * k
+ const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
+ WARPEDPIXEL_PREC_SHIFTS;
+ const int16_t *coeffs = warped_filter[offs];
+ int32_t sum = 0;
+ // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
+ for (m = 0; m < 8; ++m) {
+ sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
+ }
+ sum = clip_pixel(ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS));
+ if (ref_frm)
+ *p = ROUND_POWER_OF_TWO(*p + sum, 1);
+ else
+ *p = sum;
+ sy += gamma;
+ }
+ }
+ }
+ }
}
static void warp_plane(WarpedMotionParams *wm, uint8_t *ref, int width,