Reduce precision bis between shears
Change-Id: I89e981c9396c7a1ba8051d65036a16692da94d0d
diff --git a/av1/common/mv.h b/av1/common/mv.h
index 0b7a151..a22a5c8 100644
--- a/av1/common/mv.h
+++ b/av1/common/mv.h
@@ -53,6 +53,11 @@
// Precision of filter taps
#define WARPEDPIXEL_FILTER_BITS 7
+// Precision bits reduction after horizontal shear
+#define HORSHEAR_REDUCE_PREC_BITS 5
+#define VERSHEAR_REDUCE_PREC_BITS \
+ (2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)
+
#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
/* clang-format off */
diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c
index 0ad142c..1aecd56 100644
--- a/av1/common/warped_motion.c
+++ b/av1/common/warped_motion.c
@@ -916,6 +916,7 @@
else
sum += ref[iy * stride + ix + m - 3] * coeffs[m];
}
+ sum = ROUND_POWER_OF_TWO(sum, HORSHEAR_REDUCE_PREC_BITS);
tmp[(k + 7) * 8 + (l + 4)] = sum;
}
}
@@ -933,8 +934,7 @@
sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
}
sum = clip_pixel_highbd(
- ROUND_POWER_OF_TWO_SIGNED(sum, 2 * WARPEDPIXEL_FILTER_BITS),
- bd);
+ ROUND_POWER_OF_TWO_SIGNED(sum, VERSHEAR_REDUCE_PREC_BITS), bd);
if (ref_frm)
*p = ROUND_POWER_OF_TWO_SIGNED(*p + sum, 1);
else
@@ -1116,7 +1116,8 @@
// (once border extension is taken into account)
for (l = 0; l < 8; ++l) {
tmp[(k + 7) * 8 + l] =
- ref[iy * stride] * (1 << WARPEDPIXEL_FILTER_BITS);
+ ref[iy * stride] *
+ (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS));
}
} else if (ix4 >= width + 6) {
// In this case, the leftmost pixel sampled is in column
@@ -1125,7 +1126,8 @@
// (once border extension is taken into account)
for (l = 0; l < 8; ++l) {
tmp[(k + 7) * 8 + l] =
- ref[iy * stride + (width - 1)] * (1 << WARPEDPIXEL_FILTER_BITS);
+ ref[iy * stride + (width - 1)] *
+ (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS));
}
} else {
// If we get here, then
@@ -1148,6 +1150,7 @@
for (m = 0; m < 8; ++m) {
sum += ref[iy * stride + ix + m] * coeffs[m];
}
+ sum = ROUND_POWER_OF_TWO(sum, HORSHEAR_REDUCE_PREC_BITS);
tmp[(k + 7) * 8 + (l + 4)] = saturate_int16(sum);
sx += alpha;
}
@@ -1169,8 +1172,7 @@
for (m = 0; m < 8; ++m) {
sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
}
- sum =
- clip_pixel(ROUND_POWER_OF_TWO(sum, 2 * WARPEDPIXEL_FILTER_BITS));
+ sum = clip_pixel(ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS));
if (ref_frm)
*p = ROUND_POWER_OF_TWO(*p + sum, 1);
else
@@ -1199,7 +1201,6 @@
const int32_t gamma = wm->gamma;
const int32_t delta = wm->delta;
- // printf("%d %d %d %d\n", mat[2], mat[3], mat[4], mat[5]);
av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
p_width, p_height, p_stride, subsampling_x, subsampling_y,
ref_frm, alpha, beta, gamma, delta);
diff --git a/av1/common/x86/warp_plane_sse2.c b/av1/common/x86/warp_plane_sse2.c
index 42c25c9..d7d2a1c 100644
--- a/av1/common/x86/warp_plane_sse2.c
+++ b/av1/common/x86/warp_plane_sse2.c
@@ -79,11 +79,13 @@
// would be taken from the leftmost/rightmost column, then we can
// skip the expensive horizontal filter.
if (ix4 <= -7) {
- tmp[k + 7] =
- _mm_set1_epi16(ref[iy * stride] * (1 << WARPEDPIXEL_FILTER_BITS));
+ tmp[k + 7] = _mm_set1_epi16(
+ ref[iy * stride] *
+ (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
} else if (ix4 >= width + 6) {
- tmp[k + 7] = _mm_set1_epi16(ref[iy * stride + (width - 1)] *
- (1 << WARPEDPIXEL_FILTER_BITS));
+ tmp[k + 7] = _mm_set1_epi16(
+ ref[iy * stride + (width - 1)] *
+ (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
} else {
int sx = sx4 + alpha * (-4) + beta * k +
// Include rounding and offset here
@@ -119,6 +121,9 @@
// coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
__m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+ __m128i round_const =
+ _mm_set1_epi32((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1);
+
// Calculate filtered results
__m128i src_0 = _mm_unpacklo_epi8(src, zero);
__m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
@@ -131,6 +136,8 @@
__m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
_mm_add_epi32(res_2, res_6));
+ res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
+ HORSHEAR_REDUCE_PREC_BITS);
// Filter odd-index pixels
__m128i tmp_1 = filter[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS];
@@ -159,6 +166,8 @@
__m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
_mm_add_epi32(res_3, res_7));
+ res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
+ HORSHEAR_REDUCE_PREC_BITS);
// Combine results into one register.
// We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
@@ -240,12 +249,12 @@
// Round and pack into 8 bits
__m128i round_const =
- _mm_set1_epi32((1 << (2 * WARPEDPIXEL_FILTER_BITS)) >> 1);
+ _mm_set1_epi32((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1);
__m128i res_lo_round = _mm_srai_epi32(
- _mm_add_epi32(res_lo, round_const), 2 * WARPEDPIXEL_FILTER_BITS);
+ _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
__m128i res_hi_round = _mm_srai_epi32(
- _mm_add_epi32(res_hi, round_const), 2 * WARPEDPIXEL_FILTER_BITS);
+ _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
__m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
__m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);