Stop using VP9 convolve scheme in AV1 encoder. Discontinue all VP9 style convolve rounding operations in the non-normative parts of the encoder. The function av1_convolve_2d_sr_c is forced instead of SIMD versions of the same function, because of incompatibility when round_1 > 0. In the -DCONFIG_LOWPRECISION_BLEND=2 -DCONFIG_HIGHPRECISION_INTBUF=1 setting, results on 15 frames of lowres (cpu-used=1) is -0.019% better. Change-Id: I72154bd896357c352c944fb2cd3b25bafafba46a
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl index 2114451..a56eb94 100755 --- a/av1/common/av1_rtcd_defs.pl +++ b/av1/common/av1_rtcd_defs.pl
@@ -453,19 +453,16 @@ add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta"; if (aom_config("CONFIG_JNT_COMP") eq "yes") { - if (aom_config("CONFIG_JNT_COMP") eq "yes") { - specialize qw/av1_warp_affine sse4_1/; - } + specialize qw/av1_warp_affine sse4_1/; } else { specialize qw/av1_warp_affine sse2 ssse3/; } add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta"; + if (aom_config("CONFIG_JNT_COMP") eq "yes") { - if (aom_config("CONFIG_JNT_COMP") eq "yes") { - specialize qw/av1_highbd_warp_affine sse4_1/; - } + specialize qw/av1_highbd_warp_affine sse4_1/; } else { specialize qw/av1_highbd_warp_affine ssse3/; }
diff --git a/av1/common/convolve.c b/av1/common/convolve.c index 95458c1..8bc6f6d 100644 --- a/av1/common/convolve.c +++ b/av1/common/convolve.c
@@ -1159,9 +1159,6 @@ const int subpel_y_q4, int y_step_q4, int scaled, ConvolveParams *conv_params, int bd) { - (void)dst; - (void)dst_stride; - InterpFilterParams filter_params_x, filter_params_y; #if CONFIG_SHORT_FILTER av1_get_convolve_filter_params(interp_filters, &filter_params_x, @@ -1172,71 +1169,101 @@ #endif const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - if (filter_params_y.taps < filter_params_x.taps) { - uint16_t tr_src[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * - (MAX_SB_SIZE + MAX_FILTER_TAP - 1)]; - int tr_src_stride = MAX_SB_SIZE + MAX_FILTER_TAP - 1; - CONV_BUF_TYPE tr_dst[MAX_SB_SIZE * MAX_SB_SIZE]; - int tr_dst_stride = MAX_SB_SIZE; - int fo_vert = filter_params_y.taps / 2 - 1; - int fo_horiz = filter_params_x.taps / 2 - 1; + if (conv_params->dst) { + if (filter_params_y.taps < filter_params_x.taps) { + uint16_t tr_src[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * + (MAX_SB_SIZE + MAX_FILTER_TAP - 1)]; + int tr_src_stride = MAX_SB_SIZE + MAX_FILTER_TAP - 1; + CONV_BUF_TYPE tr_dst[MAX_SB_SIZE * MAX_SB_SIZE]; + int tr_dst_stride = MAX_SB_SIZE; + int fo_vert = filter_params_y.taps / 2 - 1; + int fo_horiz = filter_params_x.taps / 2 - 1; - transpose_uint16( - tr_src, tr_src_stride, src - fo_vert * src_stride - fo_horiz, - src_stride, w + filter_params_x.taps - 1, h + filter_params_y.taps - 1); - transpose_int32(tr_dst, tr_dst_stride, conv_params->dst, - conv_params->dst_stride, w, h); + transpose_uint16(tr_src, tr_src_stride, + src - fo_vert * src_stride - fo_horiz, src_stride, + w + filter_params_x.taps - 1, + h + filter_params_y.taps - 1); + transpose_int32(tr_dst, tr_dst_stride, conv_params->dst, + conv_params->dst_stride, w, h); -// horizontal and vertical parameters are swapped because of the transpose + // horizontal and vertical parameters are swapped because of the transpose #if CONFIG_JNT_COMP - if (scaled) - av1_highbd_convolve_2d_scale( - tr_src + fo_horiz * tr_src_stride + fo_vert, tr_src_stride, tr_dst, - tr_dst_stride, h, w, &filter_params_y, &filter_params_x, subpel_y_q4, - y_step_q4, subpel_x_q4, x_step_q4, conv_params, bd); - else - av1_highbd_jnt_convolve_2d(tr_src + fo_horiz * tr_src_stride + fo_vert, - tr_src_stride, tr_dst, tr_dst_stride, h, w, - &filter_params_y, &filter_params_x, - subpel_y_q4, subpel_x_q4, conv_params, bd); + if (scaled) + av1_highbd_convolve_2d_scale( + tr_src + fo_horiz * tr_src_stride + fo_vert, tr_src_stride, tr_dst, + tr_dst_stride, h, w, &filter_params_y, &filter_params_x, + subpel_y_q4, y_step_q4, subpel_x_q4, x_step_q4, conv_params, bd); + else + av1_highbd_jnt_convolve_2d(tr_src + fo_horiz * tr_src_stride + fo_vert, + tr_src_stride, tr_dst, tr_dst_stride, h, w, + &filter_params_y, &filter_params_x, + subpel_y_q4, subpel_x_q4, conv_params, bd); #else - if (scaled) - av1_highbd_convolve_2d_scale( - tr_src + fo_horiz * tr_src_stride + fo_vert, tr_src_stride, tr_dst, - tr_dst_stride, h, w, &filter_params_y, &filter_params_x, subpel_y_q4, - y_step_q4, subpel_x_q4, x_step_q4, conv_params, bd); - else - av1_highbd_convolve_2d(tr_src + fo_horiz * tr_src_stride + fo_vert, - tr_src_stride, tr_dst, tr_dst_stride, h, w, - &filter_params_y, &filter_params_x, subpel_y_q4, - subpel_x_q4, conv_params, bd); + if (scaled) + av1_highbd_convolve_2d_scale( + tr_src + fo_horiz * tr_src_stride + fo_vert, tr_src_stride, tr_dst, + tr_dst_stride, h, w, &filter_params_y, &filter_params_x, + subpel_y_q4, y_step_q4, subpel_x_q4, x_step_q4, conv_params, bd); + else + av1_highbd_convolve_2d(tr_src + fo_horiz * tr_src_stride + fo_vert, + tr_src_stride, tr_dst, tr_dst_stride, h, w, + &filter_params_y, &filter_params_x, subpel_y_q4, + subpel_x_q4, conv_params, bd); #endif // CONFIG_JNT_COMP - transpose_int32(conv_params->dst, conv_params->dst_stride, tr_dst, - tr_dst_stride, h, w); + transpose_int32(conv_params->dst, conv_params->dst_stride, tr_dst, + tr_dst_stride, h, w); + } else { +#if CONFIG_JNT_COMP + if (scaled) + av1_highbd_convolve_2d_scale( + src, src_stride, conv_params->dst, conv_params->dst_stride, w, h, + &filter_params_x, &filter_params_y, subpel_x_q4, x_step_q4, + subpel_y_q4, y_step_q4, conv_params, bd); + else + av1_highbd_jnt_convolve_2d(src, src_stride, conv_params->dst, + conv_params->dst_stride, w, h, + &filter_params_x, &filter_params_y, + subpel_x_q4, subpel_y_q4, conv_params, bd); +#else + if (scaled) + av1_highbd_convolve_2d_scale( + src, src_stride, conv_params->dst, conv_params->dst_stride, w, h, + &filter_params_x, &filter_params_y, subpel_x_q4, x_step_q4, + subpel_y_q4, y_step_q4, conv_params, bd); + else + av1_highbd_convolve_2d(src, src_stride, conv_params->dst, + conv_params->dst_stride, w, h, &filter_params_x, + &filter_params_y, subpel_x_q4, subpel_y_q4, + conv_params, bd); +#endif // CONFIG_JNT_COMP + } } else { + CONV_BUF_TYPE tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE]; + int tmp_dst_stride = MAX_SB_SIZE; #if CONFIG_JNT_COMP if (scaled) - av1_highbd_convolve_2d_scale( - src, src_stride, conv_params->dst, conv_params->dst_stride, w, h, - &filter_params_x, &filter_params_y, subpel_x_q4, x_step_q4, - subpel_y_q4, y_step_q4, conv_params, bd); + av1_highbd_convolve_2d_scale(src, src_stride, tmp_dst, tmp_dst_stride, w, + h, &filter_params_x, &filter_params_y, + subpel_x_q4, x_step_q4, subpel_y_q4, + y_step_q4, conv_params, bd); else - av1_highbd_jnt_convolve_2d(src, src_stride, conv_params->dst, - conv_params->dst_stride, w, h, + av1_highbd_jnt_convolve_2d(src, src_stride, tmp_dst, tmp_dst_stride, w, h, &filter_params_x, &filter_params_y, subpel_x_q4, subpel_y_q4, conv_params, bd); #else if (scaled) - av1_highbd_convolve_2d_scale( - src, src_stride, conv_params->dst, conv_params->dst_stride, w, h, - &filter_params_x, &filter_params_y, subpel_x_q4, x_step_q4, - subpel_y_q4, y_step_q4, conv_params, bd); + av1_highbd_convolve_2d_scale(src, src_stride, tmp_dst, tmp_dst_stride, w, + h, &filter_params_x, &filter_params_y, + subpel_x_q4, x_step_q4, subpel_y_q4, + y_step_q4, conv_params, bd); else - av1_highbd_convolve_2d(src, src_stride, conv_params->dst, - conv_params->dst_stride, w, h, &filter_params_x, - &filter_params_y, subpel_x_q4, subpel_y_q4, - conv_params, bd); + av1_highbd_convolve_2d(src, src_stride, tmp_dst, tmp_dst_stride, w, h, + &filter_params_x, &filter_params_y, subpel_x_q4, + subpel_y_q4, conv_params, bd); #endif // CONFIG_JNT_COMP + // 0-bit rounding just to convert from int32 to uint16 + av1_highbd_convolve_rounding(tmp_dst, tmp_dst_stride, dst, dst_stride, w, h, + 0, bd); } }
diff --git a/av1/common/convolve.h b/av1/common/convolve.h index b95a58f..7bbf220 100644 --- a/av1/common/convolve.h +++ b/av1/common/convolve.h
@@ -60,27 +60,6 @@ const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); -static INLINE ConvolveParams get_conv_params(int ref, int do_average, int plane, - int bd) { - ConvolveParams conv_params; - conv_params.ref = ref; - conv_params.do_average = do_average; - conv_params.round = CONVOLVE_OPT_ROUND; - conv_params.plane = plane; - conv_params.do_post_rounding = 0; - conv_params.round_0 = ROUND0_BITS; - conv_params.round_1 = 0; - conv_params.is_compound = 0; - conv_params.dst = NULL; - conv_params.dst_stride = 0; - const int intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2; - if (bd < 12) assert(intbufrange <= 16); - if (intbufrange > 16) { - conv_params.round_0 += intbufrange - 16; - } - return conv_params; -} - static INLINE void av1_get_convolve_filter_params(InterpFilters interp_filters, InterpFilterParams *params_x, InterpFilterParams *params_y @@ -107,6 +86,7 @@ struct AV1Common; struct scale_factors; + void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilters interp_filters, const int subpel_x_q4, @@ -114,6 +94,27 @@ int scaled, ConvolveParams *conv_params, const struct scale_factors *sf); +static INLINE ConvolveParams get_conv_params_round(int ref, int do_average, + int plane, int bd) { + ConvolveParams conv_params; + conv_params.ref = ref; + conv_params.do_average = do_average; + conv_params.plane = plane; + conv_params.round = CONVOLVE_OPT_ROUND; + conv_params.round_0 = ROUND0_BITS; + conv_params.round_1 = 0; + conv_params.do_post_rounding = 0; + conv_params.is_compound = 0; + conv_params.dst = NULL; + conv_params.dst_stride = 0; + const int intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2; + if (bd < 12) assert(intbufrange <= 16); + if (intbufrange > 16) { + conv_params.round_0 += intbufrange - 16; + } + return conv_params; +} + static INLINE ConvolveParams get_conv_params_no_round(int ref, int do_average, int plane, int32_t *dst, int dst_stride, @@ -125,7 +126,8 @@ conv_params.is_compound = is_compound; conv_params.round_0 = ROUND0_BITS; #if CONFIG_LOWPRECISION_BLEND - conv_params.round_1 = is_compound ? COMPOUND_ROUND1_BITS : 0; + conv_params.round_1 = is_compound ? COMPOUND_ROUND1_BITS + : 2 * FILTER_BITS - conv_params.round_0; #else conv_params.round_1 = 0; #endif @@ -145,6 +147,11 @@ return conv_params; } +static INLINE ConvolveParams get_conv_params(int ref, int do_average, int plane, + int bd) { + return get_conv_params_no_round(ref, do_average, plane, NULL, 0, 0, bd); +} + void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilters interp_filters,
diff --git a/av1/common/mv.h b/av1/common/mv.h index 4c547fe..2f9011a 100644 --- a/av1/common/mv.h +++ b/av1/common/mv.h
@@ -58,7 +58,7 @@ #define WARP_PARAM_REDUCE_BITS 6 // Precision bits reduction after horizontal shear -#define HORSHEAR_REDUCE_PREC_BITS 5 +#define HORSHEAR_REDUCE_PREC_BITS 3 #define VERSHEAR_REDUCE_PREC_BITS \ (2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)
diff --git a/av1/common/scale.c b/av1/common/scale.c index a334bae..b43de0e 100644 --- a/av1/common/scale.c +++ b/av1/common/scale.c
@@ -185,7 +185,7 @@ // subpel_y_q4 == 0 sf->convolve[1][0][0] = av1_convolve_x_sr; // subpel_x_q4 != 0 && subpel_y_q4 != 0 - sf->convolve[1][1][0] = av1_convolve_2d_sr; + sf->convolve[1][1][0] = av1_convolve_2d_sr_c; #if CONFIG_JNT_COMP // subpel_x_q4 == 0 && subpel_y_q4 == 0 sf->convolve[0][0][1] = av1_jnt_convolve_2d_copy;
diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c index 5daa2cf36..71cd85a 100644 --- a/av1/common/warped_motion.c +++ b/av1/common/warped_motion.c
@@ -422,19 +422,24 @@ ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta) { int32_t tmp[15 * 8]; - const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND; - const int reduce_bits_horiz = + const int use_conv_params = + (conv_params->round == CONVOLVE_OPT_NO_ROUND && conv_params->dst); + int reduce_bits_horiz = use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS; + if (!use_conv_params && + bd + WARPEDPIXEL_FILTER_BITS + 2 - reduce_bits_horiz > 16) + reduce_bits_horiz += bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 14; + const int reduce_bits_vert = + use_conv_params ? conv_params->round_1 + : 2 * WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz; const int max_bits_horiz = - use_conv_params - ? bd + FILTER_BITS + 1 - conv_params->round_0 - : bd + WARPEDPIXEL_FILTER_BITS + 1 - HORSHEAR_REDUCE_PREC_BITS; + use_conv_params ? bd + FILTER_BITS + 1 - conv_params->round_0 + : bd + WARPEDPIXEL_FILTER_BITS + 1 - reduce_bits_horiz; const int offset_bits_horiz = use_conv_params ? bd + FILTER_BITS - 1 : bd + WARPEDPIXEL_FILTER_BITS - 1; const int offset_bits_vert = - use_conv_params - ? bd + 2 * FILTER_BITS - conv_params->round_0 - : bd + 2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS; + use_conv_params ? bd + 2 * FILTER_BITS - conv_params->round_0 + : bd + 2 * WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz; if (use_conv_params) { conv_params->do_post_rounding = 1; } @@ -534,7 +539,7 @@ } else { uint16_t *p = &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)]; - sum = ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS); + sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert); assert(0 <= sum && sum < (1 << (bd + 2))); uint16_t px = clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd); @@ -719,9 +724,13 @@ int16_t gamma, int16_t delta) { int32_t tmp[15 * 8]; const int bd = 8; - const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND; + const int use_conv_params = + (conv_params->round == CONVOLVE_OPT_NO_ROUND && conv_params->dst); const int reduce_bits_horiz = use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS; + const int reduce_bits_vert = + use_conv_params ? conv_params->round_1 + : 2 * WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz; const int max_bits_horiz = use_conv_params ? bd + FILTER_BITS + 1 - conv_params->round_0 @@ -837,7 +846,7 @@ } else { uint8_t *p = &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)]; - sum = ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS); + sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert); assert(0 <= sum && sum < (1 << (bd + 2))); uint8_t px = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd)); if (conv_params->do_average)
diff --git a/av1/common/x86/convolve_avx2.c b/av1/common/x86/convolve_avx2.c index 3692b60..2843a91 100644 --- a/av1/common/x86/convolve_avx2.c +++ b/av1/common/x86/convolve_avx2.c
@@ -140,7 +140,7 @@ void av1_convolve_rounding_avx2(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits) { - const __m256i rnd_num = _mm256_set1_epi32((int32_t)(1 << (bits - 1))); + const __m256i rnd_num = _mm256_set1_epi32((int32_t)((1 << bits) >> 1)); const __m128i rnd_num_sse2 = _mm256_castsi256_si128(rnd_num); if (w > 64) { // width = 128 @@ -283,7 +283,7 @@ uint8_t *dst8, int dst_stride, int w, int h, int bits, int bd) { uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - const __m256i rnd_num = _mm256_set1_epi32((int32_t)(1 << (bits - 1))); + const __m256i rnd_num = _mm256_set1_epi32((int32_t)((1 << bits) >> 1)); const __m128i rnd_num_sse2 = _mm256_castsi256_si128(rnd_num); if (w > 64) { // width = 128
diff --git a/av1/common/x86/highbd_warp_plane_sse4.c b/av1/common/x86/highbd_warp_plane_sse4.c index 0cd438a..e89ad8b 100644 --- a/av1/common/x86/highbd_warp_plane_sse4.c +++ b/av1/common/x86/highbd_warp_plane_sse4.c
@@ -22,15 +22,18 @@ ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta) { int comp_avg = conv_params->do_average; -#if HORSHEAR_REDUCE_PREC_BITS >= 5 __m128i tmp[15]; -#else -#error "HORSHEAR_REDUCE_PREC_BITS < 5 not currently supported by SSSE3 filter" -#endif int i, j, k; - const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND; - const int reduce_bits_horiz = + const int use_conv_params = + (conv_params->round == CONVOLVE_OPT_NO_ROUND && conv_params->dst); + int reduce_bits_horiz = use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS; + if (!use_conv_params && + bd + WARPEDPIXEL_FILTER_BITS + 2 - reduce_bits_horiz > 16) + reduce_bits_horiz += bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 14; + const int reduce_bits_vert = + use_conv_params ? conv_params->round_1 + : 2 * WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz; const int offset_bits_horiz = use_conv_params ? bd + FILTER_BITS - 1 : bd + WARPEDPIXEL_FILTER_BITS - 1; if (use_conv_params) { @@ -91,10 +94,9 @@ else if (iy > height - 1) iy = height - 1; tmp[k + 7] = _mm_set1_epi16( - (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS - - 1)) + + (1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) + ref[iy * stride] * - (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); + (1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz))); } } else if (ix4 >= width + 6) { for (k = -7; k < AOMMIN(8, p_height - i); ++k) { @@ -104,10 +106,9 @@ else if (iy > height - 1) iy = height - 1; tmp[k + 7] = _mm_set1_epi16( - (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS - - 1)) + + (1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) + ref[iy * stride + (width - 1)] * - (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); + (1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz))); } } else { for (k = -7; k < AOMMIN(8, p_height - i); ++k) { @@ -361,13 +362,13 @@ } else { // Round and pack into 8 bits const __m128i round_const = - _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) + - ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1)); + _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) + + ((1 << reduce_bits_vert) >> 1)); const __m128i res_lo_round = _mm_srai_epi32( - _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS); + _mm_add_epi32(res_lo, round_const), reduce_bits_vert); const __m128i res_hi_round = _mm_srai_epi32( - _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS); + _mm_add_epi32(res_hi, round_const), reduce_bits_vert); __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); // Clamp res_16bit to the range [0, 2^bd - 1]
diff --git a/av1/common/x86/highbd_warp_plane_ssse3.c b/av1/common/x86/highbd_warp_plane_ssse3.c index dc727b6..e1d7f8e 100644 --- a/av1/common/x86/highbd_warp_plane_ssse3.c +++ b/av1/common/x86/highbd_warp_plane_ssse3.c
@@ -22,21 +22,25 @@ ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta) { int comp_avg = conv_params->do_average; -#if HORSHEAR_REDUCE_PREC_BITS >= 5 __m128i tmp[15]; -#else -#error "HORSHEAR_REDUCE_PREC_BITS < 5 not currently supported by SSSE3 filter" -#endif int i, j, k; - const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND; - const int reduce_bits_horiz = + const int use_conv_params = + (conv_params->round == CONVOLVE_OPT_NO_ROUND && conv_params->dst); + int reduce_bits_horiz = use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS; + if (!use_conv_params && + bd + WARPEDPIXEL_FILTER_BITS + 2 - reduce_bits_horiz > 16) + reduce_bits_horiz += bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 14; + const int reduce_bits_vert = + use_conv_params ? conv_params->round_1 + : 2 * WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz; const int offset_bits_horiz = use_conv_params ? bd + FILTER_BITS - 1 : bd + WARPEDPIXEL_FILTER_BITS - 1; if (use_conv_params) { conv_params->do_post_rounding = 1; } assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS); + if (bd == 12 && reduce_bits_horiz < 5) printf("Error\n"); /* Note: For this code to work, the left/right frame borders need to be extended by at least 13 pixels each. By the time we get here, other @@ -85,10 +89,9 @@ else if (iy > height - 1) iy = height - 1; tmp[k + 7] = _mm_set1_epi16( - (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS - - 1)) + + (1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) + ref[iy * stride] * - (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); + (1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz))); } } else if (ix4 >= width + 6) { for (k = -7; k < AOMMIN(8, p_height - i); ++k) { @@ -98,10 +101,9 @@ else if (iy > height - 1) iy = height - 1; tmp[k + 7] = _mm_set1_epi16( - (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS - - 1)) + + (1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) + ref[iy * stride + (width - 1)] * - (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); + (1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz))); } } else { for (k = -7; k < AOMMIN(8, p_height - i); ++k) { @@ -320,13 +322,13 @@ } else { // Round and pack into 8 bits const __m128i round_const = - _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) + - ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1)); + _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) + + ((1 << reduce_bits_vert) >> 1)); const __m128i res_lo_round = _mm_srai_epi32( - _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS); + _mm_add_epi32(res_lo, round_const), reduce_bits_vert); const __m128i res_hi_round = _mm_srai_epi32( - _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS); + _mm_add_epi32(res_hi, round_const), reduce_bits_vert); __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); // Clamp res_16bit to the range [0, 2^bd - 1]
diff --git a/av1/common/x86/warp_plane_sse2.c b/av1/common/x86/warp_plane_sse2.c index 75ed82b..d330cd3 100644 --- a/av1/common/x86/warp_plane_sse2.c +++ b/av1/common/x86/warp_plane_sse2.c
@@ -24,7 +24,8 @@ __m128i tmp[15]; int i, j, k; const int bd = 8; - const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND; + const int use_conv_params = + (conv_params->round == CONVOLVE_OPT_NO_ROUND && conv_params->dst); const int reduce_bits_horiz = use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS; const int offset_bits_horiz = @@ -81,10 +82,9 @@ else if (iy > height - 1) iy = height - 1; tmp[k + 7] = _mm_set1_epi16( - (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS - - 1)) + + (1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) + ref[iy * stride] * - (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); + (1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz))); } } else if (ix4 >= width + 6) { for (k = -7; k < AOMMIN(8, p_height - i); ++k) { @@ -94,10 +94,9 @@ else if (iy > height - 1) iy = height - 1; tmp[k + 7] = _mm_set1_epi16( - (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS - - 1)) + + (1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) + ref[iy * stride + (width - 1)] * - (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); + (1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz))); } } else { for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
diff --git a/av1/common/x86/warp_plane_sse4.c b/av1/common/x86/warp_plane_sse4.c index 2c97704..b421533 100644 --- a/av1/common/x86/warp_plane_sse4.c +++ b/av1/common/x86/warp_plane_sse4.c
@@ -212,7 +212,8 @@ __m128i tmp[15]; int i, j, k; const int bd = 8; - const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND; + const int use_conv_params = + (conv_params->round == CONVOLVE_OPT_NO_ROUND && conv_params->dst); const int reduce_bits_horiz = use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS; const int offset_bits_horiz = @@ -275,10 +276,9 @@ else if (iy > height - 1) iy = height - 1; tmp[k + 7] = _mm_set1_epi16( - (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS - - 1)) + + (1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) + ref[iy * stride] * - (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); + (1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz))); } } else if (ix4 >= width + 6) { for (k = -7; k < AOMMIN(8, p_height - i); ++k) { @@ -288,10 +288,9 @@ else if (iy > height - 1) iy = height - 1; tmp[k + 7] = _mm_set1_epi16( - (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS - - 1)) + + (1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) + ref[iy * stride + (width - 1)] * - (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); + (1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz))); } } else { for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
diff --git a/av1/common/x86/warp_plane_ssse3.c b/av1/common/x86/warp_plane_ssse3.c index b0501e9..f18dad1 100644 --- a/av1/common/x86/warp_plane_ssse3.c +++ b/av1/common/x86/warp_plane_ssse3.c
@@ -211,7 +211,8 @@ __m128i tmp[15]; int i, j, k; const int bd = 8; - const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND; + const int use_conv_params = + (conv_params->round == CONVOLVE_OPT_NO_ROUND && conv_params->dst); const int reduce_bits_horiz = use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS; const int offset_bits_horiz = @@ -268,10 +269,9 @@ else if (iy > height - 1) iy = height - 1; tmp[k + 7] = _mm_set1_epi16( - (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS - - 1)) + + (1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) + ref[iy * stride] * - (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); + (1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz))); } } else if (ix4 >= width + 6) { for (k = -7; k < AOMMIN(8, p_height - i); ++k) { @@ -281,10 +281,9 @@ else if (iy > height - 1) iy = height - 1; tmp[k + 7] = _mm_set1_epi16( - (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS - - 1)) + + (1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) + ref[iy * stride + (width - 1)] * - (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); + (1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz))); } } else { for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
diff --git a/test/av1_convolve_optimz_test.cc b/test/av1_convolve_optimz_test.cc index 288daeb..0900aa1 100644 --- a/test/av1_convolve_optimz_test.cc +++ b/test/av1_convolve_optimz_test.cc
@@ -66,7 +66,7 @@ subpel_ = GET_PARAM(4); int ref = GET_PARAM(5); const int plane = 0; - conv_params_ = get_conv_params(ref, ref, plane, 8); + conv_params_ = get_conv_params_round(ref, ref, plane, 8); alloc_ = new uint8_t[maxBlockSize * 4]; src_ = alloc_ + (vertiOffset * maxWidth);
diff --git a/test/av1_convolve_test.cc b/test/av1_convolve_test.cc index cb58289..397fc70 100644 --- a/test/av1_convolve_test.cc +++ b/test/av1_convolve_test.cc
@@ -149,7 +149,7 @@ TEST_P(Av1ConvolveTest, av1_convolve_vert) { const int y_step_q4 = 16; - ConvolveParams conv_params = get_conv_params(0, 0, 0, 8); + ConvolveParams conv_params = get_conv_params_round(0, 0, 0, 8); int in_stride, out_stride, ref_out_stride, avg_out_stride, ref_avg_out_stride; uint8_t *in = add_input(MAX_SB_SIZE, MAX_SB_SIZE, &in_stride); @@ -202,7 +202,7 @@ TEST_P(Av1ConvolveTest, av1_convolve_horiz) { const int x_step_q4 = 16; - ConvolveParams conv_params = get_conv_params(0, 0, 0, 8); + ConvolveParams conv_params = get_conv_params_round(0, 0, 0, 8); int in_stride, out_stride, ref_out_stride, avg_out_stride, ref_avg_out_stride; uint8_t *in = add_input(MAX_SB_SIZE, MAX_SB_SIZE, &in_stride);