Some fixes and clean-ups on convolve functions Make the av1_convolve_x_sr_sse2/avx2 support various bit shift options. Addition of asserts in the convolve functions. Change-Id: Ib6d1ada6c00a20e6e498af2672bd0bb76040d7d0
diff --git a/av1/common/convolve.c b/av1/common/convolve.c index 9c44041..a2fb693 100644 --- a/av1/common/convolve.c +++ b/av1/common/convolve.c
@@ -447,6 +447,8 @@ (void)dst0; (void)dst_stride0; + assert(bits >= 0); + // vertical filter const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( *filter_params_y, subpel_y_q4 & SUBPEL_MASK); @@ -481,6 +483,8 @@ (void)dst0; (void)dst_stride0; + assert(bits >= 0); + // horizontal filter const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( *filter_params_x, subpel_x_q4 & SUBPEL_MASK); @@ -590,6 +594,10 @@ (void)subpel_x_q4; (void)conv_params; + assert(conv_params->round_0 <= FILTER_BITS); + assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) || + ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS))); + // vertical filter const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( *filter_params_y, subpel_y_q4 & SUBPEL_MASK); @@ -617,6 +625,10 @@ (void)subpel_y_q4; (void)conv_params; + assert(bits >= 0); + assert((FILTER_BITS - conv_params->round_1) >= 0 || + ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); + // horizontal filter const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
diff --git a/av1/common/x86/convolve_2d_avx2.c b/av1/common/x86/convolve_2d_avx2.c index 396e80f..9c1a32b 100644 --- a/av1/common/x86/convolve_2d_avx2.c +++ b/av1/common/x86/convolve_2d_avx2.c
@@ -43,6 +43,8 @@ __m256i filt[4], s[8], coeffs_x[4], coeffs_y[4]; + assert(conv_params->round_0 > 0); + filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2); filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2); filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2); @@ -176,6 +178,8 @@ __m256i filt[4], coeffs_h[4], coeffs_v[4]; + assert(conv_params->round_0 > 0); + filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2); filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2); filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
diff --git a/av1/common/x86/convolve_2d_sse2.c b/av1/common/x86/convolve_2d_sse2.c index f2c3561..96a6042 100644 --- a/av1/common/x86/convolve_2d_sse2.c +++ b/av1/common/x86/convolve_2d_sse2.c
@@ -41,6 +41,8 @@ const __m128i zero = _mm_setzero_si128(); + assert(conv_params->round_0 > 0); + /* Horizontal filter */ { const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( @@ -226,6 +228,8 @@ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + assert(conv_params->round_0 > 0); + /* Horizontal filter */ { const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
diff --git a/av1/common/x86/convolve_avx2.c b/av1/common/x86/convolve_avx2.c index 2843a91..c4d7447 100644 --- a/av1/common/x86/convolve_avx2.c +++ b/av1/common/x86/convolve_avx2.c
@@ -359,6 +359,8 @@ const __m256i avg_mask = _mm256_set1_epi32(conv_params->do_average ? -1 : 0); __m256i coeffs[4], s[8]; + assert((FILTER_BITS - conv_params->round_0) >= 0); + prepare_coeffs(filter_params_y, subpel_y_q4, coeffs); (void)conv_params; @@ -514,6 +516,10 @@ _mm256_set1_epi16((1 << right_shift_bits) >> 1); __m256i coeffs[4], s[8]; + assert(conv_params->round_0 <= FILTER_BITS); + assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) || + ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS))); + prepare_coeffs(filter_params_y, subpel_y_q4, coeffs); (void)filter_params_x; @@ -665,6 +671,9 @@ __m256i filt[4], coeffs[4]; + assert(bits >= 0); + assert(conv_params->round_0 > 0); + filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2); filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2); filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2); @@ -720,6 +729,7 @@ int i, j; const int fo_horiz = filter_params_x->taps / 2 - 1; const uint8_t *const src_ptr = src - fo_horiz; + const int bits = FILTER_BITS - conv_params->round_0; __m256i filt[4], coeffs[4]; @@ -730,14 +740,20 @@ prepare_coeffs(filter_params_x, subpel_x_q4, coeffs); - const __m256i round_const = - _mm256_set1_epi16(((1 << (conv_params->round_0 - 1)) >> 1) + - ((1 << (FILTER_BITS - 1)) >> 1)); - const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS - 1); + const __m256i round_0_const = + _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1); + const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1); + const __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1); + const __m128i round_shift = _mm_cvtsi32_si128(bits); (void)filter_params_y; (void)subpel_y_q4; + assert(bits >= 0); + assert((FILTER_BITS - conv_params->round_1) >= 0 || + ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); + assert(conv_params->round_0 > 0); + for (i = 0; i < h; ++i) { for (j = 0; j < w; j += 16) { // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 18 @@ -748,7 +764,9 @@ __m256i res_16b = convolve_x(data, coeffs, filt); - // Combine V round and 2F-H-V round into a single rounding + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), + round_0_shift); + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), round_shift);
diff --git a/av1/common/x86/convolve_sse2.c b/av1/common/x86/convolve_sse2.c index f8081d2..ab35226 100644 --- a/av1/common/x86/convolve_sse2.c +++ b/av1/common/x86/convolve_sse2.c
@@ -105,6 +105,8 @@ (void)dst0; (void)dst_stride0; + assert(bits >= 0); + prepare_coeffs(filter_params_y, subpel_y_q4, coeffs); if (w == 4) { @@ -252,6 +254,8 @@ (void)dst0; (void)dst_stride0; + assert(bits >= 0); + prepare_coeffs(filter_params_x, subpel_x_q4, coeffs); if (w == 4) { @@ -335,6 +339,10 @@ (void)subpel_x_q4; (void)conv_params; + assert(conv_params->round_0 <= FILTER_BITS); + assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) || + ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS))); + prepare_coeffs(filter_params_y, subpel_y_q4, coeffs); if (w <= 4) { @@ -484,14 +492,21 @@ ConvolveParams *conv_params) { const int fo_horiz = filter_params_x->taps / 2 - 1; const uint8_t *src_ptr = src - fo_horiz; - const __m128i round_const = _mm_set1_epi32( - ((1 << conv_params->round_0) >> 1) + (1 << (FILTER_BITS - 1))); - const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS); + const int bits = FILTER_BITS - conv_params->round_0; + const __m128i round_0_const = + _mm_set1_epi32((1 << conv_params->round_0) >> 1); + const __m128i round_const = _mm_set1_epi32((1 << bits) >> 1); + const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0); + const __m128i round_shift = _mm_cvtsi32_si128(bits); __m128i coeffs[4]; (void)filter_params_y; (void)subpel_y_q4; + assert(bits >= 0); + assert((FILTER_BITS - conv_params->round_1) >= 0 || + ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); + prepare_coeffs(filter_params_x, subpel_x_q4, coeffs); if (w <= 4) { @@ -507,8 +522,10 @@ s[3] = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7)); const __m128i res_lo = convolve_lo_x(s, coeffs); - const __m128i res_lo_round = - _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); + __m128i res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift); + res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const), round_shift); const __m128i res16 = _mm_packs_epi32(res_lo_round, res_lo_round); const __m128i res = _mm_packus_epi16(res16, res16); @@ -549,10 +566,14 @@ // Rearrange pixels back into the order 0 ... 7 const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); - const __m128i res_lo_round = - _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); - const __m128i res_hi_round = - _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); + __m128i res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift); + res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const), + round_shift); + __m128i res_hi_round = + _mm_sra_epi32(_mm_add_epi32(res_hi, round_0_const), round_0_shift); + res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const), + round_shift); const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round); const __m128i res = _mm_packus_epi16(res16, res16);