Optimize av1_jnt_convolve_2d_copy function With shift, convolve copy no longer needs 32-bit multiplication of two 8-bit numbers. Thus we can implement it with sse2 instead of sse4. Change-Id: I63e8ba414383a24f820bad4a6c607f222ec40ec2
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl index abbcbf6..ea4fd32 100755 --- a/av1/common/av1_rtcd_defs.pl +++ b/av1/common/av1_rtcd_defs.pl
@@ -602,7 +602,7 @@ if (aom_config("CONFIG_COMPOUND_ROUND") ne "yes") { add_proto qw/void av1_jnt_convolve_2d_copy/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; - specialize qw/av1_jnt_convolve_2d_copy sse4_1/; + specialize qw/av1_jnt_convolve_2d_copy sse2/; } }
diff --git a/av1/common/convolve.c b/av1/common/convolve.c index 56c6d14..17df00f 100644 --- a/av1/common/convolve.c +++ b/av1/common/convolve.c
@@ -684,7 +684,7 @@ for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { - CONV_BUF_TYPE res = (1 << bits) * src[y * src_stride + x]; + CONV_BUF_TYPE res = src[y * src_stride + x] << bits; if (conv_params->do_average) dst[y * dst_stride + x] += res; else @@ -776,12 +776,14 @@ CONV_BUF_TYPE res = (1 << bits) * src[y * src_stride + x]; if (conv_params->use_jnt_comp_avg) { if (conv_params->do_average) { - dst[y * dst_stride + x] += res * conv_params->bck_offset; + dst[y * dst_stride + x] += + (src[y * src_stride + x] * conv_params->bck_offset) << bits; dst[y * dst_stride + x] = ROUND_POWER_OF_TWO(dst[y * dst_stride + x], DIST_PRECISION_BITS - 1); } else { - dst[y * dst_stride + x] = res * conv_params->fwd_offset; + dst[y * dst_stride + x] = + (src[y * src_stride + x] * conv_params->fwd_offset) << bits; } } else { if (conv_params->do_average)
diff --git a/av1/common/x86/convolve_2d_sse2.c b/av1/common/x86/convolve_2d_sse2.c index 19f01be..13275b6 100644 --- a/av1/common/x86/convolve_2d_sse2.c +++ b/av1/common/x86/convolve_2d_sse2.c
@@ -385,16 +385,17 @@ InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { + (void)filter_params_x; + (void)filter_params_y; + (void)subpel_x_q4; + (void)subpel_y_q4; + const int bits = FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; const int do_average = conv_params->do_average; const __m128i zero = _mm_setzero_si128(); const __m128i left_shift = _mm_cvtsi32_si128(bits); int i, j; - (void)filter_params_x; - (void)filter_params_y; - (void)subpel_x_q4; - (void)subpel_y_q4; if (!(w % 16)) { for (i = 0; i < h; ++i) { @@ -489,4 +490,212 @@ } } } -#endif + +#if CONFIG_JNT_COMP +void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, + CONV_BUF_TYPE *dst, int dst_stride, int w, + int h, InterpFilterParams *filter_params_x, + InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, + ConvolveParams *conv_params) { + (void)filter_params_x; + (void)filter_params_y; + (void)subpel_x_q4; + (void)subpel_y_q4; + + const int bits = + FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; + const int do_average = conv_params->do_average; + const __m128i zero = _mm_setzero_si128(); + const __m128i left_shift = _mm_cvtsi32_si128(bits); + int i, j; + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi32(w0); + const __m128i wt1 = _mm_set1_epi32(w1); + const int jnt_round_const = 1 << (DIST_PRECISION_BITS - 2); + const __m128i jnt_r = _mm_set1_epi32(jnt_round_const); + + if (!(w % 16)) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 16) { + const __m128i d8 = _mm_loadu_si128((__m128i *)&src[j]); + const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero); + const __m128i d16_1 = _mm_unpackhi_epi8(d8, zero); + __m128i d32_0 = _mm_unpacklo_epi16(d16_0, zero); + __m128i d32_1 = _mm_unpackhi_epi16(d16_0, zero); + __m128i d32_2 = _mm_unpacklo_epi16(d16_1, zero); + __m128i d32_3 = _mm_unpackhi_epi16(d16_1, zero); + + __m128i *const p = (__m128i *)&dst[j]; + + if (conv_params->use_jnt_comp_avg) { + if (do_average) { + __m128i mul = _mm_mullo_epi16(d32_0, wt1); + __m128i weighted_res = _mm_sll_epi32(mul, left_shift); + __m128i sum = _mm_add_epi32(_mm_loadu_si128(p + 0), weighted_res); + d32_0 = _mm_srai_epi32(_mm_add_epi32(sum, jnt_r), + DIST_PRECISION_BITS - 1); + + mul = _mm_mullo_epi16(d32_1, wt1); + weighted_res = _mm_sll_epi32(mul, left_shift); + sum = _mm_add_epi32(_mm_loadu_si128(p + 1), weighted_res); + d32_1 = _mm_srai_epi32(_mm_add_epi32(sum, jnt_r), + DIST_PRECISION_BITS - 1); + + mul = _mm_mullo_epi16(d32_2, wt1); + weighted_res = _mm_sll_epi32(mul, left_shift); + sum = _mm_add_epi32(_mm_loadu_si128(p + 2), weighted_res); + d32_2 = _mm_srai_epi32(_mm_add_epi32(sum, jnt_r), + DIST_PRECISION_BITS - 1); + + mul = _mm_mullo_epi16(d32_3, wt1); + weighted_res = _mm_sll_epi32(mul, left_shift); + sum = _mm_add_epi32(_mm_loadu_si128(p + 3), weighted_res); + d32_3 = _mm_srai_epi32(_mm_add_epi32(sum, jnt_r), + DIST_PRECISION_BITS - 1); + } else { + d32_0 = _mm_sll_epi32(_mm_mullo_epi16(d32_0, wt0), left_shift); + d32_1 = _mm_sll_epi32(_mm_mullo_epi16(d32_1, wt0), left_shift); + d32_2 = _mm_sll_epi32(_mm_mullo_epi16(d32_2, wt0), left_shift); + d32_3 = _mm_sll_epi32(_mm_mullo_epi16(d32_3, wt0), left_shift); + } + } else { + if (do_average) { + d32_0 = _mm_add_epi32(_mm_loadu_si128(p + 0), + _mm_sll_epi32(d32_0, left_shift)); + d32_1 = _mm_add_epi32(_mm_loadu_si128(p + 1), + _mm_sll_epi32(d32_1, left_shift)); + d32_2 = _mm_add_epi32(_mm_loadu_si128(p + 2), + _mm_sll_epi32(d32_2, left_shift)); + d32_3 = _mm_add_epi32(_mm_loadu_si128(p + 3), + _mm_sll_epi32(d32_3, left_shift)); + } else { + d32_0 = _mm_sll_epi32(d32_0, left_shift); + d32_1 = _mm_sll_epi32(d32_1, left_shift); + d32_2 = _mm_sll_epi32(d32_2, left_shift); + d32_3 = _mm_sll_epi32(d32_3, left_shift); + } + } + + _mm_storeu_si128(p + 0, d32_0); + _mm_storeu_si128(p + 1, d32_1); + _mm_storeu_si128(p + 2, d32_2); + _mm_storeu_si128(p + 3, d32_3); + } + src += src_stride; + dst += dst_stride; + } + } else if (!(w % 8)) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i d8 = _mm_loadl_epi64((__m128i *)&src[j]); + const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero); + __m128i d32_0 = _mm_unpacklo_epi16(d16_0, zero); + __m128i d32_1 = _mm_unpackhi_epi16(d16_0, zero); + + __m128i *const p = (__m128i *)&dst[j]; + if (conv_params->use_jnt_comp_avg) { + if (do_average) { + __m128i mul = _mm_mullo_epi16(d32_0, wt1); + __m128i weighted_res = _mm_sll_epi32(mul, left_shift); + __m128i sum = _mm_add_epi32(_mm_loadu_si128(p + 0), weighted_res); + d32_0 = _mm_srai_epi32(_mm_add_epi32(sum, jnt_r), + DIST_PRECISION_BITS - 1); + + mul = _mm_mullo_epi16(d32_1, wt1); + weighted_res = _mm_sll_epi32(mul, left_shift); + sum = _mm_add_epi32(_mm_loadu_si128(p + 1), weighted_res); + d32_1 = _mm_srai_epi32(_mm_add_epi32(sum, jnt_r), + DIST_PRECISION_BITS - 1); + } else { + d32_0 = _mm_sll_epi32(_mm_mullo_epi16(d32_0, wt0), left_shift); + d32_1 = _mm_sll_epi32(_mm_mullo_epi16(d32_1, wt0), left_shift); + } + } else { + if (do_average) { + d32_0 = _mm_add_epi32(_mm_loadu_si128(p + 0), + _mm_sll_epi32(d32_0, left_shift)); + d32_1 = _mm_add_epi32(_mm_loadu_si128(p + 1), + _mm_sll_epi32(d32_1, left_shift)); + } else { + d32_0 = _mm_sll_epi32(d32_0, left_shift); + d32_1 = _mm_sll_epi32(d32_1, left_shift); + } + } + + _mm_storeu_si128(p + 0, d32_0); + _mm_storeu_si128(p + 1, d32_1); + } + src += src_stride; + dst += dst_stride; + } + } else if (!(w % 4)) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 4) { + const __m128i d8 = _mm_loadl_epi64((__m128i *)&src[j]); + const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero); + __m128i d32_0 = _mm_unpacklo_epi16(d16_0, zero); + + __m128i *const p = (__m128i *)&dst[j]; + if (conv_params->use_jnt_comp_avg) { + if (do_average) { + __m128i mul = _mm_mullo_epi16(d32_0, wt1); + __m128i weighted_res = _mm_sll_epi32(mul, left_shift); + __m128i sum = _mm_add_epi32(_mm_loadu_si128(p + 0), weighted_res); + d32_0 = _mm_srai_epi32(_mm_add_epi32(sum, jnt_r), + DIST_PRECISION_BITS - 1); + } else { + d32_0 = _mm_sll_epi32(_mm_mullo_epi16(d32_0, wt0), left_shift); + } + } else { + if (do_average) { + d32_0 = _mm_add_epi32(_mm_loadu_si128(p + 0), + _mm_sll_epi32(d32_0, left_shift)); + } else { + d32_0 = _mm_sll_epi32(d32_0, left_shift); + } + } + + _mm_storeu_si128(p, d32_0); + } + src += src_stride; + dst += dst_stride; + } + } else { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 2) { + const __m128i d8 = _mm_cvtsi32_si128(*(const int *)&src[j]); + const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero); + __m128i d32_0 = _mm_unpacklo_epi16(d16_0, zero); + + __m128i *const p = (__m128i *)&dst[j]; + if (conv_params->use_jnt_comp_avg) { + if (do_average) { + __m128i mul = _mm_mullo_epi16(d32_0, wt1); + __m128i weighted_res = _mm_sll_epi32(mul, left_shift); + __m128i sum = _mm_add_epi32(_mm_loadl_epi64(p), weighted_res); + d32_0 = _mm_srai_epi32(_mm_add_epi32(sum, jnt_r), + DIST_PRECISION_BITS - 1); + } else { + d32_0 = _mm_sll_epi32(_mm_mullo_epi16(d32_0, wt0), left_shift); + } + } else { + if (do_average) { + d32_0 = _mm_add_epi32(_mm_loadl_epi64(p), + _mm_sll_epi32(d32_0, left_shift)); + } else { + d32_0 = _mm_sll_epi32(d32_0, left_shift); + } + } + + _mm_storel_epi64(p, d32_0); + } + src += src_stride; + dst += dst_stride; + } + } +} +#endif // CONFIG_JNT_COMP +#endif // CONFIG_COMPOUND_ROUND
diff --git a/av1/common/x86/convolve_2d_sse4.c b/av1/common/x86/convolve_2d_sse4.c index 893036b..71c32e7 100644 --- a/av1/common/x86/convolve_2d_sse4.c +++ b/av1/common/x86/convolve_2d_sse4.c
@@ -450,194 +450,5 @@ } } } - -void av1_jnt_convolve_2d_copy_sse4_1(const uint8_t *src, int src_stride, - CONV_BUF_TYPE *dst, int dst_stride, int w, - int h, InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, - const int subpel_x_q4, - const int subpel_y_q4, - ConvolveParams *conv_params) { - const int bits = - FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; - const int do_average = conv_params->do_average; - const __m128i zero = _mm_setzero_si128(); - const __m128i left_shift = _mm_cvtsi32_si128(bits); - int i, j; - (void)filter_params_x; - (void)filter_params_y; - (void)subpel_x_q4; - (void)subpel_y_q4; - - const int w0 = conv_params->fwd_offset; - const int w1 = conv_params->bck_offset; - const __m128i wt0 = _mm_set1_epi32(w0); - const __m128i wt1 = _mm_set1_epi32(w1); - const int jnt_round_const = 1 << (DIST_PRECISION_BITS - 2); - const __m128i jnt_r = _mm_set1_epi32(jnt_round_const); - - if (!(w % 16)) { - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 16) { - const __m128i d8 = _mm_loadu_si128((__m128i *)&src[j]); - const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero); - const __m128i d16_1 = _mm_unpackhi_epi8(d8, zero); - __m128i d32_0 = _mm_unpacklo_epi16(d16_0, zero); - __m128i d32_1 = _mm_unpackhi_epi16(d16_0, zero); - __m128i d32_2 = _mm_unpacklo_epi16(d16_1, zero); - __m128i d32_3 = _mm_unpackhi_epi16(d16_1, zero); - - d32_0 = _mm_sll_epi32(d32_0, left_shift); - d32_1 = _mm_sll_epi32(d32_1, left_shift); - d32_2 = _mm_sll_epi32(d32_2, left_shift); - d32_3 = _mm_sll_epi32(d32_3, left_shift); - - __m128i *const p = (__m128i *)&dst[j]; - - if (conv_params->use_jnt_comp_avg) { - if (do_average) { - __m128i weighted_res = _mm_mullo_epi32(d32_0, wt1); - __m128i sum = _mm_add_epi32(_mm_loadu_si128(p + 0), weighted_res); - d32_0 = _mm_srai_epi32(_mm_add_epi32(sum, jnt_r), - DIST_PRECISION_BITS - 1); - - weighted_res = _mm_mullo_epi32(d32_1, wt1); - sum = _mm_add_epi32(_mm_loadu_si128(p + 1), weighted_res); - d32_1 = _mm_srai_epi32(_mm_add_epi32(sum, jnt_r), - DIST_PRECISION_BITS - 1); - - weighted_res = _mm_mullo_epi32(d32_2, wt1); - sum = _mm_add_epi32(_mm_loadu_si128(p + 2), weighted_res); - d32_2 = _mm_srai_epi32(_mm_add_epi32(sum, jnt_r), - DIST_PRECISION_BITS - 1); - - weighted_res = _mm_mullo_epi32(d32_3, wt1); - sum = _mm_add_epi32(_mm_loadu_si128(p + 3), weighted_res); - d32_3 = _mm_srai_epi32(_mm_add_epi32(sum, jnt_r), - DIST_PRECISION_BITS - 1); - } else { - d32_0 = _mm_mullo_epi32(d32_0, wt0); - d32_1 = _mm_mullo_epi32(d32_1, wt0); - d32_2 = _mm_mullo_epi32(d32_2, wt0); - d32_3 = _mm_mullo_epi32(d32_3, wt0); - } - } else { - if (do_average) { - d32_0 = _mm_add_epi32(_mm_loadu_si128(p + 0), d32_0); - d32_1 = _mm_add_epi32(_mm_loadu_si128(p + 1), d32_1); - d32_2 = _mm_add_epi32(_mm_loadu_si128(p + 2), d32_2); - d32_3 = _mm_add_epi32(_mm_loadu_si128(p + 3), d32_3); - } - } - - _mm_storeu_si128(p + 0, d32_0); - _mm_storeu_si128(p + 1, d32_1); - _mm_storeu_si128(p + 2, d32_2); - _mm_storeu_si128(p + 3, d32_3); - } - src += src_stride; - dst += dst_stride; - } - } else if (!(w % 8)) { - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 8) { - const __m128i d8 = _mm_loadl_epi64((__m128i *)&src[j]); - const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero); - __m128i d32_0 = _mm_unpacklo_epi16(d16_0, zero); - __m128i d32_1 = _mm_unpackhi_epi16(d16_0, zero); - - d32_0 = _mm_sll_epi32(d32_0, left_shift); - d32_1 = _mm_sll_epi32(d32_1, left_shift); - - __m128i *const p = (__m128i *)&dst[j]; - if (conv_params->use_jnt_comp_avg) { - if (do_average) { - __m128i weighted_res = _mm_mullo_epi32(d32_0, wt1); - __m128i sum = _mm_add_epi32(_mm_loadu_si128(p + 0), weighted_res); - d32_0 = _mm_srai_epi32(_mm_add_epi32(sum, jnt_r), - DIST_PRECISION_BITS - 1); - - weighted_res = _mm_mullo_epi32(d32_1, wt1); - sum = _mm_add_epi32(_mm_loadu_si128(p + 1), weighted_res); - d32_1 = _mm_srai_epi32(_mm_add_epi32(sum, jnt_r), - DIST_PRECISION_BITS - 1); - } else { - d32_0 = _mm_mullo_epi32(d32_0, wt0); - d32_1 = _mm_mullo_epi32(d32_1, wt0); - } - } else { - if (do_average) { - d32_0 = _mm_add_epi32(_mm_loadu_si128(p + 0), d32_0); - d32_1 = _mm_add_epi32(_mm_loadu_si128(p + 1), d32_1); - } - } - - _mm_storeu_si128(p + 0, d32_0); - _mm_storeu_si128(p + 1, d32_1); - } - src += src_stride; - dst += dst_stride; - } - } else if (!(w % 4)) { - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 4) { - const __m128i d8 = _mm_loadl_epi64((__m128i *)&src[j]); - const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero); - __m128i d32_0 = _mm_unpacklo_epi16(d16_0, zero); - - d32_0 = _mm_sll_epi32(d32_0, left_shift); - - __m128i *const p = (__m128i *)&dst[j]; - if (conv_params->use_jnt_comp_avg) { - if (do_average) { - __m128i weighted_res = _mm_mullo_epi32(d32_0, wt1); - __m128i sum = _mm_add_epi32(_mm_loadu_si128(p + 0), weighted_res); - d32_0 = _mm_srai_epi32(_mm_add_epi32(sum, jnt_r), - DIST_PRECISION_BITS - 1); - } else { - d32_0 = _mm_mullo_epi32(d32_0, wt0); - } - } else { - if (do_average) { - d32_0 = _mm_add_epi32(_mm_loadu_si128(p + 0), d32_0); - } - } - - _mm_storeu_si128(p, d32_0); - } - src += src_stride; - dst += dst_stride; - } - } else { - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 2) { - const __m128i d8 = _mm_cvtsi32_si128(*(const int *)&src[j]); - const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero); - __m128i d32_0 = _mm_unpacklo_epi16(d16_0, zero); - - d32_0 = _mm_sll_epi32(d32_0, left_shift); - __m128i *const p = (__m128i *)&dst[j]; - if (conv_params->use_jnt_comp_avg) { - if (do_average) { - __m128i weighted_res = _mm_mullo_epi32(d32_0, wt1); - __m128i sum = _mm_add_epi32(_mm_loadl_epi64(p), weighted_res); - d32_0 = _mm_srai_epi32(_mm_add_epi32(sum, jnt_r), - DIST_PRECISION_BITS - 1); - } else { - d32_0 = _mm_mullo_epi32(d32_0, wt0); - } - } else { - if (do_average) { - d32_0 = _mm_add_epi32(_mm_loadl_epi64(p), d32_0); - } - } - - _mm_storel_epi64(p, d32_0); - } - src += src_stride; - dst += dst_stride; - } - } -} #endif // CONFIG_COMPOUND_ROUND #endif // CONFIG_JNT_COMP