AVX2 optimization of motion compensation function AVX2 implementation of av1_highbd_jnt_convolve_x_c has been added. ASSERT for (FILTER_BITS - conv_params->round_1) >=0 has been included in function av1_highbd_jnt_convolve_x_c. RunSpeedTest for AV1HighbdJntConvolve2DTest has been added. It is disabled by default. av1_highbd_jnt_convolve_x_avx2 - ~13.8 times faster than C implementation. Change-Id: I12c15b5a9894f2b95bf959123578916b6303f7b6
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl index 9f46879..4f13269 100755 --- a/av1/common/av1_rtcd_defs.pl +++ b/av1/common/av1_rtcd_defs.pl
@@ -544,6 +544,7 @@ specialize qw/av1_highbd_jnt_convolve_2d sse4_1 avx2/; add_proto qw/void av1_highbd_jnt_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; + specialize qw/av1_highbd_jnt_convolve_x avx2/; add_proto qw/void av1_highbd_jnt_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
diff --git a/av1/common/convolve.c b/av1/common/convolve.c index a8fac2a..c2992b5 100644 --- a/av1/common/convolve.c +++ b/av1/common/convolve.c
@@ -1407,6 +1407,7 @@ (void)dst_stride0; (void)bd; + assert(bits >= 0); // horizontal filter const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
diff --git a/av1/common/x86/highbd_jnt_convolve_avx2.c b/av1/common/x86/highbd_jnt_convolve_avx2.c index 54b557f..847e2d2 100644 --- a/av1/common/x86/highbd_jnt_convolve_avx2.c +++ b/av1/common/x86/highbd_jnt_convolve_avx2.c
@@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * Copyright (c) 2018, Alliance for Open Media. All rights reserved * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License @@ -275,3 +275,113 @@ } } #endif + +#if CONFIG_JNT_COMP +void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride, + uint16_t *dst0, int dst_stride0, int w, + int h, InterpFilterParams *filter_params_x, + InterpFilterParams *filter_params_y, + const int subpel_x_q4, + const int subpel_y_q4, + ConvolveParams *conv_params, int bd) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_horiz; + const int bits = FILTER_BITS - conv_params->round_1; + (void)filter_params_y; + (void)subpel_y_q4; + (void)dst0; + (void)dst_stride0; + (void)bd; + + int i, j; + __m256i s[4], coeffs_x[4]; + + const int do_average = conv_params->do_average; + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m256i wt0 = _mm256_set1_epi32(w0); + const __m256i wt1 = _mm256_set1_epi32(w1); + const __m128i wt0_128 = _mm256_castsi256_si128(wt0); + const __m128i wt1_128 = _mm256_castsi256_si128(wt1); + + const __m256i round_const_x = + _mm256_set1_epi32(((1 << conv_params->round_0) >> 1)); + const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + + assert(bits >= 0); + prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x); + + for (j = 0; j < w; j += 8) { + /* Horizontal filter */ + for (i = 0; i < h; i += 2) { + const __m256i row0 = + _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]); + __m256i row1 = + _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]); + + const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20); + const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); + + // even pixels + s[0] = _mm256_alignr_epi8(r1, r0, 0); + s[1] = _mm256_alignr_epi8(r1, r0, 4); + s[2] = _mm256_alignr_epi8(r1, r0, 8); + s[3] = _mm256_alignr_epi8(r1, r0, 12); + + __m256i res_even = convolve(s, coeffs_x); + res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x), + round_shift_x); + + // odd pixels + s[0] = _mm256_alignr_epi8(r1, r0, 2); + s[1] = _mm256_alignr_epi8(r1, r0, 6); + s[2] = _mm256_alignr_epi8(r1, r0, 10); + s[3] = _mm256_alignr_epi8(r1, r0, 14); + + __m256i res_odd = convolve(s, coeffs_x); + res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x), + round_shift_x); + + res_even = _mm256_sll_epi32(res_even, round_shift_bits); + res_odd = _mm256_sll_epi32(res_odd, round_shift_bits); + + __m256i res1 = _mm256_unpacklo_epi32(res_even, res_odd); + + if (w - j > 4) { + __m256i res2 = _mm256_unpackhi_epi32(res_even, res_odd); + + const __m256i res_ax = _mm256_permute2x128_si256(res1, res2, 0x20); + const __m256i res_bx = _mm256_permute2x128_si256(res1, res2, 0x31); + + if (conv_params->use_jnt_comp_avg) { + mult_add_store_aligned_256(&dst[i * dst_stride + j], &res_ax, &wt0, + &wt1, do_average); + mult_add_store_aligned_256(&dst[i * dst_stride + j + dst_stride], + &res_bx, &wt0, &wt1, do_average); + } else { + add_store_aligned_256(&dst[i * dst_stride + j], &res_ax, do_average); + add_store_aligned_256(&dst[i * dst_stride + j + dst_stride], &res_bx, + do_average); + } + } else { + const __m128i res_ax = _mm256_castsi256_si128(res1); + const __m128i res_bx = _mm256_extracti128_si256(res1, 1); + + if (conv_params->use_jnt_comp_avg) { + mult_add_store(&dst[i * dst_stride + j], &res_ax, &wt0_128, &wt1_128, + do_average); + mult_add_store(&dst[i * dst_stride + j + dst_stride], &res_bx, + &wt0_128, &wt1_128, do_average); + } else { + add_store(&dst[i * dst_stride + j], &res_ax, do_average); + add_store(&dst[i * dst_stride + j + dst_stride], &res_bx, do_average); + } + } + } + } +} + +#endif
diff --git a/test/av1_convolve_2d_test.cc b/test/av1_convolve_2d_test.cc index 90512b0..735c1ae 100644 --- a/test/av1_convolve_2d_test.cc +++ b/test/av1_convolve_2d_test.cc
@@ -230,6 +230,10 @@ RunCheckOutput(GET_PARAM(1)); } +TEST_P(AV1HighbdJntConvolve2DTest, DISABLED_Speed) { + RunSpeedTest(GET_PARAM(1)); +} + INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdJntConvolve2DTest, libaom_test::AV1HighbdConvolve2D::BuildParams( av1_highbd_jnt_convolve_2d_sse4_1, 1, 1, 1)); @@ -252,6 +256,9 @@ INSTANTIATE_TEST_CASE_P(AVX2, AV1HighbdJntConvolve2DTest, libaom_test::AV1HighbdConvolve2D::BuildParams( av1_highbd_jnt_convolve_2d_avx2, 1, 1, 1)); +INSTANTIATE_TEST_CASE_P(AVX2_X, AV1HighbdJntConvolve2DTest, + libaom_test::AV1HighbdConvolve2D::BuildParams( + av1_highbd_jnt_convolve_x_avx2, 1, 0, 1)); #endif #endif // CONFIG_JNT_COMP #endif
diff --git a/test/av1_convolve_2d_test_util.cc b/test/av1_convolve_2d_test_util.cc index b166630..81df68d 100644 --- a/test/av1_convolve_2d_test_util.cc +++ b/test/av1_convolve_2d_test_util.cc
@@ -573,6 +573,59 @@ void AV1HighbdJntConvolve2DTest::TearDown() { libaom_test::ClearSystemState(); } +void AV1HighbdJntConvolve2DTest::RunSpeedTest( + highbd_convolve_2d_func test_impl) { + const int w = kMaxSize, h = kMaxSize; + const int bd = GET_PARAM(0); + // const int has_subx = GET_PARAM(2); + // const int has_suby = GET_PARAM(3); + int hfilter, vfilter, subx, suby; + uint16_t input[kMaxSize * kMaxSize]; + DECLARE_ALIGNED(32, CONV_BUF_TYPE, output[MAX_SB_SQUARE]); + + for (int i = 0; i < h; ++i) + for (int j = 0; j < w; ++j) + input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1); + for (int i = 0; i < MAX_SB_SQUARE; ++i) output[i] = rnd_.Rand31(); + hfilter = EIGHTTAP_REGULAR; + vfilter = EIGHTTAP_REGULAR; + int do_average = 0; + for (int block_idx = BLOCK_4X4; block_idx < BLOCK_SIZES_ALL; ++block_idx) { + const int out_w = block_size_wide[block_idx]; + const int out_h = block_size_high[block_idx]; + + InterpFilterParams filter_params_x = + av1_get_interp_filter_params((InterpFilter)hfilter); + InterpFilterParams filter_params_y = + av1_get_interp_filter_params((InterpFilter)vfilter); + + ConvolveParams conv_params = + get_conv_params_no_round(0, do_average, 0, output, MAX_SB_SIZE, 1, bd); + + // Test special case where jnt_comp_avg is not used + conv_params.use_jnt_comp_avg = 0; + + subx = 0; + suby = 0; + // Choose random locations within the source block + const int offset_r = 3; + const int offset_c = 3; + + const int num_loops = 1000000000 / (out_w + out_h); + aom_usec_timer timer; + aom_usec_timer_start(&timer); + for (int i = 0; i < num_loops; ++i) + test_impl(input + offset_r * w + offset_c, w, NULL, 0, out_w, out_h, + &filter_params_x, &filter_params_y, subx, suby, &conv_params, + bd); + + aom_usec_timer_mark(&timer); + const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer)); + printf("convolve %3dx%-3d: %7.2f ns\n", out_w, out_h, + 1000.0 * elapsed_time / num_loops); + } +} + void AV1HighbdJntConvolve2DTest::RunCheckOutput( highbd_convolve_2d_func test_impl) { const int w = kMaxSize, h = kMaxSize;
diff --git a/test/av1_convolve_2d_test_util.h b/test/av1_convolve_2d_test_util.h index 7f4c919..9b43938 100644 --- a/test/av1_convolve_2d_test_util.h +++ b/test/av1_convolve_2d_test_util.h
@@ -134,6 +134,7 @@ protected: void RunCheckOutput(highbd_convolve_2d_func test_impl); + void RunSpeedTest(highbd_convolve_2d_func test_impl); libaom_test::ACMRandom rnd_; };