AVX2: Add optimization for sad_16xhx4d Around 5~10% improvement on functional level. Performance from this series of commits: RTC: | SPD_SET | TESTSET | AVG_PSNR | OVR_PSNR | SSIM | ENC_T | |---------|------------|----------|----------|---------|-------| | 5 | rtc | +0.000% | +0.000% | +0.000% | -0.7% | | 5 | rtc_1080p | +0.000% | +0.000% | +0.000% | -0.4% | | 5 | rtc_derf | +0.000% | +0.000% | +0.000% | -0.8% | | 5 | rtc_screen | +0.000% | +0.000% | +0.000% | -0.2% | |---------|------------|----------|----------|---------|-------| | 6 | rtc | +0.000% | +0.000% | +0.000% | -0.8% | | 6 | rtc_1080p | +0.000% | +0.000% | +0.000% | -0.5% | | 6 | rtc_derf | +0.000% | +0.000% | +0.000% | -0.9% | | 6 | rtc_screen | +0.000% | +0.000% | +0.000% | -0.3% | |---------|------------|----------|----------|---------|-------| | 7 | rtc | +0.000% | +0.000% | +0.000% | -0.5% | | 7 | rtc_1080p | +0.000% | +0.000% | +0.000% | -0.5% | | 7 | rtc_derf | +0.000% | +0.000% | +0.000% | -0.6% | | 7 | rtc_screen | +0.000% | +0.000% | +0.000% | -0.2% | |---------|------------|----------|----------|---------|-------| | 8 | rtc | +0.000% | +0.000% | +0.000% | -0.5% | | 8 | rtc_1080p | +0.000% | +0.000% | +0.000% | -0.3% | | 8 | rtc_derf | +0.000% | +0.000% | +0.000% | -0.5% | | 8 | rtc_screen | +0.000% | +0.000% | +0.000% | -0.2% | |---------|------------|----------|----------|---------|-------| | 9 | rtc | +0.000% | +0.000% | +0.000% | -0.6% | | 9 | rtc_1080p | +0.000% | +0.000% | +0.000% | -0.3% | | 9 | rtc_derf | +0.000% | +0.000% | +0.000% | -0.5% | | 9 | rtc_screen | +0.000% | +0.000% | +0.000% | -0.5% | |---------|------------|----------|----------|---------|-------| | 10 | rtc | +0.000% | +0.000% | +0.000% | -0.5% | | 10 | rtc_1080p | +0.000% | +0.000% | +0.000% | -0.3% | | 10 | rtc_derf | +0.000% | +0.000% | +0.000% | -0.5% | | 10 | rtc_screen | +0.000% | +0.000% | +0.000% | -0.5% | VOD: | SPD_SET | TESTSET | AVG_PSNR | OVR_PSNR | SSIM | ENC_T | |---------|---------|----------|----------|---------|-------| | 1 | hdres2 | +0.000% | +0.000% | +0.000% | -0.5% | | 1 | lowres2 | +0.000% | +0.000% | +0.000% | -0.1% | | 1 | midres2 | +0.000% | +0.000% | +0.000% | -0.2% | |---------|---------|----------|----------|---------|-------| | 2 | hdres2 | +0.000% | +0.000% | +0.000% | -0.4% | | 2 | lowres2 | +0.000% | +0.000% | +0.000% | -0.1% | | 2 | midres2 | +0.000% | +0.000% | +0.000% | -0.2% | |---------|---------|----------|----------|---------|-------| | 3 | hdres2 | +0.000% | +0.000% | +0.000% | -0.8% | | 3 | lowres2 | +0.000% | +0.000% | +0.000% | -0.3% | | 3 | midres2 | +0.000% | +0.000% | +0.000% | -0.5% | |---------|---------|----------|----------|---------|-------| | 4 | hdres2 | +0.000% | +0.000% | +0.000% | -1.1% | | 4 | lowres2 | +0.000% | +0.000% | +0.000% | -0.4% | | 4 | midres2 | +0.000% | +0.000% | +0.000% | -0.7% | |---------|---------|----------|----------|---------|-------| | 5 | hdres2 | +0.000% | +0.000% | +0.000% | -1.5% | | 5 | lowres2 | +0.000% | +0.000% | +0.000% | -0.6% | | 5 | midres2 | +0.000% | +0.000% | +0.000% | -0.9% | |---------|---------|----------|----------|---------|-------| | 6 | hdres2 | +0.000% | +0.000% | +0.000% | -1.6% | | 6 | lowres2 | +0.000% | +0.000% | +0.000% | -0.7% | | 6 | midres2 | +0.000% | +0.000% | +0.000% | -1.1% | BUG=aomedia:3358 Change-Id: I63dd9113e07314e0238268b1f04a8d4b2e3397c1
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl index 427a3dc..4c22060 100755 --- a/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -1013,9 +1013,9 @@ specialize qw/aom_sad32x64x4d avx2 neon sse2/; specialize qw/aom_sad32x32x4d avx2 neon sse2/; specialize qw/aom_sad32x16x4d avx2 neon sse2/; - specialize qw/aom_sad16x32x4d neon sse2/; - specialize qw/aom_sad16x16x4d neon sse2/; - specialize qw/aom_sad16x8x4d neon sse2/; + specialize qw/aom_sad16x32x4d avx2 neon sse2/; + specialize qw/aom_sad16x16x4d avx2 neon sse2/; + specialize qw/aom_sad16x8x4d avx2 neon sse2/; specialize qw/aom_sad8x16x4d neon sse2/; specialize qw/aom_sad8x8x4d neon sse2/; @@ -1026,8 +1026,8 @@ specialize qw/aom_sad64x16x4d avx2 neon sse2/; specialize qw/aom_sad32x8x4d avx2 neon sse2/; - specialize qw/aom_sad16x64x4d neon sse2/; - specialize qw/aom_sad16x4x4d neon sse2/; + specialize qw/aom_sad16x64x4d avx2 neon sse2/; + specialize qw/aom_sad16x4x4d avx2 neon sse2/; specialize qw/aom_sad8x32x4d neon sse2/; specialize qw/aom_sad4x16x4d neon sse2/; @@ -1042,10 +1042,10 @@ specialize qw/aom_sad_skip_32x16x4d avx2 sse2 neon/; specialize qw/aom_sad_skip_32x8x4d avx2 sse2 neon/; - specialize qw/aom_sad_skip_16x64x4d sse2 neon/; - specialize qw/aom_sad_skip_16x32x4d sse2 neon/; - specialize qw/aom_sad_skip_16x16x4d sse2 neon/; - specialize qw/aom_sad_skip_16x8x4d sse2 neon/; + specialize qw/aom_sad_skip_16x64x4d avx2 sse2 neon/; + specialize qw/aom_sad_skip_16x32x4d avx2 sse2 neon/; + specialize qw/aom_sad_skip_16x16x4d avx2 sse2 neon/; + specialize qw/aom_sad_skip_16x8x4d avx2 sse2 neon/; specialize qw/aom_sad_skip_8x32x4d sse2 neon/; specialize qw/aom_sad_skip_8x16x4d sse2 neon/; specialize qw/aom_sad_skip_8x8x4d sse2 neon/;
diff --git a/aom_dsp/x86/sad4d_avx2.c b/aom_dsp/x86/sad4d_avx2.c index 7629cf4..adfbd43 100644 --- a/aom_dsp/x86/sad4d_avx2.c +++ b/aom_dsp/x86/sad4d_avx2.c
@@ -225,12 +225,72 @@ _mm256_setzero_si256()); } +static AOM_FORCE_INLINE void aom_sad16xNx4d_avx2(int N, const uint8_t *src, + int src_stride, + const uint8_t *const ref[4], + int ref_stride, + uint32_t res[4]) { + __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg; + __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; + const uint8_t *ref0, *ref1, *ref2, *ref3; + assert(N % 2 == 0); + + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + ref3 = ref[3]; + + sum_ref0 = _mm256_setzero_si256(); + sum_ref2 = _mm256_setzero_si256(); + sum_ref1 = _mm256_setzero_si256(); + sum_ref3 = _mm256_setzero_si256(); + + for (int i = 0; i < N; i += 2) { + // load src and all refs + src_reg = yy_loadu2_128(src + src_stride, src); + ref0_reg = yy_loadu2_128(ref0 + ref_stride, ref0); + ref1_reg = yy_loadu2_128(ref1 + ref_stride, ref1); + ref2_reg = yy_loadu2_128(ref2 + ref_stride, ref2); + ref3_reg = yy_loadu2_128(ref3 + ref_stride, ref3); + + // sum of the absolute differences between every ref-i to src + ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); + ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); + ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); + ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); + + // sum every ref-i + sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); + sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); + sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); + sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg); + + src += 2 * src_stride; + ref0 += 2 * ref_stride; + ref1 += 2 * ref_stride; + ref2 += 2 * ref_stride; + ref3 += 2 * ref_stride; + } + + aggregate_and_store_sum(res, sum_ref0, sum_ref1, sum_ref2, sum_ref3); +} + #define SAD16XNX3_AVX2(n) \ void aom_sad16x##n##x3d_avx2(const uint8_t *src, int src_stride, \ const uint8_t *const ref[4], int ref_stride, \ uint32_t res[4]) { \ aom_sad16xNx3d_avx2(n, src, src_stride, ref, ref_stride, res); \ } +#define SAD16XNX4_AVX2(n) \ + void aom_sad16x##n##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref[4], int ref_stride, \ + uint32_t res[4]) { \ + aom_sad16xNx4d_avx2(n, src, src_stride, ref, ref_stride, res); \ + } + +SAD16XNX4_AVX2(32) +SAD16XNX4_AVX2(16) +SAD16XNX4_AVX2(8) SAD16XNX3_AVX2(32) SAD16XNX3_AVX2(16) @@ -239,4 +299,29 @@ #if !CONFIG_REALTIME_ONLY SAD16XNX3_AVX2(64) SAD16XNX3_AVX2(4) + +SAD16XNX4_AVX2(64) +SAD16XNX4_AVX2(4) + +#endif // !CONFIG_REALTIME_ONLY + +#define SAD_SKIP_16XN_AVX2(n) \ + void aom_sad_skip_16x##n##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref[4], \ + int ref_stride, uint32_t res[4]) { \ + aom_sad16xNx4d_avx2(((n) >> 1), src, 2 * src_stride, ref, 2 * ref_stride, \ + res); \ + res[0] <<= 1; \ + res[1] <<= 1; \ + res[2] <<= 1; \ + res[3] <<= 1; \ + } + +SAD_SKIP_16XN_AVX2(32) +SAD_SKIP_16XN_AVX2(16) +SAD_SKIP_16XN_AVX2(8) + +#if !CONFIG_REALTIME_ONLY +SAD_SKIP_16XN_AVX2(64) +SAD_SKIP_16XN_AVX2(4) #endif // !CONFIG_REALTIME_ONLY
diff --git a/test/sad_test.cc b/test/sad_test.cc index 9dae336..98c8f51 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc
@@ -337,6 +337,30 @@ } } + virtual void SADForSpeedTest(unsigned int *results, + const uint8_t *const *references) { + (void)results; + (void)references; + } + + void SpeedSAD() { + int test_count = 20000000; + unsigned int exp_sad[4]; + const uint8_t *references[] = { GetReference(0), GetReference(1), + GetReference(2), GetReference(3) }; + aom_usec_timer timer; + aom_usec_timer_start(&timer); + while (test_count > 0) { + SADForSpeedTest(exp_sad, references); + test_count -= 1; + } + aom_usec_timer_mark(&timer); + const int64_t time = aom_usec_timer_elapsed(&timer) / 1000; + std::cout << "BLOCK_" << width_ << "X" << height_ + << ", bit_depth:" << bit_depth_ << ",Time: " << time << "ms" + << std::endl; + } + int width_, height_, mask_, bd_; aom_bit_depth_t bit_depth_; static uint8_t *source_data_; @@ -376,9 +400,14 @@ source_data_, source_stride_, references, reference_stride_, results)); } + void SADForSpeedTest(unsigned int *results, + const uint8_t *const *references) { + GET_PARAM(2) + (source_data_, source_stride_, references, reference_stride_, results); + } + void CheckSADs() { unsigned int reference_sad, exp_sad[4]; - SADs(exp_sad); for (int block = 0; block < 4; ++block) { reference_sad = ReferenceSAD(block); @@ -386,15 +415,6 @@ EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block; } } - - void SpeedSAD() { - int test_count = 2000000; - unsigned int exp_sad[4]; - while (test_count > 0) { - SADs(exp_sad); - test_count -= 1; - } - } }; class SADx3Test : public ::testing::WithParamInterface<SadMxNx4Param>, @@ -407,8 +427,14 @@ const uint8_t *references[] = { GetReference(0), GetReference(1), GetReference(2), GetReference(3) }; - API_REGISTER_STATE_CHECK(GET_PARAM(2)( - source_data_, source_stride_, references, reference_stride_, results)); + GET_PARAM(2) + (source_data_, source_stride_, references, reference_stride_, results); + } + + void SADForSpeedTest(unsigned int *results, + const uint8_t *const *references) { + GET_PARAM(2) + (source_data_, source_stride_, references, reference_stride_, results); } void CheckSADs() { @@ -421,15 +447,6 @@ EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block; } } - - void SpeedSAD() { - int test_count = 2000000; - unsigned int exp_sad[4]; - while (test_count > 0) { - SADs(exp_sad); - test_count -= 1; - } - } }; class SADSkipx4Test : public ::testing::WithParamInterface<SadMxNx4Param>, @@ -457,13 +474,10 @@ } } - void SpeedSAD() { - int test_count = 2000000; - unsigned int exp_sad[4]; - while (test_count > 0) { - SADs(exp_sad); - test_count -= 1; - } + void SADForSpeedTest(unsigned int *results, + const uint8_t *const *references) { + GET_PARAM(2) + (source_data_, source_stride_, references, reference_stride_, results); } }; @@ -494,13 +508,11 @@ } } - void SpeedSAD() { - int test_count = 200000; - unsigned int exp_sad[4]; - while (test_count > 0) { - SADs(exp_sad); - test_count -= 1; - } + void SADForSpeedTest(unsigned int *results, + const uint8_t *const *references) { + GET_PARAM(2) + (source_data_, source_stride_, references, reference_stride_, second_pred_, + results); } }; #endif // !CONFIG_REALTIME_ONLY @@ -527,12 +539,11 @@ ASSERT_EQ(reference_sad, exp_sad); } - void SpeedSAD() { - int test_count = 20000000; - while (test_count > 0) { - SAD(0); - test_count -= 1; - } + void SADForSpeedTest(unsigned int *results, + const uint8_t *const *references) { + GET_PARAM(2) + (source_data_, source_stride_, references[0], reference_stride_); + (void)results; } }; @@ -558,12 +569,11 @@ ASSERT_EQ(reference_sad, exp_sad); } - void SpeedSAD() { - int test_count = 20000000; - while (test_count > 0) { - SAD(0); - test_count -= 1; - } + void SADForSpeedTest(unsigned int *results, + const uint8_t *const *references) { + GET_PARAM(2) + (source_data_, source_stride_, references[0], reference_stride_); + (void)results; } }; @@ -648,12 +658,12 @@ ASSERT_EQ(reference_sad, exp_sad); } - void SpeedSAD() { - int test_count = 20000000; - while (test_count > 0) { - SAD(0); - test_count -= 1; - } + void SADForSpeedTest(unsigned int *results, + const uint8_t *const *references) { + GET_PARAM(2) + (source_data_, source_stride_, references[0], reference_stride_, width_, + height_); + (void)results; } }; GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DistWtdSADTest); @@ -2928,6 +2938,9 @@ make_tuple(32, 64, &aom_sad_skip_32x64x4d_avx2, -1), make_tuple(32, 32, &aom_sad_skip_32x32x4d_avx2, -1), make_tuple(32, 16, &aom_sad_skip_32x16x4d_avx2, -1), + make_tuple(16, 32, &aom_sad_skip_16x32x4d_avx2, -1), + make_tuple(16, 16, &aom_sad_skip_16x16x4d_avx2, -1), + make_tuple(16, 8, &aom_sad_skip_16x8x4d_avx2, -1), #if CONFIG_AV1_HIGHBITDEPTH make_tuple(128, 128, &aom_highbd_sad_skip_128x128x4d_avx2, 8), @@ -2984,6 +2997,8 @@ #if !CONFIG_REALTIME_ONLY make_tuple(64, 16, &aom_sad_skip_64x16x4d_avx2, -1), make_tuple(32, 8, &aom_sad_skip_32x8x4d_avx2, -1), + + make_tuple(16, 64, &aom_sad_skip_16x64x4d_avx2, -1), #endif }; @@ -2991,6 +3006,9 @@ ::testing::ValuesIn(skip_x4d_avx2_tests)); const SadMxNx4Param x4d_avx2_tests[] = { + make_tuple(16, 32, &aom_sad16x32x4d_avx2, -1), + make_tuple(16, 16, &aom_sad16x16x4d_avx2, -1), + make_tuple(16, 8, &aom_sad16x8x4d_avx2, -1), make_tuple(32, 64, &aom_sad32x64x4d_avx2, -1), make_tuple(32, 32, &aom_sad32x32x4d_avx2, -1), make_tuple(32, 16, &aom_sad32x16x4d_avx2, -1), @@ -3001,6 +3019,8 @@ make_tuple(128, 64, &aom_sad128x64x4d_avx2, -1), #if !CONFIG_REALTIME_ONLY + make_tuple(16, 64, &aom_sad16x64x4d_avx2, -1), + make_tuple(16, 4, &aom_sad16x4x4d_avx2, -1), make_tuple(32, 8, &aom_sad32x8x4d_avx2, -1), make_tuple(64, 16, &aom_sad64x16x4d_avx2, -1), #endif