AVX2: Add optimization for sad_16xhx4d
Around 5~10% improvement on functional level.
Performance from this series of commits:
RTC:
| SPD_SET | TESTSET | AVG_PSNR | OVR_PSNR | SSIM | ENC_T |
|---------|------------|----------|----------|---------|-------|
| 5 | rtc | +0.000% | +0.000% | +0.000% | -0.7% |
| 5 | rtc_1080p | +0.000% | +0.000% | +0.000% | -0.4% |
| 5 | rtc_derf | +0.000% | +0.000% | +0.000% | -0.8% |
| 5 | rtc_screen | +0.000% | +0.000% | +0.000% | -0.2% |
|---------|------------|----------|----------|---------|-------|
| 6 | rtc | +0.000% | +0.000% | +0.000% | -0.8% |
| 6 | rtc_1080p | +0.000% | +0.000% | +0.000% | -0.5% |
| 6 | rtc_derf | +0.000% | +0.000% | +0.000% | -0.9% |
| 6 | rtc_screen | +0.000% | +0.000% | +0.000% | -0.3% |
|---------|------------|----------|----------|---------|-------|
| 7 | rtc | +0.000% | +0.000% | +0.000% | -0.5% |
| 7 | rtc_1080p | +0.000% | +0.000% | +0.000% | -0.5% |
| 7 | rtc_derf | +0.000% | +0.000% | +0.000% | -0.6% |
| 7 | rtc_screen | +0.000% | +0.000% | +0.000% | -0.2% |
|---------|------------|----------|----------|---------|-------|
| 8 | rtc | +0.000% | +0.000% | +0.000% | -0.5% |
| 8 | rtc_1080p | +0.000% | +0.000% | +0.000% | -0.3% |
| 8 | rtc_derf | +0.000% | +0.000% | +0.000% | -0.5% |
| 8 | rtc_screen | +0.000% | +0.000% | +0.000% | -0.2% |
|---------|------------|----------|----------|---------|-------|
| 9 | rtc | +0.000% | +0.000% | +0.000% | -0.6% |
| 9 | rtc_1080p | +0.000% | +0.000% | +0.000% | -0.3% |
| 9 | rtc_derf | +0.000% | +0.000% | +0.000% | -0.5% |
| 9 | rtc_screen | +0.000% | +0.000% | +0.000% | -0.5% |
|---------|------------|----------|----------|---------|-------|
| 10 | rtc | +0.000% | +0.000% | +0.000% | -0.5% |
| 10 | rtc_1080p | +0.000% | +0.000% | +0.000% | -0.3% |
| 10 | rtc_derf | +0.000% | +0.000% | +0.000% | -0.5% |
| 10 | rtc_screen | +0.000% | +0.000% | +0.000% | -0.5% |
VOD:
| SPD_SET | TESTSET | AVG_PSNR | OVR_PSNR | SSIM | ENC_T |
|---------|---------|----------|----------|---------|-------|
| 1 | hdres2 | +0.000% | +0.000% | +0.000% | -0.5% |
| 1 | lowres2 | +0.000% | +0.000% | +0.000% | -0.1% |
| 1 | midres2 | +0.000% | +0.000% | +0.000% | -0.2% |
|---------|---------|----------|----------|---------|-------|
| 2 | hdres2 | +0.000% | +0.000% | +0.000% | -0.4% |
| 2 | lowres2 | +0.000% | +0.000% | +0.000% | -0.1% |
| 2 | midres2 | +0.000% | +0.000% | +0.000% | -0.2% |
|---------|---------|----------|----------|---------|-------|
| 3 | hdres2 | +0.000% | +0.000% | +0.000% | -0.8% |
| 3 | lowres2 | +0.000% | +0.000% | +0.000% | -0.3% |
| 3 | midres2 | +0.000% | +0.000% | +0.000% | -0.5% |
|---------|---------|----------|----------|---------|-------|
| 4 | hdres2 | +0.000% | +0.000% | +0.000% | -1.1% |
| 4 | lowres2 | +0.000% | +0.000% | +0.000% | -0.4% |
| 4 | midres2 | +0.000% | +0.000% | +0.000% | -0.7% |
|---------|---------|----------|----------|---------|-------|
| 5 | hdres2 | +0.000% | +0.000% | +0.000% | -1.5% |
| 5 | lowres2 | +0.000% | +0.000% | +0.000% | -0.6% |
| 5 | midres2 | +0.000% | +0.000% | +0.000% | -0.9% |
|---------|---------|----------|----------|---------|-------|
| 6 | hdres2 | +0.000% | +0.000% | +0.000% | -1.6% |
| 6 | lowres2 | +0.000% | +0.000% | +0.000% | -0.7% |
| 6 | midres2 | +0.000% | +0.000% | +0.000% | -1.1% |
BUG=aomedia:3358
Change-Id: I63dd9113e07314e0238268b1f04a8d4b2e3397c1
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 427a3dc..4c22060 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -1013,9 +1013,9 @@
specialize qw/aom_sad32x64x4d avx2 neon sse2/;
specialize qw/aom_sad32x32x4d avx2 neon sse2/;
specialize qw/aom_sad32x16x4d avx2 neon sse2/;
- specialize qw/aom_sad16x32x4d neon sse2/;
- specialize qw/aom_sad16x16x4d neon sse2/;
- specialize qw/aom_sad16x8x4d neon sse2/;
+ specialize qw/aom_sad16x32x4d avx2 neon sse2/;
+ specialize qw/aom_sad16x16x4d avx2 neon sse2/;
+ specialize qw/aom_sad16x8x4d avx2 neon sse2/;
specialize qw/aom_sad8x16x4d neon sse2/;
specialize qw/aom_sad8x8x4d neon sse2/;
@@ -1026,8 +1026,8 @@
specialize qw/aom_sad64x16x4d avx2 neon sse2/;
specialize qw/aom_sad32x8x4d avx2 neon sse2/;
- specialize qw/aom_sad16x64x4d neon sse2/;
- specialize qw/aom_sad16x4x4d neon sse2/;
+ specialize qw/aom_sad16x64x4d avx2 neon sse2/;
+ specialize qw/aom_sad16x4x4d avx2 neon sse2/;
specialize qw/aom_sad8x32x4d neon sse2/;
specialize qw/aom_sad4x16x4d neon sse2/;
@@ -1042,10 +1042,10 @@
specialize qw/aom_sad_skip_32x16x4d avx2 sse2 neon/;
specialize qw/aom_sad_skip_32x8x4d avx2 sse2 neon/;
- specialize qw/aom_sad_skip_16x64x4d sse2 neon/;
- specialize qw/aom_sad_skip_16x32x4d sse2 neon/;
- specialize qw/aom_sad_skip_16x16x4d sse2 neon/;
- specialize qw/aom_sad_skip_16x8x4d sse2 neon/;
+ specialize qw/aom_sad_skip_16x64x4d avx2 sse2 neon/;
+ specialize qw/aom_sad_skip_16x32x4d avx2 sse2 neon/;
+ specialize qw/aom_sad_skip_16x16x4d avx2 sse2 neon/;
+ specialize qw/aom_sad_skip_16x8x4d avx2 sse2 neon/;
specialize qw/aom_sad_skip_8x32x4d sse2 neon/;
specialize qw/aom_sad_skip_8x16x4d sse2 neon/;
specialize qw/aom_sad_skip_8x8x4d sse2 neon/;
diff --git a/aom_dsp/x86/sad4d_avx2.c b/aom_dsp/x86/sad4d_avx2.c
index 7629cf4..adfbd43 100644
--- a/aom_dsp/x86/sad4d_avx2.c
+++ b/aom_dsp/x86/sad4d_avx2.c
@@ -225,12 +225,72 @@
_mm256_setzero_si256());
}
+static AOM_FORCE_INLINE void aom_sad16xNx4d_avx2(int N, const uint8_t *src,
+ int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride,
+ uint32_t res[4]) {
+ __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
+ __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
+ const uint8_t *ref0, *ref1, *ref2, *ref3;
+ assert(N % 2 == 0);
+
+ ref0 = ref[0];
+ ref1 = ref[1];
+ ref2 = ref[2];
+ ref3 = ref[3];
+
+ sum_ref0 = _mm256_setzero_si256();
+ sum_ref2 = _mm256_setzero_si256();
+ sum_ref1 = _mm256_setzero_si256();
+ sum_ref3 = _mm256_setzero_si256();
+
+ for (int i = 0; i < N; i += 2) {
+ // load src and all refs
+ src_reg = yy_loadu2_128(src + src_stride, src);
+ ref0_reg = yy_loadu2_128(ref0 + ref_stride, ref0);
+ ref1_reg = yy_loadu2_128(ref1 + ref_stride, ref1);
+ ref2_reg = yy_loadu2_128(ref2 + ref_stride, ref2);
+ ref3_reg = yy_loadu2_128(ref3 + ref_stride, ref3);
+
+ // sum of the absolute differences between every ref-i to src
+ ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
+ ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
+ ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
+ ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
+
+ // sum every ref-i
+ sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
+ sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
+ sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
+ sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
+
+ src += 2 * src_stride;
+ ref0 += 2 * ref_stride;
+ ref1 += 2 * ref_stride;
+ ref2 += 2 * ref_stride;
+ ref3 += 2 * ref_stride;
+ }
+
+ aggregate_and_store_sum(res, sum_ref0, sum_ref1, sum_ref2, sum_ref3);
+}
+
#define SAD16XNX3_AVX2(n) \
void aom_sad16x##n##x3d_avx2(const uint8_t *src, int src_stride, \
const uint8_t *const ref[4], int ref_stride, \
uint32_t res[4]) { \
aom_sad16xNx3d_avx2(n, src, src_stride, ref, ref_stride, res); \
}
+#define SAD16XNX4_AVX2(n) \
+ void aom_sad16x##n##x4d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref[4], int ref_stride, \
+ uint32_t res[4]) { \
+ aom_sad16xNx4d_avx2(n, src, src_stride, ref, ref_stride, res); \
+ }
+
+SAD16XNX4_AVX2(32)
+SAD16XNX4_AVX2(16)
+SAD16XNX4_AVX2(8)
SAD16XNX3_AVX2(32)
SAD16XNX3_AVX2(16)
@@ -239,4 +299,29 @@
#if !CONFIG_REALTIME_ONLY
SAD16XNX3_AVX2(64)
SAD16XNX3_AVX2(4)
+
+SAD16XNX4_AVX2(64)
+SAD16XNX4_AVX2(4)
+
+#endif // !CONFIG_REALTIME_ONLY
+
+#define SAD_SKIP_16XN_AVX2(n) \
+ void aom_sad_skip_16x##n##x4d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref[4], \
+ int ref_stride, uint32_t res[4]) { \
+ aom_sad16xNx4d_avx2(((n) >> 1), src, 2 * src_stride, ref, 2 * ref_stride, \
+ res); \
+ res[0] <<= 1; \
+ res[1] <<= 1; \
+ res[2] <<= 1; \
+ res[3] <<= 1; \
+ }
+
+SAD_SKIP_16XN_AVX2(32)
+SAD_SKIP_16XN_AVX2(16)
+SAD_SKIP_16XN_AVX2(8)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_SKIP_16XN_AVX2(64)
+SAD_SKIP_16XN_AVX2(4)
#endif // !CONFIG_REALTIME_ONLY
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 9dae336..98c8f51 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -337,6 +337,30 @@
}
}
+ virtual void SADForSpeedTest(unsigned int *results,
+ const uint8_t *const *references) {
+ (void)results;
+ (void)references;
+ }
+
+ void SpeedSAD() {
+ int test_count = 20000000;
+ unsigned int exp_sad[4];
+ const uint8_t *references[] = { GetReference(0), GetReference(1),
+ GetReference(2), GetReference(3) };
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ while (test_count > 0) {
+ SADForSpeedTest(exp_sad, references);
+ test_count -= 1;
+ }
+ aom_usec_timer_mark(&timer);
+ const int64_t time = aom_usec_timer_elapsed(&timer) / 1000;
+ std::cout << "BLOCK_" << width_ << "X" << height_
+ << ", bit_depth:" << bit_depth_ << ",Time: " << time << "ms"
+ << std::endl;
+ }
+
int width_, height_, mask_, bd_;
aom_bit_depth_t bit_depth_;
static uint8_t *source_data_;
@@ -376,9 +400,14 @@
source_data_, source_stride_, references, reference_stride_, results));
}
+ void SADForSpeedTest(unsigned int *results,
+ const uint8_t *const *references) {
+ GET_PARAM(2)
+ (source_data_, source_stride_, references, reference_stride_, results);
+ }
+
void CheckSADs() {
unsigned int reference_sad, exp_sad[4];
-
SADs(exp_sad);
for (int block = 0; block < 4; ++block) {
reference_sad = ReferenceSAD(block);
@@ -386,15 +415,6 @@
EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block;
}
}
-
- void SpeedSAD() {
- int test_count = 2000000;
- unsigned int exp_sad[4];
- while (test_count > 0) {
- SADs(exp_sad);
- test_count -= 1;
- }
- }
};
class SADx3Test : public ::testing::WithParamInterface<SadMxNx4Param>,
@@ -407,8 +427,14 @@
const uint8_t *references[] = { GetReference(0), GetReference(1),
GetReference(2), GetReference(3) };
- API_REGISTER_STATE_CHECK(GET_PARAM(2)(
- source_data_, source_stride_, references, reference_stride_, results));
+ GET_PARAM(2)
+ (source_data_, source_stride_, references, reference_stride_, results);
+ }
+
+ void SADForSpeedTest(unsigned int *results,
+ const uint8_t *const *references) {
+ GET_PARAM(2)
+ (source_data_, source_stride_, references, reference_stride_, results);
}
void CheckSADs() {
@@ -421,15 +447,6 @@
EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block;
}
}
-
- void SpeedSAD() {
- int test_count = 2000000;
- unsigned int exp_sad[4];
- while (test_count > 0) {
- SADs(exp_sad);
- test_count -= 1;
- }
- }
};
class SADSkipx4Test : public ::testing::WithParamInterface<SadMxNx4Param>,
@@ -457,13 +474,10 @@
}
}
- void SpeedSAD() {
- int test_count = 2000000;
- unsigned int exp_sad[4];
- while (test_count > 0) {
- SADs(exp_sad);
- test_count -= 1;
- }
+ void SADForSpeedTest(unsigned int *results,
+ const uint8_t *const *references) {
+ GET_PARAM(2)
+ (source_data_, source_stride_, references, reference_stride_, results);
}
};
@@ -494,13 +508,11 @@
}
}
- void SpeedSAD() {
- int test_count = 200000;
- unsigned int exp_sad[4];
- while (test_count > 0) {
- SADs(exp_sad);
- test_count -= 1;
- }
+ void SADForSpeedTest(unsigned int *results,
+ const uint8_t *const *references) {
+ GET_PARAM(2)
+ (source_data_, source_stride_, references, reference_stride_, second_pred_,
+ results);
}
};
#endif // !CONFIG_REALTIME_ONLY
@@ -527,12 +539,11 @@
ASSERT_EQ(reference_sad, exp_sad);
}
- void SpeedSAD() {
- int test_count = 20000000;
- while (test_count > 0) {
- SAD(0);
- test_count -= 1;
- }
+ void SADForSpeedTest(unsigned int *results,
+ const uint8_t *const *references) {
+ GET_PARAM(2)
+ (source_data_, source_stride_, references[0], reference_stride_);
+ (void)results;
}
};
@@ -558,12 +569,11 @@
ASSERT_EQ(reference_sad, exp_sad);
}
- void SpeedSAD() {
- int test_count = 20000000;
- while (test_count > 0) {
- SAD(0);
- test_count -= 1;
- }
+ void SADForSpeedTest(unsigned int *results,
+ const uint8_t *const *references) {
+ GET_PARAM(2)
+ (source_data_, source_stride_, references[0], reference_stride_);
+ (void)results;
}
};
@@ -648,12 +658,12 @@
ASSERT_EQ(reference_sad, exp_sad);
}
- void SpeedSAD() {
- int test_count = 20000000;
- while (test_count > 0) {
- SAD(0);
- test_count -= 1;
- }
+ void SADForSpeedTest(unsigned int *results,
+ const uint8_t *const *references) {
+ GET_PARAM(2)
+ (source_data_, source_stride_, references[0], reference_stride_, width_,
+ height_);
+ (void)results;
}
};
GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DistWtdSADTest);
@@ -2928,6 +2938,9 @@
make_tuple(32, 64, &aom_sad_skip_32x64x4d_avx2, -1),
make_tuple(32, 32, &aom_sad_skip_32x32x4d_avx2, -1),
make_tuple(32, 16, &aom_sad_skip_32x16x4d_avx2, -1),
+ make_tuple(16, 32, &aom_sad_skip_16x32x4d_avx2, -1),
+ make_tuple(16, 16, &aom_sad_skip_16x16x4d_avx2, -1),
+ make_tuple(16, 8, &aom_sad_skip_16x8x4d_avx2, -1),
#if CONFIG_AV1_HIGHBITDEPTH
make_tuple(128, 128, &aom_highbd_sad_skip_128x128x4d_avx2, 8),
@@ -2984,6 +2997,8 @@
#if !CONFIG_REALTIME_ONLY
make_tuple(64, 16, &aom_sad_skip_64x16x4d_avx2, -1),
make_tuple(32, 8, &aom_sad_skip_32x8x4d_avx2, -1),
+
+ make_tuple(16, 64, &aom_sad_skip_16x64x4d_avx2, -1),
#endif
};
@@ -2991,6 +3006,9 @@
::testing::ValuesIn(skip_x4d_avx2_tests));
const SadMxNx4Param x4d_avx2_tests[] = {
+ make_tuple(16, 32, &aom_sad16x32x4d_avx2, -1),
+ make_tuple(16, 16, &aom_sad16x16x4d_avx2, -1),
+ make_tuple(16, 8, &aom_sad16x8x4d_avx2, -1),
make_tuple(32, 64, &aom_sad32x64x4d_avx2, -1),
make_tuple(32, 32, &aom_sad32x32x4d_avx2, -1),
make_tuple(32, 16, &aom_sad32x16x4d_avx2, -1),
@@ -3001,6 +3019,8 @@
make_tuple(128, 64, &aom_sad128x64x4d_avx2, -1),
#if !CONFIG_REALTIME_ONLY
+ make_tuple(16, 64, &aom_sad16x64x4d_avx2, -1),
+ make_tuple(16, 4, &aom_sad16x4x4d_avx2, -1),
make_tuple(32, 8, &aom_sad32x8x4d_avx2, -1),
make_tuple(64, 16, &aom_sad64x16x4d_avx2, -1),
#endif