Implement aom_variance{128,64,32}_sse2/avx2
Add sse2 and avx2 version of aom_variance for size with 128
128x128, 128x64, 64x128, 128x32, 32x128
The speed test in unittest shows sse2 version is 5x~6x
faster than C version, and avx2 version is 9x~11x faster
For encoder time, 20 frames of foreman_cip shows 2% speedup.
Change-Id: Ibf91514f6c876d47d56081fab54614f7c3609666
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index e821aba..f9ab56b 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -981,7 +981,11 @@
add_proto qw/uint32_t/, "aom_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
add_proto qw/uint32_t/, "aom_jnt_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param";
}
-
+ specialize qw/aom_variance128x128 sse2 avx2 /;
+ specialize qw/aom_variance128x64 sse2 avx2 /;
+ specialize qw/aom_variance64x128 sse2 avx2 /;
+ specialize qw/aom_variance128x32 sse2 avx2 /;
+ specialize qw/aom_variance32x128 sse2 avx2 /;
specialize qw/aom_variance64x64 sse2 avx2 neon msa/;
specialize qw/aom_variance64x32 sse2 avx2 neon msa/;
specialize qw/aom_variance32x64 sse2 neon msa/;
diff --git a/aom_dsp/x86/variance_avx2.c b/aom_dsp/x86/variance_avx2.c
index a041bba..0d94e6c 100644
--- a/aom_dsp/x86/variance_avx2.c
+++ b/aom_dsp/x86/variance_avx2.c
@@ -47,9 +47,7 @@
unsigned int *sse) {
int sum;
unsigned int variance;
- variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
- aom_get16x16var_avx2, 16);
-
+ aom_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
variance = *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
_mm256_zeroupper();
return variance;
@@ -64,57 +62,28 @@
return *sse;
}
-unsigned int aom_variance32x16_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- unsigned int variance;
- variance_avx2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
- aom_get32x32var_avx2, 32);
+#define AOM_VAR_AVX2(bw, bh, w, bits) \
+ unsigned int aom_variance##bw##x##bh##_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ unsigned int *sse) { \
+ int sum; \
+ unsigned int variance; \
+ variance_avx2(src, src_stride, ref, ref_stride, bw, bh, sse, &sum, \
+ aom_get##w##x##w##var_avx2, w); \
+ variance = *sse - (uint32_t)(((int64_t)sum * sum) >> bits); \
+ _mm256_zeroupper(); \
+ return variance; \
+ }
- variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 9);
- _mm256_zeroupper();
- return variance;
-}
-
-unsigned int aom_variance32x32_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- unsigned int variance;
- variance_avx2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
- aom_get32x32var_avx2, 32);
-
- variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 10);
- _mm256_zeroupper();
- return variance;
-}
-
-unsigned int aom_variance64x64_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- unsigned int variance;
- variance_avx2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
- aom_get32x32var_avx2, 32);
-
- variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 12);
- _mm256_zeroupper();
- return variance;
-}
-
-unsigned int aom_variance64x32_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- unsigned int variance;
- variance_avx2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
- aom_get32x32var_avx2, 32);
-
- variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 11);
- _mm256_zeroupper();
- return variance;
-}
+AOM_VAR_AVX2(32, 16, 16, 9);
+AOM_VAR_AVX2(32, 32, 32, 10);
+AOM_VAR_AVX2(64, 64, 32, 12);
+AOM_VAR_AVX2(64, 32, 32, 11);
+AOM_VAR_AVX2(128, 128, 32, 14);
+AOM_VAR_AVX2(128, 64, 32, 13);
+AOM_VAR_AVX2(64, 128, 32, 13);
+AOM_VAR_AVX2(128, 32, 32, 12);
+AOM_VAR_AVX2(32, 128, 32, 12);
unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
int x_offset, int y_offset,
diff --git a/aom_dsp/x86/variance_sse2.c b/aom_dsp/x86/variance_sse2.c
index af4c8ea..cd0605a 100644
--- a/aom_dsp/x86/variance_sse2.c
+++ b/aom_dsp/x86/variance_sse2.c
@@ -249,71 +249,29 @@
return *sse - ((uint32_t)((int64_t)sum * sum) >> 8);
}
-unsigned int aom_variance32x32_sse2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- variance_sse2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
- aom_get16x16var_sse2, 16);
- assert(sum <= 255 * 32 * 32);
- assert(sum >= -255 * 32 * 32);
- return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
-}
+#define AOM_VAR_16_SSE2(bw, bh, bits) \
+ unsigned int aom_variance##bw##x##bh##_sse2( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ unsigned int *sse) { \
+ int sum; \
+ variance_sse2(src, src_stride, ref, ref_stride, bw, bh, sse, &sum, \
+ aom_get16x16var_sse2, 16); \
+ assert(sum <= 255 * bw * bh); \
+ assert(sum >= -255 * bw * bh); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> bits); \
+ }
-unsigned int aom_variance32x16_sse2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- variance_sse2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
- aom_get16x16var_sse2, 16);
- assert(sum <= 255 * 32 * 16);
- assert(sum >= -255 * 32 * 16);
- return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
-}
-
-unsigned int aom_variance16x32_sse2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- variance_sse2(src, src_stride, ref, ref_stride, 16, 32, sse, &sum,
- aom_get16x16var_sse2, 16);
- assert(sum <= 255 * 32 * 16);
- assert(sum >= -255 * 32 * 16);
- return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
-}
-
-unsigned int aom_variance64x64_sse2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- variance_sse2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
- aom_get16x16var_sse2, 16);
- assert(sum <= 255 * 64 * 64);
- assert(sum >= -255 * 64 * 64);
- return *sse - (unsigned int)(((int64_t)sum * sum) >> 12);
-}
-
-unsigned int aom_variance64x32_sse2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- variance_sse2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
- aom_get16x16var_sse2, 16);
- assert(sum <= 255 * 64 * 32);
- assert(sum >= -255 * 64 * 32);
- return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
-}
-
-unsigned int aom_variance32x64_sse2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- variance_sse2(src, src_stride, ref, ref_stride, 32, 64, sse, &sum,
- aom_get16x16var_sse2, 16);
- assert(sum <= 255 * 64 * 32);
- assert(sum >= -255 * 64 * 32);
- return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
-}
+AOM_VAR_16_SSE2(32, 32, 10);
+AOM_VAR_16_SSE2(32, 16, 9);
+AOM_VAR_16_SSE2(16, 32, 9);
+AOM_VAR_16_SSE2(64, 64, 12);
+AOM_VAR_16_SSE2(64, 32, 11);
+AOM_VAR_16_SSE2(32, 64, 11);
+AOM_VAR_16_SSE2(128, 128, 14);
+AOM_VAR_16_SSE2(128, 64, 13);
+AOM_VAR_16_SSE2(64, 128, 13);
+AOM_VAR_16_SSE2(128, 32, 12);
+AOM_VAR_16_SSE2(32, 128, 12);
unsigned int aom_mse8x8_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 391031c..856817e 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -395,6 +395,7 @@
void RefTest();
void RefStrideTest();
void OneQuarterTest();
+ void SpeedTest();
// MSE/SSE tests
void RefTestMse();
@@ -515,6 +516,31 @@
EXPECT_EQ(expected, var);
}
+template <typename VarianceFunctionType>
+void MainTestClass<VarianceFunctionType>::SpeedTest() {
+ for (int j = 0; j < block_size(); j++) {
+ if (!use_high_bit_depth()) {
+ src_[j] = rnd_.Rand8();
+ ref_[j] = rnd_.Rand8();
+ } else {
+ CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+ CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
+ }
+ }
+ unsigned int sse1, sse2, var1, var2;
+ const int stride = width();
+ int run_time = 1000000000 / block_size();
+
+ ASM_REGISTER_STATE_CHECK(var1 =
+ params_.func(src_, stride, ref_, stride, &sse1));
+ for (int i = 0; i < run_time; ++i) {
+ ASM_REGISTER_STATE_CHECK(
+ var2 = params_.func(src_, stride, ref_, stride, &sse2));
+ }
+ EXPECT_EQ(var1, var2);
+ EXPECT_EQ(sse1, sse2);
+}
+
////////////////////////////////////////////////////////////////////////////////
// Tests related to MSE / SSE.
@@ -586,17 +612,17 @@
rnd_.Reset(ACMRandom::DeterministicSeed());
if (!use_high_bit_depth()) {
- src_ = reinterpret_cast<uint8_t *>(aom_memalign(16, block_size()));
- sec_ = reinterpret_cast<uint8_t *>(aom_memalign(16, block_size()));
+ src_ = reinterpret_cast<uint8_t *>(aom_memalign(32, block_size()));
+ sec_ = reinterpret_cast<uint8_t *>(aom_memalign(32, block_size()));
ref_ = reinterpret_cast<uint8_t *>(
- aom_memalign(16, block_size() + width() + height() + 1));
+ aom_memalign(32, block_size() + width() + height() + 1));
} else {
src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
- aom_memalign(16, block_size() * sizeof(uint16_t))));
+ aom_memalign(32, block_size() * sizeof(uint16_t))));
sec_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
- aom_memalign(16, block_size() * sizeof(uint16_t))));
+ aom_memalign(32, block_size() * sizeof(uint16_t))));
ref_ = CONVERT_TO_BYTEPTR(aom_memalign(
- 16, (block_size() + width() + height() + 1) * sizeof(uint16_t)));
+ 32, (block_size() + width() + height() + 1) * sizeof(uint16_t)));
}
ASSERT_TRUE(src_ != NULL);
ASSERT_TRUE(sec_ != NULL);
@@ -791,6 +817,7 @@
TEST_P(AvxVarianceTest, Ref) { RefTest(); }
TEST_P(AvxVarianceTest, RefStride) { RefStrideTest(); }
TEST_P(AvxVarianceTest, OneQuarter) { OneQuarterTest(); }
+TEST_P(AvxVarianceTest, DISABLED_Speed) { SpeedTest(); }
TEST_P(SumOfSquaresTest, Const) { ConstTest(); }
TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
TEST_P(AvxSubpelVarianceTest, Ref) { RefTest(); }
@@ -816,7 +843,12 @@
typedef TestParams<VarianceMxNFunc> VarianceParams;
INSTANTIATE_TEST_CASE_P(
C, AvxVarianceTest,
- ::testing::Values(VarianceParams(6, 6, &aom_variance64x64_c),
+ ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_c),
+ VarianceParams(7, 6, &aom_variance128x64_c),
+ VarianceParams(6, 7, &aom_variance64x128_c),
+ VarianceParams(7, 5, &aom_variance128x32_c),
+ VarianceParams(5, 7, &aom_variance32x128_c),
+ VarianceParams(6, 6, &aom_variance64x64_c),
VarianceParams(6, 5, &aom_variance64x32_c),
VarianceParams(5, 6, &aom_variance32x64_c),
VarianceParams(5, 5, &aom_variance32x32_c),
@@ -1154,7 +1186,12 @@
INSTANTIATE_TEST_CASE_P(
SSE2, AvxVarianceTest,
- ::testing::Values(VarianceParams(6, 6, &aom_variance64x64_sse2),
+ ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_sse2),
+ VarianceParams(7, 6, &aom_variance128x64_sse2),
+ VarianceParams(6, 7, &aom_variance64x128_sse2),
+ VarianceParams(7, 5, &aom_variance128x32_sse2),
+ VarianceParams(5, 7, &aom_variance32x128_sse2),
+ VarianceParams(6, 6, &aom_variance64x64_sse2),
VarianceParams(6, 5, &aom_variance64x32_sse2),
VarianceParams(5, 6, &aom_variance32x64_sse2),
VarianceParams(5, 5, &aom_variance32x32_sse2),
@@ -1479,7 +1516,12 @@
INSTANTIATE_TEST_CASE_P(
AVX2, AvxVarianceTest,
- ::testing::Values(VarianceParams(6, 6, &aom_variance64x64_avx2),
+ ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_avx2),
+ VarianceParams(7, 6, &aom_variance128x64_avx2),
+ VarianceParams(6, 7, &aom_variance64x128_avx2),
+ VarianceParams(7, 5, &aom_variance128x32_avx2),
+ VarianceParams(5, 7, &aom_variance32x128_avx2),
+ VarianceParams(6, 6, &aom_variance64x64_avx2),
VarianceParams(6, 5, &aom_variance64x32_avx2),
VarianceParams(5, 5, &aom_variance32x32_avx2),
VarianceParams(5, 4, &aom_variance32x16_avx2),