RTC: Add SSE4_1 optimization for aom_vector_var
This commit also fixes a potential integer overflow issue in the C code
when the function is applied to BLOCK_128X128.
Performance:
Functional Level:
| Width | C Code | SSE4_1 | Gain |
|-------|--------|--------|------|
| 16 | 42430 | 25017 | 1.70 |
| 32 | 69805 | 31255 | 2.28 |
| 64 | 126035 | 43659 | 2.80 |
| 128 | 238695 | 72269 | 3.30 |
Encoder Level:
| SPD_SET | TESTSET | AVG_PSNR | OVR_PSNR | SSIM | ENC_T |
|---------|----------|----------|----------|---------|-------|
| 7 | rtc | +0.000% | +0.000% | +0.000% | -1.2% |
| 7 | rtc_derf | +0.000% | +0.000% | +0.000% | -1.2% |
|---------|----------|----------|----------|---------|-------|
| 8 | rtc | +0.000% | +0.000% | +0.000% | -1.2% |
| 8 | rtc_derf | +0.000% | +0.000% | +0.000% | -0.8% |
|---------|----------|----------|----------|---------|-------|
| 9 | rtc | +0.000% | +0.000% | +0.000% | -0.1% |
| 9 | rtc_derf | +0.000% | +0.000% | +0.000% | -0.1% |
|---------|----------|----------|----------|---------|-------|
| 10 | rtc | +0.000% | +0.000% | +0.000% | -0.1% |
| 10 | rtc_derf | +0.000% | +0.000% | +0.000% | -0.1% |
Change-Id: If02b50a037b9f4b87fc57d471a6998130f575d41
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 7e2f570..29a9d3f 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -250,7 +250,9 @@
"${AOM_ROOT}/aom_dsp/x86/jnt_variance_ssse3.c"
"${AOM_ROOT}/aom_dsp/x86/jnt_sad_ssse3.c")
- list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1 "${AOM_ROOT}/aom_dsp/x86/sse_sse4.c"
+ list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1
+ "${AOM_ROOT}/aom_dsp/x86/avg_intrin_sse4.c"
+ "${AOM_ROOT}/aom_dsp/x86/sse_sse4.c"
"${AOM_ROOT}/aom_dsp/x86/obmc_sad_sse4.c"
"${AOM_ROOT}/aom_dsp/x86/obmc_variance_sse4.c")
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 4503c8e..1928de0 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -1201,7 +1201,7 @@
specialize qw/aom_int_pro_col sse2 neon/;
add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl";
- specialize qw/aom_vector_var neon/;
+ specialize qw/aom_vector_var sse4_1 neon/;
# TODO(kyslov@) bring back SSE2 by extending it to 128 block size
#specialize qw/aom_vector_var neon sse2/;
diff --git a/aom_dsp/arm/avg_neon.c b/aom_dsp/arm/avg_neon.c
index 9f5b545..593807b 100644
--- a/aom_dsp/arm/avg_neon.c
+++ b/aom_dsp/arm/avg_neon.c
@@ -187,10 +187,11 @@
v_sse = vmlal_s16(v_sse, v_high, v_high);
#endif
}
- int mean = horizontal_add_s32x4(v_mean);
- int sse = horizontal_add_s32x4(v_sse);
+ const int mean = horizontal_add_s32x4(v_mean);
+ const int sse = horizontal_add_s32x4(v_sse);
+ const unsigned int mean_abs = mean >= 0 ? mean : -mean;
// (mean * mean): dynamic range 31 bits.
- int var = sse - ((mean * mean) >> (bwl + 2));
+ const int var = sse - ((mean_abs * mean_abs) >> (bwl + 2));
return var;
}
diff --git a/aom_dsp/avg.c b/aom_dsp/avg.c
index 1e48bc1..a3821e6 100644
--- a/aom_dsp/avg.c
+++ b/aom_dsp/avg.c
@@ -547,6 +547,9 @@
}
// (mean * mean): dynamic range 31 bits.
- var = sse - ((mean * mean) >> (bwl + 2));
+ // If width == 128, the mean can be 510 * 128 = 65280, and log2(65280 ** 2) ~=
+ // 31.99, so it needs to be casted to unsigned int to compute its square.
+ const unsigned int mean_abs = mean >= 0 ? mean : -mean;
+ var = sse - ((mean_abs * mean_abs) >> (bwl + 2));
return var;
}
diff --git a/aom_dsp/x86/avg_intrin_sse4.c b/aom_dsp/x86/avg_intrin_sse4.c
new file mode 100644
index 0000000..55a483e
--- /dev/null
+++ b/aom_dsp/x86/avg_intrin_sse4.c
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+// ref: [0 - 510]
+// src: [0 - 510]
+// bwl: {2, 3, 4, 5}
+int aom_vector_var_sse4_1(const int16_t *ref, const int16_t *src,
+ const int log_bw) {
+ const int width = 4 << log_bw;
+ assert(width % 16 == 0);
+
+ const __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1);
+ __m128i mean = _mm_setzero_si128();
+ __m128i sse = _mm_setzero_si128();
+
+ for (int i = 0; i < width; i += 16) {
+ const __m128i src_line = _mm_loadu_si128((const __m128i *)src);
+ const __m128i ref_line = _mm_loadu_si128((const __m128i *)ref);
+ const __m128i src_line2 = _mm_loadu_si128((const __m128i *)(src + 8));
+ const __m128i ref_line2 = _mm_loadu_si128((const __m128i *)(ref + 8));
+ __m128i diff = _mm_sub_epi16(ref_line, src_line);
+ const __m128i diff2 = _mm_sub_epi16(ref_line2, src_line2);
+ __m128i diff_sqr = _mm_madd_epi16(diff, diff);
+ const __m128i diff_sqr2 = _mm_madd_epi16(diff2, diff2);
+
+ diff = _mm_add_epi16(diff, diff2);
+ diff_sqr = _mm_add_epi32(diff_sqr, diff_sqr2);
+ diff = _mm_madd_epi16(diff, k_one_epi16);
+ sse = _mm_add_epi32(sse, diff_sqr);
+
+ mean = _mm_add_epi32(mean, diff);
+
+ src += 16;
+ ref += 16;
+ }
+
+ mean = _mm_hadd_epi32(mean, mean);
+ sse = _mm_hadd_epi32(sse, sse);
+ mean = _mm_hadd_epi32(mean, mean);
+ sse = _mm_hadd_epi32(sse, sse);
+
+ // (mean * mean): dynamic range 31 bits.
+ const int mean_int = _mm_extract_epi32(mean, 0);
+ const int sse_int = _mm_extract_epi32(sse, 0);
+ const unsigned int mean_abs = mean_int >= 0 ? mean_int : -mean_int;
+ const int var = sse_int - ((mean_abs * mean_abs) >> (log_bw + 2));
+ return var;
+}
diff --git a/test/avg_test.cc b/test/avg_test.cc
index b12d1ef..93f4c34 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -636,7 +636,7 @@
}
TEST_P(VectorVarTest, DISABLED_Speed) {
FillRandom();
- const int numIter = 50000;
+ const int numIter = 5000000;
printf("Width = %d number of iteration is %d \n", width, numIter);
int sum_c_var = 0;
@@ -942,6 +942,16 @@
make_tuple(5, &aom_vector_var_c, &aom_vector_var_neon)));
#endif
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, VectorVarTest,
+ ::testing::Values(make_tuple(2, &aom_vector_var_c, &aom_vector_var_sse4_1),
+ make_tuple(3, &aom_vector_var_c, &aom_vector_var_sse4_1),
+ make_tuple(4, &aom_vector_var_c, &aom_vector_var_sse4_1),
+ make_tuple(5, &aom_vector_var_c,
+ &aom_vector_var_sse4_1)));
+#endif // HAVE_SSE4_1
+
#if HAVE_AVX2
INSTANTIATE_TEST_SUITE_P(
AVX2, SatdTest,