Add SSE2 version of low precision Hadamard 16x16
The SSE2 version gives about 82% encoding time reduction compared to C
version.
Change-Id: I1fda995460da0fcca31f3e17a1c539431f591da3
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index ba6027c..e10acadb 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -1113,7 +1113,7 @@
specialize qw/aom_hadamard_lp_8x8 sse2 neon/;
add_proto qw/void aom_hadamard_lp_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
- specialize qw/aom_hadamard_lp_16x16 avx2 neon/;
+ specialize qw/aom_hadamard_lp_16x16 sse2 avx2 neon/;
if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
diff --git a/aom_dsp/x86/avg_intrin_sse2.c b/aom_dsp/x86/avg_intrin_sse2.c
index 260ca2a..18c4ca7 100644
--- a/aom_dsp/x86/avg_intrin_sse2.c
+++ b/aom_dsp/x86/avg_intrin_sse2.c
@@ -272,8 +272,8 @@
hadamard_8x8_sse2(src_diff, src_stride, coeff, 1);
}
-void aom_hadamard_lp_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
- int16_t *coeff) {
+static INLINE void hadamard_lp_8x8_sse2(const int16_t *src_diff,
+ ptrdiff_t src_stride, int16_t *coeff) {
__m128i src[8];
src[0] = _mm_load_si128((const __m128i *)src_diff);
src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
@@ -304,6 +304,50 @@
_mm_store_si128((__m128i *)coeff, src[7]);
}
+void aom_hadamard_lp_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+ int16_t *coeff) {
+ hadamard_lp_8x8_sse2(src_diff, src_stride, coeff);
+}
+
+void aom_hadamard_lp_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+ int16_t *coeff) {
+ for (int idx = 0; idx < 4; ++idx) {
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+ hadamard_lp_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
+ }
+
+ int16_t *t_coeff = coeff;
+ for (int idx = 0; idx < 64; idx += 8) {
+ __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
+ __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64));
+ __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128));
+ __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192));
+
+ __m128i b0 = _mm_add_epi16(coeff0, coeff1);
+ __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
+ __m128i b2 = _mm_add_epi16(coeff2, coeff3);
+ __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
+
+ b0 = _mm_srai_epi16(b0, 1);
+ b1 = _mm_srai_epi16(b1, 1);
+ b2 = _mm_srai_epi16(b2, 1);
+ b3 = _mm_srai_epi16(b3, 1);
+
+ coeff0 = _mm_add_epi16(b0, b2);
+ coeff1 = _mm_add_epi16(b1, b3);
+ coeff2 = _mm_sub_epi16(b0, b2);
+ coeff3 = _mm_sub_epi16(b1, b3);
+
+ _mm_store_si128((__m128i *)t_coeff, coeff0);
+ _mm_store_si128((__m128i *)(t_coeff + 64), coeff1);
+ _mm_store_si128((__m128i *)(t_coeff + 128), coeff2);
+ _mm_store_si128((__m128i *)(t_coeff + 192), coeff3);
+
+ t_coeff += 8;
+ }
+}
+
static INLINE void hadamard_16x16_sse2(const int16_t *src_diff,
ptrdiff_t src_stride, tran_low_t *coeff,
int is_final) {
diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc
index 30c45fc..8813f33 100644
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc
@@ -302,7 +302,8 @@
#if HAVE_SSE2
INSTANTIATE_TEST_SUITE_P(
SSE2, HadamardLowbdLPTest,
- ::testing::Values(HadamardLPFuncWithSize(&aom_hadamard_lp_8x8_sse2, 8)));
+ ::testing::Values(HadamardLPFuncWithSize(&aom_hadamard_lp_8x8_sse2, 8),
+ HadamardLPFuncWithSize(&aom_hadamard_lp_16x16_sse2, 16)));
#endif // HAVE_SSE2
#if HAVE_AVX2