Fix aom_fdct8x8_ssse3 in high bit depth mode Change-Id: I63e492163ef10e12a842837368c209b8ffc4eee0
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl index d6fa90b..b073b1b 100644 --- a/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -708,7 +708,7 @@ specialize qw/aom_fdct4x4_1 sse2/; add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/aom_fdct8x8 sse2/; + specialize qw/aom_fdct8x8 sse2/, "$ssse3_x86_64"; add_proto qw/void aom_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/aom_fdct8x8_1 sse2/;
diff --git a/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm index 6f3c470..5b2aab2 100644 --- a/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm +++ b/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
@@ -130,12 +130,30 @@ psraw m%2, 1 %endmacro +%macro STORE_OUTPUT 2 ; index, result +%if CONFIG_AOM_HIGHBITDEPTH + ; const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); + ; __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); + ; __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); + ; _mm_store_si128((__m128i *)(dst_ptr), out0); + ; _mm_store_si128((__m128i *)(dst_ptr + 4), out1); + pxor m11, m11 + pcmpgtw m11, m%2 + movdqa m12, m%2 + punpcklwd m%2, m11 + punpckhwd m12, m11 + mova [outputq + 4*%1 + 0], m%2 + mova [outputq + 4*%1 + 16], m12 +%else + mova [outputq + 2*%1], m%2 +%endif +%endmacro + INIT_XMM ssse3 cglobal fdct8x8, 3, 5, 13, input, output, stride mova m8, [pd_8192] mova m12, [pw_11585x2] - pxor m11, m11 lea r3, [2 * strideq] lea r4, [4 * strideq] @@ -173,14 +191,14 @@ DIVIDE_ROUND_2X 4, 5, 9, 10 DIVIDE_ROUND_2X 6, 7, 9, 10 - mova [outputq + 0], m0 - mova [outputq + 16], m1 - mova [outputq + 32], m2 - mova [outputq + 48], m3 - mova [outputq + 64], m4 - mova [outputq + 80], m5 - mova [outputq + 96], m6 - mova [outputq + 112], m7 + STORE_OUTPUT 0, 0 + STORE_OUTPUT 8, 1 + STORE_OUTPUT 16, 2 + STORE_OUTPUT 24, 3 + STORE_OUTPUT 32, 4 + STORE_OUTPUT 40, 5 + STORE_OUTPUT 48, 6 + STORE_OUTPUT 56, 7 RET %endif
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc index 9f62ffe..bbfb7f1 100644 --- a/test/fdct8x8_test.cc +++ b/test/fdct8x8_test.cc
@@ -728,8 +728,7 @@ make_tuple(&idct8x8_12, &idct8x8_64_add_12_sse2, 6225, AOM_BITS_12))); #endif // HAVE_SSE2 && CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_AOM_HIGHBITDEPTH && \ - !CONFIG_EMULATE_HARDWARE +#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P(SSSE3, FwdTrans8x8DCT, ::testing::Values(make_tuple(&aom_fdct8x8_ssse3, &aom_idct8x8_64_add_ssse3,