Fix mismatches in quantize asm
Fix c vs asm mismatch with quantize_b_32x32 module
BUG=aomedia:2240
Change-Id: I5ce26147e29ba22f8ed328edb453660a11c5a97a
diff --git a/aom_dsp/x86/quantize_avx_x86_64.asm b/aom_dsp/x86/quantize_avx_x86_64.asm
index 216a0bd..d6e15c4 100644
--- a/aom_dsp/x86/quantize_avx_x86_64.asm
+++ b/aom_dsp/x86/quantize_avx_x86_64.asm
@@ -126,7 +126,7 @@
punpckhqdq m3, m3
pmullw m13, m3 ; dqc[i] = qc[i] * q
- ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+ ; Store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
pcmpgtw m6, m5, m8
punpckhwd m6, m8, m6
pmovsxwd m11, m8
@@ -198,10 +198,7 @@
mova m4, [r2] ; m4 = shift
mov r4, dqcoeffmp
mov r5, iscanmp
-%ifidn %1, b_32x32
- psllw m4, 1
-%endif
- pxor m5, m5 ; m5 = dedicated zero
+ pxor m5, m5 ; m5 = dedicated zero
DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob
@@ -255,9 +252,26 @@
pmulhw m13, m11, m2 ; m13 = m11*q>>16
paddw m8, m6 ; m8 += m6
paddw m13, m11 ; m13 += m11
+ %ifidn %1, b_32x32
+ pmullw m5, m8, m4 ; store the lower 16 bits of m8*qsh
+ %endif
pmulhw m8, m4 ; m8 = m8*qsh>>16
+ %ifidn %1, b_32x32
+ psllw m8, 1
+ psrlw m5, 15
+ por m8, m5
+ %endif
punpckhqdq m4, m4
+ %ifidn %1, b_32x32
+ pmullw m5, m13, m4 ; store the lower 16 bits of m13*qsh
+ %endif
pmulhw m13, m4 ; m13 = m13*qsh>>16
+ %ifidn %1, b_32x32
+ psllw m13, 1
+ psrlw m5, 15
+ por m13, m5
+ pxor m5, m5 ; reset m5 to zero register
+ %endif
psignw m8, m9 ; m8 = reinsert sign
psignw m13, m10 ; m13 = reinsert sign
pand m8, m7
@@ -289,7 +303,7 @@
psignw m13, m10
%endif
- ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+ ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
pcmpgtw m6, m5, m8
punpckhwd m6, m8, m6
pmovsxwd m11, m8
@@ -359,8 +373,23 @@
pmulhw m13, m11, m2 ; m13 = m11*q>>16
paddw m14, m6 ; m14 += m6
paddw m13, m11 ; m13 += m11
+ %ifidn %1, b_32x32
+ pmullw m5, m14, m4 ; store the lower 16 bits of m14*qsh
+ %endif
pmulhw m14, m4 ; m14 = m14*qsh>>16
+ %ifidn %1, b_32x32
+ psllw m14, 1
+ psrlw m5, 15
+ por m14, m5
+ pmullw m5, m13, m4 ; store the lower 16 bits of m13*qsh
+ %endif
pmulhw m13, m4 ; m13 = m13*qsh>>16
+ %ifidn %1, b_32x32
+ psllw m13, 1
+ psrlw m5, 15
+ por m13, m5
+ pxor m5, m5 ; reset m5 to zero register
+ %endif
psignw m14, m9 ; m14 = reinsert sign
psignw m13, m10 ; m13 = reinsert sign
pand m14, m7
@@ -391,7 +420,7 @@
psignw m13, m10
%endif
- ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+ ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
pcmpgtw m6, m5, m14
punpckhwd m6, m14, m6
pmovsxwd m11, m14
diff --git a/aom_dsp/x86/quantize_ssse3_x86_64.asm b/aom_dsp/x86/quantize_ssse3_x86_64.asm
index 39d4ca6..fa616a6 100644
--- a/aom_dsp/x86/quantize_ssse3_x86_64.asm
+++ b/aom_dsp/x86/quantize_ssse3_x86_64.asm
@@ -48,9 +48,6 @@
mov r3, qcoeffmp
mov r4, dqcoeffmp
mov r5, iscanmp
-%ifidn %1, b_32x32
- psllw m4, 1
-%endif
pxor m5, m5 ; m5 = dedicated zero
DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob
lea coeffq, [ coeffq+ncoeffq*4]
@@ -78,9 +75,26 @@
pmulhw m13, m11, m2 ; m13 = m11*q>>16
paddw m8, m6 ; m8 += m6
paddw m13, m11 ; m13 += m11
+ %ifidn %1, b_32x32
+ pmullw m5, m8, m4 ; store the lower 16 bits of m8*qsh
+ %endif
pmulhw m8, m4 ; m8 = m8*qsh>>16
+ %ifidn %1, b_32x32
+ psllw m8, 1
+ psrlw m5, 15
+ por m8, m5
+ %endif
punpckhqdq m4, m4
+ %ifidn %1, b_32x32
+ pmullw m5, m13, m4 ; store the lower 16 bits of m13*qsh
+ %endif
pmulhw m13, m4 ; m13 = m13*qsh>>16
+ %ifidn %1, b_32x32
+ psllw m13, 1
+ psrlw m5, 15
+ por m13, m5
+ pxor m5, m5 ; reset m5 to zero register
+ %endif
psignw m8, m9 ; m8 = reinsert sign
psignw m13, m10 ; m13 = reinsert sign
pand m8, m7
@@ -117,7 +131,7 @@
psignw m8, m9
psignw m13, m10
%endif
- ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+ ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
mova m11, m8
mova m6, m8
pcmpgtw m5, m8
@@ -169,12 +183,28 @@
pmulhw m13, m11, m2 ; m13 = m11*q>>16
paddw m14, m6 ; m14 += m6
paddw m13, m11 ; m13 += m11
+ %ifidn %1, b_32x32
+ pmullw m5, m14, m4 ; store the lower 16 bits of m14*qsh
+ %endif
pmulhw m14, m4 ; m14 = m14*qsh>>16
+ %ifidn %1, b_32x32
+ psllw m14, 1
+ psrlw m5, 15
+ por m14, m5
+ pmullw m5, m13, m4 ; store the lower 16 bits of m13*qsh
+ %endif
pmulhw m13, m4 ; m13 = m13*qsh>>16
+ %ifidn %1, b_32x32
+ psllw m13, 1
+ psrlw m5, 15
+ por m13, m5
+ pxor m5, m5 ; reset m5 to zero register
+ %endif
psignw m14, m9 ; m14 = reinsert sign
psignw m13, m10 ; m13 = reinsert sign
pand m14, m7
pand m13, m12
+
; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
pxor m11, m11
mova m11, m14
@@ -207,7 +237,7 @@
psignw m13, m10
%endif
- ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+ ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
mova m11, m14
mova m6, m14
pcmpgtw m5, m14
diff --git a/av1/encoder/av1_quantize.c b/av1/encoder/av1_quantize.c
index a7baaf8..21ab4db 100644
--- a/av1/encoder/av1_quantize.c
+++ b/av1/encoder/av1_quantize.c
@@ -289,13 +289,10 @@
sc->iscan);
break;
case 1:
- // TODO(any): there is a bug in current ssse3 and avx optimizations
- // (refer to libaom issue 2240), which needs to be fixed. Use c version
- // for now.
- aom_quantize_b_32x32_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
- p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
- dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
- sc->iscan);
+ aom_quantize_b_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+ p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan);
break;
case 2:
aom_quantize_b_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
diff --git a/test/quantize_func_test.cc b/test/quantize_func_test.cc
index 554d0c7..8dee864 100644
--- a/test/quantize_func_test.cc
+++ b/test/quantize_func_test.cc
@@ -286,19 +286,29 @@
const int16_t *quant_shift = qtab_->quant.y_quant_shift[q];
const int16_t *dequant = qtab_->dequant.y_dequant_QTX[q];
const int kNumTests = 5000000;
- aom_usec_timer timer;
+ aom_usec_timer timer, simd_timer;
FillCoeffRandom();
aom_usec_timer_start(&timer);
for (int n = 0; n < kNumTests; ++n) {
- quant_(coeff_ptr, n_coeffs, zbin, round_fp, quant_fp, quant_shift, qcoeff,
- dqcoeff, dequant, eob, sc->scan, sc->iscan);
+ quant_ref_(coeff_ptr, n_coeffs, zbin, round_fp, quant_fp, quant_shift,
+ qcoeff, dqcoeff, dequant, eob, sc->scan, sc->iscan);
}
aom_usec_timer_mark(&timer);
+ aom_usec_timer_start(&simd_timer);
+ for (int n = 0; n < kNumTests; ++n) {
+ quant_(coeff_ptr, n_coeffs, zbin, round_fp, quant_fp, quant_shift, qcoeff,
+ dqcoeff, dequant, eob, sc->scan, sc->iscan);
+ }
+ aom_usec_timer_mark(&simd_timer);
+
const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
- printf("Elapsed time: %d us\n", elapsed_time);
+ const int simd_elapsed_time =
+ static_cast<int>(aom_usec_timer_elapsed(&simd_timer));
+ printf("c_time = %d \t simd_time = %d \t Gain = %d \n", elapsed_time,
+ simd_elapsed_time, (elapsed_time / simd_elapsed_time));
}
using ::testing::make_tuple;
@@ -398,14 +408,9 @@
INSTANTIATE_TEST_CASE_P(
SSSE3, QuantizeTest,
::testing::Values(make_tuple(&aom_quantize_b_c, &aom_quantize_b_ssse3,
- TX_16X16, TYPE_B, AOM_BITS_8)));
-
-// Like libvpx, the ssse3 and avx quantize tests do not pass.
-// https://bugs.chromium.org/p/webm/issues/detail?id=1448
-INSTANTIATE_TEST_CASE_P(
- DISABLED_SSSE3_32x32, QuantizeTest,
- ::testing::Values(make_tuple(&aom_quantize_b_32x32_c,
- &aom_quantize_b_32x32_ssse3, TX_16X16, TYPE_B,
+ TX_16X16, TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_quantize_b_32x32_c,
+ &aom_quantize_b_32x32_ssse3, TX_32X32, TYPE_B,
AOM_BITS_8)));
#endif // HAVE_SSSE3 && ARCH_X86_64
@@ -413,13 +418,11 @@
#if HAVE_AVX && ARCH_X86_64
INSTANTIATE_TEST_CASE_P(
AVX, QuantizeTest,
- ::testing::Values(
- make_tuple(&aom_quantize_b_c, &aom_quantize_b_avx, TX_16X16, TYPE_B,
- AOM_BITS_8),
- // Although these tests will not pass against _c, test them against each
- // other so there is some minor checking.
- make_tuple(&aom_quantize_b_32x32_ssse3, &aom_quantize_b_32x32_avx,
- TX_32X32, TYPE_B, AOM_BITS_8)));
+ ::testing::Values(make_tuple(&aom_quantize_b_c, &aom_quantize_b_avx,
+ TX_16X16, TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_quantize_b_32x32_c,
+ &aom_quantize_b_32x32_avx, TX_32X32, TYPE_B,
+ AOM_BITS_8)));
#endif // HAVE_AVX && ARCH_X86_64
} // namespace