Fix mismatches in quantize asm

Fix c vs asm mismatch with quantize_b_32x32 module

BUG=aomedia:2240

Change-Id: I5ce26147e29ba22f8ed328edb453660a11c5a97a
diff --git a/aom_dsp/x86/quantize_avx_x86_64.asm b/aom_dsp/x86/quantize_avx_x86_64.asm
index 216a0bd..d6e15c4 100644
--- a/aom_dsp/x86/quantize_avx_x86_64.asm
+++ b/aom_dsp/x86/quantize_avx_x86_64.asm
@@ -126,7 +126,7 @@
   punpckhqdq                      m3, m3
   pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
 
-  ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  ; Store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
   pcmpgtw                         m6, m5, m8
   punpckhwd                       m6, m8, m6
   pmovsxwd                       m11, m8
@@ -198,10 +198,7 @@
   mova                            m4, [r2]            ; m4 = shift
   mov                             r4, dqcoeffmp
   mov                             r5, iscanmp
-%ifidn %1, b_32x32
-  psllw                           m4, 1
-%endif
-  pxor                            m5, m5                   ; m5 = dedicated zero
+  pxor                            m5, m5              ; m5 = dedicated zero
 
   DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob
 
@@ -255,9 +252,26 @@
   pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
   paddw                           m8, m6                   ; m8 += m6
   paddw                          m13, m11                  ; m13 += m11
+  %ifidn %1, b_32x32
+  pmullw                          m5, m8, m4               ; store the lower 16 bits of m8*qsh
+  %endif
   pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
+  %ifidn %1, b_32x32
+  psllw                           m8, 1
+  psrlw                           m5, 15
+  por                             m8, m5
+  %endif
   punpckhqdq                      m4, m4
+  %ifidn %1, b_32x32
+  pmullw                          m5, m13, m4              ; store the lower 16 bits of m13*qsh
+  %endif
   pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  %ifidn %1, b_32x32
+  psllw                          m13, 1
+  psrlw                           m5, 15
+  por                            m13, m5
+  pxor                            m5, m5                   ; reset m5 to zero register
+  %endif
   psignw                          m8, m9                   ; m8 = reinsert sign
   psignw                         m13, m10                  ; m13 = reinsert sign
   pand                            m8, m7
@@ -289,7 +303,7 @@
   psignw                         m13, m10
 %endif
 
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
   pcmpgtw                         m6, m5, m8
   punpckhwd                       m6, m8, m6
   pmovsxwd                       m11, m8
@@ -359,8 +373,23 @@
   pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
   paddw                          m14, m6                   ; m14 += m6
   paddw                          m13, m11                  ; m13 += m11
+  %ifidn %1, b_32x32
+  pmullw                          m5, m14, m4              ; store the lower 16 bits of m14*qsh
+  %endif
   pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
+  %ifidn %1, b_32x32
+  psllw                          m14, 1
+  psrlw                           m5, 15
+  por                            m14, m5
+  pmullw                          m5, m13, m4              ; store the lower 16 bits of m13*qsh
+  %endif
   pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  %ifidn %1, b_32x32
+  psllw                          m13, 1
+  psrlw                           m5, 15
+  por                            m13, m5
+  pxor                            m5, m5                   ; reset m5 to zero register
+  %endif
   psignw                         m14, m9                   ; m14 = reinsert sign
   psignw                         m13, m10                  ; m13 = reinsert sign
   pand                           m14, m7
@@ -391,7 +420,7 @@
   psignw                         m13, m10
 %endif
 
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
   pcmpgtw                         m6, m5, m14
   punpckhwd                       m6, m14, m6
   pmovsxwd                       m11, m14
diff --git a/aom_dsp/x86/quantize_ssse3_x86_64.asm b/aom_dsp/x86/quantize_ssse3_x86_64.asm
index 39d4ca6..fa616a6 100644
--- a/aom_dsp/x86/quantize_ssse3_x86_64.asm
+++ b/aom_dsp/x86/quantize_ssse3_x86_64.asm
@@ -48,9 +48,6 @@
   mov                             r3, qcoeffmp
   mov                             r4, dqcoeffmp
   mov                             r5, iscanmp
-%ifidn %1, b_32x32
-  psllw                           m4, 1
-%endif
   pxor                            m5, m5                   ; m5 = dedicated zero
   DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob
   lea                         coeffq, [  coeffq+ncoeffq*4]
@@ -78,9 +75,26 @@
   pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
   paddw                           m8, m6                   ; m8 += m6
   paddw                          m13, m11                  ; m13 += m11
+  %ifidn %1, b_32x32
+  pmullw                          m5, m8, m4               ; store the lower 16 bits of m8*qsh
+  %endif
   pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
+  %ifidn %1, b_32x32
+  psllw                           m8, 1
+  psrlw                           m5, 15
+  por                             m8, m5
+  %endif
   punpckhqdq                      m4, m4
+  %ifidn %1, b_32x32
+  pmullw                          m5, m13, m4              ; store the lower 16 bits of m13*qsh
+  %endif
   pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  %ifidn %1, b_32x32
+  psllw                          m13, 1
+  psrlw                           m5, 15
+  por                            m13, m5
+  pxor                            m5, m5                   ; reset m5 to zero register
+  %endif
   psignw                          m8, m9                   ; m8 = reinsert sign
   psignw                         m13, m10                  ; m13 = reinsert sign
   pand                            m8, m7
@@ -117,7 +131,7 @@
   psignw                          m8, m9
   psignw                         m13, m10
 %endif
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
   mova                            m11, m8
   mova                            m6, m8
   pcmpgtw                         m5, m8
@@ -169,12 +183,28 @@
   pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
   paddw                          m14, m6                   ; m14 += m6
   paddw                          m13, m11                  ; m13 += m11
+  %ifidn %1, b_32x32
+  pmullw                          m5, m14, m4              ; store the lower 16 bits of m14*qsh
+  %endif
   pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
+  %ifidn %1, b_32x32
+  psllw                          m14, 1
+  psrlw                           m5, 15
+  por                            m14, m5
+  pmullw                          m5, m13, m4              ; store the lower 16 bits of m13*qsh
+  %endif
   pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  %ifidn %1, b_32x32
+  psllw                          m13, 1
+  psrlw                           m5, 15
+  por                            m13, m5
+  pxor                            m5, m5                   ; reset m5 to zero register
+  %endif
   psignw                         m14, m9                   ; m14 = reinsert sign
   psignw                         m13, m10                  ; m13 = reinsert sign
   pand                           m14, m7
   pand                           m13, m12
+
   ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
   pxor                           m11, m11
   mova                           m11, m14
@@ -207,7 +237,7 @@
   psignw                         m13, m10
 %endif
 
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
   mova                           m11, m14
   mova                            m6, m14
   pcmpgtw                         m5, m14
diff --git a/av1/encoder/av1_quantize.c b/av1/encoder/av1_quantize.c
index a7baaf8..21ab4db 100644
--- a/av1/encoder/av1_quantize.c
+++ b/av1/encoder/av1_quantize.c
@@ -289,13 +289,10 @@
                        sc->iscan);
         break;
       case 1:
-        // TODO(any): there is a bug in current ssse3 and avx optimizations
-        // (refer to libaom issue 2240), which needs to be fixed. Use c version
-        // for now.
-        aom_quantize_b_32x32_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
-                               p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
-                               dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
-                               sc->iscan);
+        aom_quantize_b_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                             p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                             dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                             sc->iscan);
         break;
       case 2:
         aom_quantize_b_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
diff --git a/test/quantize_func_test.cc b/test/quantize_func_test.cc
index 554d0c7..8dee864 100644
--- a/test/quantize_func_test.cc
+++ b/test/quantize_func_test.cc
@@ -286,19 +286,29 @@
   const int16_t *quant_shift = qtab_->quant.y_quant_shift[q];
   const int16_t *dequant = qtab_->dequant.y_dequant_QTX[q];
   const int kNumTests = 5000000;
-  aom_usec_timer timer;
+  aom_usec_timer timer, simd_timer;
 
   FillCoeffRandom();
 
   aom_usec_timer_start(&timer);
   for (int n = 0; n < kNumTests; ++n) {
-    quant_(coeff_ptr, n_coeffs, zbin, round_fp, quant_fp, quant_shift, qcoeff,
-           dqcoeff, dequant, eob, sc->scan, sc->iscan);
+    quant_ref_(coeff_ptr, n_coeffs, zbin, round_fp, quant_fp, quant_shift,
+               qcoeff, dqcoeff, dequant, eob, sc->scan, sc->iscan);
   }
   aom_usec_timer_mark(&timer);
 
+  aom_usec_timer_start(&simd_timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    quant_(coeff_ptr, n_coeffs, zbin, round_fp, quant_fp, quant_shift, qcoeff,
+           dqcoeff, dequant, eob, sc->scan, sc->iscan);
+  }
+  aom_usec_timer_mark(&simd_timer);
+
   const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-  printf("Elapsed time: %d us\n", elapsed_time);
+  const int simd_elapsed_time =
+      static_cast<int>(aom_usec_timer_elapsed(&simd_timer));
+  printf("c_time = %d \t simd_time = %d \t Gain = %d \n", elapsed_time,
+         simd_elapsed_time, (elapsed_time / simd_elapsed_time));
 }
 
 using ::testing::make_tuple;
@@ -398,14 +408,9 @@
 INSTANTIATE_TEST_CASE_P(
     SSSE3, QuantizeTest,
     ::testing::Values(make_tuple(&aom_quantize_b_c, &aom_quantize_b_ssse3,
-                                 TX_16X16, TYPE_B, AOM_BITS_8)));
-
-// Like libvpx, the ssse3 and avx quantize tests do not pass.
-// https://bugs.chromium.org/p/webm/issues/detail?id=1448
-INSTANTIATE_TEST_CASE_P(
-    DISABLED_SSSE3_32x32, QuantizeTest,
-    ::testing::Values(make_tuple(&aom_quantize_b_32x32_c,
-                                 &aom_quantize_b_32x32_ssse3, TX_16X16, TYPE_B,
+                                 TX_16X16, TYPE_B, AOM_BITS_8),
+                      make_tuple(&aom_quantize_b_32x32_c,
+                                 &aom_quantize_b_32x32_ssse3, TX_32X32, TYPE_B,
                                  AOM_BITS_8)));
 
 #endif  // HAVE_SSSE3 && ARCH_X86_64
@@ -413,13 +418,11 @@
 #if HAVE_AVX && ARCH_X86_64
 INSTANTIATE_TEST_CASE_P(
     AVX, QuantizeTest,
-    ::testing::Values(
-        make_tuple(&aom_quantize_b_c, &aom_quantize_b_avx, TX_16X16, TYPE_B,
-                   AOM_BITS_8),
-        // Although these tests will not pass against _c, test them against each
-        // other so there is some minor checking.
-        make_tuple(&aom_quantize_b_32x32_ssse3, &aom_quantize_b_32x32_avx,
-                   TX_32X32, TYPE_B, AOM_BITS_8)));
+    ::testing::Values(make_tuple(&aom_quantize_b_c, &aom_quantize_b_avx,
+                                 TX_16X16, TYPE_B, AOM_BITS_8),
+                      make_tuple(&aom_quantize_b_32x32_c,
+                                 &aom_quantize_b_32x32_avx, TX_32X32, TYPE_B,
+                                 AOM_BITS_8)));
 
 #endif  // HAVE_AVX && ARCH_X86_64
 }  // namespace