Fix bug in av1_highbd_quantize_fp_avx2/sse4

1. Both av1_highbd_quantize_fp_avx2/sse4 missed the
comparing of abs_coeff<<(1+log_scale) and dequant.
When abs_coeff<<(1+log_scale) less than dequant,
output should be zero.

2. Fix test case of AV1QuantizeTest, the input data
arrary needs to extend to 8 element for SIMD.

3. Re-enable sse4_1 version AV1QuantizeTest.BitExactCheck
and AV1QuantizeTest.EobVerify.
And add avx2 version test for them.

4. Re-enable SIMD optimization, for encoder, about 4.2%
faster shows by encoding 10 frames of city_cif, when
CONFIG_LOWBITDEPTH set to 0.

a) gcc (Ubuntu 7.2.0-8ubuntu3.2) 7.2.0
b) CPU: Intel(R) Core(TM) i7-6900K CPU @ 3.20GHz
c) Config cmd
cmake ../ -DENABLE_CCACHE=1 -DCONFIG_LOWBITDEPTH=0
d) Test cmd:
./aomenc --cpu-used=1 --end-usage=vbr \
--target-bitrate=800 --limit=10


Change-Id: I05c2203ccdf078efa3a0cf227c03797859c355e3
diff --git a/av1/encoder/av1_quantize.c b/av1/encoder/av1_quantize.c
index 119b4f4..757c9f0 100644
--- a/av1/encoder/av1_quantize.c
+++ b/av1/encoder/av1_quantize.c
@@ -405,13 +405,10 @@
           p->dequant_QTX, eob_ptr, sc->scan, sc->iscan, qparam->log_scale);
       return;
     }
-
-    // TODO(yunqing): modify the AVX2 version to match the c version, and
-    // then turn on it and also enable its unit test.
-    av1_highbd_quantize_fp_c(
-        coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, p->round_fp_QTX,
-        p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
-        p->dequant_QTX, eob_ptr, sc->scan, sc->iscan, qparam->log_scale);
+    av1_highbd_quantize_fp(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
+                           p->round_fp_QTX, p->quant_fp_QTX, p->quant_shift_QTX,
+                           qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+                           sc->scan, sc->iscan, qparam->log_scale);
   }
 }
 
diff --git a/av1/encoder/x86/av1_highbd_quantize_avx2.c b/av1/encoder/x86/av1_highbd_quantize_avx2.c
index 52d3437..20d4160 100644
--- a/av1/encoder/x86/av1_highbd_quantize_avx2.c
+++ b/av1/encoder/x86/av1_highbd_quantize_avx2.c
@@ -59,6 +59,9 @@
   q_hi = _mm256_srli_epi64(q_hi, 16 - log_scale);
   q_hi = _mm256_slli_epi64(q_hi, 32);
   q = _mm256_or_si256(q_lo, q_hi);
+  const __m256i abs_s = _mm256_slli_epi32(abs, 1 + log_scale);
+  const __m256i mask = _mm256_cmpgt_epi32(qp[2], abs_s);
+  q = _mm256_andnot_si256(mask, q);
 
   __m256i dq = _mm256_mullo_epi32(q, qp[2]);
   dq = _mm256_srai_epi32(dq, log_scale);
diff --git a/av1/encoder/x86/av1_highbd_quantize_sse4.c b/av1/encoder/x86/av1_highbd_quantize_sse4.c
index 3ca7581..da8bb37 100644
--- a/av1/encoder/x86/av1_highbd_quantize_sse4.c
+++ b/av1/encoder/x86/av1_highbd_quantize_sse4.c
@@ -36,6 +36,8 @@
   qcoeff[0] = _mm_srli_epi64(qcoeff[0], shift);
   dquan[0] = _mm_mul_epi32(qcoeff[0], param[2]);
   dquan[0] = _mm_srli_epi64(dquan[0], scale);
+  const __m128i abs_s = _mm_slli_epi32(*coeff, 1 + scale);
+  qcoeff[2] = _mm_cmplt_epi32(abs_s, param[3]);
 }
 
 // Coefficient quantization phase 2
@@ -70,7 +72,8 @@
 
   qcoeff[0] = _mm_sign_epi32(qcoeff[0], *sign);
   dquan[0] = _mm_sign_epi32(dquan[0], *sign);
-
+  qcoeff[0] = _mm_andnot_si128(qcoeff[2], qcoeff[0]);
+  dquan[0] = _mm_andnot_si128(qcoeff[2], dquan[0]);
   _mm_storeu_si128((__m128i *)qAddr, qcoeff[0]);
   _mm_storeu_si128((__m128i *)dqAddr, dquan[0]);
 }
@@ -113,7 +116,7 @@
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan, int log_scale) {
-  __m128i coeff[2], qcoeff[2], dequant[2], qparam[3], coeff_sign;
+  __m128i coeff[2], qcoeff[3], dequant[2], qparam[4], coeff_sign;
   __m128i eob = _mm_setzero_si128();
   const tran_low_t *src = coeff_ptr;
   tran_low_t *quanAddr = qcoeff_ptr;
@@ -137,6 +140,8 @@
     qparam[0] = _mm_set_epi32(round1, round1, round1, round0);
     qparam[1] = _mm_set_epi32(0, quant_ptr[1], 0, quant_ptr[0]);
     qparam[2] = _mm_set_epi32(0, dequant_ptr[1], 0, dequant_ptr[0]);
+    qparam[3] = _mm_set_epi32(dequant_ptr[1], dequant_ptr[1], dequant_ptr[1],
+                              dequant_ptr[0]);
 
     // DC and first 3 AC
     quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
@@ -146,7 +151,7 @@
     qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]);
     qparam[1] = _mm_set_epi32(0, quant_ptr[1], 0, quant_ptr[1]);
     qparam[2] = _mm_set_epi32(0, dequant_ptr[1], 0, dequant_ptr[1]);
-
+    qparam[3] = _mm_set1_epi32(dequant_ptr[1]);
     quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
                           log_scale, quanAddr, dquanAddr);
 
diff --git a/test/av1_quantize_test.cc b/test/av1_quantize_test.cc
index 86a48f2..f594a64 100644
--- a/test/av1_quantize_test.cc
+++ b/test/av1_quantize_test.cc
@@ -50,15 +50,15 @@
   void RunQuantizeTest() {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
     DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[maxSize]);
-    DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
-    DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
-    DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
-    DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
+    DECLARE_ALIGNED(16, int16_t, zbin_ptr[8]);
+    DECLARE_ALIGNED(16, int16_t, round_ptr[8]);
+    DECLARE_ALIGNED(16, int16_t, quant_ptr[8]);
+    DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[8]);
     DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[maxSize]);
     DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[maxSize]);
     DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[maxSize]);
     DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[maxSize]);
-    DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
+    DECLARE_ALIGNED(16, int16_t, dequant_ptr[8]);
     uint16_t eob;
     uint16_t ref_eob;
     int err_count_total = 0;
@@ -86,7 +86,13 @@
         quant_ptr[j] = (1 << 16) / dequant_ptr[j];
         round_ptr[j] = (abs(rnd(roundFactorRange)) * dequant_ptr[j]) >> 7;
       }
-
+      for (int j = 2; j < 8; ++j) {
+        zbin_ptr[j] = zbin_ptr[1];
+        quant_shift_ptr[j] = quant_shift_ptr[1];
+        dequant_ptr[j] = dequant_ptr[1];
+        quant_ptr[j] = quant_ptr[1];
+        round_ptr[j] = round_ptr[1];
+      }
       quanFuncRef(coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr,
                   quant_shift_ptr, ref_qcoeff_ptr, ref_dqcoeff_ptr, dequant_ptr,
                   &ref_eob, scanOrder.scan, scanOrder.iscan, log_scale);
@@ -99,7 +105,7 @@
       for (int j = 0; j < count; ++j) {
         err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
                      (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
-        EXPECT_EQ(ref_qcoeff_ptr[j], qcoeff_ptr[j])
+        ASSERT_EQ(ref_qcoeff_ptr[j], qcoeff_ptr[j])
             << "qcoeff error: i = " << i << " j = " << j << "\n";
         EXPECT_EQ(ref_dqcoeff_ptr[j], dqcoeff_ptr[j])
             << "dqcoeff error: i = " << i << " j = " << j << "\n";
@@ -120,15 +126,15 @@
   void RunEobTest() {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
     DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[maxSize]);
-    DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
-    DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
-    DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
-    DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
+    DECLARE_ALIGNED(16, int16_t, zbin_ptr[8]);
+    DECLARE_ALIGNED(16, int16_t, round_ptr[8]);
+    DECLARE_ALIGNED(16, int16_t, quant_ptr[8]);
+    DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[8]);
     DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[maxSize]);
     DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[maxSize]);
     DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[maxSize]);
     DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[maxSize]);
-    DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
+    DECLARE_ALIGNED(16, int16_t, dequant_ptr[8]);
     uint16_t eob;
     uint16_t ref_eob;
     int skip_block = 0;
@@ -157,6 +163,13 @@
         quant_ptr[j] = (1 << 16) / dequant_ptr[j];
         round_ptr[j] = (abs(rnd(roundFactorRange)) * dequant_ptr[j]) >> 7;
       }
+      for (int j = 2; j < 8; ++j) {
+        zbin_ptr[j] = zbin_ptr[1];
+        quant_shift_ptr[j] = quant_shift_ptr[1];
+        dequant_ptr[j] = dequant_ptr[1];
+        quant_ptr[j] = quant_ptr[1];
+        round_ptr[j] = round_ptr[1];
+      }
 
       quanFuncRef(coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr,
                   quant_shift_ptr, ref_qcoeff_ptr, ref_dqcoeff_ptr, dequant_ptr,
@@ -191,8 +204,8 @@
   QuantizeFuncParams params_;
 };
 
-TEST_P(AV1QuantizeTest, DISABLED_BitExactCheck) { RunQuantizeTest(); }
-TEST_P(AV1QuantizeTest, DISABLED_EobVerify) { RunEobTest(); }
+TEST_P(AV1QuantizeTest, BitExactCheck) { RunQuantizeTest(); }
+TEST_P(AV1QuantizeTest, EobVerify) { RunEobTest(); }
 
 #if HAVE_SSE4_1
 const QuantizeFuncParams qfps[4] = {
@@ -208,4 +221,20 @@
 
 INSTANTIATE_TEST_CASE_P(SSE4_1, AV1QuantizeTest, ::testing::ValuesIn(qfps));
 #endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+const QuantizeFuncParams qfps_avx2[4] = {
+  QuantizeFuncParams(&av1_highbd_quantize_fp_avx2, &av1_highbd_quantize_fp_c,
+                     16),
+  QuantizeFuncParams(&av1_highbd_quantize_fp_avx2, &av1_highbd_quantize_fp_c,
+                     64),
+  QuantizeFuncParams(&av1_highbd_quantize_fp_avx2, &av1_highbd_quantize_fp_c,
+                     256),
+  QuantizeFuncParams(&av1_highbd_quantize_fp_avx2, &av1_highbd_quantize_fp_c,
+                     1024),
+};
+
+INSTANTIATE_TEST_CASE_P(AVX2, AV1QuantizeTest, ::testing::ValuesIn(qfps_avx2));
+#endif  // HAVE_AVX2
+
 }  // namespace