SIMD updates & unit testing

Change-Id: I6e35237c3045d0b421300ae6f62c3fcc9d1f7e70
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 9ad5d4b..a1be01d 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -551,7 +551,7 @@
 if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   if (aom_config("CONFIG_EXTQUANT") eq "yes") {
     add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr, const int32_t *round_ptr, const int32_t *quant_ptr, const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    specialize qw/aom_highbd_quantize_b sse2 avx2/;
+    specialize qw/aom_highbd_quantize_b sse2/;
 
 	add_proto qw/void aom_highbd_quantize_b_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr, const int32_t *round_ptr, const int32_t *quant_ptr, const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
 
diff --git a/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/aom_dsp/x86/highbd_quantize_intrin_sse2.c
index 4f48663..13d6530 100644
--- a/aom_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/aom_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -89,7 +89,11 @@
         int k = 4 * i + j;
         const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
         const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
+#if CONFIG_EXTQUANT
+        const int32_t abs_qcoeff =
+#else
         const uint32_t abs_qcoeff =
+#endif
             (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
         qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
 #if CONFIG_EXTQUANT
diff --git a/av1/encoder/x86/av1_highbd_quantize_avx2.c b/av1/encoder/x86/av1_highbd_quantize_avx2.c
index bed144e..b63a9a6 100644
--- a/av1/encoder/x86/av1_highbd_quantize_avx2.c
+++ b/av1/encoder/x86/av1_highbd_quantize_avx2.c
@@ -74,7 +74,7 @@
                             __m256i *eob) {
   const __m256i abs_coeff = _mm256_abs_epi32(*c);
 #if CONFIG_EXTQUANT
-  const __m256i round = _mm256_set1_epi64x((1 << QUANT_TABLE_BITS) >> 1);
+  const __m256i round = _mm256_set1_epi32((1 << QUANT_TABLE_BITS) >> 1);
 #endif
   __m256i q = _mm256_add_epi32(abs_coeff, qp[0]);
 
diff --git a/test/av1_quantize_test.cc b/test/av1_quantize_test.cc
index 5714ba4..d319f1e 100644
--- a/test/av1_quantize_test.cc
+++ b/test/av1_quantize_test.cc
@@ -21,13 +21,21 @@
 #include "av1/common/scan.h"
 
 namespace {
-
+#if CONFIG_EXTQUANT
+typedef void (*QuantizeFpFunc)(
+    const tran_low_t *coeff_ptr, intptr_t count, const int32_t *zbin_ptr,
+    const int32_t *round_ptr, const int32_t *quant_ptr,
+    const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, int log_scale);
+#else
 typedef void (*QuantizeFpFunc)(
     const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr,
     const int16_t *round_ptr, const int16_t *quant_ptr,
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan, int log_scale);
+#endif
 
 struct QuantizeFuncParams {
   QuantizeFuncParams(QuantizeFpFunc qF = NULL, QuantizeFpFunc qRefF = NULL,
@@ -42,14 +50,31 @@
 
 const int numTests = 1000;
 const int maxSize = 1024;
+#if CONFIG_EXTQUANT
+const int roundFactorRange = 64;
+const int dequantRange = 1048576;
+#else
 const int roundFactorRange = 127;
 const int dequantRange = 32768;
+#endif
 const int coeffRange = (1 << 20) - 1;
 
 class AV1QuantizeTest : public ::testing::TestWithParam<QuantizeFuncParams> {
  public:
   void RunQuantizeTest() {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
+#if CONFIG_EXTQUANT
+    DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, int32_t, zbin_ptr[8]);
+    DECLARE_ALIGNED(16, int32_t, round_ptr[8]);
+    DECLARE_ALIGNED(16, int32_t, quant_ptr[8]);
+    DECLARE_ALIGNED(16, int32_t, quant_shift_ptr[8]);
+    DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, int32_t, dequant_ptr[8]);
+#else
     DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[maxSize]);
     DECLARE_ALIGNED(16, int16_t, zbin_ptr[8]);
     DECLARE_ALIGNED(16, int16_t, round_ptr[8]);
@@ -60,6 +85,7 @@
     DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[maxSize]);
     DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[maxSize]);
     DECLARE_ALIGNED(16, int16_t, dequant_ptr[8]);
+#endif
     uint16_t eob;
     uint16_t ref_eob;
     int err_count_total = 0;
@@ -79,12 +105,23 @@
       }
 
       for (int j = 0; j < 2; j++) {
+#if CONFIG_EXTQUANT
+        zbin_ptr[j] = rnd.Rand31();
+        quant_shift_ptr[j] = rnd.Rand31();
+        // int32_t positive
+        dequant_ptr[j] = abs(rnd(dequantRange));
+        quant_ptr[j] = static_cast<int32_t>(
+            (1 << (16 + QUANT_FP_BITS + QUANT_TABLE_BITS)) / dequant_ptr[j]);
+        round_ptr[j] = (abs(rnd(roundFactorRange)) * dequant_ptr[j]) >>
+                       (7 + QUANT_TABLE_BITS);
+#else
         zbin_ptr[j] = rnd.Rand16();
         quant_shift_ptr[j] = rnd.Rand16();
         // int16_t positive
         dequant_ptr[j] = abs(rnd(dequantRange));
         quant_ptr[j] = static_cast<int16_t>((1 << 16) / dequant_ptr[j]);
         round_ptr[j] = (abs(rnd(roundFactorRange)) * dequant_ptr[j]) >> 7;
+#endif
       }
       for (int j = 2; j < 8; ++j) {
         zbin_ptr[j] = zbin_ptr[1];
@@ -125,6 +162,18 @@
 
   void RunEobTest() {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
+#if CONFIG_EXTQUANT
+    DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, int32_t, zbin_ptr[8]);
+    DECLARE_ALIGNED(16, int32_t, round_ptr[8]);
+    DECLARE_ALIGNED(16, int32_t, quant_ptr[8]);
+    DECLARE_ALIGNED(16, int32_t, quant_shift_ptr[8]);
+    DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, int32_t, dequant_ptr[8]);
+#else
     DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[maxSize]);
     DECLARE_ALIGNED(16, int16_t, zbin_ptr[8]);
     DECLARE_ALIGNED(16, int16_t, round_ptr[8]);
@@ -135,6 +184,7 @@
     DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[maxSize]);
     DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[maxSize]);
     DECLARE_ALIGNED(16, int16_t, dequant_ptr[8]);
+#endif
     uint16_t eob;
     uint16_t ref_eob;
     int count = params_.coeffCount;
@@ -155,12 +205,23 @@
       coeff_ptr[rnd(count)] = rnd(coeffRange);
 
       for (int j = 0; j < 2; j++) {
+#if CONFIG_EXTQUANT
+        zbin_ptr[j] = rnd.Rand31();
+        quant_shift_ptr[j] = rnd.Rand31();
+        // int32_t positive
+        dequant_ptr[j] = abs(rnd(dequantRange));
+        quant_ptr[j] =
+            ((1 << (16 + QUANT_FP_BITS + QUANT_TABLE_BITS)) / dequant_ptr[j]);
+        round_ptr[j] = (abs(rnd(roundFactorRange)) * dequant_ptr[j]) >>
+                       (7 + QUANT_TABLE_BITS);
+#else
         zbin_ptr[j] = rnd.Rand16();
         quant_shift_ptr[j] = rnd.Rand16();
         // int16_t positive
         dequant_ptr[j] = abs(rnd(dequantRange));
         quant_ptr[j] = (1 << 16) / dequant_ptr[j];
         round_ptr[j] = (abs(rnd(roundFactorRange)) * dequant_ptr[j]) >> 7;
+#endif
       }
       for (int j = 2; j < 8; ++j) {
         zbin_ptr[j] = zbin_ptr[1];
@@ -207,7 +268,7 @@
 TEST_P(AV1QuantizeTest, BitExactCheck) { RunQuantizeTest(); }
 TEST_P(AV1QuantizeTest, EobVerify) { RunEobTest(); }
 
-#if HAVE_SSE4_1 && !CONFIG_EXTQUANT
+#if HAVE_SSE4_1 && CONFIG_EXTQUANT
 const QuantizeFuncParams qfps[4] = {
   QuantizeFuncParams(&av1_highbd_quantize_fp_sse4_1, &av1_highbd_quantize_fp_c,
                      16),
@@ -222,7 +283,7 @@
 INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1QuantizeTest, ::testing::ValuesIn(qfps));
 #endif  // HAVE_SSE4_1
 
-#if HAVE_AVX2 && !CONFIG_EXTQUANT
+#if HAVE_AVX2 && CONFIG_EXTQUANT
 const QuantizeFuncParams qfps_avx2[4] = {
   QuantizeFuncParams(&av1_highbd_quantize_fp_avx2, &av1_highbd_quantize_fp_c,
                      16),
diff --git a/test/quantize_func_test.cc b/test/quantize_func_test.cc
index c1e5539..ba2698b 100644
--- a/test/quantize_func_test.cc
+++ b/test/quantize_func_test.cc
@@ -439,6 +439,38 @@
 
 INSTANTIATE_TEST_SUITE_P(AVX2, QuantizeTest,
                          ::testing::ValuesIn(kQParamArrayAvx2));
+#elif HAVE_AVX2 && CONFIG_EXTQUANT
+const QuantizeParam kQParamArrayAvx2[] = {
+  make_tuple(&highbd_quan16x16_wrapper<av1_highbd_quantize_fp_c>,
+             &highbd_quan16x16_wrapper<av1_highbd_quantize_fp_avx2>,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_8),
+  make_tuple(&highbd_quan16x16_wrapper<av1_highbd_quantize_fp_c>,
+             &highbd_quan16x16_wrapper<av1_highbd_quantize_fp_avx2>,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_10),
+  make_tuple(&highbd_quan16x16_wrapper<av1_highbd_quantize_fp_c>,
+             &highbd_quan16x16_wrapper<av1_highbd_quantize_fp_avx2>,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_12),
+  make_tuple(&highbd_quan32x32_wrapper<av1_highbd_quantize_fp_c>,
+             &highbd_quan32x32_wrapper<av1_highbd_quantize_fp_avx2>,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_FP, AOM_BITS_8),
+  make_tuple(&highbd_quan32x32_wrapper<av1_highbd_quantize_fp_c>,
+             &highbd_quan32x32_wrapper<av1_highbd_quantize_fp_avx2>,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_FP, AOM_BITS_10),
+  make_tuple(&highbd_quan32x32_wrapper<av1_highbd_quantize_fp_c>,
+             &highbd_quan32x32_wrapper<av1_highbd_quantize_fp_avx2>,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_FP, AOM_BITS_12),
+  make_tuple(&highbd_quan64x64_wrapper<av1_highbd_quantize_fp_c>,
+             &highbd_quan64x64_wrapper<av1_highbd_quantize_fp_avx2>,
+             static_cast<TX_SIZE>(TX_64X64), TYPE_FP, AOM_BITS_8),
+  make_tuple(&highbd_quan64x64_wrapper<av1_highbd_quantize_fp_c>,
+             &highbd_quan64x64_wrapper<av1_highbd_quantize_fp_avx2>,
+             static_cast<TX_SIZE>(TX_64X64), TYPE_FP, AOM_BITS_10),
+  make_tuple(&highbd_quan64x64_wrapper<av1_highbd_quantize_fp_c>,
+             &highbd_quan64x64_wrapper<av1_highbd_quantize_fp_avx2>,
+             static_cast<TX_SIZE>(TX_64X64), TYPE_FP, AOM_BITS_12),
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, QuantizeTest,
+                         ::testing::ValuesIn(kQParamArrayAvx2));
 #endif  // HAVE_AVX2
 
 #if HAVE_SSE2 && !CONFIG_EXTQUANT
@@ -528,9 +560,33 @@
 
 INSTANTIATE_TEST_SUITE_P(SSE2, QuantizeTest,
                          ::testing::ValuesIn(kQParamArraySSE2));
+#elif HAVE_SSE2 && CONFIG_EXTQUANT
+const QuantizeParam kQParamArraySSE2[] = {
+  make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_sse2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_sse2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_10),
+  make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_sse2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_12),
+  make_tuple(&aom_highbd_quantize_b_32x32_c, &aom_highbd_quantize_b_32x32_sse2,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_highbd_quantize_b_32x32_c, &aom_highbd_quantize_b_32x32_sse2,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_10),
+  make_tuple(&aom_highbd_quantize_b_32x32_c, &aom_highbd_quantize_b_32x32_sse2,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_12),
+  make_tuple(&aom_highbd_quantize_b_64x64_c, &aom_highbd_quantize_b_64x64_sse2,
+             static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_highbd_quantize_b_64x64_c, &aom_highbd_quantize_b_64x64_sse2,
+             static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_10),
+  make_tuple(&aom_highbd_quantize_b_64x64_c, &aom_highbd_quantize_b_64x64_sse2,
+             static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_12)
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE2, QuantizeTest,
+                         ::testing::ValuesIn(kQParamArraySSE2));
 #endif
 
-#if HAVE_NEON
+#if HAVE_NEON && !CONFIG_EXTQUANT
 const QuantizeParam kQParamArrayNEON[] = {
   make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_neon,
              static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_8),