HBD fast path quantization speed improvement

- HBD encoder speed improvement (SSE4.1):
  Enable CONFIG_VP9_HIGHBITDEPTH, on Xeon E5-2680,
  50 frames, park_joy_1080p, 12-bit,
  Encoding time reduces from 4846481 to 4177471 (ms)
- Add unit test to verify bit-exact and EOB calculation

Change-Id: I08e8ef3549ddad5ab36d86e78557df3b288537ea
diff --git a/test/test.mk b/test/test.mk
index cdef53c..94091f0 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -198,6 +198,7 @@
 endif
 
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+LIBVPX_TEST_SRCS-$(HAVE_SSE4_1) += vp10_quantize_test.cc
 LIBVPX_TEST_SRCS-$(HAVE_SSE4_1) += vp10_highbd_iht_test.cc
 endif # CONFIG_VP9_HIGHBITDEPTH
 endif # VP10
diff --git a/test/vp10_quantize_test.cc b/test/vp10_quantize_test.cc
new file mode 100644
index 0000000..f8bbb25
--- /dev/null
+++ b/test/vp10_quantize_test.cc
@@ -0,0 +1,230 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#include "./vp10_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "vp10/common/scan.h"
+
+namespace {
+
+typedef void (*QuantizeFpFunc)(const tran_low_t *coeff_ptr, intptr_t count,
+                               int skip_block, const int16_t *zbin_ptr,
+                               const int16_t *round_ptr,
+                               const int16_t *quant_ptr,
+                               const int16_t *quant_shift_ptr,
+                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                               const int16_t *scan, const int16_t *iscan,
+                               const int log_scale);
+
+struct QuantizeFuncParams {
+  QuantizeFuncParams(QuantizeFpFunc qF = NULL, QuantizeFpFunc qRefF = NULL,
+                     int count = 16) : qFunc(qF), qFuncRef(qRefF),
+                                       coeffCount(count) {}
+  QuantizeFpFunc qFunc;
+  QuantizeFpFunc qFuncRef;
+  int coeffCount;
+};
+
+using libvpx_test::ACMRandom;
+
+const int numTests = 1000;
+const int maxSize = 1024;
+const int roundFactorRange = 127;
+const int dequantRange = 32768;
+const int coeffRange = (1 << 20) - 1;
+
+class VP10QuantizeTest : public ::testing::TestWithParam<QuantizeFuncParams> {
+ public:
+  void RunQuantizeTest() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
+    DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
+    DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
+    DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
+    DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
+    uint16_t eob;
+    uint16_t ref_eob;
+    int err_count_total = 0;
+    int first_failure = -1;
+    int skip_block = 0;
+    int count = params_.coeffCount;
+    const TX_SIZE txSize = getTxSize(count);
+    int log_scale = (txSize == TX_32X32);
+    QuantizeFpFunc quanFunc = params_.qFunc;
+    QuantizeFpFunc quanFuncRef = params_.qFuncRef;
+
+    const scan_order scanOrder = vp10_default_scan_orders[txSize];
+    for (int i = 0; i < numTests; i++) {
+      int err_count = 0;
+      ref_eob = eob = -1;
+      for (int j = 0; j < count; j++) {
+        coeff_ptr[j] = rnd(coeffRange);
+      }
+
+      for (int j = 0; j < 2; j++) {
+        zbin_ptr[j] = rnd.Rand16();
+        quant_shift_ptr[j] = rnd.Rand16();
+        // int16_t positive
+        dequant_ptr[j] = abs(rnd(dequantRange));
+        quant_ptr[j] = (1 << 16) / dequant_ptr[j];
+        round_ptr[j] = (abs(rnd(roundFactorRange)) * dequant_ptr[j]) >> 7;
+      }
+
+      quanFuncRef(coeff_ptr, count, skip_block, zbin_ptr,
+                  round_ptr, quant_ptr, quant_shift_ptr,
+                  ref_qcoeff_ptr, ref_dqcoeff_ptr, dequant_ptr,
+                  &ref_eob, scanOrder.scan, scanOrder.iscan,
+                  log_scale);
+
+      ASM_REGISTER_STATE_CHECK(quanFunc(coeff_ptr, count, skip_block, zbin_ptr,
+                                        round_ptr, quant_ptr, quant_shift_ptr,
+                                        qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                                        &eob, scanOrder.scan, scanOrder.iscan,
+                                        log_scale));
+
+      for (int j = 0; j < count; ++j) {
+        err_count += (ref_qcoeff_ptr[j]  != qcoeff_ptr[j]) |
+            (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
+        EXPECT_EQ(ref_qcoeff_ptr[j], qcoeff_ptr[j])
+            << "qcoeff error: i = " << i << " j = " << j << "\n";
+        EXPECT_EQ(ref_dqcoeff_ptr[j], dqcoeff_ptr[j])
+            << "dqcoeff error: i = " << i << " j = " << j << "\n";
+      }
+      EXPECT_EQ(ref_eob, eob)
+          << "eob error: " << "i = " << i << "\n";
+      err_count += (ref_eob != eob);
+      if (err_count && !err_count_total) {
+        first_failure = i;
+      }
+      err_count_total += err_count;
+    }
+    EXPECT_EQ(0, err_count_total)
+        << "Error: Quantization Test, C output doesn't match SSE2 output. "
+        << "First failed at test case " << first_failure;
+  }
+
+  void RunEobTest() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
+    DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
+    DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
+    DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
+    DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
+    uint16_t eob;
+    uint16_t ref_eob;
+    int skip_block = 0;
+    int count = params_.coeffCount;
+    const TX_SIZE txSize = getTxSize(count);
+    int log_scale = (txSize == TX_32X32);
+    QuantizeFpFunc quanFunc = params_.qFunc;
+    QuantizeFpFunc quanFuncRef = params_.qFuncRef;
+    const scan_order scanOrder = vp10_default_scan_orders[txSize];
+
+    for (int i = 0; i < numTests; i++) {
+      ref_eob = eob = -1;
+      for (int j = 0; j < count; j++) {
+        coeff_ptr[j] = 0;
+      }
+
+      coeff_ptr[rnd(count)] = rnd(coeffRange);
+      coeff_ptr[rnd(count)] = rnd(coeffRange);
+      coeff_ptr[rnd(count)] = rnd(coeffRange);
+
+      for (int j = 0; j < 2; j++) {
+        zbin_ptr[j] = rnd.Rand16();
+        quant_shift_ptr[j] = rnd.Rand16();
+        // int16_t positive
+        dequant_ptr[j] = abs(rnd(dequantRange));
+        quant_ptr[j] = (1 << 16) / dequant_ptr[j];
+        round_ptr[j] = (abs(rnd(roundFactorRange)) * dequant_ptr[j]) >> 7;
+      }
+
+      quanFuncRef(coeff_ptr, count, skip_block, zbin_ptr,
+                  round_ptr, quant_ptr, quant_shift_ptr,
+                  ref_qcoeff_ptr, ref_dqcoeff_ptr, dequant_ptr,
+                  &ref_eob, scanOrder.scan, scanOrder.iscan,
+                  log_scale);
+
+      ASM_REGISTER_STATE_CHECK(quanFunc(coeff_ptr, count, skip_block, zbin_ptr,
+                                        round_ptr, quant_ptr, quant_shift_ptr,
+                                        qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                                        &eob, scanOrder.scan, scanOrder.iscan,
+                                        log_scale));
+      EXPECT_EQ(ref_eob, eob)
+          << "eob error: " << "i = " << i << "\n";
+    }
+  }
+
+  virtual void SetUp() {
+    params_ = GetParam();
+  }
+
+  virtual void TearDown() {
+    libvpx_test::ClearSystemState();
+  }
+
+  virtual ~VP10QuantizeTest() {}
+
+ private:
+  TX_SIZE getTxSize(int count) {
+    TX_SIZE txSize = 0;
+    if (16 == count) {
+      txSize = 0;
+    } else if (64 == count) {
+      txSize = 1;
+    } else if (256 == count) {
+      txSize = 2;
+    } else if (1024 == count) {
+      txSize = 3;
+    }
+    return txSize;
+  }
+
+  QuantizeFuncParams params_;
+};
+
+TEST_P(VP10QuantizeTest, BitExactCheck) {
+  RunQuantizeTest();
+}
+TEST_P(VP10QuantizeTest, EobVerify) {
+  RunEobTest();
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, VP10QuantizeTest,
+    ::testing::Values(QuantizeFuncParams(&vp10_highbd_quantize_fp_sse4_1,
+                                         &vp10_highbd_quantize_fp_c, 16),
+                      QuantizeFuncParams(&vp10_highbd_quantize_fp_sse4_1,
+                                         &vp10_highbd_quantize_fp_c, 64),
+                      QuantizeFuncParams(&vp10_highbd_quantize_fp_sse4_1,
+                                         &vp10_highbd_quantize_fp_c, 256),
+                      QuantizeFuncParams(&vp10_highbd_quantize_fp_sse4_1,
+                                         &vp10_highbd_quantize_fp_c, 1024)));
+#endif  // HAVE_SSE4_1
+}  // namespace
diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl
index d7cf6e0..85e6cc0 100644
--- a/vp10/common/vp10_rtcd_defs.pl
+++ b/vp10/common/vp10_rtcd_defs.pl
@@ -690,7 +690,7 @@
   specialize qw/vp10_highbd_block_error sse2/;
 
   add_proto qw/void vp10_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
-  specialize qw/vp10_highbd_quantize_fp/;
+  specialize qw/vp10_highbd_quantize_fp sse4_1/;
 
   add_proto qw/void vp10_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
   specialize qw/vp10_highbd_quantize_b/;
diff --git a/vp10/encoder/x86/vp10_highbd_quantize_sse4.c b/vp10/encoder/x86/vp10_highbd_quantize_sse4.c
new file mode 100644
index 0000000..a2ed7a9
--- /dev/null
+++ b/vp10/encoder/x86/vp10_highbd_quantize_sse4.c
@@ -0,0 +1,200 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h>
+#include <stdint.h>
+
+#include "./vp10_rtcd.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+// Coefficient quantization phase 1
+// param[0-2] : rounding/quan/dequan constants
+static INLINE void quantize_coeff_phase1(__m128i *coeff, const __m128i *param,
+                                         const int shift, const int scale,
+                                         __m128i *qcoeff, __m128i *dquan,
+                                         __m128i *sign) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi32(1);
+
+  *sign = _mm_cmplt_epi32(*coeff, zero);
+  *sign = _mm_or_si128(*sign, one);
+  *coeff = _mm_abs_epi32(*coeff);
+
+  qcoeff[0] = _mm_add_epi32(*coeff, param[0]);
+  qcoeff[1] = _mm_unpackhi_epi32(qcoeff[0], zero);
+  qcoeff[0] = _mm_unpacklo_epi32(qcoeff[0], zero);
+
+  qcoeff[0] = _mm_mul_epi32(qcoeff[0], param[1]);
+  qcoeff[0] = _mm_srli_epi64(qcoeff[0], shift);
+  dquan[0] = _mm_mul_epi32(qcoeff[0], param[2]);
+  dquan[0] = _mm_srli_epi64(dquan[0], scale);
+}
+
+// Coefficient quantization phase 2
+static INLINE void quantize_coeff_phase2(__m128i *qcoeff, __m128i *dquan,
+                                         const __m128i *sign,
+                                         const __m128i *param, const int shift,
+                                         const int scale, tran_low_t *qAddr,
+                                         tran_low_t *dqAddr) {
+  __m128i mask0L = _mm_set_epi32(-1, -1, 0, 0);
+  __m128i mask0H = _mm_set_epi32(0, 0, -1, -1);
+
+  qcoeff[1] = _mm_mul_epi32(qcoeff[1], param[1]);
+  qcoeff[1] = _mm_srli_epi64(qcoeff[1], shift);
+  dquan[1] = _mm_mul_epi32(qcoeff[1], param[2]);
+  dquan[1] = _mm_srli_epi64(dquan[1], scale);
+
+  // combine L&H
+  qcoeff[0] = _mm_shuffle_epi32(qcoeff[0], 0xd8);
+  qcoeff[1] = _mm_shuffle_epi32(qcoeff[1], 0x8d);
+
+  qcoeff[0] = _mm_and_si128(qcoeff[0], mask0H);
+  qcoeff[1] = _mm_and_si128(qcoeff[1], mask0L);
+
+  dquan[0] = _mm_shuffle_epi32(dquan[0], 0xd8);
+  dquan[1] = _mm_shuffle_epi32(dquan[1], 0x8d);
+
+  dquan[0] = _mm_and_si128(dquan[0], mask0H);
+  dquan[1] = _mm_and_si128(dquan[1], mask0L);
+
+  qcoeff[0] = _mm_or_si128(qcoeff[0], qcoeff[1]);
+  dquan[0] = _mm_or_si128(dquan[0], dquan[1]);
+
+  qcoeff[0] = _mm_sign_epi32(qcoeff[0], *sign);
+  dquan[0] = _mm_sign_epi32(dquan[0], *sign);
+
+  _mm_storeu_si128((__m128i *)qAddr, qcoeff[0]);
+  _mm_storeu_si128((__m128i *)dqAddr, dquan[0]);
+}
+
+static INLINE void find_eob(tran_low_t *qcoeff_ptr, const int16_t *iscan,
+                            __m128i *eob) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i mask, iscanIdx;
+  const __m128i q0 = _mm_loadu_si128((__m128i const *)qcoeff_ptr);
+  const __m128i q1 = _mm_loadu_si128((__m128i const *)(qcoeff_ptr + 4));
+  __m128i nz_flag0 = _mm_cmpeq_epi32(q0, zero);
+  __m128i nz_flag1 = _mm_cmpeq_epi32(q1, zero);
+
+  nz_flag0 = _mm_cmpeq_epi32(nz_flag0, zero);
+  nz_flag1 = _mm_cmpeq_epi32(nz_flag1, zero);
+
+  mask = _mm_packs_epi32(nz_flag0, nz_flag1);
+  iscanIdx = _mm_loadu_si128((__m128i const *)iscan);
+  iscanIdx = _mm_sub_epi16(iscanIdx, mask);
+  iscanIdx = _mm_and_si128(iscanIdx, mask);
+  *eob = _mm_max_epi16(*eob, iscanIdx);
+}
+
+static INLINE uint16_t get_accumulated_eob(__m128i *eob) {
+  __m128i eob_shuffled;
+  uint16_t eobValue;
+  eob_shuffled = _mm_shuffle_epi32(*eob, 0xe);
+  *eob = _mm_max_epi16(*eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(*eob, 0xe);
+  *eob = _mm_max_epi16(*eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(*eob, 0x1);
+  *eob = _mm_max_epi16(*eob, eob_shuffled);
+  eobValue = _mm_extract_epi16(*eob, 0);
+  return eobValue;
+}
+
+void vp10_highbd_quantize_fp_sse4_1(const tran_low_t *coeff_ptr,
+                                    intptr_t count,
+                                    int skip_block,
+                                    const int16_t *zbin_ptr,
+                                    const int16_t *round_ptr,
+                                    const int16_t *quant_ptr,
+                                    const int16_t *quant_shift_ptr,
+                                    tran_low_t *qcoeff_ptr,
+                                    tran_low_t *dqcoeff_ptr,
+                                    const int16_t *dequant_ptr,
+                                    uint16_t *eob_ptr,
+                                    const int16_t *scan,
+                                    const int16_t *iscan,
+                                    const int log_scale) {
+  __m128i coeff[2], qcoeff[2], dequant[2], qparam[3], coeff_sign;
+  __m128i eob = _mm_setzero_si128();
+  const tran_low_t *src = coeff_ptr;
+  tran_low_t *quanAddr = qcoeff_ptr;
+  tran_low_t *dquanAddr = dqcoeff_ptr;
+  const int shift = 16 - log_scale;
+  const int coeff_stride = 4;
+  const int quan_stride = coeff_stride;
+  (void)skip_block;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)scan;
+
+  memset(quanAddr, 0, count * sizeof(quanAddr[0]));
+  memset(dquanAddr, 0, count * sizeof(dquanAddr[0]));
+
+  if (!skip_block) {
+    coeff[0] = _mm_loadu_si128((__m128i const *)src);
+
+    qparam[0] = _mm_set_epi32(round_ptr[1], round_ptr[1], round_ptr[1],
+                              round_ptr[0]);
+    qparam[1] = _mm_set_epi64x(quant_ptr[1], quant_ptr[0]);
+    qparam[2] = _mm_set_epi64x(dequant_ptr[1], dequant_ptr[0]);
+
+    // DC and first 3 AC
+    quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale,
+                          qcoeff, dequant, &coeff_sign);
+
+    // update round/quan/dquan for AC
+    qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]);
+    qparam[1] = _mm_set_epi64x(quant_ptr[1], quant_ptr[1]);
+    qparam[2] = _mm_set_epi64x(dequant_ptr[1], dequant_ptr[1]);
+
+    quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
+                          log_scale, quanAddr, dquanAddr);
+
+    // next 4 AC
+    coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
+    quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale,
+                          qcoeff, dequant, &coeff_sign);
+    quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
+                          log_scale, quanAddr + quan_stride,
+                          dquanAddr + quan_stride);
+
+    find_eob(quanAddr, iscan, &eob);
+
+    count -= 8;
+
+    // loop for the rest of AC
+    while (count > 0) {
+      src += coeff_stride << 1;
+      quanAddr += quan_stride << 1;
+      dquanAddr += quan_stride << 1;
+      iscan += quan_stride << 1;
+
+      coeff[0] = _mm_loadu_si128((__m128i const *)src);
+      coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
+
+      quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff,
+                            dequant, &coeff_sign);
+      quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
+                            log_scale, quanAddr, dquanAddr);
+
+      quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff,
+                            dequant, &coeff_sign);
+      quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
+                            log_scale, quanAddr + quan_stride,
+                            dquanAddr + quan_stride);
+
+      find_eob(quanAddr, iscan, &eob);
+
+      count -= 8;
+    }
+    *eob_ptr = get_accumulated_eob(&eob);
+  } else {
+    *eob_ptr = 0;
+  }
+}
diff --git a/vp10/vp10cx.mk b/vp10/vp10cx.mk
index 5524322..5c62d37 100644
--- a/vp10/vp10cx.mk
+++ b/vp10/vp10cx.mk
@@ -117,6 +117,7 @@
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP10_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/highbd_fwd_txfm_sse4.c
 VP10_CX_SRCS-$(HAVE_SSE4_1) += common/x86/highbd_inv_txfm_sse4.c
+VP10_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp10_highbd_quantize_sse4.c
 endif
 
 ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes)