Implement av1_quantize_fp_64x64_avx2

1. Implement av1_quantize_fp_64x64_avx2
2. The perf result shows the CPU time of
quantize_fp_helper_c drop from 1.01% to
0.65%(+0.11% for av1_quantize_fp_64x64_avx2)
3. Refactor: extract quant_gather_eob

a) gcc (Ubuntu 7.2.0-8ubuntu3.2) 7.2.0
b) CPU: Intel(R) Core(TM) i7-6900K CPU @ 3.20GHz
c) Config cmd
cmake ../ -DENABLE_CCACHE=1 -DCONFIG_LOWBITDEPTH=1
d) Test cmd:
./aomenc --cpu-used=1 --end-usage=vbr \
--target-bitrate=800 --limit=10

Change-Id: Ic1d07bc2b995a71876cf61d9bcc72895592ccb59
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index df09142..4a37d23 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -173,7 +173,7 @@
   specialize qw/av1_quantize_fp_32x32 avx2/;
 
   add_proto qw/void av1_quantize_fp_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-
+  specialize qw/av1_quantize_fp_64x64 avx2/;
 
   # fdct functions
 
diff --git a/av1/encoder/x86/av1_quantize_avx2.c b/av1/encoder/x86/av1_quantize_avx2.c
index 078a675..fb18829 100644
--- a/av1/encoder/x86/av1_quantize_avx2.c
+++ b/av1/encoder/x86/av1_quantize_avx2.c
@@ -57,7 +57,7 @@
   init_one_qp(&round, &qp[0]);
   init_one_qp(&quant, &qp[1]);
 
-  if (log_scale > 0) {
+  if (log_scale == 1) {
     qp[1] = _mm256_slli_epi16(qp[1], log_scale);
   }
 
@@ -94,6 +94,15 @@
     }                                            \
   } while (0)
 
+static INLINE uint16_t quant_gather_eob(__m256i eob) {
+  const __m128i eob_lo = _mm256_castsi256_si128(eob);
+  const __m128i eob_hi = _mm256_extractf128_si256(eob, 1);
+  __m128i eob_s = _mm_max_epi16(eob_lo, eob_hi);
+  eob_s = _mm_subs_epu16(_mm_set1_epi16(INT16_MAX), eob_s);
+  eob_s = _mm_minpos_epu16(eob_s);
+  return INT16_MAX - _mm_extract_epi16(eob_s, 0);
+}
+
 static INLINE void quantize(const __m256i *thr, const __m256i *qp, __m256i *c,
                             const int16_t *iscan_ptr, tran_low_t *qcoeff,
                             tran_low_t *dqcoeff, __m256i *eob) {
@@ -163,18 +172,7 @@
       iscan_ptr += step;
       n_coeffs -= step;
     }
-    {
-      __m256i eob_s;
-      eob_s = _mm256_shuffle_epi32(eob, 0xe);
-      eob = _mm256_max_epi16(eob, eob_s);
-      eob_s = _mm256_shufflelo_epi16(eob, 0xe);
-      eob = _mm256_max_epi16(eob, eob_s);
-      eob_s = _mm256_shufflelo_epi16(eob, 1);
-      eob = _mm256_max_epi16(eob, eob_s);
-      const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
-                                              _mm256_extractf128_si256(eob, 1));
-      *eob_ptr = _mm_extract_epi16(final_eob, 0);
-    }
+    *eob_ptr = quant_gather_eob(eob);
   } else {
     do {
       write_zero(qcoeff_ptr);
@@ -261,18 +259,98 @@
       iscan_ptr += step;
       n_coeffs -= step;
     }
-    {
-      __m256i eob_s;
-      eob_s = _mm256_shuffle_epi32(eob, 0xe);
-      eob = _mm256_max_epi16(eob, eob_s);
-      eob_s = _mm256_shufflelo_epi16(eob, 0xe);
-      eob = _mm256_max_epi16(eob, eob_s);
-      eob_s = _mm256_shufflelo_epi16(eob, 1);
-      eob = _mm256_max_epi16(eob, eob_s);
-      const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
-                                              _mm256_extractf128_si256(eob, 1));
-      *eob_ptr = _mm_extract_epi16(final_eob, 0);
+    *eob_ptr = quant_gather_eob(eob);
+  } else {
+    do {
+      write_zero(qcoeff_ptr);
+      write_zero(dqcoeff_ptr);
+      qcoeff_ptr += step;
+      dqcoeff_ptr += step;
+      n_coeffs -= step;
+    } while (n_coeffs > 0);
+    *eob_ptr = 0;
+  }
+}
+
+static INLINE void quantize_64x64(const __m256i *thr, const __m256i *qp,
+                                  __m256i *c, const int16_t *iscan_ptr,
+                                  tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                                  __m256i *eob) {
+  const __m256i abs = _mm256_abs_epi16(*c);
+  __m256i mask = _mm256_cmpgt_epi16(abs, *thr);
+  mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs, *thr));
+  const int nzflag = _mm256_movemask_epi8(mask);
+
+  if (nzflag) {
+    __m256i q = _mm256_adds_epi16(abs, qp[0]);
+    __m256i qh = _mm256_mulhi_epi16(q, qp[1]);
+    __m256i ql = _mm256_mullo_epi16(q, qp[1]);
+    qh = _mm256_slli_epi16(qh, 2);
+    ql = _mm256_srli_epi16(ql, 14);
+    q = _mm256_or_si256(qh, ql);
+    const __m256i dqh = _mm256_slli_epi16(_mm256_mulhi_epi16(q, qp[2]), 14);
+    const __m256i dql = _mm256_srli_epi16(_mm256_mullo_epi16(q, qp[2]), 2);
+    __m256i dq = _mm256_or_si256(dqh, dql);
+
+    q = _mm256_sign_epi16(q, *c);
+    dq = _mm256_sign_epi16(dq, *c);
+
+    store_two_quan(q, qcoeff, dq, dqcoeff);
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr);
+    const __m256i zero_coeff = _mm256_cmpeq_epi16(dq, zero);
+    const __m256i nzero_coeff = _mm256_cmpeq_epi16(zero_coeff, zero);
+    __m256i cur_eob = _mm256_sub_epi16(iscan, nzero_coeff);
+    cur_eob = _mm256_and_si256(cur_eob, nzero_coeff);
+    *eob = _mm256_max_epi16(*eob, cur_eob);
+  } else {
+    write_zero(qcoeff);
+    write_zero(dqcoeff);
+  }
+}
+
+void av1_quantize_fp_64x64_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+  (void)scan_ptr;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  const unsigned int step = 16;
+
+  if (LIKELY(!skip_block)) {
+    __m256i qp[3];
+    __m256i coeff, thr;
+    const int log_scale = 2;
+
+    init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+    read_coeff(coeff_ptr, &coeff);
+
+    __m256i eob = _mm256_setzero_si256();
+    quantize_64x64(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+    coeff_ptr += step;
+    qcoeff_ptr += step;
+    dqcoeff_ptr += step;
+    iscan_ptr += step;
+    n_coeffs -= step;
+
+    update_qp(log_scale, &thr, qp);
+
+    while (n_coeffs > 0) {
+      read_coeff(coeff_ptr, &coeff);
+      quantize_64x64(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr,
+                     &eob);
+
+      coeff_ptr += step;
+      qcoeff_ptr += step;
+      dqcoeff_ptr += step;
+      iscan_ptr += step;
+      n_coeffs -= step;
     }
+    *eob_ptr = quant_gather_eob(eob);
   } else {
     do {
       write_zero(qcoeff_ptr);
diff --git a/test/quantize_func_test.cc b/test/quantize_func_test.cc
index 9cfc52e..71c8d94 100644
--- a/test/quantize_func_test.cc
+++ b/test/quantize_func_test.cc
@@ -301,6 +301,8 @@
              AOM_BITS_8),
   make_tuple(&av1_quantize_fp_32x32_c, &av1_quantize_fp_32x32_avx2, TX_32X32,
              TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_64x64_c, &av1_quantize_fp_64x64_avx2, TX_64X64,
+             TYPE_FP, AOM_BITS_8),
   make_tuple(&highbd_quan16x16_wrapper<av1_highbd_quantize_fp_c>,
              &highbd_quan16x16_wrapper<av1_highbd_quantize_fp_avx2>, TX_16X16,
              TYPE_FP, AOM_BITS_8),