AVX2: Remove some permutes in av1_quantize_lp_avx2 for TX_4X4

Performance:
| SPD_SET |  TESTSET   | AVG_PSNR | OVR_PSNR |  SSIM   | ENC_T |
|---------|------------|----------|----------|---------|-------|
|    5    |    rtc     | +0.000%  | +0.000%  | +0.000% | +0.0% |
|    5    | rtc_1080p  | +0.000%  | +0.000%  | +0.000% | -0.2% |
|    5    |  rtc_derf  | +0.000%  | +0.000%  | +0.000% | -0.0% |
|    5    | rtc_screen | +0.000%  | +0.000%  | +0.000% | -0.1% |
|---------|------------|----------|----------|---------|-------|
|    6    |    rtc     | +0.000%  | +0.000%  | +0.000% | +0.0% |
|    6    | rtc_1080p  | +0.000%  | +0.000%  | +0.000% | -0.2% |
|    6    |  rtc_derf  | +0.000%  | +0.000%  | +0.000% | +0.0% |
|    6    | rtc_screen | +0.000%  | +0.000%  | +0.000% | -0.1% |
|---------|------------|----------|----------|---------|-------|
|    7    |    rtc     | +0.000%  | +0.000%  | +0.000% | -0.2% |
|    7    | rtc_1080p  | +0.000%  | +0.000%  | +0.000% | -0.2% |
|    7    |  rtc_derf  | +0.000%  | +0.000%  | +0.000% | -0.2% |
|    7    | rtc_screen | +0.000%  | +0.000%  | +0.000% | -0.2% |
|---------|------------|----------|----------|---------|-------|
|    8    |    rtc     | +0.000%  | +0.000%  | +0.000% | -0.2% |
|    8    | rtc_1080p  | +0.000%  | +0.000%  | +0.000% | -0.2% |
|    8    |  rtc_derf  | +0.000%  | +0.000%  | +0.000% | -0.2% |
|    8    | rtc_screen | +0.000%  | +0.000%  | +0.000% | -0.2% |
|---------|------------|----------|----------|---------|-------|
|    9    |    rtc     | +0.000%  | +0.000%  | +0.000% | -0.2% |
|    9    | rtc_1080p  | +0.000%  | +0.000%  | +0.000% | -0.2% |
|    9    |  rtc_derf  | +0.000%  | +0.000%  | +0.000% | -0.2% |
|    9    | rtc_screen | +0.000%  | +0.000%  | +0.000% | -0.1% |
|---------|------------|----------|----------|---------|-------|
|   10    |    rtc     | +0.000%  | +0.000%  | +0.000% | -0.2% |
|   10    | rtc_1080p  | +0.000%  | +0.000%  | +0.000% | -0.2% |
|   10    |  rtc_derf  | +0.000%  | +0.000%  | +0.000% | -0.2% |
|   10    | rtc_screen | +0.000%  | +0.000%  | +0.000% | -0.0% |

Change-Id: I5e838985c5ec03a604cd4609ab9d938f2d866cf7
diff --git a/av1/encoder/x86/av1_quantize_avx2.c b/av1/encoder/x86/av1_quantize_avx2.c
index 8c4e395..75c5172 100644
--- a/av1/encoder/x86/av1_quantize_avx2.c
+++ b/av1/encoder/x86/av1_quantize_avx2.c
@@ -98,6 +98,28 @@
   return _mm_extract_epi16(eob, 1);
 }
 
+static AOM_FORCE_INLINE void quantize_lp_16_first(
+    const int16_t *coeff_ptr, const int16_t *iscan_ptr, int16_t *qcoeff_ptr,
+    int16_t *dqcoeff_ptr, __m256i *round256, __m256i *quant256,
+    __m256i *dequant256, __m256i *eob) {
+  const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+  const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round256);
+  const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant256);
+  const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff);
+  const __m256i dqcoeff = _mm256_mullo_epi16(qcoeff, *dequant256);
+  const __m256i nz_mask =
+      _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256());
+
+  _mm256_storeu_si256((__m256i *)qcoeff_ptr, qcoeff);
+  _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dqcoeff);
+
+  const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr);
+  const __m256i iscan_plus1 = _mm256_sub_epi16(iscan, nz_mask);
+  const __m256i nz_iscan = _mm256_and_si256(iscan_plus1, nz_mask);
+  *eob = _mm256_max_epi16(*eob, nz_iscan);
+}
+
 static AOM_FORCE_INLINE void quantize_lp_16(
     const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *iscan_ptr,
     int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, __m256i *round256,
@@ -143,29 +165,21 @@
   quant256 = _mm256_permute4x64_epi64(quant256, 0x54);
   dequant256 = _mm256_permute4x64_epi64(dequant256, 0x54);
 
-  coeff_ptr += n_coeffs;
-  iscan += n_coeffs;
-  qcoeff_ptr += n_coeffs;
-  dqcoeff_ptr += n_coeffs;
-  n_coeffs = -n_coeffs;
-
   // Process DC and the first 15 AC coeffs.
-  quantize_lp_16(coeff_ptr, n_coeffs, iscan, qcoeff_ptr, dqcoeff_ptr, &round256,
-                 &quant256, &dequant256, &eob256);
+  quantize_lp_16_first(coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &round256,
+                       &quant256, &dequant256, &eob256);
 
-  // Overwrite the DC constants with AC constants
-  dequant256 = _mm256_permute2x128_si256(dequant256, dequant256, 0x31);
-  quant256 = _mm256_permute2x128_si256(quant256, quant256, 0x31);
-  round256 = _mm256_permute2x128_si256(round256, round256, 0x31);
+  if (n_coeffs > 16) {
+    // Overwrite the DC constants with AC constants
+    dequant256 = _mm256_permute2x128_si256(dequant256, dequant256, 0x31);
+    quant256 = _mm256_permute2x128_si256(quant256, quant256, 0x31);
+    round256 = _mm256_permute2x128_si256(round256, round256, 0x31);
 
-  n_coeffs += 8 * 2;
-
-  // AC only loop.
-  while (n_coeffs < 0) {
-    quantize_lp_16(coeff_ptr, n_coeffs, iscan, qcoeff_ptr, dqcoeff_ptr,
-                   &round256, &quant256, &dequant256, &eob256);
-
-    n_coeffs += 8 * 2;
+    // AC only loop.
+    for (int idx = 16; idx < n_coeffs; idx += 16) {
+      quantize_lp_16(coeff_ptr, idx, iscan, qcoeff_ptr, dqcoeff_ptr, &round256,
+                     &quant256, &dequant256, &eob256);
+    }
   }
 
   *eob_ptr = accumulate_eob256(eob256);