Optimize adaptive quantization modules

SSE2 variants of aom_quantize_b_adaptive and aom_quantize_b_32x32_adaptive
are optimized for improved performance.

For speed = 3 and 4 presets, observed encode time reduction of
1.07% and 0.67% (averaged across multiple test cases).

Performance improved by factor of ~1.7x

Change-Id: Icb0769fc44b3a75b18b530c3d0ab172d8a99a0a0
diff --git a/aom_dsp/x86/adaptive_quantize_sse2.c b/aom_dsp/x86/adaptive_quantize_sse2.c
index 3822c27..fc9de59 100644
--- a/aom_dsp/x86/adaptive_quantize_sse2.c
+++ b/aom_dsp/x86/adaptive_quantize_sse2.c
@@ -22,41 +22,31 @@
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan) {
-  const __m128i zero = _mm_setzero_si128();
   int index = 16;
-  int non_zero_count = (int)n_coeffs;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
+  const __m128i zero = _mm_setzero_si128();
   __m128i zbin, round, quant, dequant, shift;
   __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
   __m128i qcoeff0, qcoeff1;
   __m128i cmp_mask0, cmp_mask1;
-  __m128i eob = zero, eob0, prescan0, prescan1, all_zero;
-  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 0),
-                         ROUND_POWER_OF_TWO(zbin_ptr[1], 0) };
+  __m128i all_zero;
+  __m128i mask0 = zero, mask1 = zero;
 
   int prescan_add[2];
-  for (int i = 0; i < 2; ++i)
+  int thresh[4];
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
     prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
-
-  // max buffer is of size 256 as this functions calls with
-  // maximum n_coeffs as 256
-  int16_t prescan[256];
-  memset(prescan, -1, n_coeffs * sizeof(int16_t));
-
-  // TODO(Aniket): Experiment the following loop with intrinsic
-  for (int i = (int)n_coeffs - 1; i >= 0; i--) {
-    const int rc = scan[i];
-    const qm_val_t wt = 1 << AOM_QM_BITS;
-    const int coeff = coeff_ptr[rc] * wt;
-    const int coeff_sign = (coeff >> 31);
-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-    const int prescan_add_val = prescan_add[rc != 0];
-    if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
-      prescan[rc] = 0;
-      non_zero_count--;
-    } else {
-      break;
-    }
+    thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1;
   }
+  thresh[2] = thresh[3] = thresh[1];
+  __m128i threshold[2];
+  threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+  threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
 #if SKIP_EOB_FACTOR_ADJUST
   int first = -1;
 #endif
@@ -74,13 +64,15 @@
   qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
   qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
 
-  prescan0 = _mm_loadu_si128((const __m128i *)prescan);
-  prescan1 = _mm_loadu_si128((const __m128i *)(prescan + 8));
+  update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
 
-  cmp_mask0 = _mm_and_si128(prescan0, _mm_cmpgt_epi16(qcoeff0, zbin));
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
   zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
-  cmp_mask1 = _mm_and_si128(prescan1, _mm_cmpgt_epi16(qcoeff1, zbin));
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
 
+  update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1);
+
+  threshold[0] = threshold[1];
   all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
   if (_mm_movemask_epi8(all_zero) == 0) {
     _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
@@ -121,13 +113,9 @@
 
     store_coefficients(coeff0, dqcoeff_ptr);
     store_coefficients(coeff1, dqcoeff_ptr + 8);
-
-    eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
   }
 
   // AC only loop.
-  // TODO(Aniket): Reduce the processing of coeff quatization
-  // based on eob logic
   while (index < n_coeffs) {
     coeff0 = load_coefficients(coeff_ptr + index);
     coeff1 = load_coefficients(coeff_ptr + index + 8);
@@ -137,11 +125,13 @@
     qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
     qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
 
-    prescan0 = _mm_loadu_si128((const __m128i *)(prescan + index));
-    prescan1 = _mm_loadu_si128((const __m128i *)(prescan + index + 8));
+    update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0,
+                 &mask0);
 
-    cmp_mask0 = _mm_and_si128(prescan0, _mm_cmpgt_epi16(qcoeff0, zbin));
-    cmp_mask1 = _mm_and_si128(prescan1, _mm_cmpgt_epi16(qcoeff1, zbin));
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1);
 
     all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
     if (_mm_movemask_epi8(all_zero) == 0) {
@@ -174,14 +164,27 @@
     store_coefficients(coeff0, dqcoeff_ptr + index);
     store_coefficients(coeff1, dqcoeff_ptr + index + 8);
 
-    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index,
-                        zero);
-    eob = _mm_max_epi16(eob, eob0);
     index += 16;
   }
+  if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+  if (is_found1)
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
 
-  *eob_ptr = accumulate_eob(eob);
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
 
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
 #if SKIP_EOB_FACTOR_ADJUST
   // TODO(Aniket): Experiment the following loop with intrinsic by combining
   // with the quantization loop above
@@ -196,14 +199,14 @@
   if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
     const int rc = scan[(*eob_ptr - 1)];
     if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
-      const qm_val_t wt = (1 << AOM_QM_BITS);
       const int coeff = coeff_ptr[rc] * wt;
       const int coeff_sign = (coeff >> 31);
       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
       const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
       const int prescan_add_val =
           ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
-      if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+      if (abs_coeff <
+          (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
         qcoeff_ptr[rc] = 0;
         dqcoeff_ptr[rc] = 0;
         *eob_ptr = 0;
@@ -220,8 +223,11 @@
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan) {
   int index = 16;
-  int non_zero_count = (int)n_coeffs;
   const int log_scale = 1;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
   const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi16(1);
   const __m128i log_scale_vec = _mm_set1_epi16(log_scale);
@@ -229,34 +235,23 @@
   __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
   __m128i qcoeff0, qcoeff1;
   __m128i cmp_mask0, cmp_mask1;
-  __m128i eob = zero, eob0, prescan0, prescan1, all_zero;
+  __m128i all_zero;
+  __m128i mask0 = zero, mask1 = zero;
+
   const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
                          ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
-
   int prescan_add[2];
-  for (int i = 0; i < 2; ++i)
+  int thresh[4];
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
     prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
-
-  // max buffer is of size 1024 as this functions calls with
-  // maximum n_coeffs as 1024
-  int16_t prescan[1024];
-  memset(prescan, -1, n_coeffs * sizeof(int16_t));
-
-  // TODO(Aniket): Experiment the following loop with intrinsic
-  for (int i = (int)n_coeffs - 1; i >= 0; i--) {
-    const int rc = scan[i];
-    const qm_val_t wt = 1 << AOM_QM_BITS;
-    const int coeff = coeff_ptr[rc] * wt;
-    const int coeff_sign = (coeff >> 31);
-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-    const int prescan_add_val = prescan_add[rc != 0];
-    if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
-      prescan[rc] = 0;
-      non_zero_count--;
-    } else {
-      break;
-    }
+    thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
   }
+  thresh[2] = thresh[3] = thresh[1];
+  __m128i threshold[2];
+  threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+  threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
 #if SKIP_EOB_FACTOR_ADJUST
   int first = -1;
 #endif
@@ -273,6 +268,7 @@
   zbin = _mm_srli_epi16(zbin, log_scale);
   round = _mm_srli_epi16(round, log_scale);
   zbin = _mm_sub_epi16(zbin, one);
+
   // Do DC and first 15 AC.
   coeff0 = load_coefficients(coeff_ptr);
   coeff1 = load_coefficients(coeff_ptr + 8);
@@ -282,13 +278,15 @@
   qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
   qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
 
-  prescan0 = _mm_loadu_si128((const __m128i *)prescan);
-  prescan1 = _mm_loadu_si128((const __m128i *)(prescan + 8));
+  update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
 
-  cmp_mask0 = _mm_and_si128(prescan0, _mm_cmpgt_epi16(qcoeff0, zbin));
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
   zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
-  cmp_mask1 = _mm_and_si128(prescan1, _mm_cmpgt_epi16(qcoeff1, zbin));
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
 
+  update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1);
+
+  threshold[0] = threshold[1];
   all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
   if (_mm_movemask_epi8(all_zero) == 0) {
     _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
@@ -305,11 +303,9 @@
     dequant = _mm_unpackhi_epi64(dequant, dequant);
   } else {
     calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
-
     round = _mm_unpackhi_epi64(round, round);
     quant = _mm_unpackhi_epi64(quant, quant);
     shift = _mm_unpackhi_epi64(shift, shift);
-
     calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
 
     // Reinsert signs
@@ -328,14 +324,9 @@
     dequant = _mm_unpackhi_epi64(dequant, dequant);
     calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
                                           dqcoeff_ptr + 8, &log_scale);
-
-    eob =
-        scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
   }
 
   // AC only loop.
-  // TODO(Aniket): Reduce the processing of coeff quatization
-  // based on eob logic
   while (index < n_coeffs) {
     coeff0 = load_coefficients(coeff_ptr + index);
     coeff1 = load_coefficients(coeff_ptr + index + 8);
@@ -345,11 +336,13 @@
     qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
     qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
 
-    prescan0 = _mm_loadu_si128((const __m128i *)(prescan + index));
-    prescan1 = _mm_loadu_si128((const __m128i *)(prescan + index + 8));
+    update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0,
+                 &mask0);
 
-    cmp_mask0 = _mm_and_si128(prescan0, _mm_cmpgt_epi16(qcoeff0, zbin));
-    cmp_mask1 = _mm_and_si128(prescan1, _mm_cmpgt_epi16(qcoeff1, zbin));
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1);
 
     all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
     if (_mm_movemask_epi8(all_zero) == 0) {
@@ -380,15 +373,27 @@
                                           dqcoeff_ptr + index, &log_scale);
     calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
                                           dqcoeff_ptr + index + 8, &log_scale);
-
-    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
-                        zero);
-    eob = _mm_max_epi16(eob, eob0);
     index += 16;
   }
+  if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+  if (is_found1)
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
 
-  *eob_ptr = accumulate_eob(eob);
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
 
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
 #if SKIP_EOB_FACTOR_ADJUST
   // TODO(Aniket): Experiment the following loop with intrinsic by combining
   // with the quantization loop above
@@ -403,7 +408,6 @@
   if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
     const int rc = scan[(*eob_ptr - 1)];
     if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
-      const qm_val_t wt = (1 << AOM_QM_BITS);
       const int coeff = coeff_ptr[rc] * wt;
       const int coeff_sign = (coeff >> 31);
       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
diff --git a/aom_dsp/x86/quantize_x86.h b/aom_dsp/x86/quantize_x86.h
index b2de01b..5b040a2 100644
--- a/aom_dsp/x86/quantize_x86.h
+++ b/aom_dsp/x86/quantize_x86.h
@@ -143,3 +143,60 @@
   _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
   _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
 }
+
+static INLINE void update_mask1(__m128i *cmp_mask0, __m128i *cmp_mask1,
+                                const int16_t *iscan_ptr, int *is_found,
+                                __m128i *mask) {
+  __m128i all_zero;
+  __m128i temp_mask = _mm_setzero_si128();
+  all_zero = _mm_or_si128(*cmp_mask0, *cmp_mask1);
+  if (_mm_movemask_epi8(all_zero)) {
+    __m128i iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr));
+    __m128i mask0 = _mm_and_si128(*cmp_mask0, iscan0);
+    __m128i iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + 8));
+    __m128i mask1 = _mm_and_si128(*cmp_mask1, iscan1);
+    temp_mask = _mm_max_epi16(mask0, mask1);
+    *is_found = 1;
+  }
+  *mask = _mm_max_epi16(temp_mask, *mask);
+}
+
+static INLINE void update_mask0(__m128i *qcoeff0, __m128i *qcoeff1,
+                                __m128i *threshold, const int16_t *iscan_ptr,
+                                int *is_found, __m128i *mask) {
+  __m128i zero = _mm_setzero_si128();
+  __m128i coeff[4], cmp_mask0, cmp_mask1, cmp_mask2, cmp_mask3;
+
+  coeff[0] = _mm_unpacklo_epi16(*qcoeff0, zero);
+  coeff[1] = _mm_unpackhi_epi16(*qcoeff0, zero);
+  coeff[2] = _mm_unpacklo_epi16(*qcoeff1, zero);
+  coeff[3] = _mm_unpackhi_epi16(*qcoeff1, zero);
+
+  coeff[0] = _mm_slli_epi32(coeff[0], AOM_QM_BITS);
+  cmp_mask0 = _mm_cmpgt_epi32(coeff[0], threshold[0]);
+  coeff[1] = _mm_slli_epi32(coeff[1], AOM_QM_BITS);
+  cmp_mask1 = _mm_cmpgt_epi32(coeff[1], threshold[1]);
+  coeff[2] = _mm_slli_epi32(coeff[2], AOM_QM_BITS);
+  cmp_mask2 = _mm_cmpgt_epi32(coeff[2], threshold[1]);
+  coeff[3] = _mm_slli_epi32(coeff[3], AOM_QM_BITS);
+  cmp_mask3 = _mm_cmpgt_epi32(coeff[3], threshold[1]);
+
+  cmp_mask0 = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+  cmp_mask1 = _mm_packs_epi32(cmp_mask2, cmp_mask3);
+
+  update_mask1(&cmp_mask0, &cmp_mask1, iscan_ptr, is_found, mask);
+}
+
+static INLINE int calculate_non_zero_count(__m128i mask) {
+  __m128i mask0, mask1;
+  int non_zero_count = 0;
+  mask0 = _mm_unpackhi_epi64(mask, mask);
+  mask1 = _mm_max_epi16(mask0, mask);
+  mask0 = _mm_shuffle_epi32(mask1, 1);
+  mask0 = _mm_max_epi16(mask0, mask1);
+  mask1 = _mm_srli_epi32(mask0, 16);
+  mask0 = _mm_max_epi16(mask0, mask1);
+  non_zero_count = _mm_extract_epi16(mask0, 0) + 1;
+
+  return non_zero_count;
+}