Add improvements to the b quantizer

These are controlled by a macro because they do not currently have
SIMD.

Lowres Compression Gains:
Full Trellis Optimization: 0.02% PSNR Gain
Partial Trellis (disabled in tx search): 0.38% PSNR Gain
Trellis Fully Disabled: 0.68% PSNR Gain

Change-Id: I5a42e6ff9fcca3dc31691c14deb1f0599a0ff894
diff --git a/aom_dsp/quantize.c b/aom_dsp/quantize.c
index 62dbd86..5a28698 100644
--- a/aom_dsp/quantize.c
+++ b/aom_dsp/quantize.c
@@ -12,6 +12,67 @@
 #include "aom_dsp/quantize.h"
 #include "aom_mem/aom_mem.h"
 
+void quantize_b_adaptive_helper_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, const int log_scale) {
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+  int i, non_zero_count = (int)n_coeffs, eob = -1;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  // Pre-scan pass
+  for (i = (int)n_coeffs - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    const int coeff = coeff_ptr[rc] * wt;
+
+    int prescan_add = ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * 325, 7);
+    if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add) &&
+        coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add))
+      non_zero_count--;
+    else
+      break;
+  }
+
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+  for (i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    int tmp32;
+
+    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) {
+      int64_t tmp =
+          clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale),
+                INT16_MIN, INT16_MAX);
+      tmp *= wt;
+      tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+                     quant_shift_ptr[rc != 0]) >>
+                    (16 - log_scale + AOM_QM_BITS));  // quantization
+      qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+      const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+      const int dequant =
+          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+      const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale;
+      dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+
+      if (tmp32) eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
 void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                          const int16_t *zbin_ptr, const int16_t *round_ptr,
                          const int16_t *quant_ptr,
@@ -74,6 +135,64 @@
   *eob_ptr = eob + 1;
 }
 
+void highbd_quantize_b_adaptive_helper_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, const int log_scale) {
+  int i, eob = -1;
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+  int dequant;
+  int idx_arr[4096];
+  (void)iscan;
+  int idx = 0;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  // Pre-scan pass
+  for (i = 0; i < n_coeffs; i++) {
+    const int rc = scan[i];
+    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    const int coeff = coeff_ptr[rc] * wt;
+
+    // If the coefficient is out of the base ZBIN range, keep it for
+    // quantization.
+    int prescan_add = ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * 325, 7);
+    if (coeff >= (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add) &&
+        coeff <= (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add))
+      idx_arr[idx++] = i;
+  }
+
+  // Quantization pass: only process the coefficients selected in
+  // pre-scan pass. Note: idx can be zero.
+  for (i = 0; i < idx; i++) {
+    const int rc = scan[idx_arr[i]];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp1 =
+        abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale);
+    const int64_t tmpw = tmp1 * wt;
+    const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
+    const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >>
+                                 (16 - log_scale + AOM_QM_BITS));
+    qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    dequant =
+        (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+    const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+    dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+    if (abs_qcoeff) eob = idx_arr[i];
+  }
+  *eob_ptr = eob + 1;
+}
+
 void highbd_quantize_b_helper_c(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
     const int16_t *round_ptr, const int16_t *quant_ptr,
@@ -139,9 +258,17 @@
                       tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                       const int16_t *dequant_ptr, uint16_t *eob_ptr,
                       const int16_t *scan, const int16_t *iscan) {
+#if ADAPTIVE_QUANT_B
+  quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                               quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                               dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+                               NULL, NULL, 0);
+
+#else
   quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
                       quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
                       eob_ptr, scan, iscan, NULL, NULL, 0);
+#endif
 }
 
 void aom_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
@@ -151,9 +278,16 @@
                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
                             const int16_t *scan, const int16_t *iscan) {
+#if ADAPTIVE_QUANT_B
+  quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                               quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                               dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+                               NULL, NULL, 1);
+#else
   quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
                       quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
                       eob_ptr, scan, iscan, NULL, NULL, 1);
+#endif
 }
 
 void aom_quantize_b_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
@@ -163,9 +297,16 @@
                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
                             const int16_t *scan, const int16_t *iscan) {
+#if ADAPTIVE_QUANT_B
+  quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                               quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                               dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+                               NULL, NULL, 2);
+#else
   quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
                       quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
                       eob_ptr, scan, iscan, NULL, NULL, 2);
+#endif
 }
 
 void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
@@ -175,10 +316,17 @@
                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
                              const int16_t *scan, const int16_t *iscan) {
+#if ADAPTIVE_QUANT_B
+  highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                                      quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                                      dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+                                      iscan, NULL, NULL, 0);
+#else
   highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
                              quant_ptr, quant_shift_ptr, qcoeff_ptr,
                              dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
                              NULL, NULL, 0);
+#endif
 }
 
 void aom_highbd_quantize_b_32x32_c(
@@ -187,10 +335,17 @@
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan) {
+#if ADAPTIVE_QUANT_B
+  highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                                      quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                                      dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+                                      iscan, NULL, NULL, 1);
+#else
   highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
                              quant_ptr, quant_shift_ptr, qcoeff_ptr,
                              dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
                              NULL, NULL, 1);
+#endif
 }
 
 void aom_highbd_quantize_b_64x64_c(
@@ -199,8 +354,15 @@
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan) {
+#if ADAPTIVE_QUANT_B
+  highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                                      quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                                      dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+                                      iscan, NULL, NULL, 2);
+#else
   highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
                              quant_ptr, quant_shift_ptr, qcoeff_ptr,
                              dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
                              NULL, NULL, 2);
+#endif
 }
diff --git a/aom_dsp/quantize.h b/aom_dsp/quantize.h
index c55ab23..df7f306 100644
--- a/aom_dsp/quantize.h
+++ b/aom_dsp/quantize.h
@@ -16,6 +16,8 @@
 
 #include "aom_dsp/aom_dsp_common.h"
 
+#define ADAPTIVE_QUANT_B 0
+
 #ifdef __cplusplus
 extern "C" {
 #endif
diff --git a/av1/encoder/av1_quantize.c b/av1/encoder/av1_quantize.c
index 21ab4db..9e48358 100644
--- a/av1/encoder/av1_quantize.c
+++ b/av1/encoder/av1_quantize.c
@@ -281,6 +281,31 @@
                         dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
                         sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
   } else {
+#if ADAPTIVE_QUANT_B
+    // TODO(sarahparker) These quantize_b optimizations need SIMD
+    // implementations
+    switch (qparam->log_scale) {
+      case 0:
+        aom_quantize_b_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                         p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                         dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                         sc->iscan);
+        break;
+      case 1:
+        aom_quantize_b_32x32_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                               p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                               dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                               sc->iscan);
+        break;
+      case 2:
+        aom_quantize_b_64x64_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                               p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                               dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                               sc->iscan);
+        break;
+      default: assert(0);
+    }
+#else
     switch (qparam->log_scale) {
       case 0:
         aom_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
@@ -302,6 +327,7 @@
         break;
       default: assert(0);
     }
+#endif
   }
 }