Support 64x64 quantizer functions

Also includes some refactoring and cleanups.

Change-Id: I2c2528c434a1e9e9b898251fa69489d884463929
diff --git a/aom_dsp/quantize.c b/aom_dsp/quantize.c
index 1b9bbdc..f7870ca 100644
--- a/aom_dsp/quantize.c
+++ b/aom_dsp/quantize.c
@@ -99,6 +99,38 @@
   *eob_ptr = eob + 1;
 }
 
+#if CONFIG_TX64X64
+void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
+                           const int16_t *round_ptr, const int16_t quant,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           const int16_t dequant_ptr, uint16_t *eob_ptr,
+                           const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) {
+  const int n_coeffs = 1024;
+  const int rc = 0;
+  const int coeff = coeff_ptr[rc];
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int64_t tmp, eob = -1;
+  int32_t tmp32;
+  int dequant;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2),
+                INT16_MIN, INT16_MAX);
+    tmp32 = (int32_t)((tmp * qm_ptr[rc] * quant) >> (14 + AOM_QM_BITS));
+    qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+    dequant =
+        (dequant_ptr * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+    dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 4;
+    if (tmp32) eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+#endif  // CONFIG_TX64X64
+
 #if CONFIG_AOM_HIGHBITDEPTH
 void aom_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
                                   const int16_t *round_ptr, const int16_t quant,
@@ -129,6 +161,38 @@
   }
   *eob_ptr = eob + 1;
 }
+
+#if CONFIG_TX64X64
+void aom_highbd_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
+                                  const int16_t *round_ptr, const int16_t quant,
+                                  tran_low_t *qcoeff_ptr,
+                                  tran_low_t *dqcoeff_ptr,
+                                  const int16_t dequant_ptr, uint16_t *eob_ptr,
+                                  const qm_val_t *qm_ptr,
+                                  const qm_val_t *iqm_ptr) {
+  const int n_coeffs = 1024;
+  int eob = -1;
+  int dequant;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    const int coeff = coeff_ptr[0];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 2);
+    const uint32_t abs_qcoeff =
+        (uint32_t)((tmp * qm_ptr[0] * quant) >> (14 + AOM_QM_BITS));
+    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    dequant =
+        (dequant_ptr * iqm_ptr[0] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+    dqcoeff_ptr[0] = (qcoeff_ptr[0] * dequant) / 4;
+    if (abs_qcoeff) eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+#endif  // CONFIG_TX64X64
 #endif
 
 void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
@@ -316,6 +380,72 @@
   *eob_ptr = eob + 1;
 }
 
+#if CONFIG_TX64X64
+void aom_quantize_b_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            int skip_block, const int16_t *zbin_ptr,
+                            const int16_t *round_ptr, const int16_t *quant_ptr,
+                            const int16_t *quant_shift_ptr,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                            const int16_t *scan, const int16_t *iscan,
+                            const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) {
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 2),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], 2) };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+
+  int idx = 0;
+  int idx_arr[4096];
+  int i, eob = -1;
+  int dequant;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const qm_val_t wt = qm_ptr[rc];
+      const int coeff = coeff_ptr[rc] * wt;
+
+      // If the coefficient is out of the base ZBIN range, keep it for
+      // quantization.
+      if (coeff >= (zbins[rc != 0] << AOM_QM_BITS) ||
+          coeff <= (nzbins[rc != 0] << AOM_QM_BITS))
+        idx_arr[idx++] = i;
+    }
+
+    // Quantization pass: only process the coefficients selected in
+    // pre-scan pass. Note: idx can be zero.
+    for (i = 0; i < idx; i++) {
+      const int rc = scan[idx_arr[i]];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const qm_val_t wt = qm_ptr[rc];
+      int64_t tmp;
+      int tmp32;
+      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2);
+      tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+      tmp = tmp * wt;
+      tmp32 = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+               quant_shift_ptr[rc != 0]) >>
+              (14 + AOM_QM_BITS);
+
+      qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+      dequant =
+          (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+      dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 4;
+
+      if (tmp32) eob = idx_arr[i];
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif  // CONFIG_TX64X64
+
 #if CONFIG_AOM_HIGHBITDEPTH
 void aom_highbd_quantize_b_32x32_c(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
@@ -375,8 +505,71 @@
   }
   *eob_ptr = eob + 1;
 }
-#endif
+
+#if CONFIG_TX64X64
+void aom_highbd_quantize_b_64x64_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr) {
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 2),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], 2) };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+
+  int idx = 0;
+  int idx_arr[4096];
+  int i, eob = -1;
+  int dequant;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const qm_val_t wt = qm_ptr[rc];
+      const int coeff = coeff_ptr[rc] * wt;
+
+      // If the coefficient is out of the base ZBIN range, keep it for
+      // quantization.
+      if (coeff >= (zbins[rc != 0] << AOM_QM_BITS) ||
+          coeff <= (nzbins[rc != 0] << AOM_QM_BITS))
+        idx_arr[idx++] = i;
+    }
+
+    // Quantization pass: only process the coefficients selected in
+    // pre-scan pass. Note: idx can be zero.
+    for (i = 0; i < idx; i++) {
+      const int rc = scan[idx_arr[i]];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const qm_val_t wt = qm_ptr[rc];
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int64_t tmp1 =
+          abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2);
+      const int64_t tmpw = tmp1 * wt;
+      const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
+      const uint32_t abs_qcoeff =
+          (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> (14 + AOM_QM_BITS));
+      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+      dequant =
+          (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+      dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 4;
+      if (abs_qcoeff) eob = idx_arr[i];
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif  // CONFIG_TX64X64
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
 #else
+
 void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
                      const int16_t *round_ptr, const int16_t quant,
                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
@@ -450,6 +643,33 @@
   *eob_ptr = eob + 1;
 }
 
+#if CONFIG_TX64X64
+void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
+                           const int16_t *round_ptr, const int16_t quant,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           const int16_t dequant_ptr, uint16_t *eob_ptr) {
+  const int n_coeffs = 4096;
+  const int rc = 0;
+  const int coeff = coeff_ptr[rc];
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int tmp, eob = -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2),
+                INT16_MIN, INT16_MAX);
+    tmp = (tmp * quant) >> 14;
+    qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 4;
+    if (tmp) eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+#endif  // CONFIG_TX64X64
+
 #if CONFIG_AOM_HIGHBITDEPTH
 void aom_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
                                   const int16_t *round_ptr, const int16_t quant,
@@ -475,6 +695,33 @@
   }
   *eob_ptr = eob + 1;
 }
+
+#if CONFIG_TX64X64
+void aom_highbd_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
+                                  const int16_t *round_ptr, const int16_t quant,
+                                  tran_low_t *qcoeff_ptr,
+                                  tran_low_t *dqcoeff_ptr,
+                                  const int16_t dequant_ptr,
+                                  uint16_t *eob_ptr) {
+  const int n_coeffs = 4096;
+  int eob = -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    const int coeff = coeff_ptr[0];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 2);
+    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 14);
+    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 4;
+    if (abs_qcoeff) eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+#endif  // CONFIG_TX64X64
 #endif
 
 void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
@@ -632,6 +879,62 @@
   *eob_ptr = eob + 1;
 }
 
+#if CONFIG_TX64X64
+void aom_quantize_b_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            int skip_block, const int16_t *zbin_ptr,
+                            const int16_t *round_ptr, const int16_t *quant_ptr,
+                            const int16_t *quant_shift_ptr,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                            const int16_t *scan, const int16_t *iscan) {
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 2),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], 2) };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+
+  int idx = 0;
+  int idx_arr[4096];
+  int i, eob = -1;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+
+      // If the coefficient is out of the base ZBIN range, keep it for
+      // quantization.
+      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
+        idx_arr[idx++] = i;
+    }
+
+    // Quantization pass: only process the coefficients selected in
+    // pre-scan pass. Note: idx can be zero.
+    for (i = 0; i < idx; i++) {
+      const int rc = scan[idx_arr[i]];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      int tmp;
+      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2);
+      abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+      tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) *
+             quant_shift_ptr[rc != 0]) >>
+            14;
+
+      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 4;
+
+      if (tmp) eob = idx_arr[i];
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif  // CONFIG_TX64X64
+
 #if CONFIG_AOM_HIGHBITDEPTH
 void aom_highbd_quantize_b_32x32_c(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
@@ -682,5 +985,57 @@
   }
   *eob_ptr = eob + 1;
 }
+
+#if CONFIG_TX64X64
+void aom_highbd_quantize_b_64x64_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 2),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], 2) };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+
+  int idx = 0;
+  int idx_arr[4096];
+  int i, eob = -1;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+
+      // If the coefficient is out of the base ZBIN range, keep it for
+      // quantization.
+      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
+        idx_arr[idx++] = i;
+    }
+
+    // Quantization pass: only process the coefficients selected in
+    // pre-scan pass. Note: idx can be zero.
+    for (i = 0; i < idx; i++) {
+      const int rc = scan[idx_arr[i]];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int64_t tmp1 =
+          abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2);
+      const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+      const uint32_t abs_qcoeff =
+          (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 14);
+      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 4;
+      if (abs_qcoeff) eob = idx_arr[i];
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif  // CONFIG_TX64X64
 #endif
 #endif