Support 64x64 quantizer functions Also includes some refactoring and cleanups. Change-Id: I2c2528c434a1e9e9b898251fa69489d884463929

commit: 0e11912ae108e8f77eb3b063f3fd5896749fb161 [log] [tgz]
author: Debargha Mukherjee <debargha@google.com> Fri Nov 04 12:10:23 2016 -0700
committer: Debargha Mukherjee <debargha@google.com> Wed Nov 09 21:59:14 2016 +0000
tree: 44f19045dc74e7ab1befbbf287e6b79e9cb4cfd1
parent: 23b120db225b4c973d757e480475a4e9b7f1272c [diff]
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 94e2587..7b2b5fa 100644
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl

@@ -1027,13 +1027,23 @@
 if (aom_config("CONFIG_AOM_QM") eq "yes") {
   if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     add_proto qw/void aom_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr";
+    specialize qw/aom_quantize_b/;
 
     add_proto qw/void aom_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr";
+    specialize qw/aom_quantize_b_32x32/;
+
+    add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr";
+    specialize qw/aom_quantize_b_64x64/;
 
     if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
       add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr";
+      specialize qw/aom_highbd_quantize_b/;
 
       add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr";
+      specialize qw/aom_highbd_quantize_b_32x32/;
+
+      add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr";
+      specialize qw/aom_highbd_quantize_b_64x64/;
     }  # CONFIG_AOM_HIGHBITDEPTH
   }  # CONFIG_AV1_ENCODER
 } else {
@@ -1044,12 +1054,18 @@
     add_proto qw/void aom_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
     specialize qw/aom_quantize_b_32x32/, "$ssse3_x86_64", "$avx_x86_64";
 
+    add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/aom_quantize_b_64x64/;
+
     if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
       add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
       specialize qw/aom_highbd_quantize_b sse2/;
 
       add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
       specialize qw/aom_highbd_quantize_b_32x32 sse2/;
+
+      add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+      specialize qw/aom_highbd_quantize_b_64x64/;
     }  # CONFIG_AOM_HIGHBITDEPTH
   }  # CONFIG_AV1_ENCODER
 } # CONFIG_AOM_QM

diff --git a/aom_dsp/quantize.c b/aom_dsp/quantize.c
index 1b9bbdc..f7870ca 100644
--- a/aom_dsp/quantize.c
+++ b/aom_dsp/quantize.c

@@ -99,6 +99,38 @@
   *eob_ptr = eob + 1;
 }
 
+#if CONFIG_TX64X64
+void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
+                           const int16_t *round_ptr, const int16_t quant,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           const int16_t dequant_ptr, uint16_t *eob_ptr,
+                           const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) {
+  const int n_coeffs = 1024;
+  const int rc = 0;
+  const int coeff = coeff_ptr[rc];
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int64_t tmp, eob = -1;
+  int32_t tmp32;
+  int dequant;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2),
+                INT16_MIN, INT16_MAX);
+    tmp32 = (int32_t)((tmp * qm_ptr[rc] * quant) >> (14 + AOM_QM_BITS));
+    qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+    dequant =
+        (dequant_ptr * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+    dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 4;
+    if (tmp32) eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+#endif  // CONFIG_TX64X64
+
 #if CONFIG_AOM_HIGHBITDEPTH
 void aom_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
                                   const int16_t *round_ptr, const int16_t quant,
@@ -129,6 +161,38 @@
   }
   *eob_ptr = eob + 1;
 }
+
+#if CONFIG_TX64X64
+void aom_highbd_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
+                                  const int16_t *round_ptr, const int16_t quant,
+                                  tran_low_t *qcoeff_ptr,
+                                  tran_low_t *dqcoeff_ptr,
+                                  const int16_t dequant_ptr, uint16_t *eob_ptr,
+                                  const qm_val_t *qm_ptr,
+                                  const qm_val_t *iqm_ptr) {
+  const int n_coeffs = 1024;
+  int eob = -1;
+  int dequant;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    const int coeff = coeff_ptr[0];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 2);
+    const uint32_t abs_qcoeff =
+        (uint32_t)((tmp * qm_ptr[0] * quant) >> (14 + AOM_QM_BITS));
+    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    dequant =
+        (dequant_ptr * iqm_ptr[0] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+    dqcoeff_ptr[0] = (qcoeff_ptr[0] * dequant) / 4;
+    if (abs_qcoeff) eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+#endif  // CONFIG_TX64X64
 #endif
 
 void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
@@ -316,6 +380,72 @@
   *eob_ptr = eob + 1;
 }
 
+#if CONFIG_TX64X64
+void aom_quantize_b_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            int skip_block, const int16_t *zbin_ptr,
+                            const int16_t *round_ptr, const int16_t *quant_ptr,
+                            const int16_t *quant_shift_ptr,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                            const int16_t *scan, const int16_t *iscan,
+                            const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) {
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 2),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], 2) };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+
+  int idx = 0;
+  int idx_arr[4096];
+  int i, eob = -1;
+  int dequant;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const qm_val_t wt = qm_ptr[rc];
+      const int coeff = coeff_ptr[rc] * wt;
+
+      // If the coefficient is out of the base ZBIN range, keep it for
+      // quantization.
+      if (coeff >= (zbins[rc != 0] << AOM_QM_BITS) ||
+          coeff <= (nzbins[rc != 0] << AOM_QM_BITS))
+        idx_arr[idx++] = i;
+    }
+
+    // Quantization pass: only process the coefficients selected in
+    // pre-scan pass. Note: idx can be zero.
+    for (i = 0; i < idx; i++) {
+      const int rc = scan[idx_arr[i]];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const qm_val_t wt = qm_ptr[rc];
+      int64_t tmp;
+      int tmp32;
+      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2);
+      tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+      tmp = tmp * wt;
+      tmp32 = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+               quant_shift_ptr[rc != 0]) >>
+              (14 + AOM_QM_BITS);
+
+      qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+      dequant =
+          (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+      dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 4;
+
+      if (tmp32) eob = idx_arr[i];
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif  // CONFIG_TX64X64
+
 #if CONFIG_AOM_HIGHBITDEPTH
 void aom_highbd_quantize_b_32x32_c(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
@@ -375,8 +505,71 @@
   }
   *eob_ptr = eob + 1;
 }
-#endif
+
+#if CONFIG_TX64X64
+void aom_highbd_quantize_b_64x64_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr) {
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 2),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], 2) };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+
+  int idx = 0;
+  int idx_arr[4096];
+  int i, eob = -1;
+  int dequant;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const qm_val_t wt = qm_ptr[rc];
+      const int coeff = coeff_ptr[rc] * wt;
+
+      // If the coefficient is out of the base ZBIN range, keep it for
+      // quantization.
+      if (coeff >= (zbins[rc != 0] << AOM_QM_BITS) ||
+          coeff <= (nzbins[rc != 0] << AOM_QM_BITS))
+        idx_arr[idx++] = i;
+    }
+
+    // Quantization pass: only process the coefficients selected in
+    // pre-scan pass. Note: idx can be zero.
+    for (i = 0; i < idx; i++) {
+      const int rc = scan[idx_arr[i]];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const qm_val_t wt = qm_ptr[rc];
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int64_t tmp1 =
+          abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2);
+      const int64_t tmpw = tmp1 * wt;
+      const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
+      const uint32_t abs_qcoeff =
+          (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> (14 + AOM_QM_BITS));
+      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+      dequant =
+          (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+      dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 4;
+      if (abs_qcoeff) eob = idx_arr[i];
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif  // CONFIG_TX64X64
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
 #else
+
 void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
                      const int16_t *round_ptr, const int16_t quant,
                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
@@ -450,6 +643,33 @@
   *eob_ptr = eob + 1;
 }
 
+#if CONFIG_TX64X64
+void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
+                           const int16_t *round_ptr, const int16_t quant,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           const int16_t dequant_ptr, uint16_t *eob_ptr) {
+  const int n_coeffs = 4096;
+  const int rc = 0;
+  const int coeff = coeff_ptr[rc];
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int tmp, eob = -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2),
+                INT16_MIN, INT16_MAX);
+    tmp = (tmp * quant) >> 14;
+    qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 4;
+    if (tmp) eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+#endif  // CONFIG_TX64X64
+
 #if CONFIG_AOM_HIGHBITDEPTH
 void aom_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
                                   const int16_t *round_ptr, const int16_t quant,
@@ -475,6 +695,33 @@
   }
   *eob_ptr = eob + 1;
 }
+
+#if CONFIG_TX64X64
+void aom_highbd_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
+                                  const int16_t *round_ptr, const int16_t quant,
+                                  tran_low_t *qcoeff_ptr,
+                                  tran_low_t *dqcoeff_ptr,
+                                  const int16_t dequant_ptr,
+                                  uint16_t *eob_ptr) {
+  const int n_coeffs = 4096;
+  int eob = -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    const int coeff = coeff_ptr[0];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 2);
+    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 14);
+    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 4;
+    if (abs_qcoeff) eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+#endif  // CONFIG_TX64X64
 #endif
 
 void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
@@ -632,6 +879,62 @@
   *eob_ptr = eob + 1;
 }
 
+#if CONFIG_TX64X64
+void aom_quantize_b_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            int skip_block, const int16_t *zbin_ptr,
+                            const int16_t *round_ptr, const int16_t *quant_ptr,
+                            const int16_t *quant_shift_ptr,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                            const int16_t *scan, const int16_t *iscan) {
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 2),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], 2) };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+
+  int idx = 0;
+  int idx_arr[4096];
+  int i, eob = -1;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+
+      // If the coefficient is out of the base ZBIN range, keep it for
+      // quantization.
+      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
+        idx_arr[idx++] = i;
+    }
+
+    // Quantization pass: only process the coefficients selected in
+    // pre-scan pass. Note: idx can be zero.
+    for (i = 0; i < idx; i++) {
+      const int rc = scan[idx_arr[i]];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      int tmp;
+      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2);
+      abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+      tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) *
+             quant_shift_ptr[rc != 0]) >>
+            14;
+
+      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 4;
+
+      if (tmp) eob = idx_arr[i];
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif  // CONFIG_TX64X64
+
 #if CONFIG_AOM_HIGHBITDEPTH
 void aom_highbd_quantize_b_32x32_c(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
@@ -682,5 +985,57 @@
   }
   *eob_ptr = eob + 1;
 }
+
+#if CONFIG_TX64X64
+void aom_highbd_quantize_b_64x64_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 2),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], 2) };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+
+  int idx = 0;
+  int idx_arr[4096];
+  int i, eob = -1;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+
+      // If the coefficient is out of the base ZBIN range, keep it for
+      // quantization.
+      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
+        idx_arr[idx++] = i;
+    }
+
+    // Quantization pass: only process the coefficients selected in
+    // pre-scan pass. Note: idx can be zero.
+    for (i = 0; i < idx; i++) {
+      const int rc = scan[idx_arr[i]];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int64_t tmp1 =
+          abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2);
+      const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+      const uint32_t abs_qcoeff =
+          (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 14);
+      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 4;
+      if (abs_qcoeff) eob = idx_arr[i];
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif  // CONFIG_TX64X64
 #endif
 #endif

diff --git a/aom_dsp/quantize.h b/aom_dsp/quantize.h
index 45ed678..67e3b5e 100644
--- a/aom_dsp/quantize.h
+++ b/aom_dsp/quantize.h

@@ -30,6 +30,13 @@
                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                            const int16_t dequant_ptr, uint16_t *eob_ptr,
                            const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr);
+#if CONFIG_TX64X64
+void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
+                           const int16_t *round_ptr, const int16_t quant_ptr,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           const int16_t dequant_ptr, uint16_t *eob_ptr,
+                           const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr);
+#endif  // CONFIG_TX64X64
 void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                       int skip_block, const int16_t *zbin_ptr,
                       const int16_t *round_ptr, const int16_t *quant_ptr,
@@ -50,6 +57,13 @@
     const int16_t quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
     const int16_t dequant_ptr, uint16_t *eob_ptr, const qm_val_t *qm_ptr,
     const qm_val_t *iqm_ptr);
+#if CONFIG_TX64X64
+void aom_highbd_quantize_dc_64x64(
+    const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr,
+    const int16_t quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+    const int16_t dequant_ptr, uint16_t *eob_ptr, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr);
+#endif  // CONFIG_TX64X64
 void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                              int skip_block, const int16_t *zbin_ptr,
                              const int16_t *round_ptr, const int16_t *quant_ptr,
@@ -58,8 +72,10 @@
                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
                              const int16_t *scan, const int16_t *iscan,
                              const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr);
-#endif
-#else
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+#else  // CONFIG_AOM_QM
+
 void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
                      const int16_t *round_ptr, const int16_t quant_ptr,
                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
@@ -68,7 +84,12 @@
                            const int16_t *round_ptr, const int16_t quant_ptr,
                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                            const int16_t dequant_ptr, uint16_t *eob_ptr);
-
+#if CONFIG_TX64X64
+void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
+                           const int16_t *round_ptr, const int16_t quant_ptr,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           const int16_t dequant_ptr, uint16_t *eob_ptr);
+#endif  // CONFIG_TX64X64
 #if CONFIG_AOM_HIGHBITDEPTH
 void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
                             int skip_block, const int16_t *round_ptr,
@@ -81,8 +102,16 @@
                                   tran_low_t *qcoeff_ptr,
                                   tran_low_t *dqcoeff_ptr,
                                   const int16_t dequant_ptr, uint16_t *eob_ptr);
-#endif
-#endif
+#if CONFIG_TX64X64
+void aom_highbd_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
+                                  const int16_t *round_ptr,
+                                  const int16_t quant_ptr,
+                                  tran_low_t *qcoeff_ptr,
+                                  tran_low_t *dqcoeff_ptr,
+                                  const int16_t dequant_ptr, uint16_t *eob_ptr);
+#endif  // CONFIG_TX64X64
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+#endif  // CONFIG_AOM_QM
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index f96dcf2..fce07f9 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl

@@ -206,6 +206,14 @@
 
   add_proto qw/void quantize_32x32_fp_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
   specialize qw/quantize_32x32_fp_nuq/;
+
+  if (aom_config("CONFIG_TX64X64") eq "yes") {
+    add_proto qw/void quantize_64x64_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
+    specialize qw/quantize_64x64_nuq/;
+
+    add_proto qw/void quantize_64x64_fp_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
+    specialize qw/quantize_64x64_fp_nuq/;
+  }
 }
 
 # FILTER_INTRA predictor functions
@@ -332,8 +340,15 @@
     specialize qw/av1_block_error/;
 
     add_proto qw/void av1_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t *iqm_ptr";
+    specialize qw/av1_quantize_fp/;
 
     add_proto qw/void av1_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t *iqm_ptr";
+    specialize qw/av1_quantize_fp_32x32/;
+
+    if (aom_config("CONFIG_TX64X64") eq "yes") {
+      add_proto qw/void av1_quantize_fp_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t *iqm_ptr";
+      specialize qw/av1_quantize_fp_64x64/;
+    }
 
     add_proto qw/void av1_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t *iqm_ptr";
     specialize qw/av1_fdct8x8_quant/;
@@ -345,10 +360,18 @@
     specialize qw/av1_block_error_fp neon/, "$sse2_x86inc";
 
     add_proto qw/void av1_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t *iqm_ptr";
+    specialize qw/av1_quantize_fp/;
 
     add_proto qw/void av1_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t *iqm_ptr";
+    specialize qw/av1_quantize_fp_32x32/;
+
+    if (aom_config("CONFIG_TX64X64") eq "yes") {
+      add_proto qw/void av1_quantize_fp_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t *iqm_ptr";
+      specialize qw/av1_quantize_fp_64x64/;
+    }
 
     add_proto qw/void av1_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t *iqm_ptr";
+    specialize qw/av1_fdct8x8_quant/;
   }
 } else {
   if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
@@ -363,6 +386,11 @@
     add_proto qw/void av1_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
     specialize qw/av1_quantize_fp_32x32/;
 
+    if (aom_config("CONFIG_TX64X64") eq "yes") {
+      add_proto qw/void av1_quantize_fp_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+      specialize qw/av1_quantize_fp_64x64/;
+    }
+
     add_proto qw/void av1_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
     specialize qw/av1_fdct8x8_quant/;
   } else {
@@ -378,6 +406,11 @@
     add_proto qw/void av1_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
     specialize qw/av1_quantize_fp_32x32/, "$ssse3_x86_64";
 
+    if (aom_config("CONFIG_TX64X64") eq "yes") {
+      add_proto qw/void av1_quantize_fp_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+      specialize qw/av1_quantize_fp_64x64/;
+    }
+
     add_proto qw/void av1_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
     specialize qw/av1_fdct8x8_quant sse2 ssse3 neon/;
   }
@@ -478,6 +511,14 @@
 add_proto qw/void av1_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
 specialize qw/av1_temporal_filter_apply sse2 msa/;
 
+if (aom_config("CONFIG_AOM_QM") eq "yes") {
+  add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
+  specialize qw/av1_quantize_b/;
+} else {
+  add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
+  specialize qw/av1_quantize_b/;
+}
+
 if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
 
   # ENCODEMB INVOKE
@@ -493,6 +534,14 @@
 
     add_proto qw/void highbd_quantize_32x32_fp_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
     specialize qw/highbd_quantize_32x32_fp_nuq/;
+
+    if (aom_config("CONFIG_TX64X64") eq "yes") {
+      add_proto qw/void highbd_quantize_64x64_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
+      specialize qw/highbd_quantize_64x64_nuq/;
+
+      add_proto qw/void highbd_quantize_64x64_fp_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
+      specialize qw/highbd_quantize_64x64_fp_nuq/;
+    }
   }
 
   add_proto qw/int64_t av1_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
@@ -505,6 +554,11 @@
     add_proto qw/void av1_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
     specialize qw/av1_highbd_quantize_fp_32x32/;
 
+    if (aom_config("CONFIG_TX64X64") eq "yes") {
+      add_proto qw/void av1_highbd_quantize_fp_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
+      specialize qw/av1_highbd_quantize_fp_64x64/;
+    }
+
     add_proto qw/void av1_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
     specialize qw/av1_highbd_quantize_b/;
   } else {

diff --git a/av1/common/common_data.h b/av1/common/common_data.h
index cedab2f..7877106 100644
--- a/av1/common/common_data.h
+++ b/av1/common/common_data.h

@@ -478,7 +478,11 @@
 #if CONFIG_CB4X4
   4,
 #endif
-  16, 64, 256, 1024, 32, 32, 128, 128, 512, 512,
+  16, 64, 256, 1024,
+#if CONFIG_TX64X64
+  4096,
+#endif  // CONFIG_TX64X64
+  32, 32, 128, 128, 512, 512,
 };
 
 static const uint8_t tx_size_1d_log2[TX_SIZES] = { 2, 3, 4, 5 };

diff --git a/av1/common/idct.c b/av1/common/idct.c
index 2663d2d..156fc96 100644
--- a/av1/common/idct.c
+++ b/av1/common/idct.c

@@ -20,10 +20,7 @@
 #include "av1/common/enums.h"
 #include "av1/common/idct.h"
 
-int get_tx_scale(const MACROBLOCKD *const xd, const TX_TYPE tx_type,
-                 const TX_SIZE tx_size) {
-  (void)tx_type;
-  (void)xd;
+int get_tx_scale(const TX_SIZE tx_size) {
   if (txsize_sqr_up_map[tx_size] == TX_32X32) return 1;
 #if CONFIG_TX64X64
   else if (txsize_sqr_up_map[tx_size] == TX_64X64)

diff --git a/av1/common/idct.h b/av1/common/idct.h
index db9a6e2..8f1eea1 100644
--- a/av1/common/idct.h
+++ b/av1/common/idct.h

@@ -51,8 +51,7 @@
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
 #define MAX_TX_SCALE 1
-int get_tx_scale(const MACROBLOCKD *const xd, const TX_TYPE tx_type,
-                 const TX_SIZE tx_size);
+int get_tx_scale(const TX_SIZE tx_size);
 
 void av1_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                      int eob);

diff --git a/av1/decoder/detokenize.c b/av1/decoder/detokenize.c
index 0f183f2..024006c 100644
--- a/av1/decoder/detokenize.c
+++ b/av1/decoder/detokenize.c

@@ -98,6 +98,7 @@
   const uint8_t *cat4_prob;
   const uint8_t *cat5_prob;
   const uint8_t *cat6_prob;
+  (void)tx_type;
 
   if (counts) {
     coef_counts = counts->coef[tx_size_ctx][type][ref];
@@ -138,7 +139,7 @@
   cat6_prob = av1_cat6_prob;
 #endif
 
-  dq_shift = get_tx_scale(xd, tx_type, tx_size);
+  dq_shift = get_tx_scale(tx_size);
 
   while (c < max_eob) {
     int val = -1;

diff --git a/av1/encoder/dct.c b/av1/encoder/dct.c
index c137760..600acbe 100644
--- a/av1/encoder/dct.c
+++ b/av1/encoder/dct.c

@@ -2000,6 +2000,7 @@
     }
   }
 }
+#endif  // CONFIG_EXT_TX
 
 #if CONFIG_AOM_HIGHBITDEPTH
 void av1_highbd_fht32x32_c(const int16_t *input, tran_low_t *output, int stride,
@@ -2014,4 +2015,3 @@
 }
 #endif  // CONFIG_TX64X64
 #endif  // CONFIG_AOM_HIGHBITDEPTH
-#endif  // CONFIG_EXT_TX

diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index a0bf9bf..a0fa37a 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c

@@ -99,7 +99,7 @@
   int seg_id = xd->mi[0]->mbmi.segment_id;
   const qm_val_t *iqmatrix = pd->seg_iqmatrix[seg_id][!ref][tx_size];
 #endif
-  const int shift = get_tx_scale(xd, tx_type, tx_size);
+  const int shift = get_tx_scale(tx_size);
 #if CONFIG_NEW_QUANT
   int dq = get_dq_profile_from_ctx(mb->qindex, ctx, ref, plane_type);
   const dequant_val_type_nuq *dequant_val = pd->dequant_val_nuq[dq];
@@ -471,7 +471,7 @@
   const int16_t *src_diff;
 
   src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
-  qparam.log_scale = get_tx_scale(xd, tx_type, tx_size);
+  qparam.log_scale = get_tx_scale(tx_size);
 #else
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   tran_low_t *ref_coeff = BLOCK_OFFSET(pd->pvq_ref_coeff, block);
@@ -578,6 +578,7 @@
 }
 
 #if CONFIG_NEW_QUANT
+// TODO(debargha, sarah): Unify these functions with the ones above
 void av1_xform_quant_nuq(const AV1_COMMON *cm, MACROBLOCK *x, int plane,
                          int block, int blk_row, int blk_col,
                          BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int ctx) {
@@ -615,36 +616,60 @@
   fwd_txfm_param.bd = xd->bd;
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
-    if (tx_size == TX_32X32) {
-      highbd_quantize_32x32_nuq(
-          coeff, tx_size_2d[tx_size], x->skip_block, p->quant, p->quant_shift,
-          pd->dequant, (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-          (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff,
-          dqcoeff, eob, scan_order->scan, band);
-    } else {
-      highbd_quantize_nuq(coeff, tx_size_2d[tx_size], x->skip_block, p->quant,
-                          p->quant_shift, pd->dequant,
-                          (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-                          (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
-                          qcoeff, dqcoeff, eob, scan_order->scan, band);
+    switch (get_tx_scale(tx_size)) {
+#if CONFIG_TX64X64
+      case 2:
+        highbd_quantize_64x64_nuq(
+            coeff, tx_size_2d[tx_size], x->skip_block, p->quant, p->quant_shift,
+            pd->dequant, (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+            (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff,
+            dqcoeff, eob, scan_order->scan, band);
+        break;
+#endif  // CONFIG_TX64X64
+      case 1:
+        highbd_quantize_32x32_nuq(
+            coeff, tx_size_2d[tx_size], x->skip_block, p->quant, p->quant_shift,
+            pd->dequant, (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+            (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff,
+            dqcoeff, eob, scan_order->scan, band);
+        break;
+      default:
+        highbd_quantize_nuq(
+            coeff, tx_size_2d[tx_size], x->skip_block, p->quant, p->quant_shift,
+            pd->dequant, (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+            (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff,
+            dqcoeff, eob, scan_order->scan, band);
+        break;
     }
     return;
   }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
   fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
-  if (tx_size == TX_32X32) {
-    quantize_32x32_nuq(coeff, 1024, x->skip_block, p->quant, p->quant_shift,
-                       pd->dequant,
-                       (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-                       (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
-                       qcoeff, dqcoeff, eob, scan_order->scan, band);
-  } else {
-    quantize_nuq(coeff, tx_size_2d[tx_size], x->skip_block, p->quant,
-                 p->quant_shift, pd->dequant,
-                 (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-                 (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff,
-                 dqcoeff, eob, scan_order->scan, band);
+  switch (get_tx_scale(tx_size)) {
+#if CONFIG_TX64X64
+    case 2:
+      quantize_64x64_nuq(coeff, tx_size_2d[tx_size], x->skip_block,
+                         p->quant, p->quant_shift, pd->dequant,
+                         (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+                         (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
+                         qcoeff, dqcoeff, eob, scan_order->scan, band);
+      break;
+#endif  // CONFIG_TX64X64
+    case 1:
+      quantize_32x32_nuq(coeff, tx_size_2d[tx_size], x->skip_block,
+                         p->quant, p->quant_shift, pd->dequant,
+                         (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+                         (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
+                         qcoeff, dqcoeff, eob, scan_order->scan, band);
+      break;
+    default:
+      quantize_nuq(coeff, tx_size_2d[tx_size], x->skip_block, p->quant,
+                   p->quant_shift, pd->dequant,
+                   (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+                   (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
+                   qcoeff, dqcoeff, eob, scan_order->scan, band);
+      break;
   }
 }
 
@@ -685,36 +710,59 @@
   fwd_txfm_param.bd = xd->bd;
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
-    if (tx_size == TX_32X32) {
-      highbd_quantize_32x32_fp_nuq(
-          coeff, tx_size_2d[tx_size], x->skip_block, p->quant_fp, pd->dequant,
-          (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-          (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff,
-          dqcoeff, eob, scan_order->scan, band);
-    } else {
-      highbd_quantize_fp_nuq(
-          coeff, tx_size_2d[tx_size], x->skip_block, p->quant_fp, pd->dequant,
-          (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-          (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff,
-          dqcoeff, eob, scan_order->scan, band);
+    switch (get_tx_scale(tx_size)) {
+#if CONFIG_TX64X64
+      case 2:
+        highbd_quantize_64x64_fp_nuq(
+            coeff, tx_size_2d[tx_size], x->skip_block, p->quant_fp, pd->dequant,
+            (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+            (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff,
+            dqcoeff, eob, scan_order->scan, band);
+        break;
+#endif  // CONFIG_TX64X64
+      case 1:
+        highbd_quantize_32x32_fp_nuq(
+            coeff, tx_size_2d[tx_size], x->skip_block, p->quant_fp, pd->dequant,
+            (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+            (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff,
+            dqcoeff, eob, scan_order->scan, band);
+        break;
+      default:
+        highbd_quantize_fp_nuq(
+            coeff, tx_size_2d[tx_size], x->skip_block, p->quant_fp, pd->dequant,
+            (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+            (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff,
+            dqcoeff, eob, scan_order->scan, band);
     }
     return;
   }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
   fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
-  if (tx_size == TX_32X32) {
-    quantize_32x32_fp_nuq(coeff, tx_size_2d[tx_size], x->skip_block,
-                          p->quant_fp, pd->dequant,
-                          (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-                          (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
-                          qcoeff, dqcoeff, eob, scan_order->scan, band);
-  } else {
-    quantize_fp_nuq(coeff, tx_size_2d[tx_size], x->skip_block, p->quant_fp,
-                    pd->dequant,
-                    (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-                    (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
-                    qcoeff, dqcoeff, eob, scan_order->scan, band);
+  switch (get_tx_scale(tx_size)) {
+#if CONFIG_TX64X64
+    case 2:
+      quantize_64x64_fp_nuq(
+          coeff, tx_size_2d[tx_size], x->skip_block, p->quant_fp, pd->dequant,
+          (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+          (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff,
+          dqcoeff, eob, scan_order->scan, band);
+      break;
+#endif  // CONFIG_TX64X64
+    case 1:
+      quantize_32x32_fp_nuq(
+          coeff, tx_size_2d[tx_size], x->skip_block, p->quant_fp, pd->dequant,
+          (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+          (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff,
+          dqcoeff, eob, scan_order->scan, band);
+      break;
+    default:
+      quantize_fp_nuq(coeff, tx_size_2d[tx_size], x->skip_block, p->quant_fp,
+                      pd->dequant,
+                      (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+                      (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
+                      qcoeff, dqcoeff, eob, scan_order->scan, band);
+      break;
   }
 }
 
@@ -753,31 +801,54 @@
   fwd_txfm_param.bd = xd->bd;
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
-    if (tx_size == TX_32X32) {
-      highbd_quantize_dc_32x32_nuq(
-          coeff, tx_size_2d[tx_size], x->skip_block, p->quant[0],
-          p->quant_shift[0], pd->dequant[0], p->cuml_bins_nuq[dq][0],
-          pd->dequant_val_nuq[dq][0], qcoeff, dqcoeff, eob);
-    } else {
-      highbd_quantize_dc_nuq(coeff, tx_size_2d[tx_size], x->skip_block,
-                             p->quant[0], p->quant_shift[0], pd->dequant[0],
-                             p->cuml_bins_nuq[dq][0],
-                             pd->dequant_val_nuq[dq][0], qcoeff, dqcoeff, eob);
+    switch (get_tx_scale(tx_size)) {
+#if CONFIG_TX64X64
+      case 2:
+        highbd_quantize_dc_64x64_nuq(
+            coeff, tx_size_2d[tx_size], x->skip_block, p->quant[0],
+            p->quant_shift[0], pd->dequant[0], p->cuml_bins_nuq[dq][0],
+            pd->dequant_val_nuq[dq][0], qcoeff, dqcoeff, eob);
+        break;
+#endif  // CONFIG_TX64X64
+      case 1:
+        highbd_quantize_dc_32x32_nuq(
+            coeff, tx_size_2d[tx_size], x->skip_block, p->quant[0],
+            p->quant_shift[0], pd->dequant[0], p->cuml_bins_nuq[dq][0],
+            pd->dequant_val_nuq[dq][0], qcoeff, dqcoeff, eob);
+        break;
+      default:
+        highbd_quantize_dc_nuq(
+            coeff, tx_size_2d[tx_size], x->skip_block, p->quant[0],
+            p->quant_shift[0], pd->dequant[0], p->cuml_bins_nuq[dq][0],
+            pd->dequant_val_nuq[dq][0], qcoeff, dqcoeff, eob);
+        break;
     }
     return;
   }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
   fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
-  if (tx_size == TX_32X32) {
-    quantize_dc_32x32_nuq(coeff, tx_size_2d[tx_size], x->skip_block,
-                          p->quant[0], p->quant_shift[0], pd->dequant[0],
-                          p->cuml_bins_nuq[dq][0], pd->dequant_val_nuq[dq][0],
-                          qcoeff, dqcoeff, eob);
-  } else {
-    quantize_dc_nuq(coeff, tx_size_2d[tx_size], x->skip_block, p->quant[0],
-                    p->quant_shift[0], pd->dequant[0], p->cuml_bins_nuq[dq][0],
-                    pd->dequant_val_nuq[dq][0], qcoeff, dqcoeff, eob);
+  switch (get_tx_scale(tx_size)) {
+#if CONFIG_TX64X64
+    case 2:
+      quantize_dc_64x64_nuq(coeff, tx_size_2d[tx_size], x->skip_block,
+                            p->quant[0], p->quant_shift[0], pd->dequant[0],
+                            p->cuml_bins_nuq[dq][0], pd->dequant_val_nuq[dq][0],
+                            qcoeff, dqcoeff, eob);
+      break;
+#endif  // CONFIG_TX64X64
+    case 1:
+      quantize_dc_32x32_nuq(coeff, tx_size_2d[tx_size], x->skip_block,
+                            p->quant[0], p->quant_shift[0], pd->dequant[0],
+                            p->cuml_bins_nuq[dq][0], pd->dequant_val_nuq[dq][0],
+                            qcoeff, dqcoeff, eob);
+      break;
+    default:
+      quantize_dc_nuq(coeff, tx_size_2d[tx_size], x->skip_block, p->quant[0],
+                      p->quant_shift[0], pd->dequant[0],
+                      p->cuml_bins_nuq[dq][0], pd->dequant_val_nuq[dq][0],
+                      qcoeff, dqcoeff, eob);
+      break;
   }
 }
 
@@ -816,31 +887,54 @@
   fwd_txfm_param.bd = xd->bd;
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
-    if (tx_size == TX_32X32) {
-      highbd_quantize_dc_32x32_fp_nuq(
-          coeff, tx_size_2d[tx_size], x->skip_block, p->quant_fp[0],
-          pd->dequant[0], p->cuml_bins_nuq[dq][0], pd->dequant_val_nuq[dq][0],
-          qcoeff, dqcoeff, eob);
-    } else {
-      highbd_quantize_dc_fp_nuq(
-          coeff, tx_size_2d[tx_size], x->skip_block, p->quant_fp[0],
-          pd->dequant[0], p->cuml_bins_nuq[dq][0], pd->dequant_val_nuq[dq][0],
-          qcoeff, dqcoeff, eob);
+    switch (get_tx_scale(tx_size)) {
+#if CONFIG_TX64X64
+      case 2:
+        highbd_quantize_dc_64x64_fp_nuq(
+            coeff, tx_size_2d[tx_size], x->skip_block, p->quant_fp[0],
+            pd->dequant[0], p->cuml_bins_nuq[dq][0], pd->dequant_val_nuq[dq][0],
+            qcoeff, dqcoeff, eob);
+        break;
+#endif  // CONFIG_TX64X64
+      case 1:
+        highbd_quantize_dc_32x32_fp_nuq(
+            coeff, tx_size_2d[tx_size], x->skip_block, p->quant_fp[0],
+            pd->dequant[0], p->cuml_bins_nuq[dq][0], pd->dequant_val_nuq[dq][0],
+            qcoeff, dqcoeff, eob);
+        break;
+      default:
+        highbd_quantize_dc_fp_nuq(
+            coeff, tx_size_2d[tx_size], x->skip_block, p->quant_fp[0],
+            pd->dequant[0], p->cuml_bins_nuq[dq][0], pd->dequant_val_nuq[dq][0],
+            qcoeff, dqcoeff, eob);
+        break;
     }
     return;
   }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
   fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
-  if (tx_size == TX_32X32) {
-    quantize_dc_32x32_fp_nuq(coeff, tx_size_2d[tx_size], x->skip_block,
-                             p->quant_fp[0], pd->dequant[0],
-                             p->cuml_bins_nuq[dq][0],
-                             pd->dequant_val_nuq[dq][0], qcoeff, dqcoeff, eob);
-  } else {
-    quantize_dc_fp_nuq(coeff, tx_size_2d[tx_size], x->skip_block,
-                       p->quant_fp[0], pd->dequant[0], p->cuml_bins_nuq[dq][0],
-                       pd->dequant_val_nuq[dq][0], qcoeff, dqcoeff, eob);
+  switch (get_tx_scale(tx_size)) {
+#if CONFIG_TX64X64
+    case 2:
+      quantize_dc_64x64_fp_nuq(
+          coeff, tx_size_2d[tx_size], x->skip_block, p->quant_fp[0],
+          pd->dequant[0], p->cuml_bins_nuq[dq][0], pd->dequant_val_nuq[dq][0],
+          qcoeff, dqcoeff, eob);
+      break;
+#endif  // CONFIG_TX64X64
+    case 1:
+      quantize_dc_32x32_fp_nuq(
+          coeff, tx_size_2d[tx_size], x->skip_block, p->quant_fp[0],
+          pd->dequant[0], p->cuml_bins_nuq[dq][0], pd->dequant_val_nuq[dq][0],
+          qcoeff, dqcoeff, eob);
+      break;
+    default:
+      quantize_dc_fp_nuq(coeff, tx_size_2d[tx_size], x->skip_block,
+                         p->quant_fp[0], pd->dequant[0],
+                         p->cuml_bins_nuq[dq][0], pd->dequant_val_nuq[dq][0],
+                         qcoeff, dqcoeff, eob);
+      break;
   }
 }
 #endif  // CONFIG_NEW_QUANT

diff --git a/av1/encoder/hybrid_fwd_txfm.c b/av1/encoder/hybrid_fwd_txfm.c
index a88c884..4ada078 100644
--- a/av1/encoder/hybrid_fwd_txfm.c
+++ b/av1/encoder/hybrid_fwd_txfm.c

@@ -441,7 +441,7 @@
   (void)bd;
   switch (tx_type) {
     case DCT_DCT:
-      av1_highbd_fht64x64_c(src_diff, coeff, diff_stride, tx_type);
+      av1_highbd_fht64x64(src_diff, coeff, diff_stride, tx_type);
       break;
 #if CONFIG_EXT_TX
     case ADST_DCT:
@@ -458,7 +458,7 @@
     case H_ADST:
     case V_FLIPADST:
     case H_FLIPADST:
-      av1_highbd_fht64x64_c(src_diff, coeff, diff_stride, tx_type);
+      av1_highbd_fht64x64(src_diff, coeff, diff_stride, tx_type);
       break;
     case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 64, tx_type); break;
 #endif  // CONFIG_EXT_TX

diff --git a/av1/encoder/quantize.c b/av1/encoder/quantize.c
index 771f94b..9dc1b13 100644
--- a/av1/encoder/quantize.c
+++ b/av1/encoder/quantize.c

@@ -59,28 +59,28 @@
     const tran_low_t coeffv, const int16_t quant, const int16_t quant_shift,
     const int16_t dequant, const tran_low_t *cuml_bins_ptr,
     const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, int logsizeby32) {
+    tran_low_t *dqcoeff_ptr, int logsizeby16) {
   const int coeff = coeffv;
   const int coeff_sign = (coeff >> 31);
   const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
   int i, q;
   int tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX);
   for (i = 0; i < NUQ_KNOTS; i++) {
-    if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], 1 + logsizeby32)) {
+    if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], logsizeby16)) {
       q = i;
       break;
     }
   }
   if (i == NUQ_KNOTS) {
-    tmp -= ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1], 1 + logsizeby32);
+    tmp -= ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1], logsizeby16);
     q = NUQ_KNOTS +
-        (((((tmp * quant) >> 16) + tmp) * quant_shift) >> (15 - logsizeby32));
+        (((((tmp * quant) >> 16) + tmp) * quant_shift) >> (16 - logsizeby16));
   }
   if (q) {
     *dqcoeff_ptr = ROUND_POWER_OF_TWO(
-        av1_dequant_abscoeff_nuq(q, dequant, dequant_val), 1 + logsizeby32);
+        av1_dequant_abscoeff_nuq(q, dequant, dequant_val), logsizeby16);
     // *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val) >>
-    // (1 + logsizeby32);
+    // (logsizeby16);
     *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
     *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
   } else {
@@ -123,14 +123,14 @@
 static INLINE int quantize_coeff_bigtx_fp_nuq(
     const tran_low_t coeffv, const int16_t quant, const int16_t dequant,
     const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val,
-    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, int logsizeby32) {
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, int logsizeby16) {
   const int coeff = coeffv;
   const int coeff_sign = (coeff >> 31);
   const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
   int i, q;
   int tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX);
   for (i = 0; i < NUQ_KNOTS; i++) {
-    if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], 1 + logsizeby32)) {
+    if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], logsizeby16)) {
       q = i;
       break;
     }
@@ -138,15 +138,15 @@
   if (i == NUQ_KNOTS) {
     q = NUQ_KNOTS +
         ((((int64_t)tmp -
-           ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1], 1 + logsizeby32)) *
+           ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1], logsizeby16)) *
           quant) >>
-         (15 - logsizeby32));
+         (16 - logsizeby16));
   }
   if (q) {
     *dqcoeff_ptr = ROUND_POWER_OF_TWO(
-        av1_dequant_abscoeff_nuq(q, dequant, dequant_val), 1 + logsizeby32);
+        av1_dequant_abscoeff_nuq(q, dequant, dequant_val), logsizeby16);
     // *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val) >>
-    // (1 + logsizeby32);
+    // (logsizeby16);
     *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
     *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
   } else {
@@ -205,7 +205,7 @@
     const int rc = 0;
     if (quantize_coeff_bigtx_nuq(coeff_ptr[rc], quant, quant_shift, dequant,
                                  cuml_bins_ptr, dequant_val, qcoeff_ptr,
-                                 dqcoeff_ptr, 0))
+                                 dqcoeff_ptr, get_tx_scale(TX_32X32)))
       eob = 0;
   }
   *eob_ptr = eob + 1;
@@ -225,12 +225,54 @@
     const int rc = 0;
     if (quantize_coeff_bigtx_fp_nuq(coeff_ptr[rc], quant, dequant,
                                     cuml_bins_ptr, dequant_val, qcoeff_ptr,
-                                    dqcoeff_ptr, 0))
+                                    dqcoeff_ptr, get_tx_scale(TX_32X32)))
       eob = 0;
   }
   *eob_ptr = eob + 1;
 }
 
+#if CONFIG_TX64X64
+void quantize_dc_64x64_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                           int skip_block, const int16_t quant,
+                           const int16_t quant_shift, const int16_t dequant,
+                           const tran_low_t *cuml_bins_ptr,
+                           const tran_low_t *dequant_val,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           uint16_t *eob_ptr) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    const int rc = 0;
+    if (quantize_coeff_bigtx_nuq(coeff_ptr[rc], quant, quant_shift, dequant,
+                                 cuml_bins_ptr, dequant_val, qcoeff_ptr,
+                                 dqcoeff_ptr, get_tx_scale(TX_64X64)))
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+void quantize_dc_64x64_fp_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                              int skip_block, const int16_t quant,
+                              const int16_t dequant,
+                              const tran_low_t *cuml_bins_ptr,
+                              const tran_low_t *dequant_val,
+                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                              uint16_t *eob_ptr) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    const int rc = 0;
+    if (quantize_coeff_bigtx_fp_nuq(coeff_ptr[rc], quant, dequant,
+                                    cuml_bins_ptr, dequant_val, qcoeff_ptr,
+                                    dqcoeff_ptr, get_tx_scale(TX_64X64)))
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+#endif  // CONFIG_TX64X64
+
 void quantize_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                     int skip_block, const int16_t *quant_ptr,
                     const int16_t *quant_shift_ptr, const int16_t *dequant_ptr,
@@ -300,7 +342,8 @@
       if (quantize_coeff_bigtx_nuq(
               coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0],
               dequant_ptr[rc != 0], cuml_bins_ptr[band[i]],
-              dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc], 0))
+              dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc],
+              get_tx_scale(TX_32X32)))
         eob = i;
     }
   }
@@ -325,12 +368,66 @@
       if (quantize_coeff_bigtx_fp_nuq(
               coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0],
               cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc],
-              &dqcoeff_ptr[rc], 0))
+              &dqcoeff_ptr[rc], get_tx_scale(TX_32X32)))
         eob = i;
     }
   }
   *eob_ptr = eob + 1;
 }
+
+#if CONFIG_TX64X64
+void quantize_64x64_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                          int skip_block, const int16_t *quant_ptr,
+                          const int16_t *quant_shift_ptr,
+                          const int16_t *dequant_ptr,
+                          const cuml_bins_type_nuq *cuml_bins_ptr,
+                          const dequant_val_type_nuq *dequant_val,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          uint16_t *eob_ptr, const int16_t *scan,
+                          const uint8_t *band) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    int i;
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      if (quantize_coeff_bigtx_nuq(
+              coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0],
+              dequant_ptr[rc != 0], cuml_bins_ptr[band[i]],
+              dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc],
+              get_tx_scale(TX_64X64)))
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void quantize_64x64_fp_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             int skip_block, const int16_t *quant_ptr,
+                             const int16_t *dequant_ptr,
+                             const cuml_bins_type_nuq *cuml_bins_ptr,
+                             const dequant_val_type_nuq *dequant_val,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             uint16_t *eob_ptr, const int16_t *scan,
+                             const uint8_t *band) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    int i;
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      if (quantize_coeff_bigtx_fp_nuq(
+              coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0],
+              cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc],
+              &dqcoeff_ptr[rc], get_tx_scale(TX_64X64)))
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif  // CONFIG_TX64X64
 #endif  // CONFIG_NEW_QUANT
 
 void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr,
@@ -353,24 +450,42 @@
   // obsolete skip_block
   const int skip_block = 0;
 
-  if (qparam->log_scale == 0) {
-    av1_quantize_fp(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp,
-                    p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
-                    pd->dequant, eob_ptr, sc->scan, sc->iscan
+  switch (qparam->log_scale) {
+    case 0:
+      av1_quantize_fp(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp,
+                      p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+                      pd->dequant, eob_ptr, sc->scan, sc->iscan
 #if CONFIG_AOM_QM
-                    ,
-                    qm_ptr, iqm_ptr
+                      ,
+                      qm_ptr, iqm_ptr
 #endif
-                    );
-  } else {
-    av1_quantize_fp_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp,
-                          p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
-                          pd->dequant, eob_ptr, sc->scan, sc->iscan
+                      );
+      break;
+    case 1:
+      av1_quantize_fp_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin,
+                            p->round_fp, p->quant_fp, p->quant_shift,
+                            qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
+                            sc->scan, sc->iscan
 #if CONFIG_AOM_QM
-                          ,
-                          qm_ptr, iqm_ptr
+                            ,
+                            qm_ptr, iqm_ptr
 #endif
-                          );
+                            );
+      break;
+#if CONFIG_TX64X64
+    case 2:
+      av1_quantize_fp_64x64(coeff_ptr, n_coeffs, skip_block, p->zbin,
+                            p->round_fp, p->quant_fp, p->quant_shift,
+                            qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
+                            sc->scan, sc->iscan
+#if CONFIG_AOM_QM
+                            ,
+                            qm_ptr, iqm_ptr
+#endif
+                            );
+      break;
+#endif  // CONFIG_TX64X64
+    default: assert(0);
   }
 }
 
@@ -387,24 +502,40 @@
   // obsolete skip_block
   const int skip_block = 0;
 
-  if (qparam->log_scale == 0) {
-    aom_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round, p->quant,
-                   p->quant_shift, qcoeff_ptr, dqcoeff_ptr, pd->dequant,
-                   eob_ptr, sc->scan, sc->iscan
+  switch (qparam->log_scale) {
+    case 0:
+      aom_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
+                     p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+                     pd->dequant, eob_ptr, sc->scan, sc->iscan
 #if CONFIG_AOM_QM
-                   ,
-                   qm_ptr, iqm_ptr
+                     ,
+                     qm_ptr, iqm_ptr
 #endif
-                   );
-  } else {
-    aom_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
-                         p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
-                         pd->dequant, eob_ptr, sc->scan, sc->iscan
+                     );
+      break;
+    case 1:
+      aom_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
+                           p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+                           pd->dequant, eob_ptr, sc->scan, sc->iscan
 #if CONFIG_AOM_QM
-                         ,
-                         qm_ptr, iqm_ptr
+                           ,
+                           qm_ptr, iqm_ptr
 #endif
-                         );
+                           );
+      break;
+#if CONFIG_TX64X64
+    case 2:
+      aom_quantize_b_64x64(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
+                           p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+                           pd->dequant, eob_ptr, sc->scan, sc->iscan
+#if CONFIG_AOM_QM
+                           ,
+                           qm_ptr, iqm_ptr
+#endif
+                           );
+      break;
+#endif  // CONFIG_TX64X64
+    default: assert(0);
   }
 }
 
@@ -421,23 +552,38 @@
   // obsolete skip_block
   const int skip_block = 0;
   (void)sc;
-  if (qparam->log_scale == 0) {
-    aom_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round,
-                    p->quant_fp[0], qcoeff_ptr, dqcoeff_ptr, pd->dequant[0],
-                    eob_ptr
+
+  switch (qparam->log_scale) {
+    case 0:
+      aom_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round,
+                      p->quant_fp[0], qcoeff_ptr, dqcoeff_ptr, pd->dequant[0],
+                      eob_ptr
 #if CONFIG_AOM_QM
-                    ,
-                    qm_ptr, iqm_ptr
+                      ,
+                      qm_ptr, iqm_ptr
 #endif
-                    );
-  } else {
-    aom_quantize_dc_32x32(coeff_ptr, skip_block, p->round, p->quant_fp[0],
-                          qcoeff_ptr, dqcoeff_ptr, pd->dequant[0], eob_ptr
+                      );
+      break;
+    case 1:
+      aom_quantize_dc_32x32(coeff_ptr, skip_block, p->round, p->quant_fp[0],
+                            qcoeff_ptr, dqcoeff_ptr, pd->dequant[0], eob_ptr
 #if CONFIG_AOM_QM
-                          ,
-                          qm_ptr, iqm_ptr
+                            ,
+                            qm_ptr, iqm_ptr
 #endif
-                          );
+                            );
+      break;
+#if CONFIG_TX64X64
+      aom_quantize_dc_64x64(coeff_ptr, skip_block, p->round, p->quant_fp[0],
+                            qcoeff_ptr, dqcoeff_ptr, pd->dequant[0], eob_ptr
+#if CONFIG_AOM_QM
+                            ,
+                            qm_ptr, iqm_ptr
+#endif
+                            );
+    case 2: break;
+#endif  // CONFIG_TX64X64
+    default: assert(0);
   }
 }
 
@@ -574,28 +720,28 @@
 static INLINE int highbd_quantize_coeff_bigtx_fp_nuq(
     const tran_low_t coeffv, const int16_t quant, const int16_t dequant,
     const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val,
-    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, int logsizeby32) {
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, int logsizeby16) {
   const int coeff = coeffv;
   const int coeff_sign = (coeff >> 31);
   const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
   int i, q;
   int64_t tmp = clamp(abs_coeff, INT32_MIN, INT32_MAX);
   for (i = 0; i < NUQ_KNOTS; i++) {
-    if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], 1 + logsizeby32)) {
+    if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], logsizeby16)) {
       q = i;
       break;
     }
   }
   if (i == NUQ_KNOTS) {
     q = NUQ_KNOTS +
-        (int)(((tmp - ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1],
-                                         1 + logsizeby32)) *
+        (int)(((tmp -
+                ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1], logsizeby16)) *
                quant) >>
-              (15 - logsizeby32));
+              (16 - logsizeby16));
   }
   if (q) {
     *dqcoeff_ptr = ROUND_POWER_OF_TWO(
-        av1_dequant_abscoeff_nuq(q, dequant, dequant_val), 1 + logsizeby32);
+        av1_dequant_abscoeff_nuq(q, dequant, dequant_val), logsizeby16);
     *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
     *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
   } else {
@@ -609,26 +755,26 @@
     const tran_low_t coeffv, const int16_t quant, const int16_t quant_shift,
     const int16_t dequant, const tran_low_t *cuml_bins_ptr,
     const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, int logsizeby32) {
+    tran_low_t *dqcoeff_ptr, int logsizeby16) {
   const int coeff = coeffv;
   const int coeff_sign = (coeff >> 31);
   const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
   int i, q;
   int64_t tmp = clamp(abs_coeff, INT32_MIN, INT32_MAX);
   for (i = 0; i < NUQ_KNOTS; i++) {
-    if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], 1 + logsizeby32)) {
+    if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], logsizeby16)) {
       q = i;
       break;
     }
   }
   if (i == NUQ_KNOTS) {
-    tmp -= ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1], 1 + logsizeby32);
+    tmp -= ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1], logsizeby16);
     q = NUQ_KNOTS + (int)(((((tmp * quant) >> 16) + tmp) * quant_shift) >>
-                          (15 - logsizeby32));
+                          (16 - logsizeby16));
   }
   if (q) {
     *dqcoeff_ptr = ROUND_POWER_OF_TWO(
-        av1_dequant_abscoeff_nuq(q, dequant, dequant_val), 1 + logsizeby32);
+        av1_dequant_abscoeff_nuq(q, dequant, dequant_val), logsizeby16);
     *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
     *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
   } else {
@@ -723,7 +869,8 @@
       if (highbd_quantize_coeff_bigtx_nuq(
               coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0],
               dequant_ptr[rc != 0], cuml_bins_ptr[band[i]],
-              dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc], 0))
+              dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc],
+              get_tx_scale(TX_32X32)))
         eob = i;
     }
   }
@@ -749,13 +896,68 @@
       if (highbd_quantize_coeff_bigtx_fp_nuq(
               coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0],
               cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc],
-              &dqcoeff_ptr[rc], 0))
+              &dqcoeff_ptr[rc], get_tx_scale(TX_32X32)))
         eob = i;
     }
   }
   *eob_ptr = eob + 1;
 }
 
+#if CONFIG_TX64X64
+void highbd_quantize_64x64_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                 int skip_block, const int16_t *quant_ptr,
+                                 const int16_t *quant_shift_ptr,
+                                 const int16_t *dequant_ptr,
+                                 const cuml_bins_type_nuq *cuml_bins_ptr,
+                                 const dequant_val_type_nuq *dequant_val,
+                                 tran_low_t *qcoeff_ptr,
+                                 tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                 const int16_t *scan, const uint8_t *band) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    int i;
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      if (highbd_quantize_coeff_bigtx_nuq(
+              coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0],
+              dequant_ptr[rc != 0], cuml_bins_ptr[band[i]],
+              dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc],
+              get_tx_scale(TX_64X64)))
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void highbd_quantize_64x64_fp_nuq_c(const tran_low_t *coeff_ptr,
+                                    intptr_t n_coeffs, int skip_block,
+                                    const int16_t *quant_ptr,
+                                    const int16_t *dequant_ptr,
+                                    const cuml_bins_type_nuq *cuml_bins_ptr,
+                                    const dequant_val_type_nuq *dequant_val,
+                                    tran_low_t *qcoeff_ptr,
+                                    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                    const int16_t *scan, const uint8_t *band) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    int i;
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      if (highbd_quantize_coeff_bigtx_fp_nuq(
+              coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0],
+              cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc],
+              &dqcoeff_ptr[rc], get_tx_scale(TX_64X64)))
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif  // CONFIG_TX64X64
+
 void highbd_quantize_fp_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                               int skip_block, const int16_t *quant_ptr,
                               const int16_t *dequant_ptr,
@@ -793,7 +995,8 @@
     const int rc = 0;
     if (highbd_quantize_coeff_bigtx_nuq(coeff_ptr[rc], quant, quant_shift,
                                         dequant, cuml_bins_ptr, dequant_val,
-                                        qcoeff_ptr, dqcoeff_ptr, 0))
+                                        qcoeff_ptr, dqcoeff_ptr,
+                                        get_tx_scale(TX_32X32)))
       eob = 0;
   }
   *eob_ptr = eob + 1;
@@ -811,11 +1014,52 @@
     const int rc = 0;
     if (highbd_quantize_coeff_bigtx_fp_nuq(coeff_ptr[rc], quant, dequant,
                                            cuml_bins_ptr, dequant_val,
-                                           qcoeff_ptr, dqcoeff_ptr, 0))
+                                           qcoeff_ptr, dqcoeff_ptr,
+                                           get_tx_scale(TX_32X32)))
       eob = 0;
   }
   *eob_ptr = eob + 1;
 }
+
+#if CONFIG_TX64X64
+void highbd_quantize_dc_64x64_nuq(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+    const int16_t quant, const int16_t quant_shift, const int16_t dequant,
+    const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    const int rc = 0;
+    if (highbd_quantize_coeff_bigtx_nuq(coeff_ptr[rc], quant, quant_shift,
+                                        dequant, cuml_bins_ptr, dequant_val,
+                                        qcoeff_ptr, dqcoeff_ptr,
+                                        get_tx_scale(TX_64X64)))
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+void highbd_quantize_dc_64x64_fp_nuq(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+    const int16_t quant, const int16_t dequant, const tran_low_t *cuml_bins_ptr,
+    const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    const int rc = 0;
+    if (highbd_quantize_coeff_bigtx_fp_nuq(coeff_ptr[rc], quant, dequant,
+                                           cuml_bins_ptr, dequant_val,
+                                           qcoeff_ptr, dqcoeff_ptr,
+                                           get_tx_scale(TX_64X64)))
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+#endif  // CONFIG_TX64X64
 #endif  // CONFIG_NEW_QUANT
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
@@ -999,6 +1243,154 @@
   *eob_ptr = eob + 1;
 }
 
+#if CONFIG_TX64X64
+void av1_quantize_fp_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             int skip_block, const int16_t *zbin_ptr,
+                             const int16_t *round_ptr, const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan
+#if CONFIG_AOM_QM
+                             ,
+                             const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
+#endif
+                             ) {
+  int i, eob = -1;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+#if CONFIG_AOM_QM
+      const qm_val_t wt = qm_ptr[rc];
+      const qm_val_t iwt = iqm_ptr[rc];
+      const int dequant =
+          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+      int64_t tmp = 0;
+#endif
+      const int coeff_sign = (coeff >> 31);
+      int tmp32 = 0;
+      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+#if CONFIG_AOM_QM
+      if (abs_coeff * wt >= (dequant_ptr[rc != 0] << (AOM_QM_BITS - 3))) {
+#else
+      if (abs_coeff >= (dequant_ptr[rc != 0] >> 3)) {
+#endif
+        abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+        abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+#if CONFIG_AOM_QM
+        tmp = abs_coeff * wt;
+        tmp32 = (int)(tmp * quant_ptr[rc != 0]) >> (AOM_QM_BITS + 14);
+        qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+        dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 4;
+#else
+        tmp32 = (abs_coeff * quant_ptr[rc != 0]) >> 15;
+        qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+        dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant_ptr[rc != 0]) / 4;
+#endif
+      }
+
+      if (tmp32) eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif  // CONFIG_TX64X64
+
+void av1_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                      int skip_block, const int16_t *zbin_ptr,
+                      const int16_t *round_ptr, const int16_t *quant_ptr,
+                      const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+                      tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                      uint16_t *eob_ptr, const int16_t *scan,
+                      const int16_t *iscan,
+#if CONFIG_AOM_QM
+                      const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr,
+#endif
+                      int log_scale) {
+  int i, non_zero_count = (int)n_coeffs, eob = -1;
+  int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
+  int round[2] = { round_ptr[0], round_ptr[1] };
+  int nzbins[2];
+  int scale = 1;
+  int shift = 16;
+  (void)iscan;
+
+  if (log_scale > 0) {
+    zbins[0] = ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale);
+    zbins[1] = ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale);
+    round[0] = ROUND_POWER_OF_TWO(round_ptr[0], log_scale);
+    round[1] = ROUND_POWER_OF_TWO(round_ptr[1], log_scale);
+    scale = 1 << log_scale;
+    shift = 16 - log_scale;
+  }
+
+  nzbins[0] = zbins[0] * -1;
+  nzbins[1] = zbins[1] * -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = (int)n_coeffs - 1; i >= 0; i--) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
+        non_zero_count--;
+      else
+        break;
+    }
+
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < non_zero_count; i++) {
+      const int rc = scan[i];
+#if CONFIG_AOM_QM
+      const qm_val_t wt = qm_ptr[rc];
+      const qm_val_t iwt = iqm_ptr[rc];
+      const int dequant =
+          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+#endif
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+#if CONFIG_AOM_QM
+      if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) {
+#else
+
+      if (abs_coeff >= zbins[rc != 0]) {
+#endif
+        const int64_t tmp1 = abs_coeff + round[rc != 0];
+        const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+#if CONFIG_AOM_QM
+        const uint32_t abs_qcoeff = (uint32_t)(
+            (tmp2 * wt * quant_shift_ptr[rc != 0]) >> (AOM_QM_BITS + shift));
+        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+        dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / scale;
+#else
+        const uint32_t abs_qcoeff =
+            (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> shift);
+        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / scale;
+#endif  // CONFIG_AOM_QM
+        if (abs_qcoeff) eob = i;
+      }
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
 #if CONFIG_AOM_HIGHBITDEPTH
 void av1_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                              int skip_block, const int16_t *zbin_ptr,

diff --git a/av1/encoder/quantize.h b/av1/encoder/quantize.h
index f5f045e..b13af5a 100644
--- a/av1/encoder/quantize.h
+++ b/av1/encoder/quantize.h

@@ -127,6 +127,15 @@
                            const tran_low_t *dequant_val,
                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                            uint16_t *eob_ptr);
+#if CONFIG_TX64X64
+void quantize_dc_64x64_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                           int skip_block, const int16_t quant,
+                           const int16_t quant_shift, const int16_t dequant,
+                           const tran_low_t *cuml_bins_ptr,
+                           const tran_low_t *dequant_val,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           uint16_t *eob_ptr);
+#endif  // CONFIG_TX64X64
 void quantize_dc_fp_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                         int skip_block, const int16_t quant,
                         const int16_t dequant, const tran_low_t *cuml_bins_ptr,
@@ -139,6 +148,15 @@
                               const tran_low_t *dequant_val,
                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                               uint16_t *eob_ptr);
+#if CONFIG_TX64X64
+void quantize_dc_64x64_fp_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                              int skip_block, const int16_t quant,
+                              const int16_t dequant,
+                              const tran_low_t *cuml_bins_ptr,
+                              const tran_low_t *dequant_val,
+                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                              uint16_t *eob_ptr);
+#endif  // CONFIG_TX64X64
 #endif  // CONFIG_NEW_QUANT
 
 #if CONFIG_AOM_HIGHBITDEPTH
@@ -197,6 +215,13 @@
     const int16_t quant, const int16_t quant_shift, const int16_t dequant,
     const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val,
     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr);
+#if CONFIG_TX64X64
+void highbd_quantize_dc_64x64_nuq(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+    const int16_t quant, const int16_t quant_shift, const int16_t dequant,
+    const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr);
+#endif  // CONFIG_TX64X64
 void highbd_quantize_dc_fp_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                                int skip_block, const int16_t quant,
                                const int16_t dequant,
@@ -209,7 +234,13 @@
     const int16_t quant, const int16_t dequant, const tran_low_t *cuml_bins_ptr,
     const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr);
-
+#if CONFIG_TX64X64
+void highbd_quantize_dc_64x64_fp_nuq(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+    const int16_t quant, const int16_t dequant, const tran_low_t *cuml_bins_ptr,
+    const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr);
+#endif  // CONFIG_TX64X64
 #endif  // CONFIG_NEW_QUANT
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 

diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index b354c7d..2b85472 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c

@@ -1022,8 +1022,7 @@
     // not involve an inverse transform, but it is less accurate.
     const int buffer_length = tx_size_2d[tx_size];
     int64_t this_sse;
-    int tx_type = get_tx_type(pd->plane_type, xd, block, tx_size);
-    int shift = (MAX_TX_SCALE - get_tx_scale(xd, tx_type, tx_size)) * 2;
+    int shift = (MAX_TX_SCALE - get_tx_scale(tx_size)) * 2;
     tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
     tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
 #if CONFIG_PVQ
@@ -7854,8 +7853,8 @@
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
         continue;
 #else
-        restore_dst_buf(xd, orig_dst, orig_dst_stride);
-        return INT64_MAX;
+      restore_dst_buf(xd, orig_dst, orig_dst_stride);
+      return INT64_MAX;
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
       }
       /* clang-format on */
commit	0e11912ae108e8f77eb3b063f3fd5896749fb161	[log] [tgz]
author	Debargha Mukherjee <debargha@google.com>	Fri Nov 04 12:10:23 2016 -0700
committer	Debargha Mukherjee <debargha@google.com>	Wed Nov 09 21:59:14 2016 +0000
tree	44f19045dc74e7ab1befbbf287e6b79e9cb4cfd1
parent	23b120db225b4c973d757e480475a4e9b7f1272c [diff]