Add Daala TX fixed-coeff-depth capability to quantization

This patch completes the work to add fixed-depth TX domain support to
the quantization and dequantization code.  At present, it is active but
configured to behave identically to current AV1 master as RDO and TX
have not yet been updated to also support this functionality.

subset-1:
monty-rest-of-stack-noshift-s1@2017-11-13T14:37:42.541Z ->
 monty-rest-of-stack-quant-s1@2017-11-13T14:38:43.774Z

  PSNR | PSNR Cb | PSNR Cr | PSNR HVS |   SSIM | MS SSIM | CIEDE 2000
0.0000 |  0.0000 |  0.0000 |   0.0000 | 0.0000 |  0.0000 |     0.0000

objective-1-fast --limit=4:
monty-rest-of-stack-noshift-o1f4@2017-11-13T14:37:16.992Z ->
 monty-rest-of-stack-quant-o1f4@2017-11-13T14:38:28.828Z

  PSNR | PSNR Cb | PSNR Cr | PSNR HVS |   SSIM | MS SSIM | CIEDE 2000
0.0000 |  0.0000 |  0.0000 |   0.0000 | 0.0000 |  0.0000 |     0.0000

Change-Id: I3773a1fc128136c9fea227f4b547576a8aa6efa3
diff --git a/av1/common/odintrin.h b/av1/common/odintrin.h
index b7d6505..28df7a7 100644
--- a/av1/common/odintrin.h
+++ b/av1/common/odintrin.h
@@ -65,7 +65,11 @@
 /**The maximum number of color planes allowed in a single frame.*/
 # define OD_NPLANES_MAX (3)
 
-# define OD_COEFF_SHIFT (4)
+/* Native coefficient 'bitdepth'; TX is scaled up by (TX_COEFF_DEPTH-bitdepth)
+   such that the real coefficient depth precision is always TX_CEOFF_DEPTH
+   regardless of bitdepth or transform size.
+*/
+# define TX_COEFF_DEPTH (11)
 
 # define OD_DISABLE_CFL (1)
 # define OD_DISABLE_FILTER (1)
diff --git a/av1/common/quant_common.c b/av1/common/quant_common.c
index 2ba2da8..b61cfc1 100644
--- a/av1/common/quant_common.c
+++ b/av1/common/quant_common.c
@@ -120,7 +120,7 @@
 }
 #endif  // CONFIG_NEW_QUANT
 
-static const int16_t dc_qlookup[QINDEX_RANGE] = {
+static const int16_t dc_qlookup_Q3[QINDEX_RANGE] = {
   4,    8,    8,    9,    10,  11,  12,  12,  13,  14,  15,   16,   17,   18,
   19,   19,   20,   21,   22,  23,  24,  25,  26,  26,  27,   28,   29,   30,
   31,   32,   32,   33,   34,  35,  36,  37,  38,  38,  39,   40,   41,   42,
@@ -143,7 +143,7 @@
 };
 
 #if CONFIG_HIGHBITDEPTH
-static const int16_t dc_qlookup_10[QINDEX_RANGE] = {
+static const int16_t dc_qlookup_10_Q3[QINDEX_RANGE] = {
   4,    9,    10,   13,   15,   17,   20,   22,   25,   28,   31,   34,   37,
   40,   43,   47,   50,   53,   57,   60,   64,   68,   71,   75,   78,   82,
   86,   90,   93,   97,   101,  105,  109,  113,  116,  120,  124,  128,  132,
@@ -166,7 +166,7 @@
   3953, 4089, 4236, 4394, 4559, 4737, 4929, 5130, 5347,
 };
 
-static const int16_t dc_qlookup_12[QINDEX_RANGE] = {
+static const int16_t dc_qlookup_12_Q3[QINDEX_RANGE] = {
   4,     12,    18,    25,    33,    41,    50,    60,    70,    80,    91,
   103,   115,   127,   140,   153,   166,   180,   194,   208,   222,   237,
   251,   266,   281,   296,   312,   327,   343,   358,   374,   390,   405,
@@ -194,7 +194,7 @@
 };
 #endif
 
-static const int16_t ac_qlookup[QINDEX_RANGE] = {
+static const int16_t ac_qlookup_Q3[QINDEX_RANGE] = {
   4,    8,    9,    10,   11,   12,   13,   14,   15,   16,   17,   18,   19,
   20,   21,   22,   23,   24,   25,   26,   27,   28,   29,   30,   31,   32,
   33,   34,   35,   36,   37,   38,   39,   40,   41,   42,   43,   44,   45,
@@ -218,7 +218,7 @@
 };
 
 #if CONFIG_HIGHBITDEPTH
-static const int16_t ac_qlookup_10[QINDEX_RANGE] = {
+static const int16_t ac_qlookup_10_Q3[QINDEX_RANGE] = {
   4,    9,    11,   13,   16,   18,   21,   24,   27,   30,   33,   37,   40,
   44,   48,   51,   55,   59,   63,   67,   71,   75,   79,   83,   88,   92,
   96,   100,  105,  109,  114,  118,  122,  127,  131,  136,  140,  145,  149,
@@ -241,7 +241,7 @@
   6268, 6388, 6512, 6640, 6768, 6900, 7036, 7172, 7312,
 };
 
-static const int16_t ac_qlookup_12[QINDEX_RANGE] = {
+static const int16_t ac_qlookup_12_Q3[QINDEX_RANGE] = {
   4,     13,    19,    27,    35,    44,    54,    64,    75,    87,    99,
   112,   126,   139,   154,   168,   183,   199,   214,   230,   247,   263,
   280,   297,   314,   331,   349,   366,   384,   402,   420,   438,   456,
@@ -269,52 +269,90 @@
 };
 #endif
 
-int16_t av1_dc_quant(int qindex, int delta, aom_bit_depth_t bit_depth) {
+#if !CONFIG_DAALA_TX
+
+// Coefficient scaling and quantization with AV1 TX are tailored to
+// the AV1 TX transforms.  Regardless of the bit-depth of the input,
+// the transform stages scale the coefficient values up by a factor of
+// 8 (3 bits) over the scale of the pixel values.  Thus, for 8-bit
+// input, the coefficients have effectively 11 bits of scale depth
+// (8+3), 10-bit input pixels result in 13-bit coefficient depth
+// (10+3) and 12-bit pixels yield 15-bit (12+3) coefficient depth.
+// All quantizers are built using this invariant of x8, 3-bit scaling,
+// thus the Q3 suffix.
+
+// A partial exception to this rule is large transforms; to avoid
+// overflow, TX blocks with > 256 pels (>16x16) are scaled only
+// 4-times unity (2 bits) over the pixel depth, and TX blocks with
+// over 1024 pixels (>32x32) are scaled up only 2x unity (1 bit).
+// This descaling is found via av1_tx_get_scale().  Thus, 16x32, 32x16
+// and 32x32 transforms actually return Q2 coefficients, and 32x64,
+// 64x32 and 64x64 transforms return Q1 coefficients.  However, the
+// quantizers are de-scaled down on-the-fly by the same amount
+// (av1_tx_get_scale()) during quantization, and as such the
+// quantized/coded coefficients, even for large TX blocks, are always
+// effectively Q3.
+
+// Note that encoder decision making (which uses the quantizer to
+// generate several bespoke lamdas for RDO and other heuristics)
+// expects quantizers to be larger for higher-bitdepth input.  In
+// addition, the minimum allowable quantizer is 4; smaller values will
+// underflow to 0 in the actual quantization routines.
+
+int16_t av1_dc_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth) {
 #if CONFIG_HIGHBITDEPTH
   switch (bit_depth) {
-    case AOM_BITS_8: return dc_qlookup[clamp(qindex + delta, 0, MAXQ)];
-    case AOM_BITS_10: return dc_qlookup_10[clamp(qindex + delta, 0, MAXQ)];
-    case AOM_BITS_12: return dc_qlookup_12[clamp(qindex + delta, 0, MAXQ)];
+    case AOM_BITS_8: return dc_qlookup_Q3[clamp(qindex + delta, 0, MAXQ)];
+    case AOM_BITS_10: return dc_qlookup_10_Q3[clamp(qindex + delta, 0, MAXQ)];
+    case AOM_BITS_12: return dc_qlookup_12_Q3[clamp(qindex + delta, 0, MAXQ)];
     default:
       assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
       return -1;
   }
 #else
   (void)bit_depth;
-  return dc_qlookup[clamp(qindex + delta, 0, MAXQ)];
+  return dc_qlookup_Q3[clamp(qindex + delta, 0, MAXQ)];
 #endif
 }
 
-int16_t av1_ac_quant(int qindex, int delta, aom_bit_depth_t bit_depth) {
+int16_t av1_ac_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth) {
 #if CONFIG_HIGHBITDEPTH
   switch (bit_depth) {
-    case AOM_BITS_8: return ac_qlookup[clamp(qindex + delta, 0, MAXQ)];
-    case AOM_BITS_10: return ac_qlookup_10[clamp(qindex + delta, 0, MAXQ)];
-    case AOM_BITS_12: return ac_qlookup_12[clamp(qindex + delta, 0, MAXQ)];
+    case AOM_BITS_8: return ac_qlookup_Q3[clamp(qindex + delta, 0, MAXQ)];
+    case AOM_BITS_10: return ac_qlookup_10_Q3[clamp(qindex + delta, 0, MAXQ)];
+    case AOM_BITS_12: return ac_qlookup_12_Q3[clamp(qindex + delta, 0, MAXQ)];
     default:
       assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
       return -1;
   }
 #else
   (void)bit_depth;
-  return ac_qlookup[clamp(qindex + delta, 0, MAXQ)];
+  return ac_qlookup_Q3[clamp(qindex + delta, 0, MAXQ)];
 #endif
 }
 
-int16_t av1_qindex_from_ac(int ac, aom_bit_depth_t bit_depth) {
+// In AV1 TX, the coefficients are always scaled up a factor of 8 (3
+// bits), so QTX == Q3.
+
+int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
+  return av1_dc_quant_Q3(qindex, delta, bit_depth);
+}
+
+int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
+  return av1_ac_quant_Q3(qindex, delta, bit_depth);
+}
+
+int16_t av1_qindex_from_ac_Q3(int ac_Q3, aom_bit_depth_t bit_depth) {
   int i;
-  const int16_t *tab = ac_qlookup;
-  ac *= 4;
+  const int16_t *tab = ac_qlookup_Q3;
 #if CONFIG_HIGHBITDEPTH
   switch (bit_depth) {
     case AOM_BITS_10: {
-      tab = ac_qlookup_10;
-      ac *= 4;
+      tab = ac_qlookup_10_Q3;
       break;
     }
     case AOM_BITS_12: {
-      tab = ac_qlookup_12;
-      ac *= 16;
+      tab = ac_qlookup_12_Q3;
       break;
     }
     default:
@@ -324,11 +362,88 @@
 #endif
   (void)bit_depth;
   for (i = 0; i < QINDEX_RANGE; i++) {
-    if (ac <= tab[i]) return i;
+    if (ac_Q3 <= tab[i]) return i;
   }
   return QINDEX_RANGE - 1;
 }
 
+#else   // CONFIG_DAALA_TX
+
+// Daala TX uses a constant effective coefficient depth
+// (TX_COEFF_DEPTH) regardless of input pixel bitdepth or transform
+// size. This means that coefficient scale and range is identical
+// regardless of the bit depth of the pixel input.  However, the
+// existing encoder heuristics and RDO loop were built expecting a
+// quantizer that scales with bitdepth, treating it more as a
+// proto-lambda than a quantizer.  The assumption that quantizer scale
+// increases with bitdepth is spread throughout the encoder.
+
+// For this reason, we need to be able to find an old-style 'Q3'
+// quantizer that scales with pixel depth (to be used in encoder
+// decision making) as well as the literal quantizer that is used in
+// actual quantization/dequantization.  That is centralized here.
+
+// Right now, the existing quantization code and setup are not
+// particularly well suited to Daala TX.  The scale range used by, eg,
+// the 12 bit lookups is intentionally larger in order to provide more
+// fine control at the top end of the quality range, as 12-bit input
+// would be assumed to offer a lower noise floor than an 8-bit input.
+// However, the 12-bit lookups assume an effective 15-bit TX depth,
+// while we intend to run Daala TX somewhere between 12 and 14.  We
+// can't simply scale it down, because this would violate the minimum
+// allowable quantizer in the current code (4).
+
+// As such, we do the simplest thing for the time being: Always use
+// the 8-bit scale range for all inputs and scale the QTX and Q3
+// returns accordingly, which will always be no-ops or upshifts.  This
+// might well work well enough; if not, we'll need to patch quantizer
+// scaling to extend the high-bitdepth quality range upward at some
+// later date.
+
+int16_t av1_dc_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth) {
+  assert(bit_depth >= 8);
+  return qindex == 0 ? dc_qlookup_Q3[0]
+                     :  // Do not scale lossless
+             dc_qlookup_Q3[clamp(qindex + delta, 0, MAXQ)] *
+                 (1 << (bit_depth - 8));
+}
+
+int16_t av1_ac_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth) {
+  assert(bit_depth >= 8);
+  return qindex == 0 ? ac_qlookup_Q3[0]
+                     :  // Do not scale lossless
+             ac_qlookup_Q3[clamp(qindex + delta, 0, MAXQ)] *
+                 (1 << (bit_depth - 8));
+}
+
+int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
+  (void)bit_depth;
+  return qindex == 0 ? dc_qlookup_Q3[0]
+                     :  // Do not scale lossless
+             dc_qlookup_Q3[clamp(qindex + delta, 0, MAXQ)] *
+                 (1 << (TX_COEFF_DEPTH - 11));
+}
+
+int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
+  (void)bit_depth;
+  return qindex == 0 ? ac_qlookup_Q3[0]
+                     :  // Do not scale lossless
+             ac_qlookup_Q3[clamp(qindex + delta, 0, MAXQ)] *
+                 (1 << (TX_COEFF_DEPTH - 11));
+}
+
+int16_t av1_qindex_from_ac_Q3(int ac_QTX, aom_bit_depth_t bit_depth) {
+  int i;
+  const int16_t *tab = ac_qlookup_Q3;
+  int scale = (1 << (TX_COEFF_DEPTH - 11));
+  (void)bit_depth;
+  for (i = 0; i < QINDEX_RANGE; i++) {
+    if (ac_QTX <= tab[i] * scale) return i;
+  }
+  return QINDEX_RANGE - 1;
+}
+#endif  // !CONFIG_DAALA_TX
+
 int av1_get_qindex(const struct segmentation *seg, int segment_id,
 #if CONFIG_Q_SEGMENTATION
                    int q_segment_id, int base_qindex)
diff --git a/av1/common/quant_common.h b/av1/common/quant_common.h
index dee105c..8261a71 100644
--- a/av1/common/quant_common.h
+++ b/av1/common/quant_common.h
@@ -37,9 +37,11 @@
 
 struct AV1Common;
 
-int16_t av1_dc_quant(int qindex, int delta, aom_bit_depth_t bit_depth);
-int16_t av1_ac_quant(int qindex, int delta, aom_bit_depth_t bit_depth);
-int16_t av1_qindex_from_ac(int ac, aom_bit_depth_t bit_depth);
+int16_t av1_dc_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth);
+int16_t av1_ac_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth);
+int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth);
+int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth);
+int16_t av1_qindex_from_ac_Q3(int ac_Q3, aom_bit_depth_t bit_depth);
 
 int av1_get_qindex(const struct segmentation *seg, int segment_id,
 #if CONFIG_Q_SEGMENTATION
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 91b14ad..f2baf22 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -394,14 +394,6 @@
   aom_merge_corrupted_flag(&xd->corrupted, reader_corrupted_flag);
 }
 
-// Converts a Q3 quantizer lookup from static configuration to the
-// actual TX scaling in use
-static int dequant_Q3_to_QTX(int q3, int bd) {
-  // Right now, TX scale in use is still Q3
-  (void)bd;
-  return q3;
-}
-
 static void decode_token_and_recon_block(AV1Decoder *const pbi,
                                          MACROBLOCKD *const xd, int mi_row,
                                          int mi_col, aom_reader *r,
@@ -439,12 +431,10 @@
                    : (j == 1 ? cm->u_dc_delta_q : cm->v_dc_delta_q);
         const int ac_delta_q =
             j == 0 ? 0 : (j == 1 ? cm->u_ac_delta_q : cm->v_ac_delta_q);
-        xd->plane[j].seg_dequant_QTX[i][0] = dequant_Q3_to_QTX(
-            av1_dc_quant(current_qindex, dc_delta_q, cm->bit_depth),
-            cm->bit_depth);
-        xd->plane[j].seg_dequant_QTX[i][1] = dequant_Q3_to_QTX(
-            av1_ac_quant(current_qindex, ac_delta_q, cm->bit_depth),
-            cm->bit_depth);
+        xd->plane[j].seg_dequant_QTX[i][0] =
+            av1_dc_quant_QTX(current_qindex, dc_delta_q, cm->bit_depth);
+        xd->plane[j].seg_dequant_QTX[i][1] =
+            av1_ac_quant_QTX(current_qindex, ac_delta_q, cm->bit_depth);
       }
     }
   }
@@ -1329,18 +1319,17 @@
 #else
     const int qindex = av1_get_qindex(&cm->seg, i, cm->base_qindex);
 #endif
-    cm->y_dequant_QTX[i][0] = dequant_Q3_to_QTX(
-        av1_dc_quant(qindex, cm->y_dc_delta_q, cm->bit_depth), cm->bit_depth);
-    cm->y_dequant_QTX[i][1] = dequant_Q3_to_QTX(
-        av1_ac_quant(qindex, 0, cm->bit_depth), cm->bit_depth);
-    cm->u_dequant_QTX[i][0] = dequant_Q3_to_QTX(
-        av1_dc_quant(qindex, cm->u_dc_delta_q, cm->bit_depth), cm->bit_depth);
-    cm->u_dequant_QTX[i][1] = dequant_Q3_to_QTX(
-        av1_ac_quant(qindex, cm->u_ac_delta_q, cm->bit_depth), cm->bit_depth);
-    cm->v_dequant_QTX[i][0] = dequant_Q3_to_QTX(
-        av1_dc_quant(qindex, cm->v_dc_delta_q, cm->bit_depth), cm->bit_depth);
-    cm->v_dequant_QTX[i][1] = dequant_Q3_to_QTX(
-        av1_ac_quant(qindex, cm->v_ac_delta_q, cm->bit_depth), cm->bit_depth);
+    cm->y_dequant_QTX[i][0] =
+        av1_dc_quant_QTX(qindex, cm->y_dc_delta_q, cm->bit_depth);
+    cm->y_dequant_QTX[i][1] = av1_ac_quant_QTX(qindex, 0, cm->bit_depth);
+    cm->u_dequant_QTX[i][0] =
+        av1_dc_quant_QTX(qindex, cm->u_dc_delta_q, cm->bit_depth);
+    cm->u_dequant_QTX[i][1] =
+        av1_ac_quant_QTX(qindex, cm->u_ac_delta_q, cm->bit_depth);
+    cm->v_dequant_QTX[i][0] =
+        av1_dc_quant_QTX(qindex, cm->v_dc_delta_q, cm->bit_depth);
+    cm->v_dequant_QTX[i][1] =
+        av1_ac_quant_QTX(qindex, cm->v_ac_delta_q, cm->bit_depth);
 #if CONFIG_AOM_QM
     const int lossless = qindex == 0 && cm->y_dc_delta_q == 0 &&
                          cm->u_dc_delta_q == 0 && cm->u_ac_delta_q == 0 &&
diff --git a/av1/encoder/aq_complexity.c b/av1/encoder/aq_complexity.c
index 33e53bc..935638e 100644
--- a/av1/encoder/aq_complexity.c
+++ b/av1/encoder/aq_complexity.c
@@ -43,7 +43,7 @@
 
 static int get_aq_c_strength(int q_index, aom_bit_depth_t bit_depth) {
   // Approximate base quatizer (truncated to int)
-  const int base_quant = av1_ac_quant(q_index, 0, bit_depth) / 4;
+  const int base_quant = av1_ac_quant_Q3(q_index, 0, bit_depth) / 4;
   return (base_quant > 10) + (base_quant > 25);
 }
 
diff --git a/av1/encoder/av1_quantize.c b/av1/encoder/av1_quantize.c
index 8c4a5db..de43f4e 100644
--- a/av1/encoder/av1_quantize.c
+++ b/av1/encoder/av1_quantize.c
@@ -1516,7 +1516,7 @@
 }
 
 static int get_qzbin_factor(int q, aom_bit_depth_t bit_depth) {
-  const int quant = av1_dc_quant(q, 0, bit_depth);
+  const int quant = av1_dc_quant_Q3(q, 0, bit_depth);
 #if CONFIG_HIGHBITDEPTH
   switch (bit_depth) {
     case AOM_BITS_8: return q == 0 ? 64 : (quant < 148 ? 84 : 80);
@@ -1545,10 +1545,11 @@
     for (i = 0; i < 2; ++i) {
       int qrounding_factor_fp = 64;
       // y quantizer setup with original coeff shift of Q3
-      quant_Q3 = i == 0 ? av1_dc_quant(q, y_dc_delta_q, bit_depth)
-                        : av1_ac_quant(q, 0, bit_depth);
-      // y quantizer with TX scale; right now, it's still Q3 as above;
-      quant_QTX = quant_Q3;
+      quant_Q3 = i == 0 ? av1_dc_quant_Q3(q, y_dc_delta_q, bit_depth)
+                        : av1_ac_quant_Q3(q, 0, bit_depth);
+      // y quantizer with TX scale
+      quant_QTX = i == 0 ? av1_dc_quant_QTX(q, y_dc_delta_q, bit_depth)
+                         : av1_ac_quant_QTX(q, 0, bit_depth);
       invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i],
                    quant_QTX);
       quants->y_quant_fp[q][i] = (1 << 16) / quant_QTX;
@@ -1559,10 +1560,11 @@
       deq->y_dequant_Q3[q][i] = quant_Q3;
 
       // u quantizer setup with original coeff shift of Q3
-      quant_Q3 = i == 0 ? av1_dc_quant(q, u_dc_delta_q, bit_depth)
-                        : av1_ac_quant(q, u_ac_delta_q, bit_depth);
-      // u quantizer with TX scale; right now, it's still Q3 as above;
-      quant_QTX = quant_Q3;
+      quant_Q3 = i == 0 ? av1_dc_quant_Q3(q, u_dc_delta_q, bit_depth)
+                        : av1_ac_quant_Q3(q, u_ac_delta_q, bit_depth);
+      // u quantizer with TX scale
+      quant_QTX = i == 0 ? av1_dc_quant_QTX(q, u_dc_delta_q, bit_depth)
+                         : av1_ac_quant_QTX(q, u_ac_delta_q, bit_depth);
       invert_quant(&quants->u_quant[q][i], &quants->u_quant_shift[q][i],
                    quant_QTX);
       quants->u_quant_fp[q][i] = (1 << 16) / quant_QTX;
@@ -1573,10 +1575,11 @@
       deq->u_dequant_Q3[q][i] = quant_Q3;
 
       // v quantizer setup with original coeff shift of Q3
-      quant_Q3 = i == 0 ? av1_dc_quant(q, v_dc_delta_q, bit_depth)
-                        : av1_ac_quant(q, v_ac_delta_q, bit_depth);
-      // v quantizer with TX scale; right now, it's still Q3 as above;
-      quant_QTX = quant_Q3;
+      quant_Q3 = i == 0 ? av1_dc_quant_Q3(q, v_dc_delta_q, bit_depth)
+                        : av1_ac_quant_Q3(q, v_ac_delta_q, bit_depth);
+      // v quantizer with TX scale
+      quant_QTX = i == 0 ? av1_dc_quant_QTX(q, v_dc_delta_q, bit_depth)
+                         : av1_ac_quant_QTX(q, v_ac_delta_q, bit_depth);
       invert_quant(&quants->v_quant[q][i], &quants->v_quant_shift[q][i],
                    quant_QTX);
       quants->v_quant_fp[q][i] = (1 << 16) / quant_QTX;
diff --git a/av1/encoder/pickcdef.c b/av1/encoder/pickcdef.c
index cf6ccca..c9c8ca9 100644
--- a/av1/encoder/pickcdef.c
+++ b/av1/encoder/pickcdef.c
@@ -329,7 +329,7 @@
   int chroma_cdef = xd->plane[1].subsampling_x == xd->plane[1].subsampling_y &&
                     xd->plane[2].subsampling_x == xd->plane[2].subsampling_y;
   quantizer =
-      av1_ac_quant(cm->base_qindex, 0, cm->bit_depth) >> (cm->bit_depth - 8);
+      av1_ac_quant_Q3(cm->base_qindex, 0, cm->bit_depth) >> (cm->bit_depth - 8);
   lambda = .12 * quantizer * quantizer / 256.;
 
   av1_setup_dst_planes(xd->plane, cm->sb_size, frame, 0, 0);
diff --git a/av1/encoder/picklpf.c b/av1/encoder/picklpf.c
index 35276d7..0fcda6c 100644
--- a/av1/encoder/picklpf.c
+++ b/av1/encoder/picklpf.c
@@ -414,7 +414,7 @@
   } else if (method >= LPF_PICK_FROM_Q) {
     const int min_filter_level = 0;
     const int max_filter_level = av1_get_max_filter_level(cpi);
-    const int q = av1_ac_quant(cm->base_qindex, 0, cm->bit_depth);
+    const int q = av1_ac_quant_Q3(cm->base_qindex, 0, cm->bit_depth);
 // These values were determined by linear fitting the result of the
 // searched level for 8 bit depth:
 // Keyframes: filt_guess = q * 0.06699 - 1.60817
diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index 4bde2fa..0e5b4a4 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -157,15 +157,15 @@
 // Convert the index to a real Q value (scaled down to match old Q values)
 #if CONFIG_HIGHBITDEPTH
   switch (bit_depth) {
-    case AOM_BITS_8: return av1_ac_quant(qindex, 0, bit_depth) / 4.0;
-    case AOM_BITS_10: return av1_ac_quant(qindex, 0, bit_depth) / 16.0;
-    case AOM_BITS_12: return av1_ac_quant(qindex, 0, bit_depth) / 64.0;
+    case AOM_BITS_8: return av1_ac_quant_Q3(qindex, 0, bit_depth) / 4.0;
+    case AOM_BITS_10: return av1_ac_quant_Q3(qindex, 0, bit_depth) / 16.0;
+    case AOM_BITS_12: return av1_ac_quant_Q3(qindex, 0, bit_depth) / 64.0;
     default:
       assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
       return -1.0;
   }
 #else
-  return av1_ac_quant(qindex, 0, bit_depth) / 4.0;
+  return av1_ac_quant_Q3(qindex, 0, bit_depth) / 4.0;
 #endif
 }
 
diff --git a/av1/encoder/ratectrl_xiph.c b/av1/encoder/ratectrl_xiph.c
index b9f8275..94d2022 100644
--- a/av1/encoder/ratectrl_xiph.c
+++ b/av1/encoder/ratectrl_xiph.c
@@ -367,25 +367,47 @@
   /*All of these initial scale/exp values are from Theora, and have not yet
      been adapted to Daala, so they're certainly wrong.
     The B-frame values especially are simply copies of the P-frame values.*/
+
+  /*XXXXX: This constant initialization, apart from tuning, is very
+    likely also scaled incorrectly.
+
+    In Theora, where these constants come from, (bits/pixel) ==
+    scale*((q_Q2/4)^-(exp/64))
+
+    This can be derived from looking at the update formula in
+    od_enc_rc_update_state().
+
+    I.e., we have a quantizer normalized to Q0 for 8-bit pixel values,
+    which we exponentiate. To get the same behavior here, we need to
+    continue normalizing the quantizer the same way. Otherwise we'll have
+    to exponentiate any scaling baked into the quantizer as well (which
+    seems complicated and unnecessary).
+
+    If we have normalized the quantizer before exponentiation, then
+    the initializers for log_scale[] should not depend on bit depth or
+    coefficient depth in any way.
+
+    This is now restored to the initialization as is was in Theora,
+    and should be revistited/rederived/corrected for AV1.*/
   if (ibpp < 1) {
     rc->exp[OD_I_FRAME] = 59;
-    rc->log_scale[OD_I_FRAME] = od_blog64(1997) - OD_Q57(OD_COEFF_SHIFT);
+    rc->log_scale[OD_I_FRAME] = od_blog64(1997) - OD_Q57(8);
   } else if (ibpp < 2) {
     rc->exp[OD_I_FRAME] = 55;
-    rc->log_scale[OD_I_FRAME] = od_blog64(1604) - OD_Q57(OD_COEFF_SHIFT);
+    rc->log_scale[OD_I_FRAME] = od_blog64(1604) - OD_Q57(8);
   } else {
     rc->exp[OD_I_FRAME] = 48;
-    rc->log_scale[OD_I_FRAME] = od_blog64(834) - OD_Q57(OD_COEFF_SHIFT);
+    rc->log_scale[OD_I_FRAME] = od_blog64(834) - OD_Q57(8);
   }
   if (ibpp < 4) {
     rc->exp[OD_P_FRAME] = 100;
-    rc->log_scale[OD_P_FRAME] = od_blog64(2249) - OD_Q57(OD_COEFF_SHIFT);
+    rc->log_scale[OD_P_FRAME] = od_blog64(2249) - OD_Q57(8);
   } else if (ibpp < 8) {
     rc->exp[OD_P_FRAME] = 95;
-    rc->log_scale[OD_P_FRAME] = od_blog64(1751) - OD_Q57(OD_COEFF_SHIFT);
+    rc->log_scale[OD_P_FRAME] = od_blog64(1751) - OD_Q57(8);
   } else {
     rc->exp[OD_P_FRAME] = 73;
-    rc->log_scale[OD_P_FRAME] = od_blog64(1260) - OD_Q57(OD_COEFF_SHIFT);
+    rc->log_scale[OD_P_FRAME] = od_blog64(1260) - OD_Q57(8);
   }
   /*Golden P-frames both use the same log_scale and exp modeling
      values as regular P-frames and the same scale follower.
@@ -810,7 +832,11 @@
         calculation, that needs to be modulated as well.
         Calculate what is, effectively, a fractional coded quantizer. */
       /*Get the log2 quantizer in Q57 (normalized for coefficient shift).*/
-      log_quantizer = od_blog64(rc->base_quantizer) - OD_Q57(OD_COEFF_SHIFT);
+      /*XXXXX: See the above XXXX comment in rate control
+        initialization; the scaling on the log-quantizer calculation
+        should be the same as in quantizer scale initialization, but
+        OD_Q57(8) is possibly the incorrect value. */
+      log_quantizer = od_blog64(rc->base_quantizer) - OD_Q57(8);
       /*log_quantizer to Q21.*/
       log_quantizer >>= 36;
       /*scale log quantizer, result is Q33.*/
@@ -824,7 +850,7 @@
       /*Back to log2 quantizer in Q57.*/
       log_quantizer = (log_quantizer - OD_LOG_QUANTIZER_OFFSET_Q45) *
                           OD_LOG_QUANTIZER_EXP_Q12 +
-                      OD_Q57(OD_COEFF_SHIFT);
+                      OD_Q57(8);
       /*Convert Q57 log2 quantizer to unclamped linear target quantizer value.*/
       rc->target_quantizer = od_bexp64(log_quantizer);
     }
@@ -881,7 +907,11 @@
       for (i = 0; i < OD_FRAME_NSUBTYPES; i++) {
         /*Modulate base quantizer by frame type.*/
         /*Get the log2 quantizer in Q57 (normalized for coefficient shift).*/
-        log_quantizer = log_base_quantizer - OD_Q57(OD_COEFF_SHIFT);
+        /*XXXXX: See the above XXXX comment in rate control
+          initialization; the scaling on the log-quantizer calculation
+          should be the same as in quantizer scale initialization, but
+          OD_Q57(8) is possibly the incorrect value. */
+        log_quantizer = log_base_quantizer - OD_Q57(8);
         /*log_quantizer to Q21.*/
         log_quantizer >>= 36;
         /*scale log quantizer, result is Q33.*/
@@ -895,7 +925,7 @@
         /*Back to log2 quantizer in Q57.*/
         log_quantizer = (log_quantizer - OD_LOG_QUANTIZER_OFFSET_Q45) *
                             OD_LOG_QUANTIZER_EXP_Q12 +
-                        OD_Q57(OD_COEFF_SHIFT);
+                        OD_Q57(8);
         /*Clamp modulated quantizer values.*/
         log_quantizer = OD_CLAMPI(od_blog64(lossy_quantizer_min), log_quantizer,
                                   od_blog64(lossy_quantizer_max));
@@ -924,7 +954,11 @@
     /*Modulate chosen base quantizer to produce target quantizer.*/
     log_quantizer = od_blog64(base_quantizer);
     /*Get the log2 quantizer in Q57 (normalized for coefficient shift).*/
-    log_quantizer -= OD_Q57(OD_COEFF_SHIFT);
+    /*XXXXX: See the above XXXX comment in rate control
+      initialization; the scaling on the log-quantizer calculation
+      should be the same as in quantizer scale initialization, but
+      OD_Q57(8) is possibly the incorrect value. */
+    log_quantizer -= OD_Q57(8);
     /*log_quantizer to Q21.*/
     log_quantizer >>= 36;
     /*scale log quantizer, result is Q33.*/
@@ -938,7 +972,7 @@
     /*Back to log2 quantizer in Q57.*/
     log_quantizer = (log_quantizer - OD_LOG_QUANTIZER_OFFSET_Q45) *
                         OD_LOG_QUANTIZER_EXP_Q12 +
-                    OD_Q57(OD_COEFF_SHIFT);
+                    OD_Q57(8);
     /*Clamp modulated quantizer values.*/
     log_quantizer = OD_CLAMPI(od_blog64(lossy_quantizer_min), log_quantizer,
                               od_blog64(lossy_quantizer_max));
@@ -1023,7 +1057,18 @@
   }
   *bottom_idx = lossy_quantizer_min;
   *top_idx = lossy_quantizer_max;
-  rc->target_quantizer = av1_qindex_from_ac(
+  /*XXXXXX: the store back to rc->target_quantizer just seems
+    wrong. target_quantizer is used as an actual linear quantizer
+    (like base_quantizer, I think it should be scaled the same way as
+    a Q0 quantizer for 8-bit inputs). But av1_qindex_from_ac*()
+    returns a quantizer index, which is completely incomparable.
+
+    Passing rc->target_quantizer directly to av1_qindex_from_ac_Q3 is
+    also probably incorrect. If we move to storing a value scaled the
+    same way as a Q0 quantizer for 8-bit inputs, then it should just
+    be rc->target_quantizer << (TX_COEFF_DEPTH - 8) for DAALA_TX, and
+    something depending on the bit depth for !DAALA_TX. */
+  rc->target_quantizer = av1_qindex_from_ac_Q3(
       OD_CLAMPI(lossy_quantizer_min, rc->target_quantizer, lossy_quantizer_max),
       rc->bit_depth);
   return rc->target_quantizer;
diff --git a/av1/encoder/rd.c b/av1/encoder/rd.c
index 800d7fb..5571602 100644
--- a/av1/encoder/rd.c
+++ b/av1/encoder/rd.c
@@ -338,7 +338,7 @@
 };
 
 int av1_compute_rd_mult(const AV1_COMP *cpi, int qindex) {
-  const int64_t q = av1_dc_quant(qindex, 0, cpi->common.bit_depth);
+  const int64_t q = av1_dc_quant_Q3(qindex, 0, cpi->common.bit_depth);
 #if CONFIG_HIGHBITDEPTH
   int64_t rdmult = 0;
   switch (cpi->common.bit_depth) {
@@ -368,16 +368,16 @@
   double q;
 #if CONFIG_HIGHBITDEPTH
   switch (bit_depth) {
-    case AOM_BITS_8: q = av1_dc_quant(qindex, 0, AOM_BITS_8) / 4.0; break;
-    case AOM_BITS_10: q = av1_dc_quant(qindex, 0, AOM_BITS_10) / 16.0; break;
-    case AOM_BITS_12: q = av1_dc_quant(qindex, 0, AOM_BITS_12) / 64.0; break;
+    case AOM_BITS_8: q = av1_dc_quant_Q3(qindex, 0, AOM_BITS_8) / 4.0; break;
+    case AOM_BITS_10: q = av1_dc_quant_Q3(qindex, 0, AOM_BITS_10) / 16.0; break;
+    case AOM_BITS_12: q = av1_dc_quant_Q3(qindex, 0, AOM_BITS_12) / 64.0; break;
     default:
       assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
       return -1;
   }
 #else
   (void)bit_depth;
-  q = av1_dc_quant(qindex, 0, AOM_BITS_8) / 4.0;
+  q = av1_dc_quant_Q3(qindex, 0, AOM_BITS_8) / 4.0;
 #endif  // CONFIG_HIGHBITDEPTH
   // TODO(debargha): Adjust the function below.
   return AOMMAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8);
@@ -1295,7 +1295,7 @@
 
 int av1_get_intra_cost_penalty(int qindex, int qdelta,
                                aom_bit_depth_t bit_depth) {
-  const int q = av1_dc_quant(qindex, qdelta, bit_depth);
+  const int q = av1_dc_quant_Q3(qindex, qdelta, bit_depth);
 #if CONFIG_HIGHBITDEPTH
   switch (bit_depth) {
     case AOM_BITS_8: return 20 * q;
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 63601c6..a381098 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -4674,8 +4674,9 @@
     av1_fwd_txfm(p->src_diff, DCT_coefs, bw, &param);
 #endif  // CONFIG_TXMG
 
-  uint32_t dc = (uint32_t)av1_dc_quant(x->qindex, 0, xd->bd);
-  uint32_t ac = (uint32_t)av1_ac_quant(x->qindex, 0, xd->bd);
+  // Operating on TX domain, not pixels; we want the QTX quantizers
+  uint32_t dc = (uint32_t)av1_dc_quant_QTX(x->qindex, 0, xd->bd);
+  uint32_t ac = (uint32_t)av1_ac_quant_QTX(x->qindex, 0, xd->bd);
   uint32_t max_quantized_coef = (100 * (uint32_t)abs(DCT_coefs[0])) / dc;
   for (int i = 1; i < bw * bh; i++) {
     uint32_t cur_quantized_coef = (100 * (uint32_t)abs(DCT_coefs[i])) / ac;