Modify RDO for use with Daala TX constant-depth coeffs

Modify the portions of RDO using TX-domain coeff calaculations to deal
with TX_COEFF_DEPTH and constant-depth coefficient scaling.  At
present, this represents no functional change.

subset-1:
monty-rest-of-stack-quant-s1@2017-11-13T14:38:43.774Z ->
 monty-rest-of-stack-RDO-s1@2017-11-13T14:39:17.093Z

  PSNR | PSNR Cb | PSNR Cr | PSNR HVS |   SSIM | MS SSIM | CIEDE 2000
0.0000 |  0.0000 |  0.0000 |   0.0000 | 0.0000 |  0.0000 |     0.0000

objective-1-fast --limit=4:
monty-rest-of-stack-quant-o1f4@2017-11-13T14:38:28.828Z ->
 monty-rest-of-stack-RDO-o1f4@2017-11-13T14:38:57.951Z

  PSNR | PSNR Cb | PSNR Cr | PSNR HVS |   SSIM | MS SSIM | CIEDE 2000
0.0000 |  0.0000 |  0.0000 |   0.0000 | 0.0000 |  0.0000 |     0.0000

Change-Id: I0fbc45e018f565f48e1fc8fdeabfcd6cb6fa62fe
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 910c1df..aef4fda 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -445,7 +445,9 @@
     }
 
     add_proto qw/int64_t av1_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
-    specialize qw/av1_highbd_block_error sse2/;
+    if (aom_config("CONFIG_DAALA_TX") ne "yes") {
+      specialize qw/av1_highbd_block_error sse2/;
+    }
 
     add_proto qw/void av1_highbd_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
 
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index dadea37..ca650a2 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -147,7 +147,14 @@
       get_scan(cm, tx_size, tx_type, &xd->mi[0]->mbmi);
   const int16_t *const scan = scan_order->scan;
   const int16_t *const nb = scan_order->neighbors;
+#if CONFIG_DAALA_TX
+  // This is one of the few places where RDO is done on coeffs; it
+  // expects the coeffs to be in Q3/D11, so we need to scale them.
+  int depth_shift = (TX_COEFF_DEPTH - 11) * 2;
+  int depth_round = depth_shift > 1 ? (1 << depth_shift >> 1) : 0;
+#else
   const int shift = av1_get_tx_scale(tx_size);
+#endif
 #if CONFIG_AOM_QM
   int seg_id = xd->mi[0]->mbmi.segment_id;
   // Use a flat matrix (i.e. no weighting) for 1D and Identity transforms
@@ -212,14 +219,19 @@
           tail_token_costs[band_cur][ctx_cur]);
       // accu_error does not change when x==0
     } else {
-      /*  Computing distortion
-       */
-      // compute the distortion for the first candidate
-      // and the distortion for quantizing to 0.
+/*  Computing distortion
+ */
+// compute the distortion for the first candidate
+// and the distortion for quantizing to 0.
+#if CONFIG_DAALA_TX
+      int dx0 = coeff[rc];
+      const int64_t d0 = ((int64_t)dx0 * dx0 + depth_round) >> depth_shift;
+#else
       int dx0 = abs(coeff[rc]) * (1 << shift);
       dx0 >>= xd->bd - 8;
 
       const int64_t d0 = (int64_t)dx0 * dx0;
+#endif
       const int x_a = x - 2 * sz - 1;
       int dqv;
 #if CONFIG_AOM_QM
@@ -233,15 +245,29 @@
       dqv = dequant_ptr[rc != 0];
 #endif
 
+#if CONFIG_DAALA_TX
+      int dx = dqcoeff[rc] - coeff[rc];
+      const int64_t d2 = ((int64_t)dx * dx + depth_round) >> depth_shift;
+#else
       int dx = (dqcoeff[rc] - coeff[rc]) * (1 << shift);
       dx = signed_shift_right(dx, xd->bd - 8);
       const int64_t d2 = (int64_t)dx * dx;
+#endif
 
       /* compute the distortion for the second candidate
        * x_a = x - 2 * sz + 1;
        */
       int64_t d2_a;
       if (x_a != 0) {
+#if CONFIG_DAALA_TX
+#if CONFIG_NEW_QUANT
+        dx = av1_dequant_coeff_nuq(x, dqv, dequant_val[band_translate[i]]) -
+             coeff[rc];
+#else   // CONFIG_NEW_QUANT
+        dx -= (dqv + sz) ^ sz;
+#endif  // CONFIG_NEW_QUANT
+        d2_a = ((int64_t)dx * dx + depth_round) >> depth_shift;
+#else  // CONFIG_DAALA_TX
 #if CONFIG_NEW_QUANT
         dx = av1_dequant_coeff_nuq(x, dqv, dequant_val[band_translate[i]]) -
              (coeff[rc] * (1 << shift));
@@ -250,9 +276,11 @@
         dx -= ((dqv >> (xd->bd - 8)) + sz) ^ sz;
 #endif  // CONFIG_NEW_QUANT
         d2_a = (int64_t)dx * dx;
+#endif  // CONFIG_DAALA_TX
       } else {
         d2_a = d0;
       }
+
       // Computing RD cost
       int64_t base_bits;
       // rate cost of x
@@ -321,6 +349,15 @@
       int dqc_a = 0;
       if (best_x || best_eob_x) {
         if (x_a != 0) {
+#if CONFIG_DAALA_TX
+#if CONFIG_NEW_QUANT
+          dqc_a = av1_dequant_abscoeff_nuq(abs(x_a), dqv,
+                                           dequant_val[band_translate[i]]);
+          if (sz) dqc_a = -dqc_a;
+#else
+          dqc_a = x_a * dqv;
+#endif  // CONFIG_NEW_QUANT
+#else   // CONFIG_DAALA_TX
 #if CONFIG_NEW_QUANT
           dqc_a = av1_dequant_abscoeff_nuq(abs(x_a), dqv,
                                            dequant_val[band_translate[i]]);
@@ -332,9 +369,10 @@
           else
             dqc_a = (x_a * dqv) >> shift;
 #endif  // CONFIG_NEW_QUANT
+#endif  // CONFIG_DAALA_TX
         } else {
           dqc_a = 0;
-        }  // if (x_a != 0)
+        }
       }
 
       // record the better quantized value
diff --git a/av1/encoder/encodetxb.c b/av1/encoder/encodetxb.c
index 2f7481a..6c5f344 100644
--- a/av1/encoder/encodetxb.c
+++ b/av1/encoder/encodetxb.c
@@ -126,8 +126,16 @@
 
 static INLINE int64_t get_coeff_dist(tran_low_t tcoeff, tran_low_t dqcoeff,
                                      int shift) {
+#if CONFIG_DAALA_TX
+  int depth_shift = (TX_COEFF_DEPTH - 11) * 2;
+  int depth_round = depth_shift > 1 ? (1 << (depth_shift - 1)) : 0;
+  const int64_t diff = tcoeff - dqcoeff;
+  const int64_t error = diff * diff + depth_round >> depth_shift;
+  (void)shift;
+#else
   const int64_t diff = (tcoeff - dqcoeff) * (1 << shift);
   const int64_t error = diff * diff;
+#endif
   return error;
 }
 
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 1a56a93..416171e 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -1588,7 +1588,12 @@
                                  int64_t *ssz, int bd) {
   int i;
   int64_t error = 0, sqcoeff = 0;
+#if CONFIG_DAALA_TX
+  (void)bd;
+  int shift = 2 * (TX_COEFF_DEPTH - 11);
+#else
   int shift = 2 * (bd - 8);
+#endif
   int rounding = shift > 0 ? 1 << (shift - 1) : 0;
 
   for (i = 0; i < block_size; i++) {
@@ -1926,7 +1931,13 @@
     // not involve an inverse transform, but it is less accurate.
     const int buffer_length = tx_size_2d[tx_size];
     int64_t this_sse;
+// TX-domain results need to shift down to Q2/D10 to match pixel
+// domain distortion values which are in Q2^2
+#if CONFIG_DAALA_TX
+    int shift = (TX_COEFF_DEPTH - 10) * 2;
+#else
     int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
+#endif
     tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
     tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
 
@@ -2106,7 +2117,13 @@
   av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
                   coeff_ctx, AV1_XFORM_QUANT_FP);
 
+// TX-domain results need to shift down to Q2/D10 to match pixel
+// domain distortion values which are in Q2^2
+#if CONFIG_DAALA_TX
+  const int shift = (TX_COEFF_DEPTH - 10) * 2;
+#else
   const int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
+#endif
   tran_low_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block);
   tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
   const int buffer_length = tx_size_2d[tx_size];
@@ -3658,6 +3675,7 @@
   const int coeff_ctx_one_byte = combine_entropy_contexts(*a, *l);
   const uint8_t cur_joint_ctx = (coeff_ctx << 2) + coeff_ctx_one_byte;
 
+  // Note: tmp below is pixel distortion, not TX domain
   tmp = pixel_diff_dist(x, plane, diff, diff_stride, blk_row, blk_col,
                         plane_bsize, txm_bsize);
 
@@ -3714,7 +3732,13 @@
   av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
                   coeff_ctx, AV1_XFORM_QUANT_FP);
 
+// TX-domain results need to shift down to Q2/D10 to match pixel
+// domain distortion values which are in Q2^2
+#if CONFIG_DAALA_TX
+  const int shift = (TX_COEFF_DEPTH - 10) * 2;
+#else
   const int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
+#endif
   tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   const int buffer_length = tx_size_2d[tx_size];
   int64_t tmp_dist, tmp_sse;
diff --git a/test/error_block_test.cc b/test/error_block_test.cc
index 227065f..d1eac05 100644
--- a/test/error_block_test.cc
+++ b/test/error_block_test.cc
@@ -156,7 +156,7 @@
       << "First failed at test case " << first_failure;
 }
 
-#if HAVE_SSE2 || HAVE_AVX
+#if (HAVE_SSE2 || HAVE_AVX) && !CONFIG_DAALA_TX
 using std::tr1::make_tuple;
 
 INSTANTIATE_TEST_CASE_P(