Add a new experiment, DAALA_DIST

This commit adds a new experiment, Daala's distortion function,
which is designed to better approximate perceptual distortion
in 8x8 pixel blocks.

This experiment is expected to work best with PVQ.

It measures the variance of overlapped 4x4 regions in the 8x8 area,
then uses these variances to scale the MSE of weighted frequency domain
distortion of 8x8 block.

Since AV1 calculates distortion in blocks as small as 4x4, it is not possible to
directly replace the existing distortion functions of AV1,
such as dist_block() and block_rd_txf().
Hence, there has been substantial changes in order to apply
Daala's 8x8 distortion function.
The daala distortion function is applied
after all 4x4 tx blocks in a 8x8 block are encoded (during RDO),
as in below two cases:
1) intra/inter sub8x8 predictions and
2) 4x4 transform with prediction size >= 8.

To enable this experiment, add '--enable-daala-dist' with configure.

TODO: Significant tuning of parameters is required since the function has
originally came from Daala thus most parameters would not work
correctly outside Daala.
The fact that chroma distortion is added to the distortion of AV1's RDO is
also critical since Daala's distortion function is applied to luma only
and chroma continues to use MSE.

Change-Id: If35fdd3aec7efe401f351ba1c99891ad57a3d957
diff --git a/av1/common/blockd.c b/av1/common/blockd.c
index 5011e31..b9e5073 100644
--- a/av1/common/blockd.c
+++ b/av1/common/blockd.c
@@ -153,6 +153,44 @@
   }
 }
 
+#if CONFIG_DAALA_DIST
+void av1_foreach_8x8_transformed_block_in_plane(
+    const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
+    foreach_transformed_block_visitor visit,
+    foreach_transformed_block_visitor mi_visit, void *arg) {
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
+  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
+  // transform size varies per plane, look it up in a common way.
+  const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size;
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+  const uint8_t txw_unit = tx_size_wide_unit[tx_size];
+  const uint8_t txh_unit = tx_size_high_unit[tx_size];
+  const int step = txw_unit * txh_unit;
+  int i = 0, r, c;
+
+  // If mb_to_right_edge is < 0 we are in a situation in which
+  // the current block size extends into the UMV and we won't
+  // visit the sub blocks that are wholly within the UMV.
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+
+  // Keep track of the row and column of the blocks we use so that we know
+  // if we are in the unrestricted motion border.
+  for (r = 0; r < max_blocks_high; r += txh_unit) {
+    // Skip visiting the sub blocks that are wholly within the UMV.
+    for (c = 0; c < max_blocks_wide; c += txw_unit) {
+      visit(plane, i, r, c, plane_bsize, tx_size, arg);
+      // Call whenever each 8x8 block is done
+      if ((r & 1) && (c & 1))
+        mi_visit(plane, i, r - 1, c - 1, plane_bsize, TX_8X8, arg);
+      i += step;
+    }
+  }
+}
+#endif
+
 void av1_foreach_transformed_block(const MACROBLOCKD *const xd,
                                    BLOCK_SIZE bsize,
                                    foreach_transformed_block_visitor visit,
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 7642b63..c30786e 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -456,7 +456,7 @@
   const qm_val_t *seg_qmatrix[MAX_SEGMENTS][2][TX_SIZES];
 #endif
 
-#if CONFIG_PVQ
+#if CONFIG_PVQ || CONFIG_DAALA_DIST
   DECLARE_ALIGNED(16, int16_t, pred[MAX_SB_SQUARE]);
   // PVQ: forward transformed predicted image, a reference for PVQ.
   tran_low_t *pvq_ref_coeff;
@@ -972,6 +972,13 @@
     const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
     foreach_transformed_block_visitor visit, void *arg);
 
+#if CONFIG_DAALA_DIST
+void av1_foreach_8x8_transformed_block_in_plane(
+    const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
+    foreach_transformed_block_visitor visit,
+    foreach_transformed_block_visitor mi_visit, void *arg);
+#endif
+
 void av1_foreach_transformed_block(const MACROBLOCKD *const xd,
                                    BLOCK_SIZE bsize,
                                    foreach_transformed_block_visitor visit,
diff --git a/av1/common/pvq.c b/av1/common/pvq.c
index 8f6512d..28de41a 100644
--- a/av1/common/pvq.c
+++ b/av1/common/pvq.c
@@ -23,71 +23,6 @@
 #include <stdlib.h>
 #include <string.h>
 
-/* Quantization matrices for 8x8. For other block sizes, we currently just do
-   resampling. */
-/* Flat quantization, i.e. optimize for PSNR. */
-const int OD_QM8_Q4_FLAT[] = {
-  16, 16, 16, 16, 16, 16, 16, 16,
-  16, 16, 16, 16, 16, 16, 16, 16,
-  16, 16, 16, 16, 16, 16, 16, 16,
-  16, 16, 16, 16, 16, 16, 16, 16,
-  16, 16, 16, 16, 16, 16, 16, 16,
-  16, 16, 16, 16, 16, 16, 16, 16,
-  16, 16, 16, 16, 16, 16, 16, 16,
-  16, 16, 16, 16, 16, 16, 16, 16
-};
-# if 0
-/* M1: MPEG2 matrix for inter (which has a dead zone). */
-const int OD_QM8_Q4[] = {
-  16, 17, 18, 19, 20, 21, 22, 23,
-  17, 18, 19, 20, 21, 22, 23, 24,
-  18, 19, 20, 21, 22, 23, 24, 25,
-  19, 20, 21, 22, 23, 24, 26, 27,
-  20, 21, 22, 23, 25, 26, 27, 28,
-  21, 22, 23, 24, 26, 27, 28, 30,
-  22, 23, 24, 26, 27, 28, 30, 31,
-  23, 24, 25, 27, 28, 30, 31, 33};
-# endif
-# if 0
-/* M2: MPEG2 matrix for intra (no dead zone). */
-const int OD_QM8_Q4[] = {
-  16, 16, 19, 22, 22, 26, 26, 27,
-  16, 16, 22, 22, 26, 27, 27, 29,
-  19, 22, 26, 26, 27, 29, 29, 35,
-  22, 24, 27, 27, 29, 32, 34, 38,
-  26, 27, 29, 29, 32, 35, 38, 46,
-  27, 29, 34, 34, 35, 40, 46, 56,
-  29, 34, 34, 37, 40, 48, 56, 69,
-  34, 37, 38, 40, 48, 58, 69, 83
-};
-# endif
-# if 0
-/* M3: Taken from dump_psnrhvs. */
-const int OD_QM8_Q4[] = {
-  16, 16, 17, 20, 24, 29, 36, 42,
-  16, 17, 17, 19, 22, 26, 31, 37,
-  17, 17, 21, 23, 26, 30, 34, 40,
-  20, 19, 23, 28, 31, 35, 39, 45,
-  24, 22, 26, 31, 36, 41, 46, 51,
-  29, 26, 30, 35, 41, 47, 52, 58,
-  36, 31, 34, 39, 46, 52, 59, 66,
-  42, 37, 40, 45, 51, 58, 66, 73
-};
-# endif
-# if 1
-/* M4: a compromise equal to .5*(M3 + .5*(M2+transpose(M2))) */
-const int OD_QM8_Q4_HVS[] = {
-  16, 16, 18, 21, 24, 28, 32, 36,
-  16, 17, 20, 21, 24, 27, 31, 35,
-  18, 20, 24, 25, 27, 31, 33, 38,
-  21, 21, 25, 28, 30, 34, 37, 42,
-  24, 24, 27, 30, 34, 38, 43, 49,
-  28, 27, 31, 34, 38, 44, 50, 58,
-  32, 31, 33, 37, 43, 50, 58, 68,
-  36, 35, 38, 42, 49, 58, 68, 78
-};
-#endif
-
 /* Imported from encode.c in daala */
 /* These are the PVQ equivalent of quantization matrices, except that
    the values are per-band. */
diff --git a/av1/common/pvq.h b/av1/common/pvq.h
index 2836263..8f48743 100644
--- a/av1/common/pvq.h
+++ b/av1/common/pvq.h
@@ -16,14 +16,15 @@
 # include "generic_code.h"
 # include "odintrin.h"
 
-extern const int OD_QM8_Q4_FLAT[];
-extern const int OD_QM8_Q4_HVS[];
-
 extern const uint16_t EXP_CDF_TABLE[][16];
 extern const uint16_t LAPLACE_OFFSET[];
 
 #define AV1_PVQ_ENABLE_ACTIVITY_MASKING (0)
 
+#if !CONFIG_DAALA_DIST
+#define AV1_PVQ_ENABLE_ACTIVITY_MASKING (0)
+#endif
+
 # define PVQ_MAX_PARTITIONS (1 + 3*(OD_TXSIZES-1))
 
 # define OD_NOREF_ADAPT_SPEED (4)
diff --git a/av1/common/quant_common.c b/av1/common/quant_common.c
index 69d0cc0..6e1b0e9 100644
--- a/av1/common/quant_common.c
+++ b/av1/common/quant_common.c
@@ -11280,3 +11280,62 @@
 };
 
 #endif
+
+#if CONFIG_PVQ || CONFIG_DAALA_DIST
+/* Quantization matrices for 8x8. For other block sizes, we currently just do
+   resampling. */
+/* Flat quantization, i.e. optimize for PSNR. */
+const int OD_QM8_Q4_FLAT[] = { 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+                               16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+                               16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+                               16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+                               16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+                               16, 16, 16, 16, 16, 16, 16, 16, 16 };
+#if 0
+/* M1: MPEG2 matrix for inter (which has a dead zone). */
+const int OD_QM8_Q4[] = {
+  16, 17, 18, 19, 20, 21, 22, 23,
+  17, 18, 19, 20, 21, 22, 23, 24,
+  18, 19, 20, 21, 22, 23, 24, 25,
+  19, 20, 21, 22, 23, 24, 26, 27,
+  20, 21, 22, 23, 25, 26, 27, 28,
+  21, 22, 23, 24, 26, 27, 28, 30,
+  22, 23, 24, 26, 27, 28, 30, 31,
+  23, 24, 25, 27, 28, 30, 31, 33};
+#endif
+#if 0
+/* M2: MPEG2 matrix for intra (no dead zone). */
+const int OD_QM8_Q4[] = {
+  16, 16, 19, 22, 22, 26, 26, 27,
+  16, 16, 22, 22, 26, 27, 27, 29,
+  19, 22, 26, 26, 27, 29, 29, 35,
+  22, 24, 27, 27, 29, 32, 34, 38,
+  26, 27, 29, 29, 32, 35, 38, 46,
+  27, 29, 34, 34, 35, 40, 46, 56,
+  29, 34, 34, 37, 40, 48, 56, 69,
+  34, 37, 38, 40, 48, 58, 69, 83
+};
+#endif
+#if 0
+/* M3: Taken from dump_psnrhvs. */
+const int OD_QM8_Q4[] = {
+  16, 16, 17, 20, 24, 29, 36, 42,
+  16, 17, 17, 19, 22, 26, 31, 37,
+  17, 17, 21, 23, 26, 30, 34, 40,
+  20, 19, 23, 28, 31, 35, 39, 45,
+  24, 22, 26, 31, 36, 41, 46, 51,
+  29, 26, 30, 35, 41, 47, 52, 58,
+  36, 31, 34, 39, 46, 52, 59, 66,
+  42, 37, 40, 45, 51, 58, 66, 73
+};
+#endif
+#if 1
+/* M4: a compromise equal to .5*(M3 + .5*(M2+transpose(M2))) */
+const int OD_QM8_Q4_HVS[] = { 16, 16, 18, 21, 24, 28, 32, 36, 16, 17, 20,
+                              21, 24, 27, 31, 35, 18, 20, 24, 25, 27, 31,
+                              33, 38, 21, 21, 25, 28, 30, 34, 37, 42, 24,
+                              24, 27, 30, 34, 38, 43, 49, 28, 27, 31, 34,
+                              38, 44, 50, 58, 32, 31, 33, 37, 43, 50, 58,
+                              68, 36, 35, 38, 42, 49, 58, 68, 78 };
+#endif
+#endif
diff --git a/av1/common/quant_common.h b/av1/common/quant_common.h
index 43833c6..0b7f97d 100644
--- a/av1/common/quant_common.h
+++ b/av1/common/quant_common.h
@@ -98,6 +98,11 @@
 }
 #endif  // CONFIG_NEW_QUANT
 
+#if CONFIG_PVQ || CONFIG_DAALA_DIST
+extern const int OD_QM8_Q4_FLAT[];
+extern const int OD_QM8_Q4_HVS[];
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 69b084f..faade89 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -3285,6 +3285,10 @@
   // TODO(yushin) : activity masking info needs be signaled by a bitstream
   daala_dec->use_activity_masking = AV1_PVQ_ENABLE_ACTIVITY_MASKING;
 
+#if !CONFIG_DAALA_DIST
+  daala_dec->use_activity_masking = 0;
+#endif
+
   if (daala_dec->use_activity_masking)
     daala_dec->qm = OD_HVS_QM;
   else
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 5e29d9a..4678b03 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -202,6 +202,13 @@
   int pvq_speed;
   int pvq_coded;  // Indicates whether pvq_info needs be stored to tokenize
 #endif
+#if CONFIG_DAALA_DIST
+  // Keep rate of each 4x4 block in the current macroblock during RDO
+  // This is needed when using the 8x8 Daala distortion metric during RDO,
+  // because it evaluates distortion in a different order than the underlying
+  // 4x4 blocks are coded.
+  int rate_4x4[256];
+#endif
 };
 
 // Converts block_index for given transform size to index of the block in raster
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index 8fdb487..07ab2ee 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -479,7 +479,7 @@
                      TX_SIZE tx_size, int ctx,
                      AV1_XFORM_QUANT xform_quant_idx) {
   MACROBLOCKD *const xd = &x->e_mbd;
-#if !CONFIG_PVQ
+#if !(CONFIG_PVQ || CONFIG_DAALA_DIST)
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
 #else
@@ -504,6 +504,14 @@
 
   FWD_TXFM_PARAM fwd_txfm_param;
 
+#if CONFIG_PVQ || CONFIG_DAALA_DIST
+  uint8_t *dst;
+  int16_t *pred;
+  const int dst_stride = pd->dst.stride;
+  int tx_blk_size;
+  int i, j;
+#endif
+
 #if !CONFIG_PVQ
   const int tx2d_size = tx_size_2d[tx_size];
   QUANT_PARAM qparam;
@@ -523,14 +531,11 @@
 #else
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   tran_low_t *ref_coeff = BLOCK_OFFSET(pd->pvq_ref_coeff, block);
-  uint8_t *src, *dst;
-  int16_t *src_int16, *pred;
-  const int src_stride = p->src.stride;
-  const int dst_stride = pd->dst.stride;
-  int tx_blk_size;
-  int i, j;
   int skip = 1;
   PVQ_INFO *pvq_info = NULL;
+  uint8_t *src;
+  int16_t *src_int16;
+  const int src_stride = p->src.stride;
 
   (void)scan_order;
   (void)qcoeff;
@@ -539,10 +544,20 @@
     assert(block < MAX_PVQ_BLOCKS_IN_SB);
     pvq_info = &x->pvq[block][plane];
   }
-  dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
   src = &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
   src_int16 =
       &p->src_int16[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
+
+  // transform block size in pixels
+  tx_blk_size = tx_size_wide[tx_size];
+
+  for (j = 0; j < tx_blk_size; j++)
+    for (i = 0; i < tx_blk_size; i++)
+      src_int16[diff_stride * j + i] = src[src_stride * j + i];
+#endif
+
+#if CONFIG_PVQ || CONFIG_DAALA_DIST
+  dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
   pred = &pd->pred[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
 
   // transform block size in pixels
@@ -552,10 +567,10 @@
   // in order to use existing VP10 transform functions
   for (j = 0; j < tx_blk_size; j++)
     for (i = 0; i < tx_blk_size; i++) {
-      src_int16[diff_stride * j + i] = src[src_stride * j + i];
       pred[diff_stride * j + i] = dst[dst_stride * j + i];
     }
 #endif
+
   (void)ctx;
 
   fwd_txfm_param.tx_type = tx_type;
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 3d96700..da412fb 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -55,6 +55,9 @@
 #if CONFIG_PVQ
 #include "av1/encoder/pvq_encoder.h"
 #endif
+#if CONFIG_PVQ || CONFIG_DAALA_DIST
+#include "av1/common/pvq.h"
+#endif
 #if CONFIG_DUAL_FILTER
 #define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS)
 static const int filter_sets[DUAL_FILTER_SET_SIZE][2] = {
@@ -448,6 +451,170 @@
 #endif  // CONFIG_EXT_TX
 };
 
+#if CONFIG_DAALA_DIST
+static int od_compute_var_4x4(od_coeff *x, int stride) {
+  int sum;
+  int s2;
+  int i;
+  sum = 0;
+  s2 = 0;
+  for (i = 0; i < 4; i++) {
+    int j;
+    for (j = 0; j < 4; j++) {
+      int t;
+
+      t = x[i * stride + j];
+      sum += t;
+      s2 += t * t;
+    }
+  }
+  // TODO(yushin) : Check wheter any changes are required for high bit depth.
+  return (s2 - (sum * sum >> 4)) >> 4;
+}
+
+static double od_compute_dist_8x8(int qm, int use_activity_masking, od_coeff *x,
+                                  od_coeff *y, int stride) {
+  od_coeff e[8 * 8];
+  od_coeff et[8 * 8];
+  int16_t src[8 * 8];
+  tran_low_t coeff[8 * 8];
+  double sum;
+  int min_var;
+  double mean_var;
+  double var_stat;
+  double activity;
+  double calibration;
+  int i;
+  int j;
+  double vardist;
+  FWD_TXFM_PARAM fwd_txfm_param;
+
+  vardist = 0;
+  OD_ASSERT(qm != OD_FLAT_QM);
+#if 1
+  min_var = INT_MAX;
+  mean_var = 0;
+  for (i = 0; i < 3; i++) {
+    for (j = 0; j < 3; j++) {
+      int varx;
+      int vary;
+      varx = od_compute_var_4x4(x + 2 * i * stride + 2 * j, stride);
+      vary = od_compute_var_4x4(y + 2 * i * stride + 2 * j, stride);
+      min_var = OD_MINI(min_var, varx);
+      mean_var += 1. / (1 + varx);
+      /* The cast to (double) is to avoid an overflow before the sqrt.*/
+      vardist += varx - 2 * sqrt(varx * (double)vary) + vary;
+    }
+  }
+  /* We use a different variance statistic depending on whether activity
+     masking is used, since the harmonic mean appeared slghtly worse with
+     masking off. The calibration constant just ensures that we preserve the
+     rate compared to activity=1. */
+  if (use_activity_masking) {
+    calibration = 1.95;
+    var_stat = 9. / mean_var;
+  } else {
+    calibration = 1.62;
+    var_stat = min_var;
+  }
+  /* 1.62 is a calibration constant, 0.25 is a noise floor and 1/6 is the
+     activity masking constant. */
+  activity = calibration * pow(.25 + var_stat, -1. / 6);
+#else
+  activity = 1;
+#endif
+  sum = 0;
+  for (i = 0; i < 8; i++) {
+    for (j = 0; j < 8; j++)
+      e[8 * i + j] = x[i * stride + j] - y[i * stride + j];
+  }
+
+  for (i = 0; i < 8; i++)
+    for (j = 0; j < 8; j++) src[8 * i + j] = e[8 * i + j];
+
+  fwd_txfm_param.tx_type = DCT_DCT;
+  fwd_txfm_param.tx_size = TX_8X8;
+  fwd_txfm_param.lossless = 0;
+
+  fwd_txfm(&src[0], &coeff[0], 8, &fwd_txfm_param);
+
+  for (i = 0; i < 8; i++)
+    for (j = 0; j < 8; j++) et[8 * i + j] = coeff[8 * i + j] >> 3;
+
+  sum = 0;
+  for (i = 0; i < 8; i++)
+    for (j = 0; j < 8; j++)
+      sum += et[8 * i + j] * (double)et[8 * i + j] * 16. /
+             OD_QM8_Q4_HVS[i * 8 + j];
+
+  return activity * activity * (sum + vardist);
+}
+
+// Note : Inputs x and y are in a pixel domain
+static double od_compute_dist(int qm, int activity_masking, od_coeff *x,
+                              od_coeff *y, int bsize_w, int bsize_h,
+                              int qindex) {
+  int i;
+  double sum;
+  sum = 0;
+
+  (void)qindex;
+
+  assert(bsize_w >= 8 && bsize_h >= 8);
+
+  if (qm == OD_FLAT_QM) {
+    for (i = 0; i < bsize_w * bsize_h; i++) {
+      double tmp;
+      tmp = x[i] - y[i];
+      sum += tmp * tmp;
+    }
+  } else {
+    for (i = 0; i < bsize_h; i += 8) {
+      int j;
+      for (j = 0; j < bsize_w; j += 8) {
+        sum += od_compute_dist_8x8(qm, activity_masking, &x[i * bsize_w + j],
+                                   &y[i * bsize_w + j], bsize_w);
+      }
+    }
+    /* Compensate for the fact that the quantization matrix lowers the
+       distortion value. We tried a half-dozen values and picked the one where
+       we liked the ntt-short1 curves best. The tuning is approximate since
+       the different metrics go in different directions. */
+    /*Start interpolation at coded_quantizer 1.7=f(36) and end it at 1.2=f(47)*/
+    // TODO(yushin): Check whether qindex of AV1 work here, replacing daala's
+    // coded_quantizer.
+    /*sum *= qindex >= 47 ? 1.2 :
+        qindex <= 36 ? 1.7 :
+     1.7 + (1.2 - 1.7)*(qindex - 36)/(47 - 36);*/
+  }
+  return sum;
+}
+
+static int64_t av1_daala_dist(const uint8_t *src, int src_stride,
+                              const uint8_t *dst, int dst_stride, int tx_size,
+                              int qm, int use_activity_masking, int qindex) {
+  int i, j;
+  int64_t d;
+  const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+  const int bsw = block_size_wide[tx_bsize];
+  const int bsh = block_size_high[tx_bsize];
+  DECLARE_ALIGNED(16, od_coeff, orig[MAX_TX_SQUARE]);
+  DECLARE_ALIGNED(16, od_coeff, rec[MAX_TX_SQUARE]);
+
+  assert(qm == OD_HVS_QM);
+
+  for (j = 0; j < bsh; j++)
+    for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i];
+
+  for (j = 0; j < bsh; j++)
+    for (i = 0; i < bsw; i++) rec[j * bsw + i] = dst[j * dst_stride + i];
+
+  d = (int64_t)od_compute_dist(qm, use_activity_masking, orig, rec, bsw, bsh,
+                               qindex);
+  return d;
+}
+#endif  // #if CONFIG_DAALA_DIST
+
 static void get_energy_distribution_fine(const AV1_COMP *cpi, BLOCK_SIZE bsize,
                                          uint8_t *src, int src_stride,
                                          uint8_t *dst, int dst_stride,
@@ -829,9 +996,10 @@
 
 // TODO(yushin) : Since 4x4 case does not need ssz, better to refactor into
 // a separate function that does not do the extra computations for ssz.
-int64_t av1_block_error2_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
-                           const tran_low_t *ref, intptr_t block_size,
-                           int64_t *ssz) {
+static int64_t av1_block_error2_c(const tran_low_t *coeff,
+                                  const tran_low_t *dqcoeff,
+                                  const tran_low_t *ref, intptr_t block_size,
+                                  int64_t *ssz) {
   int64_t error;
 
   // Use the existing sse codes for calculating distortion of decoded signal:
@@ -1015,7 +1183,15 @@
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-  if (cpi->sf.use_transform_domain_distortion) {
+#if CONFIG_DAALA_DIST
+  int qm = OD_HVS_QM;
+  int use_activity_masking = 0;
+#if CONFIG_PVQ
+  use_activity_masking = x->daala_enc.use_activity_masking;
+#endif
+#endif
+
+  if (cpi->sf.use_transform_domain_distortion && !CONFIG_DAALA_DIST) {
     // Transform domain distortion computation is more accurate as it does
     // not involve an inverse transform, but it is less accurate.
     const int buffer_length = tx_size_2d[tx_size];
@@ -1061,7 +1237,17 @@
     assert(cpi != NULL);
     assert(tx_size_wide_log2[0] == tx_size_high_log2[0]);
 
-    cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &tmp);
+#if CONFIG_DAALA_DIST
+    if (plane == 0) {
+      if (bsw >= 8 && bsh >= 8)
+        tmp = av1_daala_dist(src, src_stride, dst, dst_stride, tx_size, qm,
+                             use_activity_masking, x->qindex);
+      else
+        tmp = 0;
+    } else
+#endif
+      cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &tmp);
+
     *out_sse = (int64_t)tmp * 16;
 
     if (eob) {
@@ -1106,10 +1292,17 @@
 #endif
         inv_txfm_add(dqcoeff, recon, MAX_TX_SIZE, &inv_txfm_param);
       }
-
-      cpi->fn_ptr[tx_bsize].vf(src, src_stride, recon, MAX_TX_SIZE, &tmp);
+#if CONFIG_DAALA_DIST
+      if (plane == 0) {
+        if (bsw >= 8 && bsh >= 8)
+          tmp = av1_daala_dist(src, src_stride, recon, MAX_TX_SIZE, tx_size, qm,
+                               use_activity_masking, x->qindex);
+        else
+          tmp = 0;
+      } else
+#endif
+        cpi->fn_ptr[tx_bsize].vf(src, src_stride, recon, MAX_TX_SIZE, &tmp);
     }
-
     *out_dist = (int64_t)tmp * 16;
   }
 }
@@ -1176,6 +1369,14 @@
   int coeff_ctx = combine_entropy_contexts(*(args->t_above + blk_col),
                                            *(args->t_left + blk_row));
   RD_STATS this_rd_stats;
+#if CONFIG_DAALA_DIST
+  int qm = OD_HVS_QM;
+  int use_activity_masking = 0;
+#if CONFIG_PVQ
+  use_activity_masking = x->daala_enc.use_activity_masking;
+#endif
+#endif
+
   av1_init_rd_stats(&this_rd_stats);
 
   if (args->exit_early) return;
@@ -1186,8 +1387,7 @@
     };
     av1_encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size,
                            &b_args);
-
-    if (args->cpi->sf.use_transform_domain_distortion) {
+    if (args->cpi->sf.use_transform_domain_distortion && !CONFIG_DAALA_DIST) {
       dist_block(args->cpi, x, plane, block, blk_row, blk_col, tx_size,
                  &this_rd_stats.dist, &this_rd_stats.sse);
     } else {
@@ -1195,7 +1395,6 @@
       // inv_txfm_add, so we can't just call dist_block here.
       const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
       const aom_variance_fn_t variance = args->cpi->fn_ptr[tx_bsize].vf;
-
       const struct macroblock_plane *const p = &x->plane[plane];
       const struct macroblockd_plane *const pd = &xd->plane[plane];
 
@@ -1210,18 +1409,56 @@
                .buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
       const int16_t *diff = &p->src_diff[(blk_row * diff_stride + blk_col)
                                          << tx_size_wide_log2[0]];
-
       unsigned int tmp;
-      this_rd_stats.sse = sum_squares_2d(diff, diff_stride, tx_size);
+
+#if CONFIG_DAALA_DIST
+      if (plane == 0) {
+        const int bsw = block_size_wide[tx_bsize];
+        const int bsh = block_size_high[tx_bsize];
+
+        if (bsw >= 8 && bsh >= 8) {
+          const int16_t *pred = &pd->pred[(blk_row * diff_stride + blk_col)
+                                          << tx_size_wide_log2[0]];
+          int i, j;
+          DECLARE_ALIGNED(16, uint8_t, pred8[MAX_TX_SQUARE]);
+
+          for (j = 0; j < bsh; j++)
+            for (i = 0; i < bsw; i++)
+              pred8[j * bsw + i] = pred[j * diff_stride + i];
+
+          this_rd_stats.sse =
+              av1_daala_dist(src, src_stride, pred8, bsw, tx_size, qm,
+                             use_activity_masking, x->qindex);
+        } else {
+          this_rd_stats.sse = 0;
+        }
+      } else
+#endif
+      {
+        this_rd_stats.sse = sum_squares_2d(diff, diff_stride, tx_size);
 
 #if CONFIG_AOM_HIGHBITDEPTH
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-        this_rd_stats.sse =
-            ROUND_POWER_OF_TWO(this_rd_stats.sse, (xd->bd - 8) * 2);
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+          this_rd_stats.sse =
+              ROUND_POWER_OF_TWO(this_rd_stats.sse, (xd->bd - 8) * 2);
 #endif  // CONFIG_AOM_HIGHBITDEPTH
+      }
+
       this_rd_stats.sse = this_rd_stats.sse * 16;
 
-      variance(src, src_stride, dst, dst_stride, &tmp);
+#if CONFIG_DAALA_DIST
+      if (plane == 0) {
+        const int bsw = block_size_wide[tx_bsize];
+        const int bsh = block_size_high[tx_bsize];
+
+        if (bsw >= 8 && bsh >= 8)
+          tmp = av1_daala_dist(src, src_stride, dst, dst_stride, tx_size, qm,
+                               use_activity_masking, x->qindex);
+        else
+          tmp = 0;
+      } else
+#endif
+        variance(src, src_stride, dst, dst_stride, &tmp);
       this_rd_stats.dist = (int64_t)tmp * 16;
     }
   } else {
@@ -1269,12 +1506,20 @@
   // TODO(jingning): temporarily enabled only for luma component
   rd = AOMMIN(rd1, rd2);
 
+#if CONFIG_DAALA_DIST
+  if (plane == 0 && tx_size <= TX_4X4) {
+    rd = 0;
+    x->rate_4x4[block] = this_rd_stats.rate;
+  }
+#endif
+
 #if !CONFIG_PVQ
   this_rd_stats.skip &= !x->plane[plane].eobs[block];
 #else
   this_rd_stats.skip &= x->pvq_skip[plane];
 #endif
   av1_merge_rd_stats(&args->rd_stats, &this_rd_stats);
+
   args->this_rd += rd;
 
   if (args->this_rd > args->best_rd) {
@@ -1283,6 +1528,96 @@
   }
 }
 
+#if CONFIG_DAALA_DIST
+static void block_8x8_rd_txfm_daala_dist(int plane, int block, int blk_row,
+                                         int blk_col, BLOCK_SIZE plane_bsize,
+                                         TX_SIZE tx_size, void *arg) {
+  struct rdcost_block_args *args = arg;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  // MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  // const AV1_COMMON *cm = &args->cpi->common;
+  int64_t rd1, rd2, rd;
+  RD_STATS this_rd_stats;
+  int qm = OD_HVS_QM;
+  int use_activity_masking = 0;
+
+#if CONFIG_PVQ
+  use_activity_masking = x->daala_enc.use_activity_masking;
+#endif
+  av1_init_rd_stats(&this_rd_stats);
+
+  if (args->exit_early) return;
+
+  {
+    const struct macroblock_plane *const p = &x->plane[plane];
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+
+    const int src_stride = p->src.stride;
+    const int dst_stride = pd->dst.stride;
+    const int diff_stride = block_size_wide[plane_bsize];
+
+    const uint8_t *src =
+        &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
+    const uint8_t *dst =
+        &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+
+    unsigned int tmp;
+
+    int qindex = x->qindex;
+
+    const int16_t *pred =
+        &pd->pred[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
+    int i, j;
+    const int tx_blk_size = 1 << (tx_size + 2);
+    DECLARE_ALIGNED(16, uint8_t, pred8[MAX_TX_SQUARE]);
+
+    for (j = 0; j < tx_blk_size; j++)
+      for (i = 0; i < tx_blk_size; i++)
+        pred8[j * tx_blk_size + i] = pred[j * diff_stride + i];
+
+    this_rd_stats.sse =
+        av1_daala_dist(src, src_stride, pred8, tx_blk_size, tx_size, qm,
+                       use_activity_masking, qindex);
+
+    this_rd_stats.sse = this_rd_stats.sse * 16;
+
+    tmp = av1_daala_dist(src, src_stride, dst, dst_stride, tx_size, qm,
+                         use_activity_masking, qindex);
+
+    this_rd_stats.dist = (int64_t)tmp * 16;
+  }
+
+  rd = RDCOST(x->rdmult, x->rddiv, 0, this_rd_stats.dist);
+  if (args->this_rd + rd > args->best_rd) {
+    args->exit_early = 1;
+    return;
+  }
+
+  {
+    const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+    // The rate of the current 8x8 block is the sum of four 4x4 blocks in it.
+    this_rd_stats.rate = x->rate_4x4[block - max_blocks_wide - 1] +
+                         x->rate_4x4[block - max_blocks_wide] +
+                         x->rate_4x4[block - 1] + x->rate_4x4[block];
+  }
+  rd1 = RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate, this_rd_stats.dist);
+  rd2 = RDCOST(x->rdmult, x->rddiv, 0, this_rd_stats.sse);
+
+  rd = AOMMIN(rd1, rd2);
+
+  args->rd_stats.dist += this_rd_stats.dist;
+  args->rd_stats.sse += this_rd_stats.sse;
+
+  args->this_rd += rd;
+
+  if (args->this_rd > args->best_rd) {
+    args->exit_early = 1;
+    return;
+  }
+}
+#endif  // #if CONFIG_DAALA_DIST
+
 static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
                              RD_STATS *rd_stats, int64_t ref_best_rd, int plane,
                              BLOCK_SIZE bsize, TX_SIZE tx_size,
@@ -1307,8 +1642,16 @@
   args.scan_order =
       get_scan(cm, tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi));
 
-  av1_foreach_transformed_block_in_plane(xd, bsize, plane, block_rd_txfm,
-                                         &args);
+#if CONFIG_DAALA_DIST
+  if (plane == 0 &&
+      (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4))
+    av1_foreach_8x8_transformed_block_in_plane(
+        xd, bsize, plane, block_rd_txfm, block_8x8_rd_txfm_daala_dist, &args);
+  else
+#endif
+    av1_foreach_transformed_block_in_plane(xd, bsize, plane, block_rd_txfm,
+                                           &args);
+
   if (args.exit_early) {
     av1_invalid_rd_stats(rd_stats);
   } else {
@@ -2633,8 +2976,9 @@
           cpi, mb, idy, idx, &best_mode, bmode_costs,
           xd->plane[0].above_context + idx, xd->plane[0].left_context + idy, &r,
           &ry, &d, bsize, tx_size, y_skip, best_rd - total_rd);
+#if !CONFIG_DAALA_DIST
       if (this_rd >= best_rd - total_rd) return INT64_MAX;
-
+#endif
       total_rd += this_rd;
       cost += r;
       total_distortion += d;
@@ -2651,6 +2995,26 @@
   }
   mbmi->mode = mic->bmi[3].as_mode;
 
+#if CONFIG_DAALA_DIST
+  {
+    const struct macroblock_plane *p = &mb->plane[0];
+    const struct macroblockd_plane *pd = &xd->plane[0];
+    const int src_stride = p->src.stride;
+    const int dst_stride = pd->dst.stride;
+    uint8_t *src = p->src.buf;
+    uint8_t *dst = pd->dst.buf;
+    int use_activity_masking = 0;
+    int qm = OD_HVS_QM;
+
+#if CONFIG_PVQ
+    use_activity_masking = mb->daala_enc.use_activity_masking;
+#endif
+    // Daala-defined distortion computed for the block of 8x8 pixels
+    total_distortion = av1_daala_dist(src, src_stride, dst, dst_stride, TX_8X8,
+                                      qm, use_activity_masking, mb->qindex)
+                       << 4;
+  }
+#endif
   // Add in the cost of the transform type
   if (!is_lossless) {
     int rate_tx_type = 0;
@@ -6015,6 +6379,88 @@
   // update the coding decisions
   for (k = 0; k < 4; ++k) bsi->modes[k] = mi->bmi[k].as_mode;
 
+#if CONFIG_DAALA_DIST
+  {
+    const int src_stride = p->src.stride;
+    const int dst_stride = pd->dst.stride;
+    uint8_t *src = p->src.buf;
+    uint8_t pred[8 * 8];
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->mbmi.sb_type, pd);
+    int use_activity_masking = 0;
+    int qm = OD_HVS_QM;
+#if CONFIG_PVQ
+    use_activity_masking = x->daala_enc.use_activity_masking;
+#endif
+
+    for (idy = 0; idy < 2; idy += num_4x4_blocks_high)
+      for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
+        int i = idy * 2 + idx;
+        int j, m;
+        uint8_t *dst_init = &pd->dst.buf[(idy * dst_stride + idx) * 4];
+
+        av1_build_inter_predictor_sub8x8(xd, 0, i, idy, idx, mi_row, mi_col);
+        // Save predicted pixels for use later.
+        for (j = 0; j < num_4x4_blocks_high * 4; j++)
+          for (m = 0; m < num_4x4_blocks_wide * 4; m++)
+            pred[(idy * 4 + j) * 8 + idx * 4 + m] =
+                dst_init[j * dst_stride + m];
+
+        // Do xform and quant to get decoded pixels.
+        {
+          const int txb_width = max_block_wide(xd, plane_bsize, 0);
+          const int txb_height = max_block_high(xd, plane_bsize, 0);
+          int idx_, idy_;
+
+          for (idy_ = 0; idy_ < txb_height; idy_++) {
+            for (idx_ = 0; idx_ < txb_width; idx_++) {
+              int block;
+              int coeff_ctx = 0;
+              const tran_low_t *dqcoeff;
+              uint16_t eob;
+              const PLANE_TYPE plane_type = PLANE_TYPE_Y;
+              INV_TXFM_PARAM inv_txfm_param;
+              uint8_t *dst = dst_init + (idy_ * dst_stride + idx_) * 4;
+
+              block = i + (idy_ * 2 + idx_);
+
+              dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+              eob = p->eobs[block];
+
+              av1_xform_quant(cm, x, 0, block, idy + (i >> 1), idx + (i & 0x01),
+                              BLOCK_8X8, TX_4X4, coeff_ctx, AV1_XFORM_QUANT_FP);
+
+              inv_txfm_param.tx_type =
+                  get_tx_type(plane_type, xd, block, TX_4X4);
+              inv_txfm_param.tx_size = TX_4X4;
+              inv_txfm_param.eob = eob;
+              inv_txfm_param.lossless = xd->lossless[mbmi->segment_id];
+
+#if CONFIG_PVQ
+              {
+                int i2, j2;
+
+                for (j2 = 0; j2 < 4; j2++)
+                  for (i2 = 0; i2 < 4; i2++) dst[j2 * dst_stride + i2] = 0;
+              }
+#endif
+              inv_txfm_add(dqcoeff, dst, dst_stride, &inv_txfm_param);
+            }
+          }
+        }
+      }
+
+    // Daala-defined distortion computed for 1) predicted pixels and
+    // 2) decoded pixels of the block of 8x8 pixels
+    bsi->sse = av1_daala_dist(src, src_stride, pred, 8, TX_8X8, qm,
+                              use_activity_masking, x->qindex)
+               << 4;
+
+    bsi->d = av1_daala_dist(src, src_stride, pd->dst.buf, dst_stride, TX_8X8,
+                            qm, use_activity_masking, x->qindex)
+             << 4;
+  }
+#endif  // CONFIG_DAALA_DIST
+
   if (bsi->segment_rd > best_rd) return INT64_MAX;
   /* set it to the best */
   for (idx = 0; idx < 4; idx++) {
diff --git a/configure b/configure
index 41a984a..4cd99f6 100755
--- a/configure
+++ b/configure
@@ -296,6 +296,7 @@
     coef_interleave
     entropy_stats
     masked_tx
+    daala_dist
 "
 CONFIG_LIST="
     dependency_tracking