Fix daala-dist for cb4x4

The place where av1_daala_dist() is applied for sub8x8 partition is
moved from sub8x8 mode decision functions to rd_pick_partition().

BD-Rate change by daala-dist with '--disable-var-tx' is:
(AWCY, objective-1-fast, high delay mode)

   PSNR | PSNR Cb | PSNR Cr | PSNR HVS |    SSIM | MS SSIM | CIEDE 2000
15.1558 | 12.9585 | 14.4662 |  -3.8651 | -1.7102 | -9.2956 |    10.8686

In MSE probe mode:

  PSNR | PSNR Cb | PSNR Cr | PSNR HVS |   SSIM | MS SSIM | CIEDE 2000
0.0429 |  0.0435 |  0.1651 |  -0.0415 | 0.0850 |  0.0122 |     0.0546

Change-Id: I3b2ea916d41c48e433eb641adf44552e4725c198
diff --git a/av1/common/blockd.c b/av1/common/blockd.c
index 602cef4..5dcd499 100644
--- a/av1/common/blockd.c
+++ b/av1/common/blockd.c
@@ -179,15 +179,15 @@
 #endif
 
 #if CONFIG_DAALA_DIST
-void av1_foreach_8x8_transformed_block_in_plane(
-    const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
+void av1_foreach_8x8_transformed_block_in_yplane(
+    const MACROBLOCKD *const xd, BLOCK_SIZE bsize,
     foreach_transformed_block_visitor visit,
     foreach_transformed_block_visitor mi_visit, void *arg) {
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[0];
   // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
   // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
   // transform size varies per plane, look it up in a common way.
-  const TX_SIZE tx_size = get_tx_size(plane, xd);
+  const TX_SIZE tx_size = get_tx_size(0, xd);
   const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
   const uint8_t txw_unit = tx_size_wide_unit[tx_size];
   const uint8_t txh_unit = tx_size_high_unit[tx_size];
@@ -197,18 +197,24 @@
   // If mb_to_right_edge is < 0 we are in a situation in which
   // the current block size extends into the UMV and we won't
   // visit the sub blocks that are wholly within the UMV.
-  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
-  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
+  const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
+  const int skip_check_r = tx_size_high[tx_size] == 8 ? 1 : 0;
+  const int skip_check_c = tx_size_wide[tx_size] == 8 ? 1 : 0;
+
+  assert(plane_bsize >= BLOCK_8X8);
+  assert(tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4);
 
   // Keep track of the row and column of the blocks we use so that we know
   // if we are in the unrestricted motion border.
   for (r = 0; r < max_blocks_high; r += txh_unit) {
     // Skip visiting the sub blocks that are wholly within the UMV.
     for (c = 0; c < max_blocks_wide; c += txw_unit) {
-      visit(plane, i, r, c, plane_bsize, tx_size, arg);
-      // Call whenever each 8x8 block is done
-      if ((r & 1) && (c & 1))
-        mi_visit(plane, i, r - 1, c - 1, plane_bsize, TX_8X8, arg);
+      visit(0, i, r, c, plane_bsize, tx_size, arg);
+      // Call whenever each 8x8 tx block is done
+      if (((r & txh_unit) || skip_check_r) && ((c & txw_unit) || skip_check_c))
+        mi_visit(0, i, r - (1 - skip_check_r) * txh_unit,
+                 c - (1 - skip_check_c) * txw_unit, plane_bsize, tx_size, arg);
       i += step;
     }
   }
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 0022907..f679a60 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -310,6 +310,9 @@
   int64_t rdcost;
   int64_t sse;
   int skip;  // sse should equal to dist when skip == 1
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+  int64_t dist_y;
+#endif
 #if CONFIG_RD_DEBUG
   int txb_coeff_cost[MAX_MB_PLANE];
 #if CONFIG_VAR_TX
@@ -1164,8 +1167,8 @@
 #endif
 
 #if CONFIG_DAALA_DIST
-void av1_foreach_8x8_transformed_block_in_plane(
-    const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
+void av1_foreach_8x8_transformed_block_in_yplane(
+    const MACROBLOCKD *const xd, BLOCK_SIZE bsize,
     foreach_transformed_block_visitor visit,
     foreach_transformed_block_visitor mi_visit, void *arg);
 #endif
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 12dbef9..47bf4f7 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -209,8 +209,11 @@
   // This is needed when using the 8x8 Daala distortion metric during RDO,
   // because it evaluates distortion in a different order than the underlying
   // 4x4 blocks are coded.
-  int rate_4x4[256];
-#endif
+  int rate_4x4[MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
+#if CONFIG_CB4X4
+  DECLARE_ALIGNED(16, uint8_t, decoded_8x8[8 * 8]);
+#endif  // CONFIG_CB4X4
+#endif  // CONFIG_DAALA_DIST
 #if CONFIG_CFL
   // Whether luma needs to be stored during RDO.
   int cfl_store_y;
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 0465ba8..0c46d03 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -1281,6 +1281,29 @@
   return av1_compute_rd_mult(cpi, segment_qindex + cm->y_dc_delta_q);
 }
 
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+static void daala_dist_set_sub8x8_dst(MACROBLOCK *const x, uint8_t *dst8x8,
+                                      BLOCK_SIZE bsize, int bw, int bh,
+                                      int mi_row, int mi_col) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  const int dst_stride = pd->dst.stride;
+  uint8_t *dst = pd->dst.buf;
+
+  assert(bsize < BLOCK_8X8);
+
+  if (bsize < BLOCK_8X8) {
+    int i, j;
+    uint8_t *dst_sub8x8 = &dst8x8[((mi_row & 1) * 8 + (mi_col & 1)) << 2];
+
+    for (j = 0; j < bh; ++j)
+      for (i = 0; i < bw; ++i) {
+        dst_sub8x8[j * 8 + i] = dst[j * dst_stride + i];
+      }
+  }
+}
+#endif
+
 static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data,
                              MACROBLOCK *const x, int mi_row, int mi_col,
                              RD_STATS *rd_cost,
@@ -3576,9 +3599,29 @@
 #if CONFIG_SUPERTX
           sum_rate_nocoef += this_rate_nocoef;
 #endif  // CONFIG_SUPERTX
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+          sum_rdc.dist_y += this_rdc.dist_y;
+#endif
         }
       }
       reached_last_index = (idx == 4);
+
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+      if (reached_last_index && sum_rdc.rdcost != INT64_MAX &&
+          bsize == BLOCK_8X8) {
+        int use_activity_masking = 0;
+        int64_t daala_dist;
+        const int src_stride = x->plane[0].src.stride;
+        daala_dist = av1_daala_dist(x->plane[0].src.buf - 4 * src_stride - 4,
+                                    src_stride, x->decoded_8x8, 8, 8, 8, 1,
+                                    use_activity_masking, x->qindex)
+                     << 4;
+        sum_rdc.dist = sum_rdc.dist - sum_rdc.dist_y + daala_dist;
+        sum_rdc.rdcost =
+            RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+      }
+#endif  // CONFIG_DAALA_DIST && CONFIG_CB4X4
+
 #if CONFIG_SUPERTX
       if (supertx_allowed && sum_rdc.rdcost < INT64_MAX && reached_last_index) {
         TX_SIZE supertx_size = max_txsize_lookup[bsize];
@@ -3716,6 +3759,16 @@
                        subsize, &pc_tree->horizontal[1],
                        best_rdc.rdcost - sum_rdc.rdcost);
 #endif  // CONFIG_SUPERTX
+
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+      if (this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) {
+        update_state(cpi, td, &pc_tree->horizontal[1], mi_row + mi_step, mi_col,
+                     subsize, DRY_RUN_NORMAL);
+        encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row + mi_step, mi_col,
+                          subsize, NULL);
+      }
+#endif  // CONFIG_DAALA_DIST && CONFIG_CB4X4
+
       if (this_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
 #if CONFIG_SUPERTX
@@ -3728,7 +3781,24 @@
 #if CONFIG_SUPERTX
         sum_rate_nocoef += this_rate_nocoef;
 #endif  // CONFIG_SUPERTX
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+        sum_rdc.dist_y += this_rdc.dist_y;
+#endif
       }
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+      if (sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) {
+        int use_activity_masking = 0;
+        int64_t daala_dist;
+        const int src_stride = x->plane[0].src.stride;
+        daala_dist = av1_daala_dist(x->plane[0].src.buf - 4 * src_stride,
+                                    src_stride, x->decoded_8x8, 8, 8, 8, 1,
+                                    use_activity_masking, x->qindex)
+                     << 4;
+        sum_rdc.dist = sum_rdc.dist - sum_rdc.dist_y + daala_dist;
+        sum_rdc.rdcost =
+            RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+      }
+#endif  // CONFIG_DAALA_DIST && CONFIG_CB4X4
     }
 
 #if CONFIG_SUPERTX
@@ -3863,6 +3933,16 @@
                        subsize, &pc_tree->vertical[1],
                        best_rdc.rdcost - sum_rdc.rdcost);
 #endif  // CONFIG_SUPERTX
+
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+      if (this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) {
+        update_state(cpi, td, &pc_tree->vertical[1], mi_row, mi_col + mi_step,
+                     subsize, DRY_RUN_NORMAL);
+        encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col + mi_step,
+                          subsize, NULL);
+      }
+#endif  // CONFIG_DAALA_DIST && CONFIG_CB4X4
+
       if (this_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
 #if CONFIG_SUPERTX
@@ -3875,7 +3955,24 @@
 #if CONFIG_SUPERTX
         sum_rate_nocoef += this_rate_nocoef;
 #endif  // CONFIG_SUPERTX
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+        sum_rdc.dist_y += this_rdc.dist_y;
+#endif
       }
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+      if (sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) {
+        int use_activity_masking = 0;
+        int64_t daala_dist;
+        const int src_stride = x->plane[0].src.stride;
+        daala_dist =
+            av1_daala_dist(x->plane[0].src.buf - 4, src_stride, x->decoded_8x8,
+                           8, 8, 8, 1, use_activity_masking, x->qindex)
+            << 4;
+        sum_rdc.dist = sum_rdc.dist - sum_rdc.dist_y + daala_dist;
+        sum_rdc.rdcost =
+            RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+      }
+#endif  // CONFIG_DAALA_DIST && CONFIG_CB4X4
     }
 #if CONFIG_SUPERTX
     if (supertx_allowed && sum_rdc.rdcost < INT64_MAX && !abort_flag) {
@@ -4031,6 +4128,14 @@
   x->cfl_store_y = 0;
 #endif
 
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+  if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
+      bsize == BLOCK_4X4 && pc_tree->index == 3) {
+    encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+              pc_tree, NULL);
+  }
+#endif  // CONFIG_DAALA_DIST && CONFIG_CB4X4
+
   if (bsize == cm->sb_size) {
 #if !CONFIG_PVQ && !CONFIG_LV_MAP
     assert(tp_orig < *tp || (tp_orig == *tp && xd->mi[0]->mbmi.skip));
@@ -5517,6 +5622,13 @@
 #endif
   }
 
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+  if (bsize < BLOCK_8X8) {
+    daala_dist_set_sub8x8_dst(x, x->decoded_8x8, bsize, block_size_wide[bsize],
+                              block_size_high[bsize], mi_row, mi_col);
+  }
+#endif
+
   if (!dry_run) {
 #if CONFIG_VAR_TX
     TX_SIZE tx_size =
diff --git a/av1/encoder/rd.h b/av1/encoder/rd.h
index d22789f..5c3eee4 100644
--- a/av1/encoder/rd.h
+++ b/av1/encoder/rd.h
@@ -361,6 +361,9 @@
   rd_stats->rdcost = 0;
   rd_stats->sse = 0;
   rd_stats->skip = 1;
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+  rd_stats->dist_y = 0;
+#endif
 #if CONFIG_RD_DEBUG
   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
     rd_stats->txb_coeff_cost[plane] = 0;
@@ -385,6 +388,9 @@
   rd_stats->rdcost = INT64_MAX;
   rd_stats->sse = INT64_MAX;
   rd_stats->skip = 0;
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+  rd_stats->dist_y = INT64_MAX;
+#endif
 #if CONFIG_RD_DEBUG
   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
     rd_stats->txb_coeff_cost[plane] = INT_MAX;
@@ -409,6 +415,9 @@
   rd_stats_dst->dist += rd_stats_src->dist;
   rd_stats_dst->sse += rd_stats_src->sse;
   rd_stats_dst->skip &= rd_stats_src->skip;
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+  rd_stats_dst->dist_y += rd_stats_src->dist_y;
+#endif
 #if CONFIG_RD_DEBUG
   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
     rd_stats_dst->txb_coeff_cost[plane] += rd_stats_src->txb_coeff_cost[plane];
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 39cd27d..b23acce 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -575,10 +575,9 @@
   return sum;
 }
 
-static int64_t av1_daala_dist(const uint8_t *src, int src_stride,
-                              const uint8_t *dst, int dst_stride, int bsw,
-                              int bsh, int qm, int use_activity_masking,
-                              int qindex) {
+int64_t av1_daala_dist(const uint8_t *src, int src_stride, const uint8_t *dst,
+                       int dst_stride, int bsw, int bsh, int qm,
+                       int use_activity_masking, int qindex) {
   int i, j;
   int64_t d;
   DECLARE_ALIGNED(16, od_coeff, orig[MAX_TX_SQUARE]);
@@ -1577,7 +1576,7 @@
   rd = AOMMIN(rd1, rd2);
 
 #if CONFIG_DAALA_DIST
-  if (plane == 0 &&
+  if (plane == 0 && plane_bsize >= BLOCK_8X8 &&
       (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4)) {
     this_rd_stats.dist = 0;
     this_rd_stats.sse = 0;
@@ -1615,6 +1614,9 @@
   int use_activity_masking = 0;
 
   (void)tx_size;
+
+  assert(plane == 0);
+  assert(plane_bsize >= BLOCK_8X8);
 #if CONFIG_PVQ
   use_activity_masking = x->daala_enc.use_activity_masking;
 #endif  // CONFIG_PVQ
@@ -1674,10 +1676,15 @@
 
   {
     const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+    const uint8_t txw_unit = tx_size_wide_unit[tx_size];
+    const uint8_t txh_unit = tx_size_high_unit[tx_size];
+    const int step = txw_unit * txh_unit;
+    int offset_h = tx_size_high_unit[TX_4X4];
     // The rate of the current 8x8 block is the sum of four 4x4 blocks in it.
-    this_rd_stats.rate = x->rate_4x4[block - max_blocks_wide - 1] +
-                         x->rate_4x4[block - max_blocks_wide] +
-                         x->rate_4x4[block - 1] + x->rate_4x4[block];
+    this_rd_stats.rate =
+        x->rate_4x4[block - max_blocks_wide * offset_h - step] +
+        x->rate_4x4[block - max_blocks_wide * offset_h] +
+        x->rate_4x4[block - step] + x->rate_4x4[block];
   }
   rd1 = RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate, this_rd_stats.dist);
   rd2 = RDCOST(x->rdmult, x->rddiv, 0, this_rd_stats.sse);
@@ -1714,10 +1721,10 @@
   av1_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
 
 #if CONFIG_DAALA_DIST
-  if (plane == 0 &&
+  if (plane == 0 && bsize >= BLOCK_8X8 &&
       (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4))
-    av1_foreach_8x8_transformed_block_in_plane(
-        xd, bsize, plane, block_rd_txfm, block_8x8_rd_txfm_daala_dist, &args);
+    av1_foreach_8x8_transformed_block_in_yplane(
+        xd, bsize, block_rd_txfm, block_8x8_rd_txfm_daala_dist, &args);
   else
 #endif  // CONFIG_DAALA_DIST
     av1_foreach_transformed_block_in_plane(xd, bsize, plane, block_rd_txfm,
@@ -9498,6 +9505,9 @@
       rd_cost->dist = dist_y + dist_uv;
     }
     rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+    rd_cost->dist_y = dist_y;
+#endif
   } else {
     rd_cost->rate = INT_MAX;
   }
@@ -10234,6 +10244,10 @@
     int compmode_cost = 0;
     int rate2 = 0, rate_y = 0, rate_uv = 0;
     int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+    int64_t distortion2_y = 0;
+    int64_t total_sse_y = INT64_MAX;
+#endif
     int skippable = 0;
     int this_skip2 = 0;
     int64_t total_sse = INT64_MAX;
@@ -10575,6 +10589,9 @@
       if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED)
         rate2 += intra_cost_penalty;
       distortion2 = distortion_y + distortion_uv;
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+      if (bsize < BLOCK_8X8) distortion2_y = distortion_y;
+#endif
     } else {
       int_mv backup_ref_mv[2];
 
@@ -10668,6 +10685,9 @@
         total_sse = rd_stats.sse;
         rate_y = rd_stats_y.rate;
         rate_uv = rd_stats_uv.rate;
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+        if (bsize < BLOCK_8X8) distortion2_y = rd_stats_y.dist;
+#endif
       }
 
 // TODO(jingning): This needs some refactoring to improve code quality
@@ -10877,6 +10897,12 @@
             tmp_ref_rd = tmp_alt_rd;
             backup_mbmi = *mbmi;
             backup_skip = x->skip;
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+            if (bsize < BLOCK_8X8) {
+              total_sse_y = tmp_rd_stats_y.sse;
+              distortion2_y = tmp_rd_stats_y.dist;
+            }
+#endif
 #if CONFIG_VAR_TX
             for (i = 0; i < MAX_MB_PLANE; ++i)
               memcpy(x->blk_skip_drl[i], x->blk_skip[i],
@@ -10950,6 +10976,9 @@
           this_skip2 = 1;
           rate_y = 0;
           rate_uv = 0;
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+          if (bsize < BLOCK_8X8) distortion2_y = total_sse_y;
+#endif
         }
       } else {
         // Add in the cost of the no skip flag.
@@ -11039,7 +11068,9 @@
         best_rate_y = rate_y + av1_cost_bit(av1_get_skip_prob(cm, xd),
                                             this_skip2 || skippable);
         best_rate_uv = rate_uv;
-
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+        if (bsize < BLOCK_8X8) rd_cost->dist_y = distortion2_y;
+#endif
 #if CONFIG_VAR_TX
         for (i = 0; i < MAX_MB_PLANE; ++i)
           memcpy(ctx->blk_skip[i], x->blk_skip[i],
@@ -11167,6 +11198,9 @@
       rd_cost->rate +=
           (rd_stats_y.rate + rd_stats_uv.rate - best_rate_y - best_rate_uv);
       rd_cost->dist = rd_stats_y.dist + rd_stats_uv.dist;
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+      if (bsize < BLOCK_8X8) rd_cost->dist_y = rd_stats_y.dist;
+#endif
       rd_cost->rdcost =
           RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
       best_skip2 = skip_blk;
@@ -11686,7 +11720,9 @@
   rd_cost->rate = rate2;
   rd_cost->dist = distortion2;
   rd_cost->rdcost = this_rd;
-
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+  if (bsize < BLOCK_8X8) rd_cost->dist_y = distortion2;
+#endif
   if (this_rd >= best_rd_so_far) {
     rd_cost->rate = INT_MAX;
     rd_cost->rdcost = INT64_MAX;
diff --git a/av1/encoder/rdopt.h b/av1/encoder/rdopt.h
index a7053b2..6563773 100644
--- a/av1/encoder/rdopt.h
+++ b/av1/encoder/rdopt.h
@@ -62,6 +62,12 @@
                     TX_SIZE tx_size, int64_t *out_dist, int64_t *out_sse,
                     OUTPUT_STATUS output_status);
 
+#if CONFIG_DAALA_DIST
+int64_t av1_daala_dist(const uint8_t *src, int src_stride, const uint8_t *dst,
+                       int dst_stride, int bsw, int bsh, int qm,
+                       int use_activity_masking, int qindex);
+#endif
+
 #if !CONFIG_PVQ || CONFIG_VAR_TX
 int av1_cost_coeffs(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
                     int block, TX_SIZE tx_size, const SCAN_ORDER *scan_order,