Fix daala-dist in sub8x8 inter rdo without cb4x4

Fixes several bugs in daala-dist, sub8x8 inter mode
with --disable-cb4x4.

Changes on BD-Rate are:
(objective-1-fast, high delay mode, --disalble-cb4x4):

   PSNR | PSNR Cb | PSNR Cr | PSNR HVS |    SSIM | MS SSIM | CIEDE 2000
14.0337 | 13.0132 | 14.6055 |  -3.2030 | -2.2092 | -9.3902 |    10.3399

Additionally, BD-Rate change by daala-dist in enforced MSE mode
which can show the correctness of implementation is

  PSNR | PSNR Cb | PSNR Cr | PSNR HVS |   SSIM | MS SSIM | CIEDE 2000
0.0048 |  0.0789 |  0.0209 |   0.0443 | 0.0137 |  0.0411 |     0.0441

Change-Id: I68ec90c6072aa3564522c2b8e87b62a998cec47c
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index d60976b..1091683 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -603,13 +603,11 @@
 }
 
 static int64_t av1_daala_dist(const uint8_t *src, int src_stride,
-                              const uint8_t *dst, int dst_stride, int tx_size,
-                              int qm, int use_activity_masking, int qindex) {
+                              const uint8_t *dst, int dst_stride, int bsw,
+                              int bsh, int qm, int use_activity_masking,
+                              int qindex) {
   int i, j;
   int64_t d;
-  const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
-  const int bsw = block_size_wide[tx_bsize];
-  const int bsh = block_size_high[tx_bsize];
   DECLARE_ALIGNED(16, od_coeff, orig[MAX_TX_SQUARE]);
   DECLARE_ALIGNED(16, od_coeff, rec[MAX_TX_SQUARE]);
 
@@ -1366,13 +1364,15 @@
                     OUTPUT_STATUS output_status) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
 #if CONFIG_DAALA_DIST
   int qm = OD_HVS_QM;
   int use_activity_masking = 0;
 #if CONFIG_PVQ
   use_activity_masking = x->daala_enc.use_activity_masking;
 #endif  // CONFIG_PVQ
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+#else   // CONFIG_DAALA_DIST
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
 #endif  // CONFIG_DAALA_DIST
 
   if (cpi->sf.use_transform_domain_distortion && !CONFIG_DAALA_DIST) {
@@ -1430,26 +1430,23 @@
     assert(tx_size_wide_log2[0] == tx_size_high_log2[0]);
 
 #if CONFIG_DAALA_DIST
-    if (plane == 0) {
-      if (bsw >= 8 && bsh >= 8) {
-        if (output_status == OUTPUT_HAS_DECODED_PIXELS) {
-          const int pred_stride = block_size_wide[plane_bsize];
-          const int16_t *pred = &pd->pred[(blk_row * pred_stride + blk_col)
-                                          << tx_size_wide_log2[0]];
-          int i, j;
-          DECLARE_ALIGNED(16, uint8_t, pred8[MAX_TX_SQUARE]);
+    if (plane == 0 && bsw >= 8 && bsh >= 8) {
+      if (output_status == OUTPUT_HAS_DECODED_PIXELS) {
+        const int pred_stride = block_size_wide[plane_bsize];
+        const int pred_idx = (blk_row * pred_stride + blk_col)
+                             << tx_size_wide_log2[0];
+        const int16_t *pred = &pd->pred[pred_idx];
+        int i, j;
+        DECLARE_ALIGNED(16, uint8_t, pred8[MAX_TX_SQUARE]);
 
-          for (j = 0; j < bsh; j++)
-            for (i = 0; i < bsw; i++)
-              pred8[j * bsw + i] = pred[j * pred_stride + i];
-          tmp = av1_daala_dist(src, src_stride, pred8, bsw, tx_size, qm,
-                               use_activity_masking, x->qindex);
-        } else {
-          tmp = av1_daala_dist(src, src_stride, dst, dst_stride, tx_size, qm,
-                               use_activity_masking, x->qindex);
-        }
+        for (j = 0; j < bsh; j++)
+          for (i = 0; i < bsw; i++)
+            pred8[j * bsw + i] = pred[j * pred_stride + i];
+        tmp = av1_daala_dist(src, src_stride, pred8, bsw, bsw, bsh, qm,
+                             use_activity_masking, x->qindex);
       } else {
-        tmp = 0;
+        tmp = av1_daala_dist(src, src_stride, dst, dst_stride, bsw, bsh, qm,
+                             use_activity_masking, x->qindex);
       }
     } else
 #endif  // CONFIG_DAALA_DIST
@@ -1470,19 +1467,13 @@
     if (eob) {
       if (output_status == OUTPUT_HAS_DECODED_PIXELS) {
 #if CONFIG_DAALA_DIST
-        if (plane == 0) {
-          if (bsw >= 8 && bsh >= 8)
-            tmp = av1_daala_dist(src, src_stride, dst, dst_stride, tx_size, qm,
-                                 use_activity_masking, x->qindex);
-          else
-            tmp = 0;
-        } else {
+        if (plane == 0 && bsw >= 8 && bsh >= 8)
+          tmp = av1_daala_dist(src, src_stride, dst, dst_stride, bsw, bsh, qm,
+                               use_activity_masking, x->qindex);
+        else
 #endif  // CONFIG_DAALA_DIST
           tmp = pixel_sse(cpi, xd, plane, src, src_stride, dst, dst_stride,
                           blk_row, blk_col, plane_bsize, tx_bsize);
-#if CONFIG_DAALA_DIST
-        }
-#endif  // CONFIG_DAALA_DIST
       } else {
 #if CONFIG_HIGHBITDEPTH
         uint8_t *recon;
@@ -1519,13 +1510,24 @@
                                     MAX_TX_SIZE, eob);
 
 #if CONFIG_DAALA_DIST
-        if (plane == 0) {
-          if (bsw >= 8 && bsh >= 8)
-            tmp = av1_daala_dist(src, src_stride, recon, MAX_TX_SIZE, tx_size,
-                                 qm, use_activity_masking, x->qindex);
-          else
-            tmp = 0;
+        if (plane == 0 && bsw >= 8 && bsh >= 8) {
+          tmp = av1_daala_dist(src, src_stride, recon, MAX_TX_SIZE, bsw, bsh,
+                               qm, use_activity_masking, x->qindex);
         } else {
+          if (plane == 0) {
+            // Save decoded pixels for inter block in pd->pred to avoid
+            // block_8x8_rd_txfm_daala_dist() need to produce them
+            // by calling av1_inverse_transform_block() again.
+            const int pred_stride = block_size_wide[plane_bsize];
+            const int pred_idx = (blk_row * pred_stride + blk_col)
+                                 << tx_size_wide_log2[0];
+            int16_t *pred = &pd->pred[pred_idx];
+            int i, j;
+
+            for (j = 0; j < bsh; j++)
+              for (i = 0; i < bsw; i++)
+                pred[j * pred_stride + i] = recon[j * MAX_TX_SIZE + i];
+          }
 #endif  // CONFIG_DAALA_DIST
           tmp = pixel_sse(cpi, xd, plane, src, src_stride, recon, MAX_TX_SIZE,
                           blk_row, blk_col, plane_bsize, tx_bsize);
@@ -1621,7 +1623,10 @@
   rd = AOMMIN(rd1, rd2);
 
 #if CONFIG_DAALA_DIST
-  if (plane == 0 && tx_size <= TX_4X4) {
+  if (plane == 0 &&
+      (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4)) {
+    this_rd_stats.dist = 0;
+    this_rd_stats.sse = 0;
     rd = 0;
     x->rate_4x4[block] = this_rd_stats.rate;
   }
@@ -1649,13 +1654,13 @@
   struct rdcost_block_args *args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
-  // MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  // const AV1_COMMON *cm = &args->cpi->common;
-  int64_t rd1, rd2, rd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  int64_t rd, rd1, rd2;
   RD_STATS this_rd_stats;
   int qm = OD_HVS_QM;
   int use_activity_masking = 0;
 
+  (void)tx_size;
 #if CONFIG_PVQ
   use_activity_masking = x->daala_enc.use_activity_masking;
 #endif  // CONFIG_PVQ
@@ -1665,7 +1670,7 @@
 
   {
     const struct macroblock_plane *const p = &x->plane[plane];
-    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    struct macroblockd_plane *const pd = &xd->plane[plane];
 
     const int src_stride = p->src.stride;
     const int dst_stride = pd->dst.stride;
@@ -1676,30 +1681,35 @@
     const uint8_t *dst =
         &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
 
-    unsigned int tmp;
-
+    unsigned int tmp1, tmp2;
     int qindex = x->qindex;
-
-    const int16_t *pred =
-        &pd->pred[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
+    const int pred_stride = block_size_wide[plane_bsize];
+    const int pred_idx = (blk_row * pred_stride + blk_col)
+                         << tx_size_wide_log2[0];
+    int16_t *pred = &pd->pred[pred_idx];
     int i, j;
-    const int tx_blk_size = 1 << (tx_size + 2);
-    DECLARE_ALIGNED(16, uint8_t, pred8[MAX_TX_SQUARE]);
+    const int tx_blk_size = 8;
+
+    DECLARE_ALIGNED(16, uint8_t, pred8[8 * 8]);
 
     for (j = 0; j < tx_blk_size; j++)
       for (i = 0; i < tx_blk_size; i++)
         pred8[j * tx_blk_size + i] = pred[j * diff_stride + i];
 
-    this_rd_stats.sse =
-        av1_daala_dist(src, src_stride, pred8, tx_blk_size, tx_size, qm,
-                       use_activity_masking, qindex);
+    tmp1 = av1_daala_dist(src, src_stride, pred8, tx_blk_size, 8, 8, qm,
+                          use_activity_masking, qindex);
+    tmp2 = av1_daala_dist(src, src_stride, dst, dst_stride, 8, 8, qm,
+                          use_activity_masking, qindex);
 
-    this_rd_stats.sse = this_rd_stats.sse * 16;
-
-    tmp = av1_daala_dist(src, src_stride, dst, dst_stride, tx_size, qm,
-                         use_activity_masking, qindex);
-
-    this_rd_stats.dist = (int64_t)tmp * 16;
+    if (!is_inter_block(mbmi)) {
+      this_rd_stats.sse = (int64_t)tmp1 * 16;
+      this_rd_stats.dist = (int64_t)tmp2 * 16;
+    } else {
+      // For inter mode, the decoded pixels are provided in pd->pred,
+      // while the predicted pixels are in dst.
+      this_rd_stats.sse = (int64_t)tmp2 * 16;
+      this_rd_stats.dist = (int64_t)tmp1 * 16;
+    }
   }
 
   rd = RDCOST(x->rdmult, x->rddiv, 0, this_rd_stats.dist);
@@ -1717,7 +1727,6 @@
   }
   rd1 = RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate, this_rd_stats.dist);
   rd2 = RDCOST(x->rdmult, x->rddiv, 0, this_rd_stats.sse);
-
   rd = AOMMIN(rd1, rd2);
 
   args->rd_stats.dist += this_rd_stats.dist;
@@ -3169,7 +3178,7 @@
     use_activity_masking = mb->daala_enc.use_activity_masking;
 #endif  // CONFIG_PVQ
     // Daala-defined distortion computed for the block of 8x8 pixels
-    total_distortion = av1_daala_dist(src, src_stride, dst, dst_stride, TX_8X8,
+    total_distortion = av1_daala_dist(src, src_stride, dst, dst_stride, 8, 8,
                                       qm, use_activity_masking, mb->qindex)
                        << 4;
   }
@@ -6558,84 +6567,125 @@
   for (k = 0; k < 4; ++k) bsi->modes[k] = mi->bmi[k].as_mode;
 
 #if CONFIG_DAALA_DIST
+  // Compute prediction (i.e. skip) and decoded distortion by daala-distortion.
   {
     const int src_stride = p->src.stride;
     const int dst_stride = pd->dst.stride;
     uint8_t *src = p->src.buf;
-    uint8_t pred[8 * 8];
+    uint8_t *dst = pd->dst.buf;
     const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->mbmi.sb_type, pd);
-    int use_activity_masking = 0;
-    int qm = OD_HVS_QM;
+    const int use_activity_masking = 0;
+    const int qm = OD_HVS_QM;
+    const int bsw = block_size_wide[plane_bsize];
+    const int bsh = block_size_high[plane_bsize];
+    int64_t rd1, rd2;
+    int64_t daala_sse, daala_dist;
+    TX_SIZE tx_size = mbmi->tx_size;
+
+#if CONFIG_HIGHBITDEPTH
+    uint8_t *recon_8x8;
+    DECLARE_ALIGNED(16, uint16_t, recon16[8 * 8]);
+
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+      recon_8x8 = CONVERT_TO_BYTEPTR(recon16);
+    else
+      recon_8x8 = (uint8_t *)recon16;
+#else
+    DECLARE_ALIGNED(16, uint8_t, recon_8x8[8 * 8]);
+#endif  // CONFIG_HIGHBITDEPTH
+
 #if CONFIG_PVQ
     use_activity_masking = x->daala_enc.use_activity_masking;
 #endif  // CONFIG_PVQ
 
-    for (idy = 0; idy < 2; idy += num_4x4_blocks_high)
+    // For each of sub8x8 prediction block in a 8x8 block
+    for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
       for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
         int i = idy * 2 + idx;
-        int j, m;
-        uint8_t *dst_init = &pd->dst.buf[(idy * dst_stride + idx) * 4];
+        const uint8_t *const src_sub8x8 =
+            src + av1_raster_block_offset(BLOCK_8X8, i, p->src.stride);
+        uint8_t *const dst_sub8x8 =
+            dst + av1_raster_block_offset(BLOCK_8X8, i, pd->dst.stride);
+        uint8_t *recon_sub8x8 = recon_8x8 + (idy * 8 + idx) * 4;
+        const int txb_width = max_block_wide(xd, plane_bsize, 0);
+        const int txb_height = max_block_high(xd, plane_bsize, 0);
+        int idx_, idy_;
 
         av1_build_inter_predictor_sub8x8(xd, 0, i, idy, idx, mi_row, mi_col);
-        // Save predicted pixels for use later.
-        for (j = 0; j < num_4x4_blocks_high * 4; j++)
-          for (m = 0; m < num_4x4_blocks_wide * 4; m++)
-            pred[(idy * 4 + j) * 8 + idx * 4 + m] =
-                dst_init[j * dst_stride + m];
+#if CONFIG_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+          aom_highbd_subtract_block(
+              height, width,
+              av1_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
+              src_sub8x8, p->src.stride, dst_sub8x8, pd->dst.stride, xd->bd);
+        } else {
+          aom_subtract_block(
+              height, width,
+              av1_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
+              src_sub8x8, p->src.stride, dst_sub8x8, pd->dst.stride);
+        }
+#else
+        aom_subtract_block(
+            bsh, bsw, av1_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
+            8, src_sub8x8, p->src.stride, dst_sub8x8, pd->dst.stride);
+#endif  // CONFIG_HIGHBITDEPTH
 
-        // Do xform and quant to get decoded pixels.
-        {
-          const int txb_width = max_block_wide(xd, plane_bsize, 0);
-          const int txb_height = max_block_high(xd, plane_bsize, 0);
-          int idx_, idy_;
+#if CONFIG_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+          aom_highbd_convolve_copy(dst_sub8x8, dst_stride, recon_sub8x8, 8,
+                                   NULL, 0, NULL, 0, bsw, bsh, xd->bd);
+        } else {
+#endif  // CONFIG_HIGHBITDEPTH
+          aom_convolve_copy(dst_sub8x8, dst_stride, recon_sub8x8, 8, NULL, 0,
+                            NULL, 0, bsw, bsh);
+#if CONFIG_HIGHBITDEPTH
+        }
+#endif  // CONFIG_HIGHBITDEPTH
 
-          for (idy_ = 0; idy_ < txb_height; idy_++) {
-            for (idx_ = 0; idx_ < txb_width; idx_++) {
-              int block;
-              int coeff_ctx = 0;
-              const tran_low_t *dqcoeff;
-              uint16_t eob;
-              const PLANE_TYPE plane_type = PLANE_TYPE_Y;
-              INV_TXFM_PARAM inv_txfm_param;
-              uint8_t *dst = dst_init + (idy_ * dst_stride + idx_) * 4;
+        // To get decoded pixels, do 4x4 xform and quant for each 4x4 block
+        // in a sub8x8 prediction block. In case remaining parts of
+        // sub8x8 inter mode rdo assume pd->dst stores predicted pixels,
+        // use local buffer to store decoded pixels.
+        for (idy_ = 0; idy_ < txb_height; idy_++) {
+          for (idx_ = 0; idx_ < txb_width; idx_++) {
+            int coeff_ctx = 0;
+            const tran_low_t *dqcoeff;
+            uint16_t eob;
+            const PLANE_TYPE plane_type = PLANE_TYPE_Y;
+            uint8_t *recon_4x4 = recon_sub8x8 + (idy_ * 8 + idx_) * 4;
+            const int block_raster_idx = (idy + idy_) * 2 + (idx + idx_);
+            const int block =
+                av1_raster_order_to_block_index(tx_size, block_raster_idx);
+            TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
 
-              block = i + (idy_ * 2 + idx_);
+            dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+            av1_xform_quant(cm, x, 0, block, idy + idy_, idx + idx_, BLOCK_8X8,
+                            tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
+            if (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0)
+              av1_optimize_b(cm, x, 0, block, tx_size, coeff_ctx);
 
-              dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-              eob = p->eobs[block];
-
-              av1_xform_quant(cm, x, 0, block, idy + (i >> 1), idx + (i & 0x01),
-                              BLOCK_8X8, TX_4X4, coeff_ctx, AV1_XFORM_QUANT_FP);
-
-              inv_txfm_param.tx_type =
-                  get_tx_type(plane_type, xd, block, TX_4X4);
-              inv_txfm_param.tx_size = TX_4X4;
-              inv_txfm_param.eob = eob;
-              inv_txfm_param.lossless = xd->lossless[mbmi->segment_id];
-
-#if CONFIG_PVQ
-              {
-                int i2, j2;
-
-                for (j2 = 0; j2 < 4; j2++)
-                  for (i2 = 0; i2 < 4; i2++) dst[j2 * dst_stride + i2] = 0;
-              }
-#endif  // CONFIG_PVQ
-              av1_inv_txfm_add(dqcoeff, dst, dst_stride, &inv_txfm_param);
-            }
+            eob = p->eobs[block];
+            av1_inverse_transform_block(xd, dqcoeff, tx_type, tx_size,
+                                        recon_4x4, 8, eob);
           }
         }
       }
+    }
+    // Compute daala-distortion for a 8x8 block
+    daala_sse = av1_daala_dist(src, src_stride, pd->dst.buf, dst_stride, 8, 8,
+                               qm, use_activity_masking, x->qindex)
+                << 4;
 
-    // Daala-defined distortion computed for 1) predicted pixels and
-    // 2) decoded pixels of the block of 8x8 pixels
-    bsi->sse = av1_daala_dist(src, src_stride, pred, 8, TX_8X8, qm,
-                              use_activity_masking, x->qindex)
-               << 4;
+    daala_dist = av1_daala_dist(src, src_stride, recon_8x8, 8, 8, 8, qm,
+                                use_activity_masking, x->qindex)
+                 << 4;
 
-    bsi->d = av1_daala_dist(src, src_stride, pd->dst.buf, dst_stride, TX_8X8,
-                            qm, use_activity_masking, x->qindex)
-             << 4;
+    bsi->sse = daala_sse;
+    bsi->d = daala_dist;
+
+    rd1 = RDCOST(x->rdmult, x->rddiv, bsi->r, bsi->d);
+    rd2 = RDCOST(x->rdmult, x->rddiv, 0, bsi->sse);
+    bsi->segment_rd = AOMMIN(rd1, rd2);
   }
 #endif  // CONFIG_DAALA_DIST