Improve dist-8x8

Improve dist-8x8 when computing 8x8 yuv dist for sub8x8.

To apply dist-8x8 for sub8x8 partitions, once mode decision for
sub8x8 partitions are finished then dist-8x8 is computed on 8x8 window.
Since dist-8x8 is only for luma, chroma distortion should be identified.

Previously, it has been hard to be free from potential bugs doing this,
due to the complex inter mode search code.

The new method is less-error-prone, which computes uv distortion (in MSE) after
the mode decisions for all of sub8x8 blocks in a 8x8 window are finished,
when the dist-8x8 distortion for luma 8x8 pixels are computed with
new distortion metric.

All the code separating y and uv distortion in inter mode search has
been removed in this commit.

Change-Id: Ieaccb7915df5faeb5e89a7e70b2b7cbac65231af
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 64800a4..b0192d3 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -344,9 +344,6 @@
   int64_t ref_rdcost;
   int zero_rate;
   uint8_t invalid_rate;
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-  int64_t dist_y;
-#endif
 #if CONFIG_RD_DEBUG
   int txb_coeff_cost[MAX_MB_PLANE];
 #if CONFIG_VAR_TX
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 716ae85..0ff8cd3 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -3370,6 +3370,54 @@
 }
 #endif  // CONFIG_EXT_PARTITION_TYPES
 
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+static int64_t dist_8x8_yuv(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                            uint8_t *y_src_8x8) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int64_t dist_8x8, dist_8x8_uv, total_dist;
+  const int src_stride = x->plane[0].src.stride;
+  uint8_t *decoded_8x8;
+  int plane;
+
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    decoded_8x8 = CONVERT_TO_BYTEPTR(x->decoded_8x8);
+  else
+#endif
+    decoded_8x8 = (uint8_t *)x->decoded_8x8;
+
+  dist_8x8 = av1_dist_8x8(cpi, x, y_src_8x8, src_stride, decoded_8x8, 8,
+                          BLOCK_8X8, 8, 8, 8, 8, x->qindex)
+             << 4;
+
+  // Compute chroma distortion for a luma 8x8 block
+  dist_8x8_uv = 0;
+
+  for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+    const int src_stride_uv = x->plane[plane].src.stride;
+    const int dst_stride_uv = xd->plane[plane].dst.stride;
+    // uv buff pointers now (i.e. the last sub8x8 block) is the same
+    // to those at the first sub8x8 block because
+    // uv buff pointer is set only once at first sub8x8 block in a 8x8.
+    uint8_t *src_uv = x->plane[plane].src.buf;
+    uint8_t *dst_uv = xd->plane[plane].dst.buf;
+    unsigned sse;
+#if CONFIG_CHROMA_SUB8X8
+    const BLOCK_SIZE plane_bsize =
+        AOMMAX(BLOCK_4X4, get_plane_block_size(BLOCK_8X8, &xd->plane[plane]));
+#else
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(BLOCK_8X8, &xd->plane[plane]);
+#endif
+    cpi->fn_ptr[plane_bsize].vf(src_uv, src_stride_uv, dst_uv, dst_stride_uv,
+                                &sse);
+    dist_8x8_uv += (int64_t)sse << 4;
+  }
+
+  return total_dist = dist_8x8 + dist_8x8_uv;
+}
+#endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
+
 // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
 // unlikely to be selected depending on previous rate-distortion optimization
 // results, for encoding speed-up.
@@ -3819,12 +3867,6 @@
                           temp_best_rdcost - sum_rdc.rdcost,
                           pc_tree->split[idx]);
 
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-        if (x->using_dist_8x8 && bsize == BLOCK_8X8 &&
-            this_rdc.rate != INT_MAX) {
-          assert(this_rdc.dist_y < INT64_MAX);
-        }
-#endif
         if (this_rdc.rate == INT_MAX) {
           sum_rdc.rdcost = INT64_MAX;
 #if CONFIG_SUPERTX
@@ -3838,12 +3880,6 @@
 #if CONFIG_SUPERTX
           sum_rate_nocoef += this_rate_nocoef;
 #endif  // CONFIG_SUPERTX
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-          if (x->using_dist_8x8 && bsize == BLOCK_8X8) {
-            assert(this_rdc.dist_y < INT64_MAX);
-            sum_rdc.dist_y += this_rdc.dist_y;
-          }
-#endif
         }
       }
       reached_last_index = (idx == 4);
@@ -3851,24 +3887,11 @@
 #if CONFIG_DIST_8X8 && CONFIG_CB4X4
       if (x->using_dist_8x8 && reached_last_index &&
           sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) {
-        int64_t dist_8x8;
         const int src_stride = x->plane[0].src.stride;
-        uint8_t *decoded_8x8;
-
-#if CONFIG_HIGHBITDEPTH
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-          decoded_8x8 = CONVERT_TO_BYTEPTR(x->decoded_8x8);
-        else
-#endif
-          decoded_8x8 = (uint8_t *)x->decoded_8x8;
-
+        int64_t dist_8x8;
         dist_8x8 =
-            av1_dist_8x8(cpi, x, x->plane[0].src.buf - 4 * src_stride - 4,
-                         src_stride, decoded_8x8, 8, BLOCK_8X8, 8, 8, 8, 8,
-                         x->qindex)
-            << 4;
-        assert(sum_rdc.dist_y < INT64_MAX);
-        sum_rdc.dist = sum_rdc.dist - sum_rdc.dist_y + dist_8x8;
+            dist_8x8_yuv(cpi, x, x->plane[0].src.buf - 4 * src_stride - 4);
+        sum_rdc.dist = dist_8x8;
         sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
       }
 #endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
@@ -4029,29 +4052,14 @@
 #if CONFIG_SUPERTX
         sum_rate_nocoef += this_rate_nocoef;
 #endif  // CONFIG_SUPERTX
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-        if (x->using_dist_8x8) sum_rdc.dist_y += this_rdc.dist_y;
-#endif
       }
 #if CONFIG_DIST_8X8 && CONFIG_CB4X4
       if (x->using_dist_8x8 && sum_rdc.rdcost != INT64_MAX &&
           bsize == BLOCK_8X8) {
-        int64_t dist_8x8;
         const int src_stride = x->plane[0].src.stride;
-        uint8_t *decoded_8x8;
-
-#if CONFIG_HIGHBITDEPTH
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-          decoded_8x8 = CONVERT_TO_BYTEPTR(x->decoded_8x8);
-        else
-#endif
-          decoded_8x8 = (uint8_t *)x->decoded_8x8;
-
-        dist_8x8 = av1_dist_8x8(cpi, x, x->plane[0].src.buf - 4 * src_stride,
-                                src_stride, decoded_8x8, 8, BLOCK_8X8, 8, 8, 8,
-                                8, x->qindex)
-                   << 4;
-        sum_rdc.dist = sum_rdc.dist - sum_rdc.dist_y + dist_8x8;
+        int64_t dist_8x8;
+        dist_8x8 = dist_8x8_yuv(cpi, x, x->plane[0].src.buf - 4 * src_stride);
+        sum_rdc.dist = dist_8x8;
         sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
       }
 #endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
@@ -4209,29 +4217,13 @@
 #if CONFIG_SUPERTX
         sum_rate_nocoef += this_rate_nocoef;
 #endif  // CONFIG_SUPERTX
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-        if (x->using_dist_8x8) sum_rdc.dist_y += this_rdc.dist_y;
-#endif
       }
 #if CONFIG_DIST_8X8 && CONFIG_CB4X4
       if (x->using_dist_8x8 && sum_rdc.rdcost != INT64_MAX &&
           bsize == BLOCK_8X8) {
         int64_t dist_8x8;
-        const int src_stride = x->plane[0].src.stride;
-        uint8_t *decoded_8x8;
-
-#if CONFIG_HIGHBITDEPTH
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-          decoded_8x8 = CONVERT_TO_BYTEPTR(x->decoded_8x8);
-        else
-#endif
-          decoded_8x8 = (uint8_t *)x->decoded_8x8;
-
-        dist_8x8 =
-            av1_dist_8x8(cpi, x, x->plane[0].src.buf - 4, src_stride,
-                         decoded_8x8, 8, BLOCK_8X8, 8, 8, 8, 8, x->qindex)
-            << 4;
-        sum_rdc.dist = sum_rdc.dist - sum_rdc.dist_y + dist_8x8;
+        dist_8x8 = dist_8x8_yuv(cpi, x, x->plane[0].src.buf - 4);
+        sum_rdc.dist = dist_8x8;
         sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
       }
 #endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
@@ -4457,11 +4449,6 @@
   (void)best_rd;
   *rd_cost = best_rdc;
 
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-  if (x->using_dist_8x8 && bsize <= BLOCK_8X8 && rd_cost->rate != INT_MAX) {
-    assert(rd_cost->dist_y < INT64_MAX);
-  }
-#endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
 #if CONFIG_SUPERTX
   *rate_nocoef = best_rate_nocoef;
 #endif  // CONFIG_SUPERTX
diff --git a/av1/encoder/rd.h b/av1/encoder/rd.h
index 39806b2..0436cec 100644
--- a/av1/encoder/rd.h
+++ b/av1/encoder/rd.h
@@ -521,9 +521,6 @@
   rd_stats->zero_rate = 0;
   rd_stats->invalid_rate = 0;
   rd_stats->ref_rdcost = INT64_MAX;
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-  rd_stats->dist_y = 0;
-#endif
 #if CONFIG_RD_DEBUG
   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
     rd_stats->txb_coeff_cost[plane] = 0;
@@ -551,9 +548,6 @@
   rd_stats->zero_rate = 0;
   rd_stats->invalid_rate = 1;
   rd_stats->ref_rdcost = INT64_MAX;
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-  rd_stats->dist_y = INT64_MAX;
-#endif
 #if CONFIG_RD_DEBUG
   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
     rd_stats->txb_coeff_cost[plane] = INT_MAX;
@@ -579,9 +573,6 @@
   rd_stats_dst->sse += rd_stats_src->sse;
   rd_stats_dst->skip &= rd_stats_src->skip;
   rd_stats_dst->invalid_rate &= rd_stats_src->invalid_rate;
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-  rd_stats_dst->dist_y += rd_stats_src->dist_y;
-#endif
 #if CONFIG_RD_DEBUG
   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
     rd_stats_dst->txb_coeff_cost[plane] += rd_stats_src->txb_coeff_cost[plane];
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 42198bb..b8c7339 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -9978,9 +9978,6 @@
       rd_cost->dist = dist_y + dist_uv;
     }
     rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-    if (x->using_dist_8x8) rd_cost->dist_y = dist_y;
-#endif
   } else {
     rd_cost->rate = INT_MAX;
   }
@@ -10715,10 +10712,6 @@
     int compmode_cost = 0;
     int rate2 = 0, rate_y = 0, rate_uv = 0;
     int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-    int64_t distortion2_y = 0;
-    int64_t total_sse_y = INT64_MAX;
-#endif
     int skippable = 0;
     int this_skip2 = 0;
     int64_t total_sse = INT64_MAX;
@@ -11103,9 +11096,6 @@
       if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED)
         rate2 += intra_cost_penalty;
       distortion2 = distortion_y + distortion_uv;
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-      if (x->using_dist_8x8 && bsize < BLOCK_8X8) distortion2_y = distortion_y;
-#endif
     } else {
       int_mv backup_ref_mv[2];
 
@@ -11201,20 +11191,6 @@
       {
         RD_STATS rd_stats, rd_stats_y, rd_stats_uv;
         av1_init_rd_stats(&rd_stats);
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-        // While av1 master uses rd_stats_y.rate through out the codebase,
-        // which is set when handle_inter_mode is called, the daala-dist code
-        // in rd_pick_partition() for cb4x4 and sub8x8 blocks need to know
-        // .dist_y which comes from rd_stats_y.dist and rd_stats_y.sse.
-        // The problem is rd_stats_y.dist and rd_stats_y.sse are sometimes not
-        // initialized when rd_stats.skip = 1,
-        // then instead rd_stats.dist and rd_stats.sse have the
-        // combined luma and chroma dist and sse.
-        // This can be seen inside motion_mode_rd(), which is called by
-        // handle_inter_mode().
-        if (x->using_dist_8x8 && bsize < BLOCK_8X8)
-          av1_init_rd_stats(&rd_stats_y);
-#endif
         rd_stats.rate = rate2;
 
         // Point to variables that are maintained between loop iterations
@@ -11236,16 +11212,6 @@
         total_sse = rd_stats.sse;
         rate_y = rd_stats_y.rate;
         rate_uv = rd_stats_uv.rate;
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-        if (x->using_dist_8x8 && bsize < BLOCK_8X8) {
-          if (rd_stats_y.rate != INT_MAX) {
-            assert(rd_stats_y.sse < INT64_MAX);
-            assert(rd_stats_y.dist < INT64_MAX);
-          }
-          total_sse_y = rd_stats_y.sse;
-          distortion2_y = rd_stats_y.dist;
-        }
-#endif
       }
 
 // TODO(jingning): This needs some refactoring to improve code quality
@@ -11419,16 +11385,7 @@
 
             frame_mv[NEARMV][ref_frame] = cur_mv;
             av1_init_rd_stats(&tmp_rd_stats);
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-            // With the same reason as 'rd_stats_y' passed to above
-            // handle_inter_mode(), tmp_rd_stats_y.dist and
-            // tmp_rd_stats_y.sse are sometimes not initialized, esp. when
-            // tmp_rd_stats.skip = 1 and tmp_rd_stats.dist and .sse
-            // represent combined luma and chroma .dist and .sse,
-            // we should initialized tmp_rd_stats_y.
-            if (x->using_dist_8x8 && bsize < BLOCK_8X8)
-              av1_init_rd_stats(&tmp_rd_stats_y);
-#endif
+
             // Point to variables that are not maintained between iterations
             args.single_newmv = dummy_single_newmv;
 #if CONFIG_EXT_INTER
@@ -11504,16 +11461,6 @@
             tmp_ref_rd = tmp_alt_rd;
             backup_mbmi = *mbmi;
             backup_skip = x->skip;
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-            if (x->using_dist_8x8 && bsize < BLOCK_8X8) {
-              if (tmp_rd_stats_y.rate != INT_MAX) {
-                assert(tmp_rd_stats_y.sse < INT64_MAX);
-                assert(tmp_rd_stats_y.dist < INT64_MAX);
-              }
-              total_sse_y = tmp_rd_stats_y.sse;
-              distortion2_y = tmp_rd_stats_y.dist;
-            }
-#endif
 #if CONFIG_VAR_TX
             for (i = 0; i < MAX_MB_PLANE; ++i)
               memcpy(x->blk_skip_drl[i], x->blk_skip[i],
@@ -11596,12 +11543,6 @@
           this_skip2 = 1;
           rate_y = 0;
           rate_uv = 0;
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-          if (x->using_dist_8x8 && bsize < BLOCK_8X8) {
-            assert(total_sse_y < INT64_MAX);
-            distortion2_y = total_sse_y;
-          }
-#endif
         }
       } else {
         // Add in the cost of the no skip flag.
@@ -11621,11 +11562,6 @@
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
     }
 
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-    if (x->using_dist_8x8 && bsize < BLOCK_8X8 && rate2 != INT_MAX)
-      assert(distortion2_y < INT64_MAX);
-#endif
-
     if (ref_frame == INTRA_FRAME) {
       // Keep record of best intra rd
       if (this_rd < best_intra_rd) {
@@ -11701,12 +11637,6 @@
         best_rate_y = rate_y + av1_cost_bit(av1_get_skip_prob(cm, xd),
                                             this_skip2 || skippable);
         best_rate_uv = rate_uv;
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-        if (x->using_dist_8x8 && bsize < BLOCK_8X8) {
-          assert(distortion2_y < INT64_MAX);
-          rd_cost->dist_y = distortion2_y;
-        }
-#endif
 #if CONFIG_VAR_TX
         for (i = 0; i < MAX_MB_PLANE; ++i)
           memcpy(ctx->blk_skip[i], x->blk_skip[i],
@@ -11714,10 +11644,7 @@
 #endif  // CONFIG_VAR_TX
       }
     }
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-    if (x->using_dist_8x8 && bsize < BLOCK_8X8 && rd_cost->rate != INT_MAX)
-      assert(rd_cost->dist_y < INT64_MAX);
-#endif
+
     /* keep record of best compound/single-only prediction */
     if (!disable_skip && ref_frame != INTRA_FRAME) {
       int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
@@ -11849,21 +11776,9 @@
       rd_cost->dist = rd_stats_y.dist + rd_stats_uv.dist;
       rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
       best_skip2 = skip_blk;
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-      if (x->using_dist_8x8 && bsize < BLOCK_8X8) {
-        assert(rd_cost->rate != INT_MAX);
-        assert(rd_cost->dist_y < INT64_MAX);
-        rd_cost->dist_y = rd_stats_y.dist;
-      }
-#endif
     }
   }
 
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-  if (x->using_dist_8x8 && bsize < BLOCK_8X8 && rd_cost->rate != INT_MAX)
-    assert(rd_cost->dist_y < INT64_MAX);
-#endif
-
   // Only try palette mode when the best mode so far is an intra mode.
   if (try_palette && !is_inter_mode(best_mbmode.mode)) {
     int rate2 = 0;
@@ -12433,9 +12348,7 @@
   rd_cost->rate = rate2;
   rd_cost->dist = distortion2;
   rd_cost->rdcost = this_rd;
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-  if (x->using_dist_8x8 && bsize < BLOCK_8X8) rd_cost->dist_y = distortion2;
-#endif
+
   if (this_rd >= best_rd_so_far) {
     rd_cost->rate = INT_MAX;
     rd_cost->rdcost = INT64_MAX;