[CFL] Sub8x8 Validation Code Rewrite

Sub8x8 Validation code is changed to be more robust. The scope of the
validation is narrowed to validating that all of the required content in
the storage buffer was stored between CfL predictions. The early
termination used in the current mode decision code does not allow to
validate more than that.

This change does not change encoder output

BUG=aomedia:925

Change-Id: I7f1ed84da5037dcfaaf5da9cf33b4b8d664d2352
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 277ec46..0d8b108 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -647,10 +647,12 @@
 
   int is_chroma_reference;
 #if CONFIG_DEBUG
-  // The prediction used for sub8x8 blocks originates from multiple luma blocks,
-  // this array is used to validate that cfl_store() is called only once for
-  // each luma block
-  uint8_t sub8x8_val[CFL_SUB8X8_VAL_MI_SQUARE];
+  // Validation buffer is usually 2x2, except for 16x4 and 4x16 in that case it
+  // is 4x2 and 2x4 respectively. To simplify accessing the buffer we use a
+  // stride of CFL_SUB8X8_VAL_MI_SIZE resulting in a square of 16.
+  uint16_t sub8x8_val[CFL_SUB8X8_VAL_MI_SQUARE];
+  uint16_t store_counter;
+  uint16_t last_compute_counter;
 #endif  // CONFIG_DEBUG
 } CFL_CTX;
 #endif  // CONFIG_CFL
diff --git a/av1/common/cfl.c b/av1/common/cfl.c
index e286eb3..999befa 100644
--- a/av1/common/cfl.c
+++ b/av1/common/cfl.c
@@ -26,6 +26,8 @@
   cfl->store_y = 0;
 #if CONFIG_DEBUG
   cfl_clear_sub8x8_val(cfl);
+  cfl->store_counter = 0;
+  cfl->last_compute_counter = 0;
 #endif  // CONFIG_DEBUG
 }
 
@@ -463,16 +465,59 @@
   }
 }
 #if CONFIG_DEBUG
-static INLINE void sub8x8_set_val(CFL_CTX *cfl, int row, int col, int val_high,
-                                  int val_wide) {
-  for (int val_r = 0; val_r < val_high; val_r++) {
-    assert(row + val_r < CFL_SUB8X8_VAL_MI_SIZE);
-    int row_off = (row + val_r) * CFL_SUB8X8_VAL_MI_SIZE;
-    for (int val_c = 0; val_c < val_wide; val_c++) {
-      assert(col + val_c < CFL_SUB8X8_VAL_MI_SIZE);
-      assert(cfl->sub8x8_val[row_off + col + val_c] == 0);
-      cfl->sub8x8_val[row_off + col + val_c]++;
+// Since the chroma surface of sub8x8 block span across multiple luma blocks,
+// this function validates that the reconstructed luma area required to predict
+// the chroma block using CfL has been stored during the previous luma encode.
+//
+//   Issue 1: Chroma intra prediction is not always performed after luma. One
+//   such example is when luma RD cost is really high and the mode decision
+//   algorithm decides to terminate instead of evaluating chroma.
+//
+//   Issue 2: When multiple CfL predictions are computed for a given sub8x8
+//   block. The reconstructed luma that belongs to the non-reference sub8x8
+//   blocks must remain in the buffer (we cannot clear the buffer when we
+//   compute the CfL prediction
+//
+// To resolve these issues, we increment the store_counter on each store. if
+// other sub8x8 blocks have already been coded and the counter corresponds to
+// the previous value they are also set to the current value. If a sub8x8 block
+// is not stored the store_counter won't match which will be detected when the
+// CfL parements are computed.
+static void sub8x8_set_val(CFL_CTX *cfl, int row, int col, TX_SIZE y_tx_size) {
+  const int y_tx_wide_unit = tx_size_wide_unit[y_tx_size];
+  const int y_tx_high_unit = tx_size_high_unit[y_tx_size];
+
+  // How many 4x4 are in tx_size
+  const int y_tx_unit_len = y_tx_wide_unit * y_tx_high_unit;
+  assert(y_tx_unit_len == 1 || y_tx_unit_len == 2 || y_tx_unit_len == 4);
+
+  // Invalidate other counters if (0,0)
+  const int is_first = row + col == 0;
+  cfl->store_counter += is_first ? 2 : 1;
+
+  const int inc =
+      (y_tx_wide_unit >= y_tx_high_unit) ? 1 : CFL_SUB8X8_VAL_MI_SIZE;
+  uint16_t *sub8x8_val = cfl->sub8x8_val + (row * CFL_SUB8X8_VAL_MI_SIZE + col);
+  for (int i = 0; i < y_tx_unit_len; i++) {
+    *sub8x8_val = cfl->store_counter;
+    sub8x8_val += inc;
+  }
+
+  if (!is_first) {
+    const uint16_t prev_store_counter = cfl->store_counter - 1;
+    int found = 0;
+    sub8x8_val = cfl->sub8x8_val;
+    for (int y = 0; y < CFL_SUB8X8_VAL_MI_SIZE; y++) {
+      for (int x = 0; x < CFL_SUB8X8_VAL_MI_SIZE; x++) {
+        if (sub8x8_val[x] == prev_store_counter) {
+          sub8x8_val[x] = cfl->store_counter;
+          found = 1;
+        }
+      }
+      sub8x8_val += CFL_SUB8X8_VAL_MI_SIZE;
     }
+    // Something is wrong if (0,0) is missing
+    assert(found);
   }
 }
 #endif  // CONFIG_DEBUG
@@ -483,15 +528,13 @@
   struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
   uint8_t *dst =
       &pd->dst.buf[(row * pd->dst.stride + col) << tx_size_wide_log2[0]];
-  (void)bsize;
   if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) {
     // Only dimensions of size 4 can have an odd offset.
     assert(!((col & 1) && tx_size_wide[tx_size] != 4));
     assert(!((row & 1) && tx_size_high[tx_size] != 4));
     sub8x8_adjust_offset(cfl, &row, &col);
 #if CONFIG_DEBUG
-    sub8x8_set_val(cfl, row, col, tx_size_high_unit[tx_size],
-                   tx_size_wide_unit[tx_size]);
+    sub8x8_set_val(cfl, row, col, tx_size);
 #endif  // CONFIG_DEBUG
   }
   cfl_store(cfl, dst, pd->dst.stride, row, col, tx_size_wide[tx_size],
@@ -507,7 +550,12 @@
   if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) {
     sub8x8_adjust_offset(cfl, &row, &col);
 #if CONFIG_DEBUG
-    sub8x8_set_val(cfl, row, col, mi_size_high[bsize], mi_size_wide[bsize]);
+    // Point to the last transform block inside the partition.
+    const int off_row =
+        row + (mi_size_high[bsize] - tx_size_high_unit[tx_size]);
+    const int off_col =
+        col + (mi_size_wide[bsize] - tx_size_wide_unit[tx_size]);
+    sub8x8_set_val(cfl, off_row, off_col, tx_size);
 #endif  // CONFIG_DEBUG
   }
   const int width = max_intra_block_width(xd, bsize, AOM_PLANE_Y, tx_size);
@@ -526,13 +574,24 @@
   const BLOCK_SIZE plane_bsize = AOMMAX(
       BLOCK_4X4, get_plane_block_size(mbmi->sb_type, &xd->plane[AOM_PLANE_U]));
 #if CONFIG_DEBUG
-  if (mbmi->sb_type < BLOCK_8X8) {
-    for (int val_r = 0; val_r < mi_size_high[mbmi->sb_type]; val_r++) {
-      for (int val_c = 0; val_c < mi_size_wide[mbmi->sb_type]; val_c++) {
-        assert(cfl->sub8x8_val[val_r * CFL_SUB8X8_VAL_MI_SIZE + val_c] == 1);
+  BLOCK_SIZE bsize = mbmi->sb_type;
+  if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) {
+    const uint16_t compute_counter = cfl->sub8x8_val[0];
+    assert(compute_counter != cfl->last_compute_counter);
+    bsize = scale_chroma_bsize(bsize, cfl->subsampling_x, cfl->subsampling_y);
+    const int val_wide = mi_size_wide[bsize];
+    const int val_high = mi_size_high[bsize];
+    assert(val_wide <= CFL_SUB8X8_VAL_MI_SIZE);
+    assert(val_high <= CFL_SUB8X8_VAL_MI_SIZE);
+    for (int val_r = 0; val_r < val_high; val_r++) {
+      for (int val_c = 0; val_c < val_wide; val_c++) {
+        // If all counters in the validation buffer are equal then they are all
+        // related to the same chroma reference block.
+        assert(cfl->sub8x8_val[val_r * CFL_SUB8X8_VAL_MI_SIZE + val_c] ==
+               compute_counter);
       }
     }
-    cfl_clear_sub8x8_val(cfl);
+    cfl->last_compute_counter = compute_counter;
   }
 #endif  // CONFIG_DEBUG
   // AOM_PLANE_U is used, but both planes will have the same sizes.
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 7071e8c..4d762a8 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -911,11 +911,6 @@
   }
 #if CONFIG_CFL
   if (mbmi->uv_mode != UV_CFL_PRED) {
-#if CONFIG_DEBUG
-    if (cfl->is_chroma_reference) {
-      cfl_clear_sub8x8_val(cfl);
-    }
-#endif
     if (!cfl->is_chroma_reference && is_inter_block(mbmi)) {
       cfl_store_block(xd, mbmi->sb_type, mbmi->tx_size);
     }
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index ce466ba..2978ee8 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -2805,12 +2805,6 @@
     }
 
     restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-
-#if CONFIG_CFL && CONFIG_DEBUG
-    if (!x->skip_chroma_rd) {
-      cfl_clear_sub8x8_val(xd->cfl);
-    }
-#endif  // CONFIG_CFL && CONFIG_DEBUG
   }
 
   // store estimated motion vector
@@ -2862,11 +2856,6 @@
     }
 #endif  // CONFIG_DIST_8X8
 
-#if CONFIG_CFL && CONFIG_DEBUG
-    if (!reached_last_index && sum_rdc.rdcost >= best_rdc.rdcost)
-      cfl_clear_sub8x8_val(xd->cfl);
-#endif  // CONFIG_CFL && CONFIG_DEBUG
-
     if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) {
       sum_rdc.rate += partition_cost[PARTITION_SPLIT];
       sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
@@ -2950,9 +2939,6 @@
 #endif  // CONFIG_DIST_8X8
     }
 
-#if CONFIG_CFL && CONFIG_DEBUG
-    cfl_clear_sub8x8_val(xd->cfl);
-#endif  // CONFIG_CFL && CONFIG_DEBUG
     if (sum_rdc.rdcost < best_rdc.rdcost) {
       sum_rdc.rate += partition_cost[PARTITION_HORZ];
       sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
@@ -3030,10 +3016,6 @@
 #endif  // CONFIG_DIST_8X8
     }
 
-#if CONFIG_CFL && CONFIG_DEBUG
-    cfl_clear_sub8x8_val(xd->cfl);
-#endif  // CONFIG_CFL && CONFIG_DEBUG
-
     if (sum_rdc.rdcost < best_rdc.rdcost) {
       sum_rdc.rate += partition_cost[PARTITION_VERT];
       sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
@@ -4650,13 +4632,6 @@
     }
 #if CONFIG_CFL
     xd->cfl->store_y = 0;
-#if CONFIG_DEBUG
-    if (is_chroma_reference(mi_row, mi_col, bsize, xd->cfl->subsampling_x,
-                            xd->cfl->subsampling_y) &&
-        !xd->cfl->are_parameters_computed) {
-      cfl_clear_sub8x8_val(xd->cfl);
-    }
-#endif  // CONFIG_DEBUG
 #endif  // CONFIG_CFL
     if (!dry_run) {
       sum_intra_stats(td->counts, xd, mi, xd->above_mi, xd->left_mi,
@@ -4836,13 +4811,6 @@
   }
 #if CONFIG_CFL
   CFL_CTX *const cfl = xd->cfl;
-#if CONFIG_DEBUG
-  if (is_chroma_reference(mi_row, mi_col, bsize, cfl->subsampling_x,
-                          cfl->subsampling_y) &&
-      !cfl->are_parameters_computed) {
-    cfl_clear_sub8x8_val(cfl);
-  }
-#endif  // CONFIG_DEBUG
   if (is_inter_block(mbmi) &&
       !is_chroma_reference(mi_row, mi_col, bsize, cfl->subsampling_x,
                            cfl->subsampling_y)) {
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index f45f108..3f3169d 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -2110,18 +2110,17 @@
                    tx_size, &this_rd_stats.dist, &this_rd_stats.sse,
                    OUTPUT_HAS_PREDICTED_PIXELS);
   }
+  rd = RDCOST(x->rdmult, 0, this_rd_stats.dist);
+  if (args->this_rd + rd > args->best_rd) {
+    args->exit_early = 1;
+    return;
+  }
 #if CONFIG_CFL
   if (plane == AOM_PLANE_Y && xd->cfl->store_y) {
     assert(!is_inter_block(mbmi) || plane_bsize < BLOCK_8X8);
     cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize);
   }
 #endif  // CONFIG_CFL
-  rd = RDCOST(x->rdmult, 0, this_rd_stats.dist);
-  if (args->this_rd + rd > args->best_rd) {
-    args->exit_early = 1;
-    return;
-  }
-
   const PLANE_TYPE plane_type = get_plane_type(plane);
   const TX_TYPE tx_type =
       av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);