[CFL] Adjust Pixel Buffer for Chroma Sub8x8

Adjust row and col offset for sub8x8 blocks to allow the CfL prediction
to use all available reconstructed luma pixels.

Results on Subset 1 (Compared to b03c2f44 with CfL)

   PSNR | PSNR Cb | PSNR Cr | PSNR HVS |    SSIM | MS SSIM | CIEDE 2000
-0.1355 | -0.8517 | -0.4481 |  -0.0579 | -0.0237 | -0.0203 |    -0.2765

Change-Id: Ia91f0a078f0ff4f28bb2d272b096f579e0d04dac
diff --git a/av1/common/cfl.c b/av1/common/cfl.c
index 093ceb5..135b308 100644
--- a/av1/common/cfl.c
+++ b/av1/common/cfl.c
@@ -177,6 +177,8 @@
     sum_v += height * 129;
   }
 
+  // TODO(ltrudeau) Because of max_block_wide and max_block_high, num_pel will
+  // not be a power of two. So these divisions will have to use a lookup table.
   cfl->dc_pred[CFL_PRED_U] = sum_u / num_pel;
   cfl->dc_pred[CFL_PRED_V] = sum_v / num_pel;
 }
@@ -245,26 +247,48 @@
 }
 
 void cfl_store(CFL_CTX *cfl, const uint8_t *input, int input_stride, int row,
-               int col, TX_SIZE tx_size) {
+               int col, TX_SIZE tx_size, BLOCK_SIZE bsize) {
   const int tx_width = tx_size_wide[tx_size];
   const int tx_height = tx_size_high[tx_size];
   const int tx_off_log2 = tx_size_wide_log2[0];
 
-  // Store the input into the CfL pixel buffer
-  uint8_t *y_pix = &cfl->y_pix[(row * MAX_SB_SIZE + col) << tx_off_log2];
+#if CONFIG_CHROMA_SUB8X8
+  if (bsize < BLOCK_8X8) {
+#if CONFIG_DEBUG
+    // Transform cannot be smaller than
+    assert(tx_width >= 4);
+    assert(tx_height >= 4);
+#endif
 
-  // Check that we remain inside the pixel buffer.
-  assert(MAX_SB_SIZE * (row + tx_height - 1) + col + tx_width - 1 <
-         MAX_SB_SQUARE);
+    const int bw = block_size_wide[bsize];
+    const int bh = block_size_high[bsize];
 
-  // TODO(ltrudeau) Speedup possible by moving the downsampling to cfl_store
-  for (int j = 0; j < tx_height; j++) {
-    for (int i = 0; i < tx_width; i++) {
-      y_pix[i] = input[i];
+    // For chroma_sub8x8, the CfL prediction for prediction blocks smaller than
+    // 8X8 uses non chroma reference reconstructed luma pixels. To do so, we
+    // combine the 4X4 non chroma reference into the CfL pixel buffers based on
+    // their row and column index.
+
+    // The following code is adapted from the is_chroma_reference() function.
+    if ((cfl->mi_row &
+         0x01)        // Increment the row index for odd indexed 4X4 blocks
+        && (bh == 4)  // But not for 4X8 blocks
+        && cfl->subsampling_y) {  // And only when chroma is subsampled
+      assert(row == 0);
+      row++;
     }
-    y_pix += MAX_SB_SIZE;
-    input += input_stride;
+
+    if ((cfl->mi_col &
+         0x01)        // Increment the col index for odd indexed 4X4 blocks
+        && (bw == 4)  // But not for 8X4 blocks
+        && cfl->subsampling_x) {  // And only when chroma is subsampled
+      assert(col == 0);
+      col++;
+    }
   }
+#endif
+
+  // Invalidate current parameters
+  cfl->are_parameters_computed = 0;
 
   // Store the surface of the pixel buffer that was written to, this way we
   // can manage chroma overrun (e.g. when the chroma surfaces goes beyond the
@@ -277,8 +301,21 @@
     cfl->y_height = OD_MAXI((row << tx_off_log2) + tx_height, cfl->y_height);
   }
 
-  // Invalidate current parameters
-  cfl->are_parameters_computed = 0;
+  // Check that we will remain inside the pixel buffer.
+  assert((row << tx_off_log2) + tx_height <= MAX_SB_SIZE);
+  assert((col << tx_off_log2) + tx_width <= MAX_SB_SIZE);
+
+  // Store the input into the CfL pixel buffer
+  uint8_t *y_pix = &cfl->y_pix[(row * MAX_SB_SIZE + col) << tx_off_log2];
+
+  // TODO(ltrudeau) Speedup possible by moving the downsampling to cfl_store
+  for (int j = 0; j < tx_height; j++) {
+    for (int i = 0; i < tx_width; i++) {
+      y_pix[i] = input[i];
+    }
+    y_pix += MAX_SB_SIZE;
+    input += input_stride;
+  }
 }
 
 void cfl_compute_parameters(MACROBLOCKD *const xd, TX_SIZE tx_size) {
diff --git a/av1/common/cfl.h b/av1/common/cfl.h
index 7c11c4b..cbdf969 100644
--- a/av1/common/cfl.h
+++ b/av1/common/cfl.h
@@ -54,6 +54,8 @@
 
   // The rate associated with each alpha codeword
   int costs[CFL_ALPHABET_SIZE];
+
+  int mi_row, mi_col;
 } CFL_CTX;
 
 static const double cfl_alpha_mags[CFL_MAGS_SIZE] = {
@@ -73,7 +75,7 @@
                        int row, int col, TX_SIZE tx_size, int plane);
 
 void cfl_store(CFL_CTX *cfl, const uint8_t *input, int input_stride, int row,
-               int col, TX_SIZE tx_size);
+               int col, TX_SIZE tx_size, BLOCK_SIZE bsize);
 
 void cfl_compute_parameters(MACROBLOCKD *const xd, TX_SIZE tx_size);
 
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index d12eb54..a1c419b 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -741,9 +741,17 @@
 #if CONFIG_CFL
   if (plane == AOM_PLANE_Y) {
     struct macroblockd_plane *const pd = &xd->plane[plane];
+#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
+    const BLOCK_SIZE plane_bsize =
+        AOMMAX(BLOCK_4X4, get_plane_block_size(mbmi->sb_type, pd));
+#else
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(mbmi->sb_type, pd);
+#endif
     uint8_t *dst =
         &pd->dst.buf[(row * pd->dst.stride + col) << tx_size_wide_log2[0]];
-    cfl_store(xd->cfl, dst, pd->dst.stride, row, col, tx_size);
+    // TODO (ltrudeau) Store sub-8x8 inter blocks when bottom right block is
+    // intra predicted.
+    cfl_store(xd->cfl, dst, pd->dst.stride, row, col, tx_size, plane_bsize);
   }
 #endif
 }
@@ -876,6 +884,10 @@
   xd->mi[0]->mbmi.mi_row = mi_row;
   xd->mi[0]->mbmi.mi_col = mi_col;
 #endif
+#if CONFIG_CFL
+  xd->cfl->mi_row = mi_row;
+  xd->cfl->mi_col = mi_col;
+#endif
   for (y = 0; y < y_mis; ++y)
     for (x = !y; x < x_mis; ++x) xd->mi[y * cm->mi_stride + x] = xd->mi[0];
 
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index e3c6036..49497fd 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -325,6 +325,10 @@
   set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
 
   mbmi = &xd->mi[0]->mbmi;
+#if CONFIG_CFL
+  xd->cfl->mi_row = mi_row;
+  xd->cfl->mi_col = mi_col;
+#endif
 
   // Setup segment ID.
   if (seg->enabled) {
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index 6df156c..e0f4516 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -1420,7 +1420,9 @@
 #endif
 #if CONFIG_CFL
   if (plane == AOM_PLANE_Y && x->cfl_store_y) {
-    cfl_store(xd->cfl, dst, dst_stride, blk_row, blk_col, tx_size);
+    // TODO (ltrudeau) Store sub-8x8 inter blocks when bottom right block is
+    // intra predicted.
+    cfl_store(xd->cfl, dst, dst_stride, blk_row, blk_col, tx_size, plane_bsize);
   }
 #endif
 }
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 0729dbe..75c640d 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -1822,7 +1822,9 @@
     const int dst_stride = pd->dst.stride;
     uint8_t *dst =
         &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
-    cfl_store(xd->cfl, dst, dst_stride, blk_row, blk_col, tx_size);
+    // TODO (ltrudeau) Store sub-8x8 inter blocks when bottom right block is
+    // intra predicted.
+    cfl_store(xd->cfl, dst, dst_stride, blk_row, blk_col, tx_size, plane_bsize);
   }
 #endif
 #if CONFIG_DPCM_INTRA
@@ -9107,7 +9109,14 @@
     // so we can store reconstructed luma values
     RD_STATS this_rd_stats;
 
+#if CONFIG_CB4X4
+    // Don't store the luma value if no chroma is associated.
+    // Don't worry, we will store this reconstructed luma in the following
+    // encode dry-run the chroma plane will never know.
+    x->cfl_store_y = !x->skip_chroma_rd;
+#else
     x->cfl_store_y = 1;
+#endif
 
     txfm_rd_in_plane(x, cpi, &this_rd_stats, INT64_MAX, AOM_PLANE_Y,
                      mbmi->sb_type, mbmi->tx_size,