[CFL] Adjust Pixel Buffer for Chroma Sub8x8

Adjust row and col offset for sub8x8 blocks to allow the CfL prediction
to use all available reconstructed luma pixels.

Results on Subset 1 (Compared to b03c2f44 with CfL)

   PSNR | PSNR Cb | PSNR Cr | PSNR HVS |    SSIM | MS SSIM | CIEDE 2000
-0.1355 | -0.8517 | -0.4481 |  -0.0579 | -0.0237 | -0.0203 |    -0.2765

Change-Id: Ia91f0a078f0ff4f28bb2d272b096f579e0d04dac
diff --git a/av1/common/cfl.c b/av1/common/cfl.c
index 093ceb5..135b308 100644
--- a/av1/common/cfl.c
+++ b/av1/common/cfl.c
@@ -177,6 +177,8 @@
     sum_v += height * 129;
   }
 
+  // TODO(ltrudeau) Because of max_block_wide and max_block_high, num_pel will
+  // not be a power of two. So these divisions will have to use a lookup table.
   cfl->dc_pred[CFL_PRED_U] = sum_u / num_pel;
   cfl->dc_pred[CFL_PRED_V] = sum_v / num_pel;
 }
@@ -245,26 +247,48 @@
 }
 
 void cfl_store(CFL_CTX *cfl, const uint8_t *input, int input_stride, int row,
-               int col, TX_SIZE tx_size) {
+               int col, TX_SIZE tx_size, BLOCK_SIZE bsize) {
   const int tx_width = tx_size_wide[tx_size];
   const int tx_height = tx_size_high[tx_size];
   const int tx_off_log2 = tx_size_wide_log2[0];
 
-  // Store the input into the CfL pixel buffer
-  uint8_t *y_pix = &cfl->y_pix[(row * MAX_SB_SIZE + col) << tx_off_log2];
+#if CONFIG_CHROMA_SUB8X8
+  if (bsize < BLOCK_8X8) {
+#if CONFIG_DEBUG
+    // Transform cannot be smaller than
+    assert(tx_width >= 4);
+    assert(tx_height >= 4);
+#endif
 
-  // Check that we remain inside the pixel buffer.
-  assert(MAX_SB_SIZE * (row + tx_height - 1) + col + tx_width - 1 <
-         MAX_SB_SQUARE);
+    const int bw = block_size_wide[bsize];
+    const int bh = block_size_high[bsize];
 
-  // TODO(ltrudeau) Speedup possible by moving the downsampling to cfl_store
-  for (int j = 0; j < tx_height; j++) {
-    for (int i = 0; i < tx_width; i++) {
-      y_pix[i] = input[i];
+    // For chroma_sub8x8, the CfL prediction for prediction blocks smaller than
+    // 8X8 uses non chroma reference reconstructed luma pixels. To do so, we
+    // combine the 4X4 non chroma reference into the CfL pixel buffers based on
+    // their row and column index.
+
+    // The following code is adapted from the is_chroma_reference() function.
+    if ((cfl->mi_row &
+         0x01)        // Increment the row index for odd indexed 4X4 blocks
+        && (bh == 4)  // But not for 4X8 blocks
+        && cfl->subsampling_y) {  // And only when chroma is subsampled
+      assert(row == 0);
+      row++;
     }
-    y_pix += MAX_SB_SIZE;
-    input += input_stride;
+
+    if ((cfl->mi_col &
+         0x01)        // Increment the col index for odd indexed 4X4 blocks
+        && (bw == 4)  // But not for 8X4 blocks
+        && cfl->subsampling_x) {  // And only when chroma is subsampled
+      assert(col == 0);
+      col++;
+    }
   }
+#endif
+
+  // Invalidate current parameters
+  cfl->are_parameters_computed = 0;
 
   // Store the surface of the pixel buffer that was written to, this way we
   // can manage chroma overrun (e.g. when the chroma surfaces goes beyond the
@@ -277,8 +301,21 @@
     cfl->y_height = OD_MAXI((row << tx_off_log2) + tx_height, cfl->y_height);
   }
 
-  // Invalidate current parameters
-  cfl->are_parameters_computed = 0;
+  // Check that we will remain inside the pixel buffer.
+  assert((row << tx_off_log2) + tx_height <= MAX_SB_SIZE);
+  assert((col << tx_off_log2) + tx_width <= MAX_SB_SIZE);
+
+  // Store the input into the CfL pixel buffer
+  uint8_t *y_pix = &cfl->y_pix[(row * MAX_SB_SIZE + col) << tx_off_log2];
+
+  // TODO(ltrudeau) Speedup possible by moving the downsampling to cfl_store
+  for (int j = 0; j < tx_height; j++) {
+    for (int i = 0; i < tx_width; i++) {
+      y_pix[i] = input[i];
+    }
+    y_pix += MAX_SB_SIZE;
+    input += input_stride;
+  }
 }
 
 void cfl_compute_parameters(MACROBLOCKD *const xd, TX_SIZE tx_size) {