[CFL] Store luma as prediction for chroma

Stores the reconstructed luma pixels for each transform block inside a
prediction block. Rectangular transform blocks are supported.

As for RDO, after all the modes have been tested for luma, an extra
encoding is perform in order to store the reconstructed pixel values of
the best mode. These values are then used for RDO on the chromatic
planes.

Change-Id: I354d9827e32fd41065f1b2ce02832d943a6fa156
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index f0c30e3..fc1958b 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -531,6 +531,12 @@
 
 #if CONFIG_CFL
 typedef struct {
+  // Pixel buffer containing the luma pixels used as prediction for chroma
+  uint8_t y_pix[MAX_SB_SQUARE];
+
+  // Height and width of the luma prediction block currently in the pixel buffer
+  int y_height, y_width;
+
   // CfL Performs its own block level DC_PRED for each chromatic plane
   int dc_pred[CFL_PRED_PLANES];
 } CFL_CTX;
diff --git a/av1/common/cfl.c b/av1/common/cfl.c
index 466349e..53e117e 100644
--- a/av1/common/cfl.c
+++ b/av1/common/cfl.c
@@ -84,3 +84,36 @@
     dst += dst_stride;
   }
 }
+
+void cfl_store(CFL_CTX *cfl, const uint8_t *input, int input_stride, int row,
+               int col, TX_SIZE tx_size) {
+  const int tx_width = tx_size_wide[tx_size];
+  const int tx_height = tx_size_high[tx_size];
+  const int tx_off_log2 = tx_size_wide_log2[0];
+
+  // Store the input into the CfL pixel buffer
+  uint8_t *y_pix = &cfl->y_pix[(row * MAX_SB_SIZE + col) << tx_off_log2];
+
+  // Check that we remain inside the pixel buffer.
+  assert(MAX_SB_SIZE * (row + tx_height - 1) + col + tx_width - 1 <
+         MAX_SB_SQUARE);
+
+  for (int j = 0; j < tx_height; j++) {
+    for (int i = 0; i < tx_width; i++) {
+      y_pix[i] = input[i];
+    }
+    y_pix += MAX_SB_SIZE;
+    input += input_stride;
+  }
+
+  // Store the surface of the pixel buffer that was written to, this way we
+  // can manage chroma overrun (e.g. when the chroma surfaces goes beyond the
+  // frame boundary)
+  if (col == 0 && row == 0) {
+    cfl->y_width = tx_width;
+    cfl->y_height = tx_height;
+  } else {
+    cfl->y_width = OD_MAXI((col << tx_off_log2) + tx_width, cfl->y_width);
+    cfl->y_height = OD_MAXI((row << tx_off_log2) + tx_height, cfl->y_height);
+  }
+}
diff --git a/av1/common/cfl.h b/av1/common/cfl.h
index 959011d..3edcb0f 100644
--- a/av1/common/cfl.h
+++ b/av1/common/cfl.h
@@ -20,4 +20,7 @@
 void cfl_predict_block(uint8_t *dst, int dst_stride, TX_SIZE tx_size,
                        int dc_pred);
 
+void cfl_store(CFL_CTX *cfl, const uint8_t *input, int input_stride, int row,
+               int col, TX_SIZE tx_size);
+
 #endif  // AV1_COMMON_CFL_H_
diff --git a/av1/common/onyxc_int.h b/av1/common/onyxc_int.h
index 3eb51d4..194daf0 100644
--- a/av1/common/onyxc_int.h
+++ b/av1/common/onyxc_int.h
@@ -535,6 +535,7 @@
 #endif
 #if CONFIG_CFL
     xd->cfl = cfl;
+    memset(&cfl->y_pix, 0, sizeof(uint8_t) * MAX_SB_SQUARE);
 #endif
     xd->above_context[i] = cm->above_context[i];
     if (xd->plane[i].plane_type == PLANE_TYPE_Y) {
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 0b72aaf..4353bb6 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -79,6 +79,10 @@
 #include "av1/encoder/hybrid_fwd_txfm.h"
 #endif
 
+#if CONFIG_CFL
+#include "av1/common/cfl.h"
+#endif
+
 static struct aom_read_bit_buffer *init_read_bit_buffer(
     AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data,
     const uint8_t *data_end, uint8_t clear_data[MAX_AV1_HEADER_SIZE]);
@@ -564,6 +568,14 @@
     av1_pvq_decode_helper2(cm, xd, mbmi, plane, row, col, tx_size, tx_type);
 #endif
   }
+#if CONFIG_CFL
+  if (plane == AOM_PLANE_Y) {
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    uint8_t *dst =
+        &pd->dst.buf[(row * pd->dst.stride + col) << tx_size_wide_log2[0]];
+    cfl_store(xd->cfl, dst, pd->dst.stride, row, col, tx_size);
+  }
+#endif
 }
 
 #if CONFIG_VAR_TX && !CONFIG_COEF_INTERLEAVE
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 28bbaf4..39e08d5 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -228,6 +228,10 @@
   // 4x4 blocks are coded.
   int rate_4x4[256];
 #endif
+#if CONFIG_CFL
+  // Whether luma needs to be stored during RDO.
+  int cfl_store_y;
+#endif
 };
 
 #ifdef __cplusplus
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index b943b64..6fb7c85 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -1869,6 +1869,10 @@
   x->pvq_speed = 1;
   x->pvq_coded = 0;
 #endif
+#if CONFIG_CFL
+  // Don't store luma during RDO (we will store the best mode later).
+  x->cfl_store_y = 0;
+#endif
 
   set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
   mbmi = &xd->mi[0]->mbmi;
@@ -4574,6 +4578,10 @@
   *rate_nocoef = best_rate_nocoef;
 #endif  // CONFIG_SUPERTX
 
+#if CONFIG_CFL
+  // Store the luma for the best mode
+  x->cfl_store_y = 1;
+#endif
   if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
       pc_tree->index != 3) {
     if (bsize == cm->sb_size) {
@@ -4587,6 +4595,9 @@
                 pc_tree, NULL);
     }
   }
+#if CONFIG_CFL
+  x->cfl_store_y = 0;
+#endif
 
   if (bsize == cm->sb_size) {
 #if !CONFIG_PVQ && !CONFIG_LV_MAP
@@ -5036,6 +5047,7 @@
 
 #if CONFIG_CFL
   td->mb.e_mbd.cfl = &this_tile->cfl;
+  memset(&this_tile->cfl.y_pix, 0, sizeof(uint8_t) * MAX_SB_SQUARE);
 #endif
 
 #if CONFIG_PVQ
@@ -5921,6 +5933,9 @@
   x->pvq_speed = 0;
   x->pvq_coded = (dry_run == OUTPUT_ENABLED) ? 1 : 0;
 #endif
+#if CONFIG_CFL
+  x->cfl_store_y = (dry_run == OUTPUT_ENABLED) ? 1 : 0;
+#endif
 
   if (!is_inter) {
     int plane;
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index f9208f5..dbb7525 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -38,6 +38,10 @@
 #include "av1/encoder/pvq_encoder.h"
 #endif
 
+#if CONFIG_CFL
+#include "av1/common/cfl.h"
+#endif
+
 // Check if one needs to use c version subtraction.
 static int check_subtract_block_size(int w, int h) { return w < 4 || h < 4; }
 
@@ -1475,6 +1479,11 @@
 #else
 // Note : *(args->skip) == mbmi->skip
 #endif
+#if CONFIG_CFL
+  if (plane == AOM_PLANE_Y && x->cfl_store_y) {
+    cfl_store(xd->cfl, dst, dst_stride, blk_row, blk_col, tx_size);
+  }
+#endif
 }
 
 void av1_encode_intra_block_plane(AV1_COMMON *cm, MACROBLOCK *x,
diff --git a/av1/encoder/firstpass.c b/av1/encoder/firstpass.c
index 5528dc3..13837b3 100644
--- a/av1/encoder/firstpass.c
+++ b/av1/encoder/firstpass.c
@@ -537,6 +537,10 @@
   xd->mi = cm->mi_grid_visible;
   xd->mi[0] = cm->mi;
 
+#if CONFIG_CFL
+  // Don't store luma on the fist pass since chroma is not computed
+  x->cfl_store_y = 0;
+#endif
   av1_frame_init_quantizer(cpi);
 
 #if CONFIG_PVQ
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 458f2b4..f13c585 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -3714,6 +3714,16 @@
   od_encode_rollback(&x->daala_enc, &post_buf);
 #endif  // CONFIG_PVQ
 
+#if CONFIG_CFL
+  // Perform one extra txfm_rd_in_plane() call, this time with the best value so
+  // we can store reconstructed luma values
+  RD_STATS this_rd_stats;
+  x->cfl_store_y = 1;
+  txfm_rd_in_plane(x, cpi, &this_rd_stats, INT64_MAX, 0, bsize,
+                   mic->mbmi.tx_size, cpi->sf.use_fast_coef_costing);
+  x->cfl_store_y = 0;
+#endif
+
 #if CONFIG_PALETTE
   if (try_palette) {
     rd_pick_palette_intra_sby(cpi, x, bsize, palette_y_mode_ctx,