[CFL] DC_PRED as a block instead of as single value

This change does not alter the bitstream. This change simplifies a subsequent
commit to remove the custom DC_PRED used by CfL. To use the DC_PRED in AV1, 
CfL must consider the DC_PRED as a block instead of a single value.

Results on Subset1 (Compared to Previous commit with CfL enabled)
  PSNR | PSNR Cb | PSNR Cr | PSNR HVS |   SSIM | MS SSIM | CIEDE 2000
0.0000 |  0.0000 |  0.0000 |   0.0000 | 0.0000 |  0.0000 |     0.0000

https://arewecompressedyet.com/?job=master%402017-11-03T15%3A57%3A30.643Z&job=cfl-pixel-DC_PRED%402017-11-03T15%3A59%3A03.304Z
Change-Id: I75f981ab93ab1808450f8280bfbabde76ea5b7fe
diff --git a/av1/common/cfl.c b/av1/common/cfl.c
index 9593b4b..fc15a7a 100644
--- a/av1/common/cfl.c
+++ b/av1/common/cfl.c
@@ -159,7 +159,8 @@
 
 // CfL computes its own block-level DC_PRED. This is required to compute both
 // alpha_cb and alpha_cr before the prediction are computed.
-static void cfl_dc_pred(MACROBLOCKD *xd, BLOCK_SIZE plane_bsize) {
+static void cfl_dc_pred(MACROBLOCKD *xd, BLOCK_SIZE plane_bsize,
+                        TX_SIZE tx_size) {
   CFL_CTX *const cfl = xd->cfl;
 
   // Compute DC_PRED until block boundary. We can't assume the neighbor will use
@@ -201,8 +202,22 @@
 
   // TODO(ltrudeau) Because of max_block_wide and max_block_high, num_pel will
   // not be a power of two. So these divisions will have to use a lookup table.
-  cfl->dc_pred[CFL_PRED_U] = (sum_u + (num_pel >> 1)) / num_pel;
-  cfl->dc_pred[CFL_PRED_V] = (sum_v + (num_pel >> 1)) / num_pel;
+  const int16_t dc_pred_u = (sum_u + (num_pel >> 1)) / num_pel;
+  const int16_t dc_pred_v = (sum_v + (num_pel >> 1)) / num_pel;
+  const int blk_width =
+      max_intra_block_width(xd, plane_bsize, AOM_PLANE_U, tx_size);
+  const int blk_height =
+      max_intra_block_height(xd, plane_bsize, AOM_PLANE_U, tx_size);
+  int16_t *p_dc_pred_u = cfl->dc_pred[CFL_PRED_U];
+  int16_t *p_dc_pred_v = cfl->dc_pred[CFL_PRED_V];
+  for (int j = 0; j < blk_height; j++) {
+    for (int i = 0; i < blk_width; i++) {
+      p_dc_pred_u[i] = dc_pred_u;
+      p_dc_pred_v[i] = dc_pred_v;
+    }
+    p_dc_pred_u += MAX_SB_SIZE;
+    p_dc_pred_v += MAX_SB_SIZE;
+  }
 }
 
 static void cfl_subtract_averages(CFL_CTX *cfl, TX_SIZE tx_size) {
@@ -257,29 +272,31 @@
 
 static void cfl_build_prediction_lbd(const int16_t *pred_buf_q3, uint8_t *dst,
                                      int dst_stride, int width, int height,
-                                     int alpha_q3, int16_t dc_pred) {
+                                     int alpha_q3, const int16_t *dc_pred) {
   for (int j = 0; j < height; j++) {
     for (int i = 0; i < width; i++) {
       dst[i] =
-          clip_pixel(get_scaled_luma_q0(alpha_q3, pred_buf_q3[i]) + dc_pred);
+          clip_pixel(get_scaled_luma_q0(alpha_q3, pred_buf_q3[i]) + dc_pred[i]);
     }
     dst += dst_stride;
     pred_buf_q3 += MAX_SB_SIZE;
+    dc_pred += MAX_SB_SIZE;
   }
 }
 
 #if CONFIG_HIGHBITDEPTH
 static void cfl_build_prediction_hbd(const int16_t *pred_buf_q3, uint16_t *dst,
                                      int dst_stride, int width, int height,
-                                     int alpha_q3, int16_t dc_pred,
+                                     int alpha_q3, const int16_t *dc_pred,
                                      int bit_depth) {
   for (int j = 0; j < height; j++) {
     for (int i = 0; i < width; i++) {
       dst[i] = clip_pixel_highbd(
-          get_scaled_luma_q0(alpha_q3, pred_buf_q3[i]) + dc_pred, bit_depth);
+          get_scaled_luma_q0(alpha_q3, pred_buf_q3[i]) + dc_pred[i], bit_depth);
     }
     dst += dst_stride;
     pred_buf_q3 += MAX_SB_SIZE;
+    dc_pred += MAX_SB_SIZE;
   }
 }
 #endif  // CONFIG_HIGHBITDEPTH
@@ -294,6 +311,8 @@
 
   const int16_t *pred_buf_q3 =
       cfl->pred_buf_q3 + ((row * MAX_SB_SIZE + col) << tx_size_wide_log2[0]);
+  const int16_t *dc_pred = cfl->dc_pred[plane - 1] +
+                           ((row * MAX_SB_SIZE + col) << tx_size_wide_log2[0]);
   const int alpha_q3 =
       cfl_idx_to_alpha(mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, plane - 1);
 
@@ -302,13 +321,12 @@
     uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
     cfl_build_prediction_hbd(pred_buf_q3, dst_16, dst_stride,
                              tx_size_wide[tx_size], tx_size_high[tx_size],
-                             alpha_q3, cfl->dc_pred[plane - 1], xd->bd);
+                             alpha_q3, dc_pred, xd->bd);
     return;
   }
 #endif  // CONFIG_HIGHBITDEPTH
   cfl_build_prediction_lbd(pred_buf_q3, dst, dst_stride, tx_size_wide[tx_size],
-                           tx_size_high[tx_size], alpha_q3,
-                           cfl->dc_pred[plane - 1]);
+                           tx_size_high[tx_size], alpha_q3, dc_pred);
 }
 
 static void cfl_luma_subsampling_420_lbd(const uint8_t *input, int input_stride,
@@ -637,7 +655,7 @@
   cfl->uv_height =
       max_intra_block_height(xd, plane_bsize, AOM_PLANE_U, tx_size);
 
-  cfl_dc_pred(xd, plane_bsize);
+  cfl_dc_pred(xd, plane_bsize, tx_size);
   cfl_subtract_averages(cfl, tx_size);
   cfl->are_parameters_computed = 1;
 }