[CFL] Compute Luma Average Over Partition Unit

Extract the compution of the luma reconstructed average out of cfl_load
and into cfl_compute_average. The reconstructed luma average is stored
in the CFL_CONTEXT to avoid computing it for each transform block and
for each plane.

Results on subset1 (compared to 803bea2 with CfL)
   PSNR | PSNR Cb | PSNR Cr | PSNR HVS |    SSIM | MS SSIM | CIEDE 2000
-0.0474 | -0.1486 | -0.2931 |  -0.0358 | -0.0397 | -0.0127 |    -0.1162

Change-Id: I9e34af0fe5961ce8dbe70cb80aea2a16221d0d92
diff --git a/av1/common/cfl.c b/av1/common/cfl.c
index 004bbeb..bce44ea 100644
--- a/av1/common/cfl.c
+++ b/av1/common/cfl.c
@@ -28,7 +28,7 @@
 
 // CfL computes its own block-level DC_PRED. This is required to compute both
 // alpha_cb and alpha_cr before the prediction are computed.
-void cfl_dc_pred(MACROBLOCKD *xd, BLOCK_SIZE plane_bsize) {
+void cfl_dc_pred(MACROBLOCKD *xd, int width, int height) {
   const struct macroblockd_plane *const pd_u = &xd->plane[AOM_PLANE_U];
   const struct macroblockd_plane *const pd_v = &xd->plane[AOM_PLANE_V];
 
@@ -38,12 +38,8 @@
   const int dst_u_stride = pd_u->dst.stride;
   const int dst_v_stride = pd_v->dst.stride;
 
-  assert(plane_bsize != BLOCK_INVALID);
-  const int block_width = block_size_wide[plane_bsize];
-  const int block_height = block_size_high[plane_bsize];
-
   // Number of pixel on the top and left borders.
-  const double num_pel = block_width + block_height;
+  const double num_pel = width + height;
 
   int sum_u = 0;
   int sum_v = 0;
@@ -64,13 +60,13 @@
   if (xd->up_available && xd->mb_to_right_edge >= 0) {
 #endif
     // TODO(ltrudeau) replace this with DC_PRED assembly
-    for (int i = 0; i < block_width; i++) {
+    for (int i = 0; i < width; i++) {
       sum_u += dst_u[-dst_u_stride + i];
       sum_v += dst_v[-dst_v_stride + i];
     }
   } else {
-    sum_u = block_width * 127;
-    sum_v = block_width * 127;
+    sum_u = width * 127;
+    sum_v = width * 127;
   }
 
 #if CONFIG_CHROMA_SUB8X8
@@ -78,27 +74,40 @@
 #else
   if (xd->left_available && xd->mb_to_bottom_edge >= 0) {
 #endif
-    for (int i = 0; i < block_height; i++) {
+    for (int i = 0; i < height; i++) {
       sum_u += dst_u[i * dst_u_stride - 1];
       sum_v += dst_v[i * dst_v_stride - 1];
     }
   } else {
-    sum_u += block_height * 129;
-    sum_v += block_height * 129;
+    sum_u += height * 129;
+    sum_v += height * 129;
   }
 
   xd->cfl->dc_pred[CFL_PRED_U] = sum_u / num_pel;
   xd->cfl->dc_pred[CFL_PRED_V] = sum_v / num_pel;
 }
 
+double cfl_compute_average(uint8_t *y_pix, int y_stride, int width,
+                           int height) {
+  int sum = 0;
+  for (int j = 0; j < height; j++) {
+    for (int i = 0; i < width; i++) {
+      sum += y_pix[i];
+    }
+    y_pix += y_stride;
+  }
+  return sum / (double)(width * height);
+}
+
 // Predict the current transform block using CfL.
 void cfl_predict_block(const CFL_CTX *cfl, uint8_t *dst, int dst_stride,
                        int row, int col, TX_SIZE tx_size, double dc_pred,
                        double alpha) {
   const int width = tx_size_wide[tx_size];
   const int height = tx_size_high[tx_size];
+  const double y_avg = cfl->y_avg;
 
-  const double y_avg = cfl_load(cfl, dst, dst_stride, row, col, width, height);
+  cfl_load(cfl, dst, dst_stride, row, col, width, height);
 
   for (int j = 0; j < height; j++) {
     for (int i = 0; i < width; i++) {
@@ -142,8 +151,8 @@
 }
 
 // Load from the CfL pixel buffer into output
-double cfl_load(const CFL_CTX *cfl, uint8_t *output, int output_stride, int row,
-                int col, int width, int height) {
+void cfl_load(const CFL_CTX *cfl, uint8_t *output, int output_stride, int row,
+              int col, int width, int height) {
   const int sub_x = cfl->subsampling_x;
   const int sub_y = cfl->subsampling_y;
   const int tx_off_log2 = tx_size_wide_log2[0];
@@ -226,14 +235,4 @@
       output_row_offset += output_stride;
     }
   }
-
-  int avg = 0;
-  output_row_offset = 0;
-  for (int j = 0; j < height; j++) {
-    for (int i = 0; i < width; i++) {
-      avg += output[output_row_offset + i];
-    }
-    output_row_offset += output_stride;
-  }
-  return avg / (double)(width * height);
 }
diff --git a/av1/common/cfl.h b/av1/common/cfl.h
index b5b77cc..239647d 100644
--- a/av1/common/cfl.h
+++ b/av1/common/cfl.h
@@ -31,6 +31,9 @@
   // Height and width of the luma prediction block currently in the pixel buffer
   int y_height, y_width;
 
+  // Average of the luma reconstructed values over the entire prediction unit
+  double y_avg;
+
   // Chroma subsampling
   int subsampling_x, subsampling_y;
 
@@ -59,7 +62,9 @@
 
 void cfl_init(CFL_CTX *cfl, AV1_COMMON *cm);
 
-void cfl_dc_pred(MACROBLOCKD *xd, BLOCK_SIZE plane_bsize);
+void cfl_dc_pred(MACROBLOCKD *xd, int width, int height);
+
+double cfl_compute_average(uint8_t *y_pix, int y_stride, int height, int width);
 
 static INLINE double cfl_idx_to_alpha(int alpha_idx, CFL_SIGN_TYPE alpha_sign,
                                       CFL_PRED_TYPE pred_type) {
@@ -81,6 +86,6 @@
 void cfl_store(CFL_CTX *cfl, const uint8_t *input, int input_stride, int row,
                int col, TX_SIZE tx_size);
 
-double cfl_load(const CFL_CTX *cfl, uint8_t *output, int output_stride, int row,
-                int col, int width, int height);
+void cfl_load(const CFL_CTX *cfl, uint8_t *output, int output_stride, int row,
+              int col, int width, int height);
 #endif  // AV1_COMMON_CFL_H_
diff --git a/av1/common/onyxc_int.h b/av1/common/onyxc_int.h
index 874508b..dabeb42 100644
--- a/av1/common/onyxc_int.h
+++ b/av1/common/onyxc_int.h
@@ -898,6 +898,24 @@
   return max_blocks_high >> tx_size_wide_log2[0];
 }
 
+#if CONFIG_CFL
+static INLINE int max_intra_block_width(const MACROBLOCKD *xd,
+                                        BLOCK_SIZE plane_bsize, int plane,
+                                        TX_SIZE tx_size) {
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane)
+                              << tx_size_wide_log2[0];
+  return ALIGN_POWER_OF_TWO(max_blocks_wide, tx_size_wide_log2[tx_size]);
+}
+
+static INLINE int max_intra_block_height(const MACROBLOCKD *xd,
+                                         BLOCK_SIZE plane_bsize, int plane,
+                                         TX_SIZE tx_size) {
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane)
+                              << tx_size_high_log2[0];
+  return ALIGN_POWER_OF_TWO(max_blocks_high, tx_size_high_log2[tx_size]);
+}
+#endif  // CONFIG_CFL
+
 static INLINE void av1_zero_above_context(AV1_COMMON *const cm,
                                           int mi_col_start, int mi_col_end) {
   const int width = mi_col_end - mi_col_start;
diff --git a/av1/common/reconintra.c b/av1/common/reconintra.c
index f336ccb..0d56f9f 100644
--- a/av1/common/reconintra.c
+++ b/av1/common/reconintra.c
@@ -2526,22 +2526,35 @@
                           mode, dst, dst_stride, dst, dst_stride, blk_col,
                           blk_row, plane);
 #if CONFIG_CFL
+  CFL_CTX *const cfl = xd->cfl;
   if (plane != AOM_PLANE_Y && mbmi->uv_mode == DC_PRED) {
     if (plane == AOM_PLANE_U && blk_col == 0 && blk_row == 0) {
-// Compute the block-level DC_PRED for both chromatic planes. DC_PRED replaces
-// beta in the linear model.
 #if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
       const BLOCK_SIZE plane_bsize =
           AOMMAX(BLOCK_4X4, get_plane_block_size(mbmi->sb_type, pd));
 #else
       const BLOCK_SIZE plane_bsize = get_plane_block_size(mbmi->sb_type, pd);
 #endif
-      cfl_dc_pred(xd, plane_bsize);
+      const int width =
+          max_intra_block_width(xd, plane_bsize, AOM_PLANE_U, tx_size);
+      const int height =
+          max_intra_block_height(xd, plane_bsize, AOM_PLANE_U, tx_size);
+
+      // Temporary pixel buffer used to store the CfL prediction when we compute
+      // the average over the reconstructed and downsampled luma pixels
+      // TODO(ltrudeau) Convert to uint16 when adding HBD support
+      uint8_t tmp_pix[MAX_SB_SQUARE];
+
+      // Compute the block-level DC_PRED for both chromatic planes. DC_PRED
+      // replaces beta in the linear model.
+      cfl_dc_pred(xd, width, height);
+      cfl_load(cfl, tmp_pix, MAX_SB_SIZE, 0, 0, width, height);
+      cfl->y_avg = cfl_compute_average(tmp_pix, MAX_SB_SIZE, width, height);
     }
 
     cfl_predict_block(
-        xd->cfl, dst, pd->dst.stride, blk_row, blk_col, tx_size,
-        xd->cfl->dc_pred[plane - 1],
+        cfl, dst, pd->dst.stride, blk_row, blk_col, tx_size,
+        cfl->dc_pred[plane - 1],
         cfl_idx_to_alpha(mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs[plane - 1],
                          plane - 1));
   }
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index 2798959..8c9168c 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -1537,7 +1537,8 @@
 }
 
 static int cfl_compute_alpha_ind(MACROBLOCK *const x, const CFL_CTX *const cfl,
-                                 BLOCK_SIZE bsize,
+                                 int width, int height,
+                                 uint8_t y_pix[MAX_SB_SQUARE],
                                  CFL_SIGN_TYPE signs_out[CFL_SIGNS]) {
   const struct macroblock_plane *const p_u = &x->plane[AOM_PLANE_U];
   const struct macroblock_plane *const p_v = &x->plane[AOM_PLANE_V];
@@ -1545,33 +1546,25 @@
   const uint8_t *const src_v = p_v->src.buf;
   const int src_stride_u = p_u->src.stride;
   const int src_stride_v = p_v->src.stride;
-  const int block_width = block_size_wide[bsize];
-  const int block_height = block_size_high[bsize];
   const double dc_pred_u = cfl->dc_pred[CFL_PRED_U];
   const double dc_pred_v = cfl->dc_pred[CFL_PRED_V];
-
-  // Temporary pixel buffer used to store the CfL prediction when we compute the
-  // alpha index.
-  uint8_t tmp_pix[MAX_SB_SQUARE];
-  // Load CfL Prediction over the entire block
-  const double y_avg =
-      cfl_load(cfl, tmp_pix, MAX_SB_SIZE, 0, 0, block_width, block_height);
+  const double y_avg = cfl->y_avg;
 
   int sse[CFL_PRED_PLANES][CFL_MAGS_SIZE];
   sse[CFL_PRED_U][0] =
-      cfl_alpha_dist(tmp_pix, MAX_SB_SIZE, y_avg, src_u, src_stride_u,
-                     block_width, block_height, dc_pred_u, 0, NULL);
+      cfl_alpha_dist(y_pix, MAX_SB_SIZE, y_avg, src_u, src_stride_u, width,
+                     height, dc_pred_u, 0, NULL);
   sse[CFL_PRED_V][0] =
-      cfl_alpha_dist(tmp_pix, MAX_SB_SIZE, y_avg, src_v, src_stride_v,
-                     block_width, block_height, dc_pred_v, 0, NULL);
+      cfl_alpha_dist(y_pix, MAX_SB_SIZE, y_avg, src_v, src_stride_v, width,
+                     height, dc_pred_v, 0, NULL);
   for (int m = 1; m < CFL_MAGS_SIZE; m += 2) {
     assert(cfl_alpha_mags[m + 1] == -cfl_alpha_mags[m]);
     sse[CFL_PRED_U][m] = cfl_alpha_dist(
-        tmp_pix, MAX_SB_SIZE, y_avg, src_u, src_stride_u, block_width,
-        block_height, dc_pred_u, cfl_alpha_mags[m], &sse[CFL_PRED_U][m + 1]);
+        y_pix, MAX_SB_SIZE, y_avg, src_u, src_stride_u, width, height,
+        dc_pred_u, cfl_alpha_mags[m], &sse[CFL_PRED_U][m + 1]);
     sse[CFL_PRED_V][m] = cfl_alpha_dist(
-        tmp_pix, MAX_SB_SIZE, y_avg, src_v, src_stride_v, block_width,
-        block_height, dc_pred_v, cfl_alpha_mags[m], &sse[CFL_PRED_V][m + 1]);
+        y_pix, MAX_SB_SIZE, y_avg, src_v, src_stride_v, width, height,
+        dc_pred_v, cfl_alpha_mags[m], &sse[CFL_PRED_V][m + 1]);
   }
 
   int dist;
@@ -1637,11 +1630,21 @@
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   if (plane != AOM_PLANE_Y && mbmi->uv_mode == DC_PRED) {
     if (blk_col == 0 && blk_row == 0 && plane == AOM_PLANE_U) {
+      const int width =
+          max_intra_block_width(xd, plane_bsize, AOM_PLANE_U, tx_size);
+      const int height =
+          max_intra_block_height(xd, plane_bsize, AOM_PLANE_U, tx_size);
+
+      uint8_t tmp_pix[MAX_SB_SQUARE];
       CFL_CTX *const cfl = xd->cfl;
+
       cfl_update_costs(cfl, ec_ctx);
-      cfl_dc_pred(xd, plane_bsize);
-      mbmi->cfl_alpha_idx =
-          cfl_compute_alpha_ind(x, cfl, plane_bsize, mbmi->cfl_alpha_signs);
+      cfl_dc_pred(xd, width, height);
+      // Load CfL Prediction over the entire block
+      cfl_load(cfl, tmp_pix, MAX_SB_SIZE, 0, 0, width, height);
+      cfl->y_avg = cfl_compute_average(tmp_pix, MAX_SB_SIZE, width, height);
+      mbmi->cfl_alpha_idx = cfl_compute_alpha_ind(
+          x, cfl, width, height, tmp_pix, mbmi->cfl_alpha_signs);
     }
   }
 #if CONFIG_DEBUG