[CFL] Compute Luma Average Over Partition Unit

Extract the compution of the luma reconstructed average out of cfl_load
and into cfl_compute_average. The reconstructed luma average is stored
in the CFL_CONTEXT to avoid computing it for each transform block and
for each plane.

Results on subset1 (compared to 803bea2 with CfL)
   PSNR | PSNR Cb | PSNR Cr | PSNR HVS |    SSIM | MS SSIM | CIEDE 2000
-0.0474 | -0.1486 | -0.2931 |  -0.0358 | -0.0397 | -0.0127 |    -0.1162

Change-Id: I9e34af0fe5961ce8dbe70cb80aea2a16221d0d92
diff --git a/av1/common/onyxc_int.h b/av1/common/onyxc_int.h
index 874508b..dabeb42 100644
--- a/av1/common/onyxc_int.h
+++ b/av1/common/onyxc_int.h
@@ -898,6 +898,24 @@
   return max_blocks_high >> tx_size_wide_log2[0];
 }
 
+#if CONFIG_CFL
+static INLINE int max_intra_block_width(const MACROBLOCKD *xd,
+                                        BLOCK_SIZE plane_bsize, int plane,
+                                        TX_SIZE tx_size) {
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane)
+                              << tx_size_wide_log2[0];
+  return ALIGN_POWER_OF_TWO(max_blocks_wide, tx_size_wide_log2[tx_size]);
+}
+
+static INLINE int max_intra_block_height(const MACROBLOCKD *xd,
+                                         BLOCK_SIZE plane_bsize, int plane,
+                                         TX_SIZE tx_size) {
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane)
+                              << tx_size_high_log2[0];
+  return ALIGN_POWER_OF_TWO(max_blocks_high, tx_size_high_log2[tx_size]);
+}
+#endif  // CONFIG_CFL
+
 static INLINE void av1_zero_above_context(AV1_COMMON *const cm,
                                           int mi_col_start, int mi_col_end) {
   const int width = mi_col_end - mi_col_start;