[CFL] Cap Averaging Area In order to address hardware concerns regarding luma averaging, this patch caps the maximum length of the sides of the luma averaging area to 32. This proposal was accepted by the hardware working group on November 20th 2017. Regression on Subset1: PSNR | PSNR Cb | PSNR Cr | PSNR HVS | SSIM | MS SSIM | CIEDE 2000 0.0078 | 0.0572 | -0.0705 | -0.0272 | -0.0202 | -0.0391 | 0.0375 Change-Id: I875a6f2114df4d857ed66c4690ee08da2df426e4

diff --git a/av1/common/cfl.c b/av1/common/cfl.c
index d930ec2..e3b6308 100644
--- a/av1/common/cfl.c
+++ b/av1/common/cfl.c

@@ -70,38 +70,42 @@
 static void cfl_subtract_averages(CFL_CTX *cfl, TX_SIZE tx_size) {
   const int width = cfl->uv_width;
   const int height = cfl->uv_height;
-  const int tx_height = tx_size_high[tx_size];
-  const int tx_width = tx_size_wide[tx_size];
-  const int block_row_stride = MAX_SB_SIZE << tx_size_high_log2[tx_size];
-  const int num_pel_log2 =
-      (tx_size_high_log2[tx_size] + tx_size_wide_log2[tx_size]);
+
+  const int avg_height_log2 =
+      AOMMIN(tx_size_high_log2[tx_size], 1 - cfl->subsampling_y + 4);
+  const int avg_width_log2 =
+      AOMMIN(tx_size_wide_log2[tx_size], 1 - cfl->subsampling_x + 4);
+  const int avg_height = 1 << avg_height_log2;
+  const int avg_width = 1 << avg_width_log2;
+  const int num_pel_log2 = avg_height_log2 + avg_width_log2;
+  const int block_row_stride = MAX_SB_SIZE << avg_height_log2;
 
   int16_t *pred_buf_q3 = cfl->pred_buf_q3;
 
   cfl_pad(cfl, width, height);
 
-  for (int b_j = 0; b_j < height; b_j += tx_height) {
-    for (int b_i = 0; b_i < width; b_i += tx_width) {
+  for (int b_j = 0; b_j < height; b_j += avg_height) {
+    for (int b_i = 0; b_i < width; b_i += avg_width) {
       int sum_q3 = 0;
-      int16_t *tx_pred_buf_q3 = pred_buf_q3;
-      for (int t_j = 0; t_j < tx_height; t_j++) {
-        for (int t_i = b_i; t_i < b_i + tx_width; t_i++) {
-          sum_q3 += tx_pred_buf_q3[t_i];
+      int16_t *avg_pred_buf_q3 = pred_buf_q3;
+      for (int t_j = 0; t_j < avg_height; t_j++) {
+        for (int t_i = b_i; t_i < b_i + avg_width; t_i++) {
+          sum_q3 += avg_pred_buf_q3[t_i];
         }
-        tx_pred_buf_q3 += MAX_SB_SIZE;
+        avg_pred_buf_q3 += MAX_SB_SIZE;
       }
       int avg_q3 = (sum_q3 + (1 << (num_pel_log2 - 1))) >> num_pel_log2;
       // Loss is never more than 1/2 (in Q3)
       assert(abs((avg_q3 * (1 << num_pel_log2)) - sum_q3) <=
              1 << num_pel_log2 >> 1);
 
-      tx_pred_buf_q3 = pred_buf_q3;
-      for (int t_j = 0; t_j < tx_height; t_j++) {
-        for (int t_i = b_i; t_i < b_i + tx_width; t_i++) {
-          tx_pred_buf_q3[t_i] -= avg_q3;
+      avg_pred_buf_q3 = pred_buf_q3;
+      for (int t_j = 0; t_j < avg_height; t_j++) {
+        for (int t_i = b_i; t_i < b_i + avg_width; t_i++) {
+          avg_pred_buf_q3[t_i] -= avg_q3;
         }
 
-        tx_pred_buf_q3 += MAX_SB_SIZE;
+        avg_pred_buf_q3 += MAX_SB_SIZE;
       }
     }
     pred_buf_q3 += block_row_stride;