[CFL] Fewer bits for fixed point

Since alpha is Q3, we reduce y_average from Q10 to Q3. As such, the
prediction is reduced from Q13 to Q6. Chroma dc_pred is reduced from Q7
to Q6 in order to match with the prediction.

Results on Subset1 (compared to 209de2e5b with CfL enabled)

  PSNR | PSNR Cb | PSNR Cr | PSNR HVS |   SSIM | MS SSIM | CIEDE 2000
0.0010 |  0.0176 | -0.0538 |  -0.0043 | 0.0027 | -0.0097 |    -0.0018

Change-Id: Ib7dd3968a764e0380ddc0ad2333ebacf1e9699cd
diff --git a/av1/common/cfl.c b/av1/common/cfl.c
index b9e356b..1ba6dda 100644
--- a/av1/common/cfl.c
+++ b/av1/common/cfl.c
@@ -179,8 +179,14 @@
 
   // TODO(ltrudeau) Because of max_block_wide and max_block_high, num_pel will
   // not be a power of two. So these divisions will have to use a lookup table.
-  cfl->dc_pred_q7[CFL_PRED_U] = (sum_u << 7) / num_pel;
-  cfl->dc_pred_q7[CFL_PRED_V] = (sum_v << 7) / num_pel;
+  cfl->dc_pred_q6[CFL_PRED_U] = ((sum_u << 6) + (num_pel >> 1)) / num_pel;
+  cfl->dc_pred_q6[CFL_PRED_V] = ((sum_v << 6) + (num_pel >> 1)) / num_pel;
+
+  // Loss is never more than 1/2 (in Q6)
+  assert(fabs(cfl->dc_pred_q6[CFL_PRED_U] - (sum_u / ((double)num_pel) * 64)) <=
+         0.5);
+  assert(fabs(cfl->dc_pred_q6[CFL_PRED_V] - (sum_v / ((double)num_pel) * 64)) <=
+         0.5);
 }
 
 static void cfl_compute_averages(CFL_CTX *cfl, TX_SIZE tx_size) {
@@ -197,7 +203,7 @@
   const uint8_t *y_pix = cfl->y_down_pix;
   // TODO(ltrudeau) Convert to uint16 for HBD support
   const uint8_t *t_y_pix;
-  int *averages_q10 = cfl->y_averages_q10;
+  int *averages_q3 = cfl->y_averages_q3;
 
   cfl_load(cfl, 0, 0, width, height);
 
@@ -212,11 +218,12 @@
         }
         t_y_pix += MAX_SB_SIZE;
       }
-      averages_q10[a++] = (sum << 10) >> num_pel_log2;
+      averages_q3[a++] =
+          ((sum << 3) + (1 << (num_pel_log2 - 1))) >> num_pel_log2;
 
-      // Assert no loss from fixed point
-      assert((double)averages_q10[a - 1] ==
-             (sum / ((double)(1 << num_pel_log2))) * (1 << 10));
+      // Loss is never more than 1/2 (in Q3)
+      assert(fabs((double)averages_q3[a - 1] -
+                  (sum / ((double)(1 << num_pel_log2))) * (1 << 3)) <= 0.5);
     }
     assert(a % stride == 0);
     y_pix += block_row_stride;
@@ -253,7 +260,7 @@
   // TODO(ltrudeau) Convert to uint16 to support HBD
   const uint8_t *y_pix = cfl->y_down_pix;
 
-  const int dc_pred_bias_q13 = (cfl->dc_pred_q7[plane - 1] << 6) + (1 << 12);
+  const int dc_pred_bias_q6 = cfl->dc_pred_q6[plane - 1] + 32;
   const double alpha = cfl_idx_to_alpha(
       mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs[plane - 1], plane - 1);
   // TODO(ltrudeau) Convert alpha to fixed point.
@@ -263,23 +270,23 @@
       (row << tx_size_wide_log2[0]) >> tx_size_wide_log2[tx_size];
   const int avg_col =
       (col << tx_size_high_log2[0]) >> tx_size_high_log2[tx_size];
-  const int avg_q10 =
-      cfl->y_averages_q10[cfl->y_averages_stride * avg_row + avg_col];
+  const int avg_q3 =
+      cfl->y_averages_q3[cfl->y_averages_stride * avg_row + avg_col];
 
   cfl_load(cfl, row, col, width, height);
   for (int j = 0; j < height; j++) {
     for (int i = 0; i < width; i++) {
-      const int pred_q13 =
-          get_scaled_luma_q13(alpha_q3, y_pix[i], avg_q10) + dc_pred_bias_q13;
+      const int pred_q6 =
+          get_scaled_luma_q6(alpha_q3, y_pix[i], avg_q3) + dc_pred_bias_q6;
       // TODO(ltrudeau) Manage HBD.
-      if (pred_q13 <= 0) {
+      if (pred_q6 <= 0) {
         dst[i] = 0;
-      } else if (pred_q13 > (255 << 13)) {
+      } else if (pred_q6 > (255 << 6)) {
         dst[i] = 255;
       } else {
-        dst[i] = (uint8_t)(pred_q13 >> 13);
-        assert(dst[i] == (int)(alpha * (y_pix[i] - (avg_q10 / 1024.0)) +
-                               (cfl->dc_pred_q7[plane - 1] / 128.0) + 0.5));
+        dst[i] = (uint8_t)(pred_q6 >> 6);
+        assert(dst[i] == (int)(alpha * (y_pix[i] - (avg_q3 / 8.0)) +
+                               (cfl->dc_pred_q6[plane - 1] / 64.0) + 0.5));
       }
     }
     dst += dst_stride;