[CFL] Convert dc_pred to fixed point

The dc_pred values stored in the CfL context are in Q8.7 (Worst case
division will be of 1/128).

Results on Subset1 (compared to f9684d222 with CfL enabled)

  PSNR | PSNR Cb | PSNR Cr | PSNR HVS |   SSIM | MS SSIM | CIEDE 2000
0.0118 | -0.0181 | -0.0109 |   0.0086 | 0.0086 |  0.0196 |     0.0018

Change-Id: I0701e04fb76f03eff12ed01fd5fda675fbb15e32
diff --git a/av1/common/cfl.c b/av1/common/cfl.c
index 5f5e397..b9e356b 100644
--- a/av1/common/cfl.c
+++ b/av1/common/cfl.c
@@ -133,7 +133,7 @@
   const int height = max_block_high(xd, plane_bsize, AOM_PLANE_U)
                      << tx_size_high_log2[0];
   // Number of pixel on the top and left borders.
-  const double num_pel = width + height;
+  const int num_pel = width + height;
 
   int sum_u = 0;
   int sum_v = 0;
@@ -179,8 +179,8 @@
 
   // TODO(ltrudeau) Because of max_block_wide and max_block_high, num_pel will
   // not be a power of two. So these divisions will have to use a lookup table.
-  cfl->dc_pred[CFL_PRED_U] = sum_u / num_pel;
-  cfl->dc_pred[CFL_PRED_V] = sum_v / num_pel;
+  cfl->dc_pred_q7[CFL_PRED_U] = (sum_u << 7) / num_pel;
+  cfl->dc_pred_q7[CFL_PRED_V] = (sum_v << 7) / num_pel;
 }
 
 static void cfl_compute_averages(CFL_CTX *cfl, TX_SIZE tx_size) {
@@ -253,22 +253,34 @@
   // TODO(ltrudeau) Convert to uint16 to support HBD
   const uint8_t *y_pix = cfl->y_down_pix;
 
-  const double dc_pred = cfl->dc_pred[plane - 1];
+  const int dc_pred_bias_q13 = (cfl->dc_pred_q7[plane - 1] << 6) + (1 << 12);
   const double alpha = cfl_idx_to_alpha(
       mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs[plane - 1], plane - 1);
+  // TODO(ltrudeau) Convert alpha to fixed point.
+  const int alpha_q3 = (int)(alpha * 8);
 
   const int avg_row =
       (row << tx_size_wide_log2[0]) >> tx_size_wide_log2[tx_size];
   const int avg_col =
       (col << tx_size_high_log2[0]) >> tx_size_high_log2[tx_size];
-  const double avg =
-      cfl->y_averages_q10[cfl->y_averages_stride * avg_row + avg_col] / 1024.0;
+  const int avg_q10 =
+      cfl->y_averages_q10[cfl->y_averages_stride * avg_row + avg_col];
 
   cfl_load(cfl, row, col, width, height);
   for (int j = 0; j < height; j++) {
     for (int i = 0; i < width; i++) {
-      // TODO(ltrudeau) call clip_pixel_highbd when HBD is enabled.
-      dst[i] = clip_pixel((int)(alpha * (y_pix[i] - avg) + dc_pred + 0.5));
+      const int pred_q13 =
+          get_scaled_luma_q13(alpha_q3, y_pix[i], avg_q10) + dc_pred_bias_q13;
+      // TODO(ltrudeau) Manage HBD.
+      if (pred_q13 <= 0) {
+        dst[i] = 0;
+      } else if (pred_q13 > (255 << 13)) {
+        dst[i] = 255;
+      } else {
+        dst[i] = (uint8_t)(pred_q13 >> 13);
+        assert(dst[i] == (int)(alpha * (y_pix[i] - (avg_q10 / 1024.0)) +
+                               (cfl->dc_pred_q7[plane - 1] / 128.0) + 0.5));
+      }
     }
     dst += dst_stride;
     y_pix += MAX_SB_SIZE;
diff --git a/av1/common/cfl.h b/av1/common/cfl.h
index 66e5359..7470eb8 100644
--- a/av1/common/cfl.h
+++ b/av1/common/cfl.h
@@ -53,8 +53,10 @@
   // Chroma subsampling
   int subsampling_x, subsampling_y;
 
-  // CfL Performs its own block level DC_PRED for each chromatic plane
-  double dc_pred[CFL_PRED_PLANES];
+  // Block level DC_PRED for each chromatic plane
+  // Fixed point dc_pred is Q12.7:
+  //   * Worst case division is 1/128
+  int dc_pred_q7[CFL_PRED_PLANES];
 
   // The rate associated with each alpha codeword
   int costs[CFL_ALPHABET_SIZE];
@@ -73,6 +75,10 @@
   { 0, 3 }, { 5, 1 }, { 1, 5 }, { 0, 5 }
 };
 
+static INLINE int get_scaled_luma_q13(int alpha_q3, int y_pix, int avg_q10) {
+  return alpha_q3 * ((y_pix << 10) - avg_q10);
+}
+
 void cfl_init(CFL_CTX *cfl, AV1_COMMON *cm);
 
 void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index 27ab853..2846141 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -1431,17 +1431,16 @@
 static int cfl_alpha_dist(const uint8_t *y_pix, int y_stride,
                           const int y_averages_q10[MAX_NUM_TXB],
                           const uint8_t *src, int src_stride, int width,
-                          int height, TX_SIZE tx_size, double dc_pred,
+                          int height, TX_SIZE tx_size, int dc_pred_q7,
                           double alpha, int *dist_neg_out) {
-  const double dc_pred_bias = dc_pred + 0.5;
   int dist = 0;
   int diff;
 
   if (alpha == 0.0) {
-    const int dc_pred_i = (int)dc_pred_bias;
+    const int dc_pred_bias = (dc_pred_q7 + 64) >> 7;
     for (int j = 0; j < height; j++) {
       for (int i = 0; i < width; i++) {
-        diff = src[i] - dc_pred_i;
+        diff = src[i] - dc_pred_bias;
         dist += diff * diff;
       }
       src += src_stride;
@@ -1452,6 +1451,9 @@
     return dist;
   }
 
+  const int dc_pred_bias_q13 = (dc_pred_q7 << 6) + (1 << 12);
+  // TODO(ltrudeau) Convert alpha to fixed point
+  const int alpha_q3 = (int)(alpha * 8);
   int dist_neg = 0;
   const int tx_height = tx_size_high[tx_size];
   const int tx_width = tx_size_wide[tx_size];
@@ -1464,21 +1466,26 @@
     const int h = b_j + tx_height;
     for (int b_i = 0; b_i < width; b_i += tx_width) {
       const int w = b_i + tx_width;
-      // TODO(ltrudeau) Remove div when DC_PRED is also fixed point
-      const double tx_avg_q10 = y_averages_q10[a++] / 1024.0;
+      const int tx_avg_q10 = y_averages_q10[a++];
       t_y_pix = y_pix;
       t_src = src;
       for (int t_j = b_j; t_j < h; t_j++) {
         for (int t_i = b_i; t_i < w; t_i++) {
-          const double scaled_luma = alpha * (t_y_pix[t_i] - tx_avg_q10);
+          const int scaled_luma_q13 =
+              get_scaled_luma_q13(alpha_q3, t_y_pix[t_i], tx_avg_q10);
+
           const int uv = t_src[t_i];
 
           // TODO(ltrudeau) add support for HBD.
-          diff = uv - clip_pixel((int)(scaled_luma + dc_pred_bias));
+          diff =
+              uv -
+              (clamp(scaled_luma_q13 + dc_pred_bias_q13, 0, (255 << 13)) >> 13);
           dist += diff * diff;
 
           // TODO(ltrudeau) add support for HBD.
-          diff = uv - clip_pixel((int)(-scaled_luma + dc_pred_bias));
+          diff = uv -
+                 (clamp(-scaled_luma_q13 + dc_pred_bias_q13, 0, (255 << 13)) >>
+                  13);
           dist_neg += diff * diff;
         }
         t_y_pix += y_stride;
@@ -1528,9 +1535,9 @@
   cfl_compute_parameters(xd, tx_size);
   const int width = cfl->uv_width;
   const int height = cfl->uv_height;
-  const double dc_pred_u = cfl->dc_pred[CFL_PRED_U];
-  const double dc_pred_v = cfl->dc_pred[CFL_PRED_V];
-  const int *y_averages = cfl->y_averages_q10;
+  const int dc_pred_u_q7 = cfl->dc_pred_q7[CFL_PRED_U];
+  const int dc_pred_v_q7 = cfl->dc_pred_q7[CFL_PRED_V];
+  const int *y_averages_q10 = cfl->y_averages_q10;
   const uint8_t *y_pix = cfl->y_down_pix;
 
   CFL_SIGN_TYPE *signs = mbmi->cfl_alpha_signs;
@@ -1539,20 +1546,20 @@
 
   int sse[CFL_PRED_PLANES][CFL_MAGS_SIZE];
   sse[CFL_PRED_U][0] =
-      cfl_alpha_dist(y_pix, MAX_SB_SIZE, y_averages, src_u, src_stride_u, width,
-                     height, tx_size, dc_pred_u, 0, NULL);
+      cfl_alpha_dist(y_pix, MAX_SB_SIZE, y_averages_q10, src_u, src_stride_u,
+                     width, height, tx_size, dc_pred_u_q7, 0, NULL);
   sse[CFL_PRED_V][0] =
-      cfl_alpha_dist(y_pix, MAX_SB_SIZE, y_averages, src_v, src_stride_v, width,
-                     height, tx_size, dc_pred_v, 0, NULL);
+      cfl_alpha_dist(y_pix, MAX_SB_SIZE, y_averages_q10, src_v, src_stride_v,
+                     width, height, tx_size, dc_pred_v_q7, 0, NULL);
 
   for (int m = 1; m < CFL_MAGS_SIZE; m += 2) {
     assert(cfl_alpha_mags[m + 1] == -cfl_alpha_mags[m]);
     sse[CFL_PRED_U][m] = cfl_alpha_dist(
-        y_pix, MAX_SB_SIZE, y_averages, src_u, src_stride_u, width, height,
-        tx_size, dc_pred_u, cfl_alpha_mags[m], &sse[CFL_PRED_U][m + 1]);
+        y_pix, MAX_SB_SIZE, y_averages_q10, src_u, src_stride_u, width, height,
+        tx_size, dc_pred_u_q7, cfl_alpha_mags[m], &sse[CFL_PRED_U][m + 1]);
     sse[CFL_PRED_V][m] = cfl_alpha_dist(
-        y_pix, MAX_SB_SIZE, y_averages, src_v, src_stride_v, width, height,
-        tx_size, dc_pred_v, cfl_alpha_mags[m], &sse[CFL_PRED_V][m + 1]);
+        y_pix, MAX_SB_SIZE, y_averages_q10, src_v, src_stride_v, width, height,
+        tx_size, dc_pred_v_q7, cfl_alpha_mags[m], &sse[CFL_PRED_V][m + 1]);
   }
 
   int dist;