[CFL] Subsample to Q3 Result from luma subsampling is left-shifted by 3. This avoids having to do it during averaging, in alpha search and when building the prediction. This change does not alter the bitstream. Results on Subset1 PSNR | PSNR Cb | PSNR Cr | PSNR HVS | SSIM | MS SSIM | CIEDE 2000 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 https://arewecompressedyet.com/?job=cfl-baseline%402017-09-06T17%3A41%3A38.041Z&job=cfl-SubsampleQ3%402017-09-06T17%3A42%3A01.252Z Change-Id: I6e89eac6496f7c36e46364c9223fbcbca6759032

diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index b0192d3..e7cb1f8 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h

@@ -699,10 +699,8 @@
   // TODO(ltrudeau) Convert to uint16 for HBD support
   uint8_t y_pix[MAX_SB_SQUARE];
 
-  // Pixel buffer containing the downsampled luma pixels used as prediction for
-  // chroma
-  // TODO(ltrudeau) Convert to uint16 for HBD support
-  uint8_t y_down_pix[MAX_SB_SQUARE];
+  // Downsampled luma pixels (in Q3) used for chroma prediction
+  int y_down_pix_q3[MAX_SB_SQUARE];
 
   // Height and width of the luma prediction block currently in the pixel buffer
   int y_height, y_width;

diff --git a/av1/common/cfl.c b/av1/common/cfl.c
index c51648b..9b33079 100644
--- a/av1/common/cfl.c
+++ b/av1/common/cfl.c

@@ -30,29 +30,30 @@
 }
 
 static INLINE void cfl_luma_subsampling_420(const uint8_t *y_pix,
-                                            uint8_t *output, int width,
+                                            int *output_q3, int width,
                                             int height) {
   for (int j = 0; j < height; j++) {
     for (int i = 0; i < width; i++) {
       int top = i << 1;
       int bot = top + MAX_SB_SIZE;
       int sum = y_pix[top] + y_pix[top + 1] + y_pix[bot] + y_pix[bot + 1];
-      output[i] = (sum + 2) >> 2;
+      // TODO(ltrudeau) replace "+ 2 >> 2 << 3" with << 1
+      output_q3[i] = ((sum + 2) >> 2) << 3;
     }
     y_pix += MAX_SB_SIZE << 1;
-    output += MAX_SB_SIZE;
+    output_q3 += MAX_SB_SIZE;
   }
 }
 
 static INLINE void cfl_luma_subsampling_444(const uint8_t *y_pix,
-                                            uint8_t *output, int width,
+                                            int *output_q3, int width,
                                             int height) {
   for (int j = 0; j < height; j++) {
     for (int i = 0; i < width; i++) {
-      output[i] = y_pix[i];
+      output_q3[i] = y_pix[i] << 3;
     }
     y_pix += MAX_SB_SIZE;
-    output += MAX_SB_SIZE;
+    output_q3 += MAX_SB_SIZE;
   }
 }
 
@@ -62,20 +63,19 @@
   const int sub_y = cfl->subsampling_y;
   const int off_log2 = tx_size_wide_log2[0];
 
-  // TODO(ltrudeau) convert to uint16 to add HBD support
-  uint8_t *output = cfl->y_down_pix;
+  int *output_q3 = cfl->y_down_pix_q3;
 
   // TODO(ltrudeau) should be faster to downsample when we store the values
   // TODO(ltrudeau) add support for 4:2:2
   if (sub_y == 0 && sub_x == 0) {
     // TODO(ltrudeau) convert to uint16 to add HBD support
     const uint8_t *y_pix = cfl->y_pix + ((row * MAX_SB_SIZE + col) << off_log2);
-    cfl_luma_subsampling_444(y_pix, output, width, height);
+    cfl_luma_subsampling_444(y_pix, output_q3, width, height);
   } else if (sub_y == 1 && sub_x == 1) {
     // TODO(ltrudeau) convert to uint16 to add HBD support
     const uint8_t *y_pix =
         cfl->y_pix + ((row * MAX_SB_SIZE + col) << (off_log2 + sub_y));
-    cfl_luma_subsampling_420(y_pix, output, width, height);
+    cfl_luma_subsampling_420(y_pix, output_q3, width, height);
   } else {
     assert(0);  // Unsupported chroma subsampling
   }
@@ -100,7 +100,7 @@
     for (int j = 0; j < height; j++) {
       last_pixel = output_row_offset - 1;
       for (int i = 0; i < diff_width; i++) {
-        output[output_row_offset + i] = output[last_pixel];
+        output_q3[output_row_offset + i] = output_q3[last_pixel];
       }
       output_row_offset += MAX_SB_SIZE;
     }
@@ -112,7 +112,7 @@
 
     for (int j = 0; j < diff_height; j++) {
       for (int i = 0; i < width; i++) {
-        output[output_row_offset + i] = output[last_row_offset + i];
+        output_q3[output_row_offset + i] = output_q3[last_row_offset + i];
       }
       output_row_offset += MAX_SB_SIZE;
     }
@@ -200,10 +200,8 @@
   const int num_pel_log2 =
       (tx_size_high_log2[tx_size] + tx_size_wide_log2[tx_size]);
 
-  // TODO(ltrudeau) Convert to uint16 for HBD support
-  const uint8_t *y_pix = cfl->y_down_pix;
-  // TODO(ltrudeau) Convert to uint16 for HBD support
-  const uint8_t *t_y_pix;
+  const int *y_pix_q3 = cfl->y_down_pix_q3;
+  const int *t_y_pix_q3;
   int *averages_q3 = cfl->y_averages_q3;
 
   cfl_load(cfl, 0, 0, width, height);
@@ -211,24 +209,23 @@
   int a = 0;
   for (int b_j = 0; b_j < height; b_j += tx_height) {
     for (int b_i = 0; b_i < width; b_i += tx_width) {
-      int sum = 0;
-      t_y_pix = y_pix;
+      int sum_q3 = 0;
+      t_y_pix_q3 = y_pix_q3;
       for (int t_j = 0; t_j < tx_height; t_j++) {
         for (int t_i = b_i; t_i < b_i + tx_width; t_i++) {
-          sum += t_y_pix[t_i];
+          sum_q3 += t_y_pix_q3[t_i];
         }
-        t_y_pix += MAX_SB_SIZE;
+        t_y_pix_q3 += MAX_SB_SIZE;
       }
       assert(a < MAX_NUM_TXB_SQUARE);
-      averages_q3[a++] =
-          ((sum << 3) + (1 << (num_pel_log2 - 1))) >> num_pel_log2;
+      averages_q3[a++] = (sum_q3 + (1 << (num_pel_log2 - 1))) >> num_pel_log2;
 
       // Loss is never more than 1/2 (in Q3)
       assert(fabs((double)averages_q3[a - 1] -
-                  (sum / ((double)(1 << num_pel_log2))) * (1 << 3)) <= 0.5);
+                  (sum_q3 / ((double)(1 << num_pel_log2)))) <= 0.5);
     }
     assert(a % stride == 0);
-    y_pix += block_row_stride;
+    y_pix_q3 += block_row_stride;
   }
 
   cfl->y_averages_stride = stride;
@@ -255,8 +252,7 @@
 
   const int width = tx_size_wide[tx_size];
   const int height = tx_size_high[tx_size];
-  // TODO(ltrudeau) Convert to uint16 to support HBD
-  const uint8_t *y_pix = cfl->y_down_pix;
+  const int *y_pix_q3 = cfl->y_down_pix_q3;
 
   const int dc_pred = cfl->dc_pred[plane - 1];
   const int alpha_q3 =
@@ -273,11 +269,11 @@
   for (int j = 0; j < height; j++) {
     for (int i = 0; i < width; i++) {
       // TODO(ltrudeau) add support for HBD.
-      dst[i] =
-          clip_pixel(get_scaled_luma_q0(alpha_q3, y_pix[i], avg_q3) + dc_pred);
+      dst[i] = clip_pixel(get_scaled_luma_q0(alpha_q3, y_pix_q3[i], avg_q3) +
+                          dc_pred);
     }
     dst += dst_stride;
-    y_pix += MAX_SB_SIZE;
+    y_pix_q3 += MAX_SB_SIZE;
   }
 }
 

diff --git a/av1/common/cfl.h b/av1/common/cfl.h
index 7a56a49..e6de1b1 100644
--- a/av1/common/cfl.h
+++ b/av1/common/cfl.h

@@ -14,8 +14,8 @@
 
 #include "av1/common/blockd.h"
 
-static INLINE int get_scaled_luma_q0(int alpha_q3, int y_pix, int avg_q3) {
-  int scaled_luma_q6 = alpha_q3 * ((y_pix << 3) - avg_q3);
+static INLINE int get_scaled_luma_q0(int alpha_q3, int y_pix_q3, int avg_q3) {
+  int scaled_luma_q6 = alpha_q3 * (y_pix_q3 - avg_q3);
   return ROUND_POWER_OF_TWO_SIGNED(scaled_luma_q6, 6);
 }
 

diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 9d53e3e..dadb977 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c

@@ -5845,7 +5845,7 @@
 #endif  // CONFIG_EXT_INTRA
 
 #if CONFIG_CFL
-static int64_t cfl_alpha_dist(const uint8_t *y_pix, int y_stride,
+static int64_t cfl_alpha_dist(const int *y_pix_q3, int y_stride,
                               const int y_averages_q3[MAX_NUM_TXB_SQUARE],
                               const uint8_t *src, int src_stride, int width,
                               int height, TX_SIZE tx_size, int dc_pred,
@@ -5872,7 +5872,7 @@
   const int tx_width = tx_size_wide[tx_size];
   const int y_block_row_off = y_stride * tx_height;
   const int src_block_row_off = src_stride * tx_height;
-  const uint8_t *t_y_pix;
+  const int *t_y_pix_q3;
   const uint8_t *t_src;
   int a = 0;
   for (int b_j = 0; b_j < height; b_j += tx_height) {
@@ -5880,14 +5880,14 @@
     for (int b_i = 0; b_i < width; b_i += tx_width) {
       const int w = b_i + tx_width;
       const int tx_avg_q3 = y_averages_q3[a++];
-      t_y_pix = y_pix;
+      t_y_pix_q3 = y_pix_q3;
       t_src = src;
       for (int t_j = b_j; t_j < h; t_j++) {
         for (int t_i = b_i; t_i < w; t_i++) {
           const int uv = t_src[t_i];
 
           const int scaled_luma =
-              get_scaled_luma_q0(alpha_q3, t_y_pix[t_i], tx_avg_q3);
+              get_scaled_luma_q0(alpha_q3, t_y_pix_q3[t_i], tx_avg_q3);
 
           // TODO(ltrudeau) add support for HBD.
           diff = uv - clamp(scaled_luma + dc_pred, 0, 255);
@@ -5897,11 +5897,11 @@
           diff = uv - clamp(-scaled_luma + dc_pred, 0, 255);
           dist_neg += diff * diff;
         }
-        t_y_pix += y_stride;
+        t_y_pix_q3 += y_stride;
         t_src += src_stride;
       }
     }
-    y_pix += y_block_row_off;
+    y_pix_q3 += y_block_row_off;
     src += src_block_row_off;
   }
 
@@ -5928,25 +5928,25 @@
   const int dc_pred_u = cfl->dc_pred[CFL_PRED_U];
   const int dc_pred_v = cfl->dc_pred[CFL_PRED_V];
   const int *y_averages_q3 = cfl->y_averages_q3;
-  const uint8_t *y_pix = cfl->y_down_pix;
+  const int *y_pix_q3 = cfl->y_down_pix_q3;
 
   int64_t sse[CFL_PRED_PLANES][CFL_MAGS_SIZE];
   sse[CFL_PRED_U][0] =
-      cfl_alpha_dist(y_pix, MAX_SB_SIZE, y_averages_q3, src_u, src_stride_u,
+      cfl_alpha_dist(y_pix_q3, MAX_SB_SIZE, y_averages_q3, src_u, src_stride_u,
                      width, height, tx_size, dc_pred_u, 0, NULL);
   sse[CFL_PRED_V][0] =
-      cfl_alpha_dist(y_pix, MAX_SB_SIZE, y_averages_q3, src_v, src_stride_v,
+      cfl_alpha_dist(y_pix_q3, MAX_SB_SIZE, y_averages_q3, src_v, src_stride_v,
                      width, height, tx_size, dc_pred_v, 0, NULL);
 
   for (int c = 0; c < CFL_ALPHABET_SIZE; c++) {
     const int m = c * 2 + 1;
     const int abs_alpha_q3 = c + 1;
     sse[CFL_PRED_U][m] = cfl_alpha_dist(
-        y_pix, MAX_SB_SIZE, y_averages_q3, src_u, src_stride_u, width, height,
-        tx_size, dc_pred_u, abs_alpha_q3, &sse[CFL_PRED_U][m + 1]);
+        y_pix_q3, MAX_SB_SIZE, y_averages_q3, src_u, src_stride_u, width,
+        height, tx_size, dc_pred_u, abs_alpha_q3, &sse[CFL_PRED_U][m + 1]);
     sse[CFL_PRED_V][m] = cfl_alpha_dist(
-        y_pix, MAX_SB_SIZE, y_averages_q3, src_v, src_stride_v, width, height,
-        tx_size, dc_pred_v, abs_alpha_q3, &sse[CFL_PRED_V][m + 1]);
+        y_pix_q3, MAX_SB_SIZE, y_averages_q3, src_v, src_stride_v, width,
+        height, tx_size, dc_pred_v, abs_alpha_q3, &sse[CFL_PRED_V][m + 1]);
   }
 
   int64_t dist;