[CFL] Support for 4:2:0 High Bit Depth

high bit depth (_hbd) and low bit depth (_lbd) versions
of the cfl functions: sum_above_row, sum_left_col,
cfl_build_prediction, cfl_luma_subsampling_420 (4:4:4 will
be added in subsequent commit) and cfl_alpha_dist. For
cfl_alpha_dist, special care is given to scale the SSE
according to the bit depth.

BUG=aomedia:835

Change-Id: I5b72845100d88fb8a438efe665bcae7fe1ba50b8
diff --git a/av1/common/cfl.c b/av1/common/cfl.c
index b5f9b70..1ebb921 100644
--- a/av1/common/cfl.c
+++ b/av1/common/cfl.c
@@ -62,46 +62,94 @@
   }
 }
 
-static void sum_above_row(const MACROBLOCKD *xd, int width, int *out_sum_u,
-                          int *out_sum_v) {
-  const struct macroblockd_plane *const pd_u = &xd->plane[AOM_PLANE_U];
-  const struct macroblockd_plane *const pd_v = &xd->plane[AOM_PLANE_V];
-
-  const int dst_u_stride = pd_u->dst.stride;
-  const int dst_v_stride = pd_v->dst.stride;
-
-  const uint8_t *above_dst_u = pd_u->dst.buf - dst_u_stride;
-  const uint8_t *above_dst_v = pd_v->dst.buf - dst_v_stride;
-
+static void sum_above_row_lbd(const uint8_t *above_u, const uint8_t *above_v,
+                              int width, int *out_sum_u, int *out_sum_v) {
   int sum_u = 0;
   int sum_v = 0;
   for (int i = 0; i < width; i++) {
-    sum_u += above_dst_u[i];
-    sum_v += above_dst_v[i];
+    sum_u += above_u[i];
+    sum_v += above_v[i];
   }
   *out_sum_u += sum_u;
   *out_sum_v += sum_v;
 }
+#if CONFIG_HIGHBITDEPTH
+static void sum_above_row_hbd(const uint16_t *above_u, const uint16_t *above_v,
+                              int width, int *out_sum_u, int *out_sum_v) {
+  int sum_u = 0;
+  int sum_v = 0;
+  for (int i = 0; i < width; i++) {
+    sum_u += above_u[i];
+    sum_v += above_v[i];
+  }
+  *out_sum_u += sum_u;
+  *out_sum_v += sum_v;
+}
+#endif  // CONFIG_HIGHBITDEPTH
 
+static void sum_above_row(const MACROBLOCKD *xd, int width, int *out_sum_u,
+                          int *out_sum_v) {
+  const struct macroblockd_plane *const pd_u = &xd->plane[AOM_PLANE_U];
+  const struct macroblockd_plane *const pd_v = &xd->plane[AOM_PLANE_V];
+#if CONFIG_HIGHBITDEPTH
+  if (get_bitdepth_data_path_index(xd)) {
+    const uint16_t *above_u_16 =
+        CONVERT_TO_SHORTPTR(pd_u->dst.buf) - pd_u->dst.stride;
+    const uint16_t *above_v_16 =
+        CONVERT_TO_SHORTPTR(pd_v->dst.buf) - pd_v->dst.stride;
+    sum_above_row_hbd(above_u_16, above_v_16, width, out_sum_u, out_sum_v);
+    return;
+  }
+#endif  // CONFIG_HIGHBITDEPTH
+  const uint8_t *above_u = pd_u->dst.buf - pd_u->dst.stride;
+  const uint8_t *above_v = pd_v->dst.buf - pd_v->dst.stride;
+  sum_above_row_lbd(above_u, above_v, width, out_sum_u, out_sum_v);
+}
+
+static void sum_left_col_lbd(const uint8_t *left_u, int u_stride,
+                             const uint8_t *left_v, int v_stride, int height,
+                             int *out_sum_u, int *out_sum_v) {
+  int sum_u = 0;
+  int sum_v = 0;
+  for (int i = 0; i < height; i++) {
+    sum_u += left_u[i * u_stride];
+    sum_v += left_v[i * v_stride];
+  }
+  *out_sum_u += sum_u;
+  *out_sum_v += sum_v;
+}
+#if CONFIG_HIGHBITDEPTH
+static void sum_left_col_hbd(const uint16_t *left_u, int u_stride,
+                             const uint16_t *left_v, int v_stride, int height,
+                             int *out_sum_u, int *out_sum_v) {
+  int sum_u = 0;
+  int sum_v = 0;
+  for (int i = 0; i < height; i++) {
+    sum_u += left_u[i * u_stride];
+    sum_v += left_v[i * v_stride];
+  }
+  *out_sum_u += sum_u;
+  *out_sum_v += sum_v;
+}
+#endif  // CONFIG_HIGHBITDEPTH
 static void sum_left_col(const MACROBLOCKD *xd, int height, int *out_sum_u,
                          int *out_sum_v) {
   const struct macroblockd_plane *const pd_u = &xd->plane[AOM_PLANE_U];
   const struct macroblockd_plane *const pd_v = &xd->plane[AOM_PLANE_V];
 
-  const int dst_u_stride = pd_u->dst.stride;
-  const int dst_v_stride = pd_v->dst.stride;
-
-  const uint8_t *left_dst_u = pd_u->dst.buf - 1;
-  const uint8_t *left_dst_v = pd_v->dst.buf - 1;
-
-  int sum_u = 0;
-  int sum_v = 0;
-  for (int i = 0; i < height; i++) {
-    sum_u += left_dst_u[i * dst_u_stride];
-    sum_v += left_dst_v[i * dst_v_stride];
+#if CONFIG_HIGHBITDEPTH
+  if (get_bitdepth_data_path_index(xd)) {
+    const uint16_t *left_u_16 = CONVERT_TO_SHORTPTR(pd_u->dst.buf) - 1;
+    const uint16_t *left_v_16 = CONVERT_TO_SHORTPTR(pd_v->dst.buf) - 1;
+    sum_left_col_hbd(left_u_16, pd_u->dst.stride, left_v_16, pd_v->dst.stride,
+                     height, out_sum_u, out_sum_v);
+    return;
   }
-  *out_sum_u += sum_u;
-  *out_sum_v += sum_v;
+#endif  // CONFIG_HIGHBITDEPTH
+  const uint8_t *left_u = pd_u->dst.buf - 1;
+  const uint8_t *left_v = pd_v->dst.buf - 1;
+  sum_left_col_lbd(left_u, pd_u->dst.stride, left_v, pd_v->dst.stride, height,
+                   out_sum_u, out_sum_v);
 }
 
 // CfL computes its own block-level DC_PRED. This is required to compute both
@@ -211,11 +259,9 @@
   return (alpha_sign == CFL_SIGN_POS) ? abs_alpha_q3 + 1 : -abs_alpha_q3 - 1;
 }
 
-// TODO(ltrudeau) add support for HBD.
-static INLINE void cfl_build_prediction(const int16_t *pred_buf_q3,
-                                        uint8_t *dst, int dst_stride,
-                                        int alpha_q3, int dc_pred, int width,
-                                        int height) {
+static void cfl_build_prediction_lbd(const int16_t *pred_buf_q3, uint8_t *dst,
+                                     int dst_stride, int width, int height,
+                                     int alpha_q3, int dc_pred) {
   for (int j = 0; j < height; j++) {
     for (int i = 0; i < width; i++) {
       dst[i] =
@@ -226,6 +272,39 @@
   }
 }
 
+#if CONFIG_HIGHBITDEPTH
+static void cfl_build_prediction_hbd(const int16_t *pred_buf_q3, uint16_t *dst,
+                                     int dst_stride, int width, int height,
+                                     int alpha_q3, int dc_pred, int bit_depth) {
+  for (int j = 0; j < height; j++) {
+    for (int i = 0; i < width; i++) {
+      dst[i] = clip_pixel_highbd(
+          get_scaled_luma_q0(alpha_q3, pred_buf_q3[i]) + dc_pred, bit_depth);
+    }
+    dst += dst_stride;
+    pred_buf_q3 += MAX_SB_SIZE;
+  }
+}
+#endif  // CONFIG_HIGHBITDEPTH
+
+static void cfl_build_prediction(const int16_t *pred_buf_q3, uint8_t *dst,
+                                 int dst_stride, int width, int height,
+                                 int alpha_q3, int dc_pred, int use_hbd,
+                                 int bit_depth) {
+#if CONFIG_HIGHBITDEPTH
+  if (use_hbd) {
+    uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
+    cfl_build_prediction_hbd(pred_buf_q3, dst_16, dst_stride, width, height,
+                             alpha_q3, dc_pred, bit_depth);
+    return;
+  }
+#endif  // CONFIG_HIGHBITDEPTH
+  (void)use_hbd;
+  (void)bit_depth;
+  cfl_build_prediction_lbd(pred_buf_q3, dst, dst_stride, width, height,
+                           alpha_q3, dc_pred);
+}
+
 void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
                        int row, int col, TX_SIZE tx_size, int plane) {
   CFL_CTX *const cfl = xd->cfl;
@@ -239,15 +318,14 @@
   const int alpha_q3 =
       cfl_idx_to_alpha(mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, plane - 1);
 
-  cfl_build_prediction(pred_buf_q3, dst, dst_stride, alpha_q3,
-                       cfl->dc_pred[plane - 1], tx_size_wide[tx_size],
-                       tx_size_high[tx_size]);
+  cfl_build_prediction(pred_buf_q3, dst, dst_stride, tx_size_wide[tx_size],
+                       tx_size_high[tx_size], alpha_q3, cfl->dc_pred[plane - 1],
+                       get_bitdepth_data_path_index(xd), xd->bd);
 }
 
-static INLINE void cfl_luma_subsampling_420(const uint8_t *input,
-                                            int input_stride,
-                                            int16_t *output_q3, int width,
-                                            int height) {
+static void cfl_luma_subsampling_420_lbd(const uint8_t *input, int input_stride,
+                                         int16_t *output_q3, int width,
+                                         int height) {
   for (int j = 0; j < height; j++) {
     for (int i = 0; i < width; i++) {
       int top = i << 1;
@@ -260,6 +338,38 @@
   }
 }
 
+#if CONFIG_HIGHBITDEPTH
+static void cfl_luma_subsampling_420_hbd(const uint16_t *input,
+                                         int input_stride, int16_t *output_q3,
+                                         int width, int height) {
+  for (int j = 0; j < height; j++) {
+    for (int i = 0; i < width; i++) {
+      int top = i << 1;
+      int bot = top + input_stride;
+      output_q3[i] = (input[top] + input[top + 1] + input[bot] + input[bot + 1])
+                     << 1;
+    }
+    input += input_stride << 1;
+    output_q3 += MAX_SB_SIZE;
+  }
+}
+#endif  // CONFIG_HIGHBITDEPTH
+
+static void cfl_luma_subsampling_420(const uint8_t *input, int input_stride,
+                                     int16_t *output_q3, int width, int height,
+                                     int use_hbd) {
+#if CONFIG_HIGHBITDEPTH
+  if (use_hbd) {
+    const uint16_t *input_16 = CONVERT_TO_SHORTPTR(input);
+    cfl_luma_subsampling_420_hbd(input_16, input_stride, output_q3, width,
+                                 height);
+    return;
+  }
+#endif  // CONFIG_HIGHBITDEPTH
+  (void)use_hbd;
+  cfl_luma_subsampling_420_lbd(input, input_stride, output_q3, width, height);
+}
+
 static INLINE void cfl_luma_subsampling_444(const uint8_t *input,
                                             int input_stride,
                                             int16_t *output_q3, int width,
@@ -275,7 +385,7 @@
 
 static INLINE void cfl_store(CFL_CTX *cfl, const uint8_t *input,
                              int input_stride, int row, int col, int width,
-                             int height) {
+                             int height, int use_hbd) {
   const int tx_off_log2 = tx_size_wide_log2[0];
   const int sub_x = cfl->subsampling_x;
   const int sub_y = cfl->subsampling_y;
@@ -307,11 +417,13 @@
       cfl->pred_buf_q3 + (store_row * MAX_SB_SIZE + store_col);
 
   if (sub_y == 0 && sub_x == 0) {
+    // TODO(ltrudeau) add support for HBD 4:4:4
+    assert(!use_hbd);
     cfl_luma_subsampling_444(input, input_stride, pred_buf_q3, store_width,
                              store_height);
   } else if (sub_y == 1 && sub_x == 1) {
     cfl_luma_subsampling_420(input, input_stride, pred_buf_q3, store_width,
-                             store_height);
+                             store_height, use_hbd);
   } else {
     // TODO(ltrudeau) add support for 4:2:2
     assert(0);  // Unsupported chroma subsampling
@@ -372,7 +484,7 @@
   }
 #endif
   cfl_store(cfl, dst, pd->dst.stride, row, col, tx_size_wide[tx_size],
-            tx_size_high[tx_size]);
+            tx_size_high[tx_size], get_bitdepth_data_path_index(xd));
 }
 
 void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size) {
@@ -391,7 +503,8 @@
 #endif  // CONFIG_CHROMA_SUB8X8
   const int width = max_intra_block_width(xd, bsize, AOM_PLANE_Y, tx_size);
   const int height = max_intra_block_height(xd, bsize, AOM_PLANE_Y, tx_size);
-  cfl_store(cfl, pd->dst.buf, pd->dst.stride, row, col, width, height);
+  cfl_store(cfl, pd->dst.buf, pd->dst.stride, row, col, width, height,
+            get_bitdepth_data_path_index(xd));
 }
 
 void cfl_compute_parameters(MACROBLOCKD *const xd, TX_SIZE tx_size) {