[CFL] Support for 4:2:0 High Bit Depth
high bit depth (_hbd) and low bit depth (_lbd) versions
of the cfl functions: sum_above_row, sum_left_col,
cfl_build_prediction, cfl_luma_subsampling_420 (4:4:4 will
be added in subsequent commit) and cfl_alpha_dist. For
cfl_alpha_dist, special care is given to scale the SSE
according to the bit depth.
BUG=aomedia:835
Change-Id: I5b72845100d88fb8a438efe665bcae7fe1ba50b8
diff --git a/av1/common/cfl.c b/av1/common/cfl.c
index b5f9b70..1ebb921 100644
--- a/av1/common/cfl.c
+++ b/av1/common/cfl.c
@@ -62,46 +62,94 @@
}
}
-static void sum_above_row(const MACROBLOCKD *xd, int width, int *out_sum_u,
- int *out_sum_v) {
- const struct macroblockd_plane *const pd_u = &xd->plane[AOM_PLANE_U];
- const struct macroblockd_plane *const pd_v = &xd->plane[AOM_PLANE_V];
-
- const int dst_u_stride = pd_u->dst.stride;
- const int dst_v_stride = pd_v->dst.stride;
-
- const uint8_t *above_dst_u = pd_u->dst.buf - dst_u_stride;
- const uint8_t *above_dst_v = pd_v->dst.buf - dst_v_stride;
-
+static void sum_above_row_lbd(const uint8_t *above_u, const uint8_t *above_v,
+ int width, int *out_sum_u, int *out_sum_v) {
int sum_u = 0;
int sum_v = 0;
for (int i = 0; i < width; i++) {
- sum_u += above_dst_u[i];
- sum_v += above_dst_v[i];
+ sum_u += above_u[i];
+ sum_v += above_v[i];
}
*out_sum_u += sum_u;
*out_sum_v += sum_v;
}
+#if CONFIG_HIGHBITDEPTH
+static void sum_above_row_hbd(const uint16_t *above_u, const uint16_t *above_v,
+ int width, int *out_sum_u, int *out_sum_v) {
+ int sum_u = 0;
+ int sum_v = 0;
+ for (int i = 0; i < width; i++) {
+ sum_u += above_u[i];
+ sum_v += above_v[i];
+ }
+ *out_sum_u += sum_u;
+ *out_sum_v += sum_v;
+}
+#endif // CONFIG_HIGHBITDEPTH
+static void sum_above_row(const MACROBLOCKD *xd, int width, int *out_sum_u,
+ int *out_sum_v) {
+ const struct macroblockd_plane *const pd_u = &xd->plane[AOM_PLANE_U];
+ const struct macroblockd_plane *const pd_v = &xd->plane[AOM_PLANE_V];
+#if CONFIG_HIGHBITDEPTH
+ if (get_bitdepth_data_path_index(xd)) {
+ const uint16_t *above_u_16 =
+ CONVERT_TO_SHORTPTR(pd_u->dst.buf) - pd_u->dst.stride;
+ const uint16_t *above_v_16 =
+ CONVERT_TO_SHORTPTR(pd_v->dst.buf) - pd_v->dst.stride;
+ sum_above_row_hbd(above_u_16, above_v_16, width, out_sum_u, out_sum_v);
+ return;
+ }
+#endif // CONFIG_HIGHBITDEPTH
+ const uint8_t *above_u = pd_u->dst.buf - pd_u->dst.stride;
+ const uint8_t *above_v = pd_v->dst.buf - pd_v->dst.stride;
+ sum_above_row_lbd(above_u, above_v, width, out_sum_u, out_sum_v);
+}
+
+static void sum_left_col_lbd(const uint8_t *left_u, int u_stride,
+ const uint8_t *left_v, int v_stride, int height,
+ int *out_sum_u, int *out_sum_v) {
+ int sum_u = 0;
+ int sum_v = 0;
+ for (int i = 0; i < height; i++) {
+ sum_u += left_u[i * u_stride];
+ sum_v += left_v[i * v_stride];
+ }
+ *out_sum_u += sum_u;
+ *out_sum_v += sum_v;
+}
+#if CONFIG_HIGHBITDEPTH
+static void sum_left_col_hbd(const uint16_t *left_u, int u_stride,
+ const uint16_t *left_v, int v_stride, int height,
+ int *out_sum_u, int *out_sum_v) {
+ int sum_u = 0;
+ int sum_v = 0;
+ for (int i = 0; i < height; i++) {
+ sum_u += left_u[i * u_stride];
+ sum_v += left_v[i * v_stride];
+ }
+ *out_sum_u += sum_u;
+ *out_sum_v += sum_v;
+}
+#endif // CONFIG_HIGHBITDEPTH
static void sum_left_col(const MACROBLOCKD *xd, int height, int *out_sum_u,
int *out_sum_v) {
const struct macroblockd_plane *const pd_u = &xd->plane[AOM_PLANE_U];
const struct macroblockd_plane *const pd_v = &xd->plane[AOM_PLANE_V];
- const int dst_u_stride = pd_u->dst.stride;
- const int dst_v_stride = pd_v->dst.stride;
-
- const uint8_t *left_dst_u = pd_u->dst.buf - 1;
- const uint8_t *left_dst_v = pd_v->dst.buf - 1;
-
- int sum_u = 0;
- int sum_v = 0;
- for (int i = 0; i < height; i++) {
- sum_u += left_dst_u[i * dst_u_stride];
- sum_v += left_dst_v[i * dst_v_stride];
+#if CONFIG_HIGHBITDEPTH
+ if (get_bitdepth_data_path_index(xd)) {
+ const uint16_t *left_u_16 = CONVERT_TO_SHORTPTR(pd_u->dst.buf) - 1;
+ const uint16_t *left_v_16 = CONVERT_TO_SHORTPTR(pd_v->dst.buf) - 1;
+ sum_left_col_hbd(left_u_16, pd_u->dst.stride, left_v_16, pd_v->dst.stride,
+ height, out_sum_u, out_sum_v);
+ return;
}
- *out_sum_u += sum_u;
- *out_sum_v += sum_v;
+#endif // CONFIG_HIGHBITDEPTH
+ const uint8_t *left_u = pd_u->dst.buf - 1;
+ const uint8_t *left_v = pd_v->dst.buf - 1;
+ sum_left_col_lbd(left_u, pd_u->dst.stride, left_v, pd_v->dst.stride, height,
+ out_sum_u, out_sum_v);
}
// CfL computes its own block-level DC_PRED. This is required to compute both
@@ -211,11 +259,9 @@
return (alpha_sign == CFL_SIGN_POS) ? abs_alpha_q3 + 1 : -abs_alpha_q3 - 1;
}
-// TODO(ltrudeau) add support for HBD.
-static INLINE void cfl_build_prediction(const int16_t *pred_buf_q3,
- uint8_t *dst, int dst_stride,
- int alpha_q3, int dc_pred, int width,
- int height) {
+static void cfl_build_prediction_lbd(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int width, int height,
+ int alpha_q3, int dc_pred) {
for (int j = 0; j < height; j++) {
for (int i = 0; i < width; i++) {
dst[i] =
@@ -226,6 +272,39 @@
}
}
+#if CONFIG_HIGHBITDEPTH
+static void cfl_build_prediction_hbd(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int width, int height,
+ int alpha_q3, int dc_pred, int bit_depth) {
+ for (int j = 0; j < height; j++) {
+ for (int i = 0; i < width; i++) {
+ dst[i] = clip_pixel_highbd(
+ get_scaled_luma_q0(alpha_q3, pred_buf_q3[i]) + dc_pred, bit_depth);
+ }
+ dst += dst_stride;
+ pred_buf_q3 += MAX_SB_SIZE;
+ }
+}
+#endif // CONFIG_HIGHBITDEPTH
+
+static void cfl_build_prediction(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int width, int height,
+ int alpha_q3, int dc_pred, int use_hbd,
+ int bit_depth) {
+#if CONFIG_HIGHBITDEPTH
+ if (use_hbd) {
+ uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
+ cfl_build_prediction_hbd(pred_buf_q3, dst_16, dst_stride, width, height,
+ alpha_q3, dc_pred, bit_depth);
+ return;
+ }
+#endif // CONFIG_HIGHBITDEPTH
+ (void)use_hbd;
+ (void)bit_depth;
+ cfl_build_prediction_lbd(pred_buf_q3, dst, dst_stride, width, height,
+ alpha_q3, dc_pred);
+}
+
void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
int row, int col, TX_SIZE tx_size, int plane) {
CFL_CTX *const cfl = xd->cfl;
@@ -239,15 +318,14 @@
const int alpha_q3 =
cfl_idx_to_alpha(mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, plane - 1);
- cfl_build_prediction(pred_buf_q3, dst, dst_stride, alpha_q3,
- cfl->dc_pred[plane - 1], tx_size_wide[tx_size],
- tx_size_high[tx_size]);
+ cfl_build_prediction(pred_buf_q3, dst, dst_stride, tx_size_wide[tx_size],
+ tx_size_high[tx_size], alpha_q3, cfl->dc_pred[plane - 1],
+ get_bitdepth_data_path_index(xd), xd->bd);
}
-static INLINE void cfl_luma_subsampling_420(const uint8_t *input,
- int input_stride,
- int16_t *output_q3, int width,
- int height) {
+static void cfl_luma_subsampling_420_lbd(const uint8_t *input, int input_stride,
+ int16_t *output_q3, int width,
+ int height) {
for (int j = 0; j < height; j++) {
for (int i = 0; i < width; i++) {
int top = i << 1;
@@ -260,6 +338,38 @@
}
}
+#if CONFIG_HIGHBITDEPTH
+static void cfl_luma_subsampling_420_hbd(const uint16_t *input,
+ int input_stride, int16_t *output_q3,
+ int width, int height) {
+ for (int j = 0; j < height; j++) {
+ for (int i = 0; i < width; i++) {
+ int top = i << 1;
+ int bot = top + input_stride;
+ output_q3[i] = (input[top] + input[top + 1] + input[bot] + input[bot + 1])
+ << 1;
+ }
+ input += input_stride << 1;
+ output_q3 += MAX_SB_SIZE;
+ }
+}
+#endif // CONFIG_HIGHBITDEPTH
+
+static void cfl_luma_subsampling_420(const uint8_t *input, int input_stride,
+ int16_t *output_q3, int width, int height,
+ int use_hbd) {
+#if CONFIG_HIGHBITDEPTH
+ if (use_hbd) {
+ const uint16_t *input_16 = CONVERT_TO_SHORTPTR(input);
+ cfl_luma_subsampling_420_hbd(input_16, input_stride, output_q3, width,
+ height);
+ return;
+ }
+#endif // CONFIG_HIGHBITDEPTH
+ (void)use_hbd;
+ cfl_luma_subsampling_420_lbd(input, input_stride, output_q3, width, height);
+}
+
static INLINE void cfl_luma_subsampling_444(const uint8_t *input,
int input_stride,
int16_t *output_q3, int width,
@@ -275,7 +385,7 @@
static INLINE void cfl_store(CFL_CTX *cfl, const uint8_t *input,
int input_stride, int row, int col, int width,
- int height) {
+ int height, int use_hbd) {
const int tx_off_log2 = tx_size_wide_log2[0];
const int sub_x = cfl->subsampling_x;
const int sub_y = cfl->subsampling_y;
@@ -307,11 +417,13 @@
cfl->pred_buf_q3 + (store_row * MAX_SB_SIZE + store_col);
if (sub_y == 0 && sub_x == 0) {
+ // TODO(ltrudeau) add support for HBD 4:4:4
+ assert(!use_hbd);
cfl_luma_subsampling_444(input, input_stride, pred_buf_q3, store_width,
store_height);
} else if (sub_y == 1 && sub_x == 1) {
cfl_luma_subsampling_420(input, input_stride, pred_buf_q3, store_width,
- store_height);
+ store_height, use_hbd);
} else {
// TODO(ltrudeau) add support for 4:2:2
assert(0); // Unsupported chroma subsampling
@@ -372,7 +484,7 @@
}
#endif
cfl_store(cfl, dst, pd->dst.stride, row, col, tx_size_wide[tx_size],
- tx_size_high[tx_size]);
+ tx_size_high[tx_size], get_bitdepth_data_path_index(xd));
}
void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size) {
@@ -391,7 +503,8 @@
#endif // CONFIG_CHROMA_SUB8X8
const int width = max_intra_block_width(xd, bsize, AOM_PLANE_Y, tx_size);
const int height = max_intra_block_height(xd, bsize, AOM_PLANE_Y, tx_size);
- cfl_store(cfl, pd->dst.buf, pd->dst.stride, row, col, width, height);
+ cfl_store(cfl, pd->dst.buf, pd->dst.stride, row, col, width, height,
+ get_bitdepth_data_path_index(xd));
}
void cfl_compute_parameters(MACROBLOCKD *const xd, TX_SIZE tx_size) {