[CFL] Subsample to Q3 Result from luma subsampling is left-shifted by 3. This avoids having to do it during averaging, in alpha search and when building the prediction. This change does not alter the bitstream. Results on Subset1 PSNR | PSNR Cb | PSNR Cr | PSNR HVS | SSIM | MS SSIM | CIEDE 2000 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 https://arewecompressedyet.com/?job=cfl-baseline%402017-09-06T17%3A41%3A38.041Z&job=cfl-SubsampleQ3%402017-09-06T17%3A42%3A01.252Z Change-Id: I6e89eac6496f7c36e46364c9223fbcbca6759032
diff --git a/av1/common/blockd.h b/av1/common/blockd.h index b0192d3..e7cb1f8 100644 --- a/av1/common/blockd.h +++ b/av1/common/blockd.h
@@ -699,10 +699,8 @@ // TODO(ltrudeau) Convert to uint16 for HBD support uint8_t y_pix[MAX_SB_SQUARE]; - // Pixel buffer containing the downsampled luma pixels used as prediction for - // chroma - // TODO(ltrudeau) Convert to uint16 for HBD support - uint8_t y_down_pix[MAX_SB_SQUARE]; + // Downsampled luma pixels (in Q3) used for chroma prediction + int y_down_pix_q3[MAX_SB_SQUARE]; // Height and width of the luma prediction block currently in the pixel buffer int y_height, y_width;
diff --git a/av1/common/cfl.c b/av1/common/cfl.c index c51648b..9b33079 100644 --- a/av1/common/cfl.c +++ b/av1/common/cfl.c
@@ -30,29 +30,30 @@ } static INLINE void cfl_luma_subsampling_420(const uint8_t *y_pix, - uint8_t *output, int width, + int *output_q3, int width, int height) { for (int j = 0; j < height; j++) { for (int i = 0; i < width; i++) { int top = i << 1; int bot = top + MAX_SB_SIZE; int sum = y_pix[top] + y_pix[top + 1] + y_pix[bot] + y_pix[bot + 1]; - output[i] = (sum + 2) >> 2; + // TODO(ltrudeau) replace "+ 2 >> 2 << 3" with << 1 + output_q3[i] = ((sum + 2) >> 2) << 3; } y_pix += MAX_SB_SIZE << 1; - output += MAX_SB_SIZE; + output_q3 += MAX_SB_SIZE; } } static INLINE void cfl_luma_subsampling_444(const uint8_t *y_pix, - uint8_t *output, int width, + int *output_q3, int width, int height) { for (int j = 0; j < height; j++) { for (int i = 0; i < width; i++) { - output[i] = y_pix[i]; + output_q3[i] = y_pix[i] << 3; } y_pix += MAX_SB_SIZE; - output += MAX_SB_SIZE; + output_q3 += MAX_SB_SIZE; } } @@ -62,20 +63,19 @@ const int sub_y = cfl->subsampling_y; const int off_log2 = tx_size_wide_log2[0]; - // TODO(ltrudeau) convert to uint16 to add HBD support - uint8_t *output = cfl->y_down_pix; + int *output_q3 = cfl->y_down_pix_q3; // TODO(ltrudeau) should be faster to downsample when we store the values // TODO(ltrudeau) add support for 4:2:2 if (sub_y == 0 && sub_x == 0) { // TODO(ltrudeau) convert to uint16 to add HBD support const uint8_t *y_pix = cfl->y_pix + ((row * MAX_SB_SIZE + col) << off_log2); - cfl_luma_subsampling_444(y_pix, output, width, height); + cfl_luma_subsampling_444(y_pix, output_q3, width, height); } else if (sub_y == 1 && sub_x == 1) { // TODO(ltrudeau) convert to uint16 to add HBD support const uint8_t *y_pix = cfl->y_pix + ((row * MAX_SB_SIZE + col) << (off_log2 + sub_y)); - cfl_luma_subsampling_420(y_pix, output, width, height); + cfl_luma_subsampling_420(y_pix, output_q3, width, height); } else { assert(0); // Unsupported chroma subsampling } @@ -100,7 +100,7 @@ for (int j = 0; j < height; j++) { last_pixel = output_row_offset - 1; for (int i = 0; i < diff_width; i++) { - output[output_row_offset + i] = output[last_pixel]; + output_q3[output_row_offset + i] = output_q3[last_pixel]; } output_row_offset += MAX_SB_SIZE; } @@ -112,7 +112,7 @@ for (int j = 0; j < diff_height; j++) { for (int i = 0; i < width; i++) { - output[output_row_offset + i] = output[last_row_offset + i]; + output_q3[output_row_offset + i] = output_q3[last_row_offset + i]; } output_row_offset += MAX_SB_SIZE; } @@ -200,10 +200,8 @@ const int num_pel_log2 = (tx_size_high_log2[tx_size] + tx_size_wide_log2[tx_size]); - // TODO(ltrudeau) Convert to uint16 for HBD support - const uint8_t *y_pix = cfl->y_down_pix; - // TODO(ltrudeau) Convert to uint16 for HBD support - const uint8_t *t_y_pix; + const int *y_pix_q3 = cfl->y_down_pix_q3; + const int *t_y_pix_q3; int *averages_q3 = cfl->y_averages_q3; cfl_load(cfl, 0, 0, width, height); @@ -211,24 +209,23 @@ int a = 0; for (int b_j = 0; b_j < height; b_j += tx_height) { for (int b_i = 0; b_i < width; b_i += tx_width) { - int sum = 0; - t_y_pix = y_pix; + int sum_q3 = 0; + t_y_pix_q3 = y_pix_q3; for (int t_j = 0; t_j < tx_height; t_j++) { for (int t_i = b_i; t_i < b_i + tx_width; t_i++) { - sum += t_y_pix[t_i]; + sum_q3 += t_y_pix_q3[t_i]; } - t_y_pix += MAX_SB_SIZE; + t_y_pix_q3 += MAX_SB_SIZE; } assert(a < MAX_NUM_TXB_SQUARE); - averages_q3[a++] = - ((sum << 3) + (1 << (num_pel_log2 - 1))) >> num_pel_log2; + averages_q3[a++] = (sum_q3 + (1 << (num_pel_log2 - 1))) >> num_pel_log2; // Loss is never more than 1/2 (in Q3) assert(fabs((double)averages_q3[a - 1] - - (sum / ((double)(1 << num_pel_log2))) * (1 << 3)) <= 0.5); + (sum_q3 / ((double)(1 << num_pel_log2)))) <= 0.5); } assert(a % stride == 0); - y_pix += block_row_stride; + y_pix_q3 += block_row_stride; } cfl->y_averages_stride = stride; @@ -255,8 +252,7 @@ const int width = tx_size_wide[tx_size]; const int height = tx_size_high[tx_size]; - // TODO(ltrudeau) Convert to uint16 to support HBD - const uint8_t *y_pix = cfl->y_down_pix; + const int *y_pix_q3 = cfl->y_down_pix_q3; const int dc_pred = cfl->dc_pred[plane - 1]; const int alpha_q3 = @@ -273,11 +269,11 @@ for (int j = 0; j < height; j++) { for (int i = 0; i < width; i++) { // TODO(ltrudeau) add support for HBD. - dst[i] = - clip_pixel(get_scaled_luma_q0(alpha_q3, y_pix[i], avg_q3) + dc_pred); + dst[i] = clip_pixel(get_scaled_luma_q0(alpha_q3, y_pix_q3[i], avg_q3) + + dc_pred); } dst += dst_stride; - y_pix += MAX_SB_SIZE; + y_pix_q3 += MAX_SB_SIZE; } }
diff --git a/av1/common/cfl.h b/av1/common/cfl.h index 7a56a49..e6de1b1 100644 --- a/av1/common/cfl.h +++ b/av1/common/cfl.h
@@ -14,8 +14,8 @@ #include "av1/common/blockd.h" -static INLINE int get_scaled_luma_q0(int alpha_q3, int y_pix, int avg_q3) { - int scaled_luma_q6 = alpha_q3 * ((y_pix << 3) - avg_q3); +static INLINE int get_scaled_luma_q0(int alpha_q3, int y_pix_q3, int avg_q3) { + int scaled_luma_q6 = alpha_q3 * (y_pix_q3 - avg_q3); return ROUND_POWER_OF_TWO_SIGNED(scaled_luma_q6, 6); }
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c index 9d53e3e..dadb977 100644 --- a/av1/encoder/rdopt.c +++ b/av1/encoder/rdopt.c
@@ -5845,7 +5845,7 @@ #endif // CONFIG_EXT_INTRA #if CONFIG_CFL -static int64_t cfl_alpha_dist(const uint8_t *y_pix, int y_stride, +static int64_t cfl_alpha_dist(const int *y_pix_q3, int y_stride, const int y_averages_q3[MAX_NUM_TXB_SQUARE], const uint8_t *src, int src_stride, int width, int height, TX_SIZE tx_size, int dc_pred, @@ -5872,7 +5872,7 @@ const int tx_width = tx_size_wide[tx_size]; const int y_block_row_off = y_stride * tx_height; const int src_block_row_off = src_stride * tx_height; - const uint8_t *t_y_pix; + const int *t_y_pix_q3; const uint8_t *t_src; int a = 0; for (int b_j = 0; b_j < height; b_j += tx_height) { @@ -5880,14 +5880,14 @@ for (int b_i = 0; b_i < width; b_i += tx_width) { const int w = b_i + tx_width; const int tx_avg_q3 = y_averages_q3[a++]; - t_y_pix = y_pix; + t_y_pix_q3 = y_pix_q3; t_src = src; for (int t_j = b_j; t_j < h; t_j++) { for (int t_i = b_i; t_i < w; t_i++) { const int uv = t_src[t_i]; const int scaled_luma = - get_scaled_luma_q0(alpha_q3, t_y_pix[t_i], tx_avg_q3); + get_scaled_luma_q0(alpha_q3, t_y_pix_q3[t_i], tx_avg_q3); // TODO(ltrudeau) add support for HBD. diff = uv - clamp(scaled_luma + dc_pred, 0, 255); @@ -5897,11 +5897,11 @@ diff = uv - clamp(-scaled_luma + dc_pred, 0, 255); dist_neg += diff * diff; } - t_y_pix += y_stride; + t_y_pix_q3 += y_stride; t_src += src_stride; } } - y_pix += y_block_row_off; + y_pix_q3 += y_block_row_off; src += src_block_row_off; } @@ -5928,25 +5928,25 @@ const int dc_pred_u = cfl->dc_pred[CFL_PRED_U]; const int dc_pred_v = cfl->dc_pred[CFL_PRED_V]; const int *y_averages_q3 = cfl->y_averages_q3; - const uint8_t *y_pix = cfl->y_down_pix; + const int *y_pix_q3 = cfl->y_down_pix_q3; int64_t sse[CFL_PRED_PLANES][CFL_MAGS_SIZE]; sse[CFL_PRED_U][0] = - cfl_alpha_dist(y_pix, MAX_SB_SIZE, y_averages_q3, src_u, src_stride_u, + cfl_alpha_dist(y_pix_q3, MAX_SB_SIZE, y_averages_q3, src_u, src_stride_u, width, height, tx_size, dc_pred_u, 0, NULL); sse[CFL_PRED_V][0] = - cfl_alpha_dist(y_pix, MAX_SB_SIZE, y_averages_q3, src_v, src_stride_v, + cfl_alpha_dist(y_pix_q3, MAX_SB_SIZE, y_averages_q3, src_v, src_stride_v, width, height, tx_size, dc_pred_v, 0, NULL); for (int c = 0; c < CFL_ALPHABET_SIZE; c++) { const int m = c * 2 + 1; const int abs_alpha_q3 = c + 1; sse[CFL_PRED_U][m] = cfl_alpha_dist( - y_pix, MAX_SB_SIZE, y_averages_q3, src_u, src_stride_u, width, height, - tx_size, dc_pred_u, abs_alpha_q3, &sse[CFL_PRED_U][m + 1]); + y_pix_q3, MAX_SB_SIZE, y_averages_q3, src_u, src_stride_u, width, + height, tx_size, dc_pred_u, abs_alpha_q3, &sse[CFL_PRED_U][m + 1]); sse[CFL_PRED_V][m] = cfl_alpha_dist( - y_pix, MAX_SB_SIZE, y_averages_q3, src_v, src_stride_v, width, height, - tx_size, dc_pred_v, abs_alpha_q3, &sse[CFL_PRED_V][m + 1]); + y_pix_q3, MAX_SB_SIZE, y_averages_q3, src_v, src_stride_v, width, + height, tx_size, dc_pred_v, abs_alpha_q3, &sse[CFL_PRED_V][m + 1]); } int64_t dist;