Add av1_convolve_2d_facade When convolve_round is on, av1_convolve_2d_facade will be used for interpolation rather than av1_convolve. Will remove the experiment code of convolve_round experiment from av1_convolve in another CL. So far we use 4-bit rounding in the intermediate stage on top of using post rounding for compound mode after the last stage. This will give us roughly 0.45% gain on lowres , 0.39% on midres and roughly 0.6-0.7% on hdres Altogether, is 1.15% on lowresm, 0.74% on midres and roughly 1.7-1.8% on hdres Note that there no restriction usage of 12-tap filter in the CL. Adding that, we will lose roughly 0.1% again on lowres. Change-Id: I6332e1d888e28a3b3ddc29711817d66e52cb5cdf
diff --git a/av1/common/convolve.c b/av1/common/convolve.c index a1a266b..97ce6ba 100644 --- a/av1/common/convolve.c +++ b/av1/common/convolve.c
@@ -199,15 +199,79 @@ #if CONFIG_CONVOLVE_ROUND void av1_convolve_rounding(const int32_t *src, int src_stride, uint8_t *dst, - int dst_stride, int w, int h) { + int dst_stride, int w, int h, int bits) { int r, c; for (r = 0; r < h; ++r) { for (c = 0; c < w; ++c) { dst[r * dst_stride + c] = - clip_pixel(ROUND_POWER_OF_TWO(src[r * src_stride + c], FILTER_BITS)); + clip_pixel(ROUND_POWER_OF_TWO_SIGNED(src[r * src_stride + c], bits)); } } } + +void av1_convolve_2d(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, + int dst_stride, int w, int h, + InterpFilterParams *filter_params_x, + InterpFilterParams *filter_params_y, const int subpel_x_q4, + const int subpel_y_q4, ConvolveParams *conv_params) { + int x, y, k; + CONV_BUF_TYPE im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; + int im_h = h + filter_params_y->taps - 1; + int im_stride = w; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + (void)conv_params; + + // horizontal filter + const uint8_t *src_horiz = src - fo_vert * src_stride; + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + *filter_params_x, subpel_x_q4 & SUBPEL_MASK); + for (y = 0; y < im_h; ++y) { + for (x = 0; x < w; ++x) { + CONV_BUF_TYPE sum = 0; + for (k = 0; k < filter_params_x->taps; ++k) { + sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; + } + im_block[y * im_stride + x] = + ROUND_POWER_OF_TWO_SIGNED(sum, conv_params->round_0); + } + } + + // vertical filter + CONV_BUF_TYPE *src_vert = im_block + fo_vert * im_stride; + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + *filter_params_y, subpel_y_q4 & SUBPEL_MASK); + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + CONV_BUF_TYPE sum = 0; + for (k = 0; k < filter_params_y->taps; ++k) { + sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; + } + dst[y * dst_stride + x] += + ROUND_POWER_OF_TWO_SIGNED(sum, conv_params->round_1); + } + } +} + +void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilter *interp_filter, + const int subpel_x_q4, int x_step_q4, + const int subpel_y_q4, int y_step_q4, + ConvolveParams *conv_params) { + (void)x_step_q4; + (void)y_step_q4; + (void)dst; + (void)dst_stride; + InterpFilterParams filter_params_x = + av1_get_interp_filter_params(interp_filter[1 + 2 * conv_params->ref]); + InterpFilterParams filter_params_y = + av1_get_interp_filter_params(interp_filter[0 + 2 * conv_params->ref]); + av1_convolve_2d(src, src_stride, conv_params->dst, conv_params->dst_stride, w, + h, &filter_params_x, &filter_params_y, subpel_x_q4, + subpel_y_q4, conv_params); +} + #endif // CONFIG_CONVOLVE_ROUND void av1_convolve(const uint8_t *src, int src_stride, uint8_t *dst, @@ -294,7 +358,6 @@ filter_params = filter_params_x; assert(filter_params.taps <= MAX_FILTER_TAP); - av1_convolve_horiz_facade(temp + (filter_size / 2 - 1), temp_stride, dst, dst_stride, w, h, filter_params, subpel_x_q4, x_step_q4, conv_params);
diff --git a/av1/common/convolve.h b/av1/common/convolve.h index 349e9ac..88b413d 100644 --- a/av1/common/convolve.h +++ b/av1/common/convolve.h
@@ -23,11 +23,15 @@ CONVOLVE_OPT_NO_ROUND, } CONVOLVE_OPT; +typedef int32_t CONV_BUF_TYPE; + typedef struct ConvolveParams { int ref; CONVOLVE_OPT round; - int32_t *dst; + CONV_BUF_TYPE *dst; int dst_stride; + int round_0; + int round_1; } ConvolveParams; static INLINE ConvolveParams get_conv_params(int ref) { @@ -38,18 +42,33 @@ } #if CONFIG_CONVOLVE_ROUND +void av1_convolve_2d(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, + int dst_stride, int w, int h, + InterpFilterParams *filter_params_x, + InterpFilterParams *filter_params_y, const int subpel_x_q4, + const int subpel_y_q4, ConvolveParams *conv_params); + +void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilter *interp_filter, + const int subpel_x_q4, int x_step_q4, + const int subpel_y_q4, int y_step_q4, + ConvolveParams *conv_params); + static INLINE ConvolveParams get_conv_params_no_round(int ref, int32_t *dst, int dst_stride) { ConvolveParams conv_params; conv_params.ref = ref; conv_params.round = CONVOLVE_OPT_NO_ROUND; + conv_params.round_0 = 5; + conv_params.round_1 = 1; conv_params.dst = dst; conv_params.dst_stride = dst_stride; return conv_params; } void av1_convolve_rounding(const int32_t *src, int src_stride, uint8_t *dst, - int dst_stride, int w, int h); + int dst_stride, int w, int h, int bits); #endif // CONFIG_CONVOLVE_ROUND void av1_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
diff --git a/av1/common/reconinter.c b/av1/common/reconinter.c index 682d002..566a911 100644 --- a/av1/common/reconinter.c +++ b/av1/common/reconinter.c
@@ -918,7 +918,8 @@ MV32 scaled_mv[2]; SubpelParams subpel_params[2]; #if CONFIG_CONVOLVE_ROUND - int32_t tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE]; + DECLARE_ALIGNED(16, int32_t, tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE]); + av1_zero(tmp_dst); #endif // CONFIG_CONVOLVE_ROUND for (ref = 0; ref < 1 + is_compound; ++ref) { @@ -968,15 +969,16 @@ (scaled_mv[ref].col >> SUBPEL_BITS); } +#if CONFIG_CONVOLVE_ROUND + ConvolveParams conv_params = + get_conv_params_no_round(ref, tmp_dst, MAX_SB_SIZE); +#else + ConvolveParams conv_params = get_conv_params(ref); +#endif // CONFIG_CONVOLVE_ROUND for (ref = 0; ref < 1 + is_compound; ++ref) { const struct scale_factors *const sf = &xd->block_refs[ref]->sf; struct buf_2d *const pre_buf = &pd->pre[ref]; -#if CONFIG_CONVOLVE_ROUND - ConvolveParams conv_params = - get_conv_params_no_round(ref, tmp_dst, MAX_SB_SIZE); -#else - ConvolveParams conv_params = get_conv_params(ref); -#endif // CONFIG_CONVOLVE_ROUND + conv_params.ref = ref; #if CONFIG_EXT_INTER if (ref && is_masked_compound_type(mi->mbmi.interinter_compound_data.type)) @@ -1014,7 +1016,9 @@ #if CONFIG_AOM_HIGHBITDEPTH if (!(xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)) #endif // CONFIG_AOM_HIGHBITDEPTH - av1_convolve_rounding(tmp_dst, MAX_SB_SIZE, dst, dst_buf->stride, w, h); + av1_convolve_rounding(tmp_dst, MAX_SB_SIZE, dst, dst_buf->stride, w, h, + FILTER_BITS * 2 + is_compound - + conv_params.round_0 - conv_params.round_1); #endif // CONFIG_CONVOLVE_ROUND } }
diff --git a/av1/common/reconinter.h b/av1/common/reconinter.h index 519a1e3..cc4c858 100644 --- a/av1/common/reconinter.h +++ b/av1/common/reconinter.h
@@ -64,11 +64,18 @@ sf->predict[subpel_x != 0][subpel_y != 0][conv_params->ref]( src, src_stride, dst, dst_stride, kernel_x, xs, kernel_y, ys, w, h); } else { - // ref_idx > 0 means this is the second reference frame - // first reference frame's prediction result is already in dst - // therefore we need to average the first and second results - av1_convolve(src, src_stride, dst, dst_stride, w, h, interp_filter, - subpel_x, xs, subpel_y, ys, conv_params); +// ref_idx > 0 means this is the second reference frame +// first reference frame's prediction result is already in dst +// therefore we need to average the first and second results +#if CONFIG_CONVOLVE_ROUND + if (conv_params->round == CONVOLVE_OPT_NO_ROUND) + av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h, + interp_filter, subpel_x, xs, subpel_y, ys, + conv_params); + else +#endif + av1_convolve(src, src_stride, dst, dst_stride, w, h, interp_filter, + subpel_x, xs, subpel_y, ys, conv_params); } }