Cherry-pick CWG-F049 and DAMR LF_SUB_PU for chroma Fix DAMR chroma subblock logics in CONFIG_LF_SUB_PU (issue #623). Fix TIP subblock logics in 422 format (issue #423). Noise level performance impact. STATS_CHANGED
diff --git a/aom_dsp/x86/aom_convolve_copy_avx2.c b/aom_dsp/x86/aom_convolve_copy_avx2.c index 1306163..28bae6e 100644 --- a/aom_dsp/x86/aom_convolve_copy_avx2.c +++ b/aom_dsp/x86/aom_convolve_copy_avx2.c
@@ -250,6 +250,24 @@ h -= 2; } while (h); #endif // CONFIG_SUBBLK_REF_EXT + } else if (w == 24) { + do { + __m256i s[2]; + __m128i s_rem[2]; + s[0] = _mm256_loadu_si256((__m256i *)src); + s_rem[0] = _mm_loadu_si128((__m128i *)(src + 16)); + src += src_stride; + s[1] = _mm256_loadu_si256((__m256i *)src); + s_rem[1] = _mm_loadu_si128((__m128i *)(src + 16)); + src += src_stride; + _mm256_storeu_si256((__m256i *)dst, s[0]); + _mm_storeu_si128((__m128i *)(dst + 16), s_rem[0]); + dst += dst_stride; + _mm256_storeu_si256((__m256i *)dst, s[1]); + _mm_storeu_si128((__m128i *)(dst + 16), s_rem[1]); + dst += dst_stride; + h -= 2; + } while (h); } else if (w == 32) { do { __m256i s[4];
diff --git a/aom_dsp/x86/aom_convolve_copy_sse2.c b/aom_dsp/x86/aom_convolve_copy_sse2.c index bd32344..cf6fb21 100644 --- a/aom_dsp/x86/aom_convolve_copy_sse2.c +++ b/aom_dsp/x86/aom_convolve_copy_sse2.c
@@ -303,6 +303,27 @@ h -= 2; } while (h); #endif // CONFIG_SUBBLK_REF_EXT + } else if (w == 24) { + do { + __m128i s[6]; + s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8)); + s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8)); + s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8)); + src += src_stride; + s[3] = _mm_loadu_si128((__m128i *)(src + 0 * 8)); + s[4] = _mm_loadu_si128((__m128i *)(src + 1 * 8)); + s[5] = _mm_loadu_si128((__m128i *)(src + 2 * 8)); + src += src_stride; + _mm_storeu_si128((__m128i *)(dst + 0 * 8), s[0]); + _mm_storeu_si128((__m128i *)(dst + 1 * 8), s[1]); + _mm_storeu_si128((__m128i *)(dst + 2 * 8), s[2]); + dst += dst_stride; + _mm_storeu_si128((__m128i *)(dst + 0 * 8), s[3]); + _mm_storeu_si128((__m128i *)(dst + 1 * 8), s[4]); + _mm_storeu_si128((__m128i *)(dst + 2 * 8), s[5]); + dst += dst_stride; + h -= 2; + } while (h); } else if (w == 32) { do { __m128i s[8];
diff --git a/av1/av1.cmake b/av1/av1.cmake index 02dabe5..5ced281 100644 --- a/av1/av1.cmake +++ b/av1/av1.cmake
@@ -358,6 +358,7 @@ AOM_AV1_COMMON_INTRIN_AVX2 "${AOM_ROOT}/av1/common/cdef_block_avx2.c" "${AOM_ROOT}/av1/common/x86/affine_optflow_refine_avx2.c" + "${AOM_ROOT}/av1/common/x86/bawp_avx2.c" "${AOM_ROOT}/av1/common/x86/cfl_avx2.c" "${AOM_ROOT}/av1/common/x86/highbd_ccso_avx2.c" "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_avx2.c"
diff --git a/av1/common/av1_loopfilter.c b/av1/common/av1_loopfilter.c index f38011e..6d05236 100644 --- a/av1/common/av1_loopfilter.c +++ b/av1/common/av1_loopfilter.c
@@ -529,13 +529,12 @@ #if CONFIG_LF_SUB_PU // Check whether current block is TIP mode static AOM_INLINE void check_tip_edge(const MB_MODE_INFO *const mbmi, - const int scale_horz, - const int scale_vert, TX_SIZE *ts, + const int scale, TX_SIZE *ts, int32_t *tip_edge) { const bool is_tip_mode = is_tip_ref_frame(mbmi->ref_frame[0]); if (is_tip_mode) { *tip_edge = 1; - const int tip_ts = (scale_horz || scale_vert) ? TX_4X4 : TX_8X8; + const int tip_ts = scale ? TX_4X4 : TX_8X8; *ts = tip_ts; } } @@ -547,13 +546,22 @@ const MACROBLOCKD *xd, #endif // CONFIG_COMPOUND_4XN const MB_MODE_INFO *const mbmi, - TX_SIZE *ts, int32_t *opfl_edge) { - if (plane > 0) return; + const int scale, TX_SIZE *ts, + int32_t *opfl_edge) { const bool is_opfl_mode = opfl_allowed_for_cur_block(cm, #if CONFIG_COMPOUND_4XN xd, #endif // CONFIG_COMPOUND_4XN mbmi); +#if CONFIG_AFFINE_REFINEMENT + if (is_opfl_mode && plane && + mbmi->comp_refine_type >= COMP_AFFINE_REFINE_START) { + *opfl_edge = 1; + *ts = scale ? TX_4X4 : TX_8X8; + return; + } +#endif // CONFIG_AFFINE_REFINEMENT + if (plane > 0) return; if (is_opfl_mode) { *opfl_edge = 1; const int opfl_ts = TX_8X8; @@ -564,14 +572,13 @@ #if CONFIG_REFINEMV // Check whether current block is RFMV mode static AOM_INLINE void check_rfmv_edge(const MB_MODE_INFO *const mbmi, - const int scale_horz, - const int scale_vert, TX_SIZE *ts, + const int scale, TX_SIZE *ts, int32_t *rfmv_edge) { const int is_rfmv_mode = mbmi->refinemv_flag && !is_tip_ref_frame(mbmi->ref_frame[0]); if (is_rfmv_mode) { *rfmv_edge = 1; - const int rfmv_ts = (scale_horz || scale_vert) ? TX_8X8 : TX_16X16; + const int rfmv_ts = scale ? TX_8X8 : TX_16X16; *ts = rfmv_ts; } } @@ -592,16 +599,16 @@ int temp_edge = 0; TX_SIZE temp_ts = 0; - check_tip_edge(mbmi, scale_horz, scale_vert, &temp_ts, &temp_edge); + int scale = edge_dir == VERT_EDGE ? scale_horz : scale_vert; + check_tip_edge(mbmi, scale, &temp_ts, &temp_edge); if (!temp_edge) check_opfl_edge(cm, plane, #if CONFIG_COMPOUND_4XN xd, #endif // CONFIG_COMPOUND_4XN - mbmi, &temp_ts, &temp_edge); + mbmi, scale, &temp_ts, &temp_edge); #if CONFIG_REFINEMV - if (!temp_edge) - check_rfmv_edge(mbmi, scale_horz, scale_vert, &temp_ts, &temp_edge); + if (!temp_edge) check_rfmv_edge(mbmi, scale, &temp_ts, &temp_edge); #endif // CONFIG_REFINEMV if (temp_edge) { @@ -1355,34 +1362,40 @@ const uint16_t q_vert = lfi->tip_q_thr[plane][VERT_EDGE]; const uint16_t side_vert = lfi->tip_side_thr[plane][VERT_EDGE]; const int bit_depth = cm->seq_params.bit_depth; - int n = 8; + int sub_bw = 8; + int sub_bh = 8; if (plane > 0) { const int subsampling_x = cm->seq_params.subsampling_x; const int subsampling_y = cm->seq_params.subsampling_y; - if (subsampling_x || subsampling_y) n = 4; + sub_bw >>= subsampling_x; + sub_bh >>= subsampling_y; } - const int filter_length = n; + // select vert/horz filter lengths based on block width/height + int filter_length_vert = sub_bw; + int filter_length_horz = sub_bh; // start filtering - const int h = bh - n; - const int w = bw - n; - const int rw = bw - (bw % n); - for (int j = 0; j <= h; j += n) { - for (int i = 0; i <= w; i += n) { + const int h = bh - sub_bh; + const int w = bw - sub_bw; + const int rw = bw - (bw % sub_bw); + for (int j = 0; j <= h; j += sub_bh) { + for (int i = 0; i <= w; i += sub_bw) { // filter vertical boundary if (i > 0) { - aom_highbd_lpf_vertical_generic_c(dst, dst_stride, filter_length, - &q_vert, &side_vert, bit_depth, n); + aom_highbd_lpf_vertical_generic_c(dst, dst_stride, filter_length_vert, + &q_vert, &side_vert, bit_depth, + sub_bh); } // filter horizontal boundary if (j > 0) { - aom_highbd_lpf_horizontal_generic_c(dst, dst_stride, filter_length, - &q_horz, &side_horz, bit_depth, n); + aom_highbd_lpf_horizontal_generic_c(dst, dst_stride, filter_length_horz, + &q_horz, &side_horz, bit_depth, + sub_bw); } - dst += n; + dst += sub_bw; } dst -= rw; - dst += n * dst_stride; + dst += sub_bh * dst_stride; } }
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl index 5c133fc..e05f8b7 100644 --- a/av1/common/av1_rtcd_defs.pl +++ b/av1/common/av1_rtcd_defs.pl
@@ -351,6 +351,14 @@ specialize qw/av1_avg_pooling_pdiff_gradients avx2/; } +# +# Block Adaptive Weighted Prediction +# +if (aom_config("CONFIG_BAWP") eq "yes"){ + add_proto qw/void av1_make_bawp_block/, "uint16_t *dst, int dst_stride, int16_t alpha, int32_t beta, int shift, int bw, int bh, int bd"; + specialize qw/av1_make_bawp_block avx2/; +} + # Helper functions. add_proto qw/void av1_round_shift_array/, "int32_t *arr, int size, int bit"; specialize "av1_round_shift_array", qw/sse4_1 neon/; @@ -703,7 +711,7 @@ if (aom_config("CONFIG_OPFL_MEMBW_REDUCTION") eq "yes"){ add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta, int use_damr_padding, ReferenceArea *ref_area"; - specialize qw/av1_highbd_warp_affine sse4_1/; + specialize qw/av1_highbd_warp_affine sse4_1 avx2/; } else{ add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
diff --git a/av1/common/blockd.h b/av1/common/blockd.h index 2f8e409..572d8d5 100644 --- a/av1/common/blockd.h +++ b/av1/common/blockd.h
@@ -344,19 +344,21 @@ #if CONFIG_SUBBLK_REF_EXT #define REF_BUFFER_WIDTH \ (REFINEMV_SUBBLOCK_WIDTH + (AOM_INTERP_EXTEND - 1) + AOM_INTERP_EXTEND + \ - 2 * SUBBLK_REF_EXT_LINES) + 2 * (SUBBLK_REF_EXT_LINES + DMVR_SEARCH_EXT_LINES)) #else -#define REF_BUFFER_WIDTH \ - (REFINEMV_SUBBLOCK_WIDTH + (AOM_INTERP_EXTEND - 1) + AOM_INTERP_EXTEND) +#define REF_BUFFER_WIDTH \ + (REFINEMV_SUBBLOCK_WIDTH + (AOM_INTERP_EXTEND - 1) + AOM_INTERP_EXTEND + \ + 2 * DMVR_SEARCH_EXT_LINES) #endif #endif // CONFIG_ACROSS_SCALE_REFINEMV #if CONFIG_SUBBLK_REF_EXT #define REF_BUFFER_HEIGHT \ (REFINEMV_SUBBLOCK_HEIGHT + (AOM_INTERP_EXTEND - 1) + AOM_INTERP_EXTEND + \ - 2 * SUBBLK_REF_EXT_LINES) + 2 * (SUBBLK_REF_EXT_LINES + DMVR_SEARCH_EXT_LINES)) #else -#define REF_BUFFER_HEIGHT \ - (REFINEMV_SUBBLOCK_HEIGHT + (AOM_INTERP_EXTEND - 1) + AOM_INTERP_EXTEND) +#define REF_BUFFER_HEIGHT \ + (REFINEMV_SUBBLOCK_HEIGHT + (AOM_INTERP_EXTEND - 1) + AOM_INTERP_EXTEND + \ + 2 * DMVR_SEARCH_EXT_LINES) #endif // CONFIG_SUBBLK_REF_EXT typedef struct PadBlock { int x0;
diff --git a/av1/common/enums.h b/av1/common/enums.h index 39d2c9b..32e217e 100644 --- a/av1/common/enums.h +++ b/av1/common/enums.h
@@ -51,6 +51,12 @@ #define SUBBLK_REF_EXT_LINES 2 #endif // CONFIG_SUBBLK_REF_EXT +#if CONFIG_16_FULL_SEARCH_DMVR +#define DMVR_SEARCH_EXT_LINES 2 +#else +#define DMVR_SEARCH_EXT_LINES 0 +#endif // CONFIG_16_FULL_SEARCH_DMVR + #if CONFIG_WARP_PRECISION #define WARP_STATS_BUFFER_SIZE \ (MAX_WARP_REF_CANDIDATES * NUM_WARP_PRECISION_MODES)
diff --git a/av1/common/reconinter.c b/av1/common/reconinter.c index 77aabc2..f45c51a 100644 --- a/av1/common/reconinter.c +++ b/av1/common/reconinter.c
@@ -1187,20 +1187,20 @@ #if CONFIG_REFINEMV // Compute the SAD values for refineMV modes -int get_refinemv_sad(uint16_t *src1, uint16_t *src2, int width, int height, - int bd) { +int get_refinemv_sad(uint16_t *src1, uint16_t *src2, int stride, int width, + int height, int bd) { #if CONFIG_SUBBLK_REF_EXT (void)bd; #if CONFIG_SUBBLK_REF_DS - return get_highbd_sad_ds(src1, width, src2, width, 8, width, height); + return get_highbd_sad_ds(src1, stride, src2, stride, 8, width, height); #else - return get_highbd_sad(src1, width, src2, width, 8, width, height); + return get_highbd_sad(src1, stride, src2, stride, 8, width, height); #endif #else #if CONFIG_SUBBLK_REF_DS - return get_highbd_sad_ds(src1, width, src2, width, bd, width, height); + return get_highbd_sad_ds(src1, stride, src2, stride, bd, width, height); #else - return get_highbd_sad(src1, width, src2, width, bd, width, height); + return get_highbd_sad(src1, stride, src2, stride, bd, width, height); #endif // CONFIG_SUBBLK_REF_DS #endif // CONFIG_SUBBLK_REF_EXT } @@ -3688,6 +3688,17 @@ } } +// Generate weighted prediction of the block. +void av1_make_bawp_block_c(uint16_t *dst, int dst_stride, int16_t alpha, + int32_t beta, int shift, int bw, int bh, int bd) { + for (int j = 0; j < bh; ++j) { + for (int i = 0; i < bw; ++i) { + dst[j * dst_stride + i] = clip_pixel_highbd( + (dst[j * dst_stride + i] * alpha + beta) >> shift, bd); + } + } +} + // generate inter prediction of a block coded in bwap mode enabled void av1_build_one_bawp_inter_predictor( uint16_t *dst, int dst_stride, const MV *const src_mv, @@ -3853,12 +3864,7 @@ int16_t alpha = mbmi->bawp_alpha[plane][ref]; int32_t beta = mbmi->bawp_beta[plane][ref]; - for (int j = 0; j < bh; ++j) { - for (int i = 0; i < bw; ++i) { - dst[j * dst_stride + i] = clip_pixel_highbd( - (dst[j * dst_stride + i] * alpha + beta) >> shift, xd->bd); - } - } + av1_make_bawp_block(dst, dst_stride, alpha, beta, shift, bw, bh, xd->bd); } #endif // CONFIG_BAWP @@ -4552,6 +4558,33 @@ } } +#if CONFIG_16_FULL_SEARCH_DMVR +void av1_refinemv_build_predictors(MACROBLOCKD *xd, int mi_x, int mi_y, + uint16_t **mc_buf, + CalcSubpelParamsFunc calc_subpel_params_func, + uint16_t *dst_ref0, uint16_t *dst_ref1, + int dst_stride, MV mv0, MV mv1, + InterPredParams *inter_pred_params) { + for (int ref = 0; ref < 2; ref++) { + SubpelParams subpel_params; + uint16_t *src; + int src_stride; + + uint16_t *dst_ref = ref == 0 ? dst_ref0 : dst_ref1; + MV *src_mv = ref == 0 ? &mv0 : &mv1; +#if CONFIG_SUBBLK_REF_EXT + src_mv->row -= 8 * SUBBLK_REF_EXT_LINES; + src_mv->col -= 8 * SUBBLK_REF_EXT_LINES; +#endif // CONFIG_SUBBLK_REF_EXT + calc_subpel_params_func(src_mv, &inter_pred_params[ref], xd, mi_x, mi_y, + ref, 0, mc_buf, &src, &subpel_params, &src_stride); + assert(inter_pred_params[ref].comp_mode == UNIFORM_SINGLE || + inter_pred_params[ref].comp_mode == UNIFORM_COMP); + av1_make_inter_predictor(src, src_stride, dst_ref, dst_stride, + &inter_pred_params[ref], &subpel_params); + } +} +#else int av1_refinemv_build_predictors_and_get_sad( MACROBLOCKD *xd, int bw, int bh, int mi_x, int mi_y, uint16_t **mc_buf, CalcSubpelParamsFunc calc_subpel_params_func, uint16_t *dst_ref0, @@ -4574,8 +4607,10 @@ &inter_pred_params[ref], &subpel_params); } - return get_refinemv_sad(dst_ref0, dst_ref1, bw, bh, xd->bd); + return get_refinemv_sad(dst_ref0, dst_ref1, bw, bw, bh, xd->bd); } +#endif // CONFIG_16_FULL_SEARCH_DMVR + void apply_mv_refinement(const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, MB_MODE_INFO *mi, int bw, int bh, int mi_x, int mi_y, uint16_t **mc_buf, const MV mv[2], @@ -4653,10 +4688,6 @@ assert(inter_pred_params[ref].conv_params.is_compound == 0); assert(inter_pred_params[ref].conv_params.do_average == 0); assert(mi->interinter_comp.type == COMPOUND_AVERAGE); -#if CONFIG_OPFL_MEMBW_REDUCTION - inter_pred_params[ref].use_ref_padding = 1; - inter_pred_params[ref].ref_area = &ref_area[ref]; -#endif // CONFIG_OPFL_MEMBW_REDUCTION } #if !CONFIG_16_FULL_SEARCH_DMVR @@ -4671,12 +4702,28 @@ // If we signal the refinemv_flags we do not select sad0 // Set sad0 a large value so that it does not be selected +#if CONFIG_16_FULL_SEARCH_DMVR +#if CONFIG_SUBBLK_REF_EXT + const int dst_stride = REFINEMV_SUBBLOCK_WIDTH + + 2 * (SUBBLK_REF_EXT_LINES + DMVR_SEARCH_EXT_LINES); +#else + const int dst_stride = REFINEMV_SUBBLOCK_WIDTH + 2 * DMVR_SEARCH_EXT_LINES; +#endif // CONFIG_SUBBLK_REF_EXT + int sad0 = INT32_MAX >> 1; + if (!switchable_refinemv_flags) { + av1_refinemv_build_predictors( + xd, mi_x, mi_y, mc_buf, calc_subpel_params_func, dst_ref0, dst_ref1, + dst_stride, center_mvs[0], center_mvs[1], inter_pred_params); + sad0 = get_refinemv_sad(dst_ref0, dst_ref1, dst_stride, bw, bh, xd->bd); + } +#else int sad0 = switchable_refinemv_flags ? (INT32_MAX >> 1) : av1_refinemv_build_predictors_and_get_sad( xd, bw, bh, mi_x, mi_y, mc_buf, calc_subpel_params_func, dst_ref0, dst_ref1, center_mvs[0], center_mvs[1], inter_pred_params); +#endif // CONFIG_16_FULL_SEARCH_DMVR #if !CONFIG_SUBBLK_REF_EXT assert(IMPLIES(mi->ref_frame[0] == TIP_FRAME, bw == 8 && bh == 8)); #endif // !CONFIG_SUBBLK_REF_EXT @@ -4694,9 +4741,9 @@ } int min_sad = sad0; - MV refined_mv0, refined_mv1; - refined_mv0 = center_mvs[0]; - refined_mv1 = center_mvs[1]; + MV refined_mv[2]; + refined_mv[0] = center_mvs[0]; + refined_mv[1] = center_mvs[1]; #if CONFIG_16_FULL_SEARCH_DMVR static const MV neighbors[DMVR_SEARCH_NUM_NEIGHBORS] = { @@ -4705,17 +4752,40 @@ { -2, 0 }, { -2, -1 }, { 0, 2 }, { 0, -2 } }; MV best_offset = { 0, 0 }; + // Prediction is generated at once for (bw+4) x (bh+4) block, by extending 2 + // samples (search range of the refinement stage) on each side. Later, the + // prediction buffers are appropriately offset for SAD calculation. + const int ext_bw = bw + 4; + const int ext_bh = bh + 4; + for (int ref = 0; ref < 2; ref++) { +#if CONFIG_OPFL_MEMBW_REDUCTION + inter_pred_params[ref].use_ref_padding = 1; + inter_pred_params[ref].ref_area = &ref_area[ref]; +#endif // CONFIG_OPFL_MEMBW_REDUCTION + inter_pred_params[ref].block_width = ext_bw; + inter_pred_params[ref].block_height = ext_bh; +#if CONFIG_REFINEMV + inter_pred_params[ref].original_pu_width = pu_width + 4; + inter_pred_params[ref].original_pu_height = pu_height + 4; +#endif // CONFIG_REFINEMV + refined_mv[ref].row -= 8 * DMVR_SEARCH_EXT_LINES; + refined_mv[ref].col -= 8 * DMVR_SEARCH_EXT_LINES; + } + + av1_refinemv_build_predictors(xd, mi_x, mi_y, mc_buf, calc_subpel_params_func, + dst_ref0, dst_ref1, dst_stride, refined_mv[0], + refined_mv[1], inter_pred_params); + for (int idx = 0; idx < DMVR_SEARCH_NUM_NEIGHBORS; ++idx) { const MV offset = { neighbors[idx].row, neighbors[idx].col }; - refined_mv0.row = center_mvs[0].row + 8 * offset.row; - refined_mv0.col = center_mvs[0].col + 8 * offset.col; - refined_mv1.row = center_mvs[1].row - 8 * offset.row; - refined_mv1.col = center_mvs[1].col - 8 * offset.col; + uint16_t *dst_ref0_offset = + dst_ref0 + (2 + offset.row) * dst_stride + 2 + offset.col; + uint16_t *dst_ref1_offset = + dst_ref1 + (2 - offset.row) * dst_stride + 2 - offset.col; - const int this_sad = av1_refinemv_build_predictors_and_get_sad( - xd, bw, bh, mi_x, mi_y, mc_buf, calc_subpel_params_func, dst_ref0, - dst_ref1, refined_mv0, refined_mv1, inter_pred_params); + const int this_sad = get_refinemv_sad(dst_ref0_offset, dst_ref1_offset, + dst_stride, bw, bh, xd->bd); if (this_sad < min_sad) { min_sad = this_sad; @@ -4729,6 +4799,7 @@ best_mv_ref[1].col = center_mvs[1].col - 8 * best_offset.col; #else + (void)ref_area; int et_sad_th = (bw * bh) << 1; #if !SINGLE_STEP_SEARCH uint8_t already_searched[5][5]; @@ -4772,14 +4843,14 @@ if (already_searched[offset.row + search_range][offset.col + search_range]) continue; #endif - refined_mv0.row = center_mvs[0].row + 8 * offset.row; - refined_mv0.col = center_mvs[0].col + 8 * offset.col; - refined_mv1.row = center_mvs[1].row - 8 * offset.row; - refined_mv1.col = center_mvs[1].col - 8 * offset.col; + refined_mv[0].row = center_mvs[0].row + 8 * offset.row; + refined_mv[0].col = center_mvs[0].col + 8 * offset.col; + refined_mv[1].row = center_mvs[1].row - 8 * offset.row; + refined_mv[1].col = center_mvs[1].col - 8 * offset.col; int this_sad = av1_refinemv_build_predictors_and_get_sad( xd, bw, bh, mi_x, mi_y, mc_buf, calc_subpel_params_func, dst_ref0, - dst_ref1, refined_mv0, refined_mv1, inter_pred_params); + dst_ref1, refined_mv[0], refined_mv[1], inter_pred_params); #if !SINGLE_STEP_SEARCH already_searched[offset.row + search_range][offset.col + search_range] = 1; @@ -4791,8 +4862,8 @@ // if the SAD is less than predefined threshold consider this candidate // as good enough to skip rest of the search. if (min_sad < et_sad_th) { - best_mv_ref[0] = refined_mv0; - best_mv_ref[1] = refined_mv1; + best_mv_ref[0] = refined_mv[0]; + best_mv_ref[1] = refined_mv[1]; return; } } @@ -5374,16 +5445,24 @@ AOMMIN(REFINEMV_SUBBLOCK_HEIGHT >> pd->subsampling_y, bh); #if CONFIG_SUBBLK_REF_EXT uint16_t - dst0_16_refinemv[(REFINEMV_SUBBLOCK_WIDTH + 2 * SUBBLK_REF_EXT_LINES) * - (REFINEMV_SUBBLOCK_HEIGHT + 2 * SUBBLK_REF_EXT_LINES)]; + dst0_16_refinemv[(REFINEMV_SUBBLOCK_WIDTH + + 2 * (SUBBLK_REF_EXT_LINES + DMVR_SEARCH_EXT_LINES)) * + (REFINEMV_SUBBLOCK_HEIGHT + + 2 * (SUBBLK_REF_EXT_LINES + DMVR_SEARCH_EXT_LINES))]; uint16_t - dst1_16_refinemv[(REFINEMV_SUBBLOCK_WIDTH + 2 * SUBBLK_REF_EXT_LINES) * - (REFINEMV_SUBBLOCK_HEIGHT + 2 * SUBBLK_REF_EXT_LINES)]; + dst1_16_refinemv[(REFINEMV_SUBBLOCK_WIDTH + + 2 * (SUBBLK_REF_EXT_LINES + DMVR_SEARCH_EXT_LINES)) * + (REFINEMV_SUBBLOCK_HEIGHT + + 2 * (SUBBLK_REF_EXT_LINES + DMVR_SEARCH_EXT_LINES))]; #else uint16_t - dst0_16_refinemv[REFINEMV_SUBBLOCK_WIDTH * REFINEMV_SUBBLOCK_HEIGHT]; + dst0_16_refinemv[(REFINEMV_SUBBLOCK_WIDTH + 2 * DMVR_SEARCH_EXT_LINES) * + (REFINEMV_SUBBLOCK_HEIGHT + + 2 * DMVR_SEARCH_EXT_LINES)]; uint16_t - dst1_16_refinemv[REFINEMV_SUBBLOCK_WIDTH * REFINEMV_SUBBLOCK_HEIGHT]; + dst1_16_refinemv[(REFINEMV_SUBBLOCK_WIDTH + 2 * DMVR_SEARCH_EXT_LINES) * + (REFINEMV_SUBBLOCK_HEIGHT + + 2 * DMVR_SEARCH_EXT_LINES)]; #endif // CONFIG_SUBBLK_REF_EXT ReferenceArea ref_area[2];
diff --git a/av1/common/reconinter.h b/av1/common/reconinter.h index 8f93b99..a77e6ab 100644 --- a/av1/common/reconinter.h +++ b/av1/common/reconinter.h
@@ -915,13 +915,23 @@ #if CONFIG_REFINEMV // Compute the SAD between the two predictors when refinemv is ON -int get_refinemv_sad(uint16_t *src1, uint16_t *src2, int width, int height, - int bd); -// Genrate two prediction signals and compute SAD of a given mv0 and mv1 +int get_refinemv_sad(uint16_t *src1, uint16_t *src2, int stride, int width, + int height, int bd); +#if CONFIG_16_FULL_SEARCH_DMVR +// Generate two prediction signals of a given mv0 and mv1 +void av1_refinemv_build_predictors(MACROBLOCKD *xd, int mi_x, int mi_y, + uint16_t **mc_buf, + CalcSubpelParamsFunc calc_subpel_params_func, + uint16_t *dst_ref0, uint16_t *dst_ref1, + int dst_stride, MV mv0, MV mv1, + InterPredParams *inter_pred_params); +#else +// Generate two prediction signals and compute SAD of a given mv0 and mv1 int av1_refinemv_build_predictors_and_get_sad( MACROBLOCKD *xd, int bw, int bh, int mi_x, int mi_y, uint16_t **mc_buf, CalcSubpelParamsFunc calc_subpel_params_func, uint16_t *dst_ref0, uint16_t *dst_ref1, MV mv0, MV mv1, InterPredParams *inter_pred_params); +#endif // CONFIG_16_FULL_SEARCH_DMVR // Get the context index to code refinemv flag int av1_get_refinemv_context(const AV1_COMMON *cm, const MACROBLOCKD *xd,
diff --git a/av1/common/tip.c b/av1/common/tip.c index 5f18297..ddf28bf 100644 --- a/av1/common/tip.c +++ b/av1/common/tip.c
@@ -1194,14 +1194,22 @@ #if CONFIG_REFINEMV #if CONFIG_SUBBLK_REF_EXT uint16_t - dst0_16_refinemv[(REFINEMV_SUBBLOCK_WIDTH + 2 * SUBBLK_REF_EXT_LINES) * - (REFINEMV_SUBBLOCK_HEIGHT + 2 * SUBBLK_REF_EXT_LINES)]; + dst0_16_refinemv[(REFINEMV_SUBBLOCK_WIDTH + + 2 * (SUBBLK_REF_EXT_LINES + DMVR_SEARCH_EXT_LINES)) * + (REFINEMV_SUBBLOCK_HEIGHT + + 2 * (SUBBLK_REF_EXT_LINES + DMVR_SEARCH_EXT_LINES))]; uint16_t - dst1_16_refinemv[(REFINEMV_SUBBLOCK_WIDTH + 2 * SUBBLK_REF_EXT_LINES) * - (REFINEMV_SUBBLOCK_HEIGHT + 2 * SUBBLK_REF_EXT_LINES)]; + dst1_16_refinemv[(REFINEMV_SUBBLOCK_WIDTH + + 2 * (SUBBLK_REF_EXT_LINES + DMVR_SEARCH_EXT_LINES)) * + (REFINEMV_SUBBLOCK_HEIGHT + + 2 * (SUBBLK_REF_EXT_LINES + DMVR_SEARCH_EXT_LINES))]; #else - uint16_t dst0_16_refinemv[REFINEMV_SUBBLOCK_WIDTH * REFINEMV_SUBBLOCK_HEIGHT]; - uint16_t dst1_16_refinemv[REFINEMV_SUBBLOCK_WIDTH * REFINEMV_SUBBLOCK_HEIGHT]; + uint16_t + dst0_16_refinemv[(REFINEMV_SUBBLOCK_WIDTH + 2 * DMVR_SEARCH_EXT_LINES) * + (REFINEMV_SUBBLOCK_HEIGHT + 2 * DMVR_SEARCH_EXT_LINES)]; + uint16_t + dst1_16_refinemv[(REFINEMV_SUBBLOCK_WIDTH + 2 * DMVR_SEARCH_EXT_LINES) * + (REFINEMV_SUBBLOCK_HEIGHT + 2 * DMVR_SEARCH_EXT_LINES)]; #endif // CONFIG_SUBBLK_REF_EXT #if CONFIG_TIP_LD const int apply_refinemv = (plane == 0 && cm->has_both_sides_refs);
diff --git a/av1/common/x86/bawp_avx2.c b/av1/common/x86/bawp_avx2.c new file mode 100644 index 0000000..be191bc --- /dev/null +++ b/av1/common/x86/bawp_avx2.c
@@ -0,0 +1,95 @@ +/* + * Copyright (c) 2025, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 3-Clause Clear License + * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear + * License was not distributed with this source code in the LICENSE file, you + * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the + * Alliance for Open Media Patent License 1.0 was not distributed with this + * source code in the PATENTS file, you can obtain it at + * aomedia.org/license/patent-license/. + */ + +#include <immintrin.h> + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#if CONFIG_BAWP +void av1_make_bawp_block_avx2(uint16_t *dst, int dst_stride, int16_t alpha, + int32_t beta, int shift, int bw, int bh, int bd) { + const __m256i alpha_reg = _mm256_set1_epi32((int)alpha); + const __m256i beta_reg = _mm256_set1_epi32(beta); + const __m256i clip_pixel = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + if (bw == 4 && ((bh & 3) == 0)) { + for (int j = 0; j < bh; j += 4) { + // d00 d01 d02 d03 + const __m128i dst_0 = _mm_cvtepu16_epi32( + _mm_loadl_epi64((const __m128i *)(&dst[j * dst_stride]))); + // d10 d11 d12 d13 + const __m128i dst_1 = _mm_cvtepu16_epi32( + _mm_loadl_epi64((const __m128i *)(&dst[(j + 1) * dst_stride]))); + // d00 d01 d02 d03 | d10 d11 d12 d13 + const __m256i dst_01 = + _mm256_inserti128_si256(_mm256_castsi128_si256(dst_0), dst_1, 1); + // d20 d21 d22 d23 + const __m128i dst_2 = _mm_cvtepu16_epi32( + _mm_loadl_epi64((const __m128i *)(&dst[(j + 2) * dst_stride]))); + // d30 d31 d32 d33 + const __m128i dst_3 = _mm_cvtepu16_epi32( + _mm_loadl_epi64((const __m128i *)(&dst[(j + 3) * dst_stride]))); + // d20 d21 d22 d23 | d30 d31 d32 d33 + const __m256i dst_23 = + _mm256_inserti128_si256(_mm256_castsi128_si256(dst_2), dst_3, 1); + + const __m256i res_0 = _mm256_srai_epi32( + _mm256_add_epi32(beta_reg, _mm256_mullo_epi32(dst_01, alpha_reg)), + shift); + const __m256i res_1 = _mm256_srai_epi32( + _mm256_add_epi32(beta_reg, _mm256_mullo_epi32(dst_23, alpha_reg)), + shift); + // 00 01 02 03 | 20 21 22 23 | 10 11 12 13 | 30 31 32 33 + const __m256i res_2 = _mm256_packus_epi32(res_0, res_1); + const __m256i res = _mm256_min_epu16(res_2, clip_pixel); + const __m128i res_lo = _mm256_castsi256_si128(res); + const __m128i res_hi = _mm256_extracti128_si256(res, 1); + + _mm_storel_epi64((__m128i *)(&dst[j * dst_stride]), res_lo); + _mm_storel_epi64((__m128i *)(&dst[(j + 1) * dst_stride]), res_hi); + _mm_storel_epi64((__m128i *)(&dst[(j + 2) * dst_stride]), + _mm_srli_si128(res_lo, 8)); + _mm_storel_epi64((__m128i *)(&dst[(j + 3) * dst_stride]), + _mm_srli_si128(res_hi, 8)); + } + } else if (((bw & 7) == 0) && ((bh & 1) == 0)) { + for (int j = 0; j < bh; j += 2) { + for (int i = 0; i < bw; i += 8) { + // d00 d01 d02 d03 d04 d05 d06 d07 + const __m256i dst_0 = _mm256_cvtepu16_epi32( + _mm_loadu_si128((const __m128i *)(&dst[j * dst_stride + i]))); + // d10 d11 d12 d13 d14 d15 d16 d17 + const __m256i dst_1 = _mm256_cvtepu16_epi32( + _mm_loadu_si128((const __m128i *)(&dst[(j + 1) * dst_stride + i]))); + + const __m256i res_0 = _mm256_srai_epi32( + _mm256_add_epi32(beta_reg, _mm256_mullo_epi32(dst_0, alpha_reg)), + shift); + const __m256i res_1 = _mm256_srai_epi32( + _mm256_add_epi32(beta_reg, _mm256_mullo_epi32(dst_1, alpha_reg)), + shift); + const __m256i res_2 = + _mm256_permute4x64_epi64(_mm256_packus_epi32(res_0, res_1), 0xD8); + const __m256i res = _mm256_min_epu16(res_2, clip_pixel); + + _mm_storeu_si128((__m128i *)(&dst[j * dst_stride + i]), + _mm256_castsi256_si128(res)); + _mm_storeu_si128((__m128i *)(&dst[(j + 1) * dst_stride + i]), + _mm256_extracti128_si256(res, 1)); + } + } + } else { + av1_make_bawp_block_c(dst, dst_stride, alpha, beta, shift, bw, bh, bd); + } +} +#endif // CONFIG_BAWP
diff --git a/av1/common/x86/highbd_warp_affine_avx2.c b/av1/common/x86/highbd_warp_affine_avx2.c index 0a8ddc2..c2c827a 100644 --- a/av1/common/x86/highbd_warp_affine_avx2.c +++ b/av1/common/x86/highbd_warp_affine_avx2.c
@@ -33,105 +33,185 @@ }; #endif // CONFIG_EXT_WARP_FILTER +DECLARE_ALIGNED(32, static const uint8_t, warp_highbd_shuffle_pattern[32]) = { + 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, + 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 +}; + +DECLARE_ALIGNED(32, static const uint8_t, warp_highbd_arrange_bytes[32]) = { + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 +}; + DECLARE_ALIGNED(32, static const uint8_t, shuffle_input_mask[32]) = { 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 }; -DECLARE_ALIGNED(32, static const uint8_t, - shuffle_gamma0_mask0[32]) = { 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, - 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, - 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 }; +DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_gamma0_mask0[32]) = { + 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, + 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +}; -DECLARE_ALIGNED(32, static const uint8_t, - shuffle_gamma0_mask1[32]) = { 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, - 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, - 6, 7, 4, 5, 6, 7, 4, 5, 6, 7 }; +DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_gamma0_mask1[32]) = { + 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, + 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7 +}; -DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask2[32]) = { +DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_gamma0_mask2[32]) = { 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11 }; -DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask3[32]) = { +DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_gamma0_mask3[32]) = { 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15 }; -static INLINE void prepare_vertical_filter_coeffs_avx2(int sy, int gamma, - __m256i *coeffs) { - // A7 A6 A5 A4 A3 A2 A1 A0 - __m256i v_coeff01 = _mm256_castsi128_si256(_mm_loadu_si128( - (__m128i *)av1_warped_filter[(sy) >> WARPEDDIFF_PREC_BITS])); - // B7 B6 B5 B4 B3 B2 B1 B0 | A7 A6 A5 A4 A3 A2 A1 A0 - v_coeff01 = _mm256_inserti128_si256( - v_coeff01, +static INLINE void prepare_8tap_filter_coeffs_alpha0_gamma0_avx2( + int32_t s, __m256i *coeffs) { + // c0 c1 c2 c3 c4 c5 c6 c7 | c0 c1 c2 c3 c4 c5 c6 c7 + const __m256i v_coeff = _mm256_broadcastsi128_si256(_mm_loadu_si128( + (__m128i *)av1_warped_filter[(s) >> WARPEDDIFF_PREC_BITS])); + + coeffs[0] = _mm256_shuffle_epi8( + v_coeff, _mm256_load_si256((__m256i *)shuffle_alpha0_gamma0_mask0)); + coeffs[1] = _mm256_shuffle_epi8( + v_coeff, _mm256_load_si256((__m256i *)shuffle_alpha0_gamma0_mask1)); + coeffs[2] = _mm256_shuffle_epi8( + v_coeff, _mm256_load_si256((__m256i *)shuffle_alpha0_gamma0_mask2)); + coeffs[3] = _mm256_shuffle_epi8( + v_coeff, _mm256_load_si256((__m256i *)shuffle_alpha0_gamma0_mask3)); +} + +static INLINE void prepare_8tap_filter_coeffs_avx2(int32_t s, int offset, + __m256i *coeffs) { + // c00 c01 c02 c03 c04 c05 c06 c07 | x x x x x x x x + const __m256i v_coeff0 = _mm256_castsi128_si256(_mm_loadu_si128( + (__m128i *)av1_warped_filter[(s) >> WARPEDDIFF_PREC_BITS])); + // c00 c01 c02 c03 c04 c05 c06 c07 | c10 c11 c12 c13 c14 c15 c16 c17 + const __m256i v_coeff01 = _mm256_inserti128_si256( + v_coeff0, _mm_loadu_si128( - (__m128i *)av1_warped_filter[(sy + gamma) >> WARPEDDIFF_PREC_BITS]), + (__m128i *)av1_warped_filter[(s + offset) >> WARPEDDIFF_PREC_BITS]), 1); - // C7 C6 C5 C4 C3 C2 C1 C0 - __m256i v_coeff23 = _mm256_castsi128_si256(_mm_loadu_si128( - (__m128i *)av1_warped_filter[(sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS])); - // D7 D6 D5 D4 D3 D2 D1 D0 | C7 C6 C5 C4 C3 C2 C1 C0 - v_coeff23 = _mm256_inserti128_si256( - v_coeff23, + // c20 c21 c22 c23 c24 c25 c26 c27 | x x x x x x x x + const __m256i v_coeff2 = _mm256_castsi128_si256(_mm_loadu_si128( + (__m128i *)av1_warped_filter[(s + 2 * offset) >> WARPEDDIFF_PREC_BITS])); + // c20 c21 c22 c23 c24 c25 c26 c27 | c30 c31 c32 c33 c34 c35 c36 c37 + const __m256i v_coeff23 = _mm256_inserti128_si256( + v_coeff2, _mm_loadu_si128( (__m128i *) - av1_warped_filter[(sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS]), + av1_warped_filter[(s + 3 * offset) >> WARPEDDIFF_PREC_BITS]), 1); - // E7 E6 E5 E4 E3 E2 E1 E0 - __m256i v_coeff45 = _mm256_castsi128_si256(_mm_loadu_si128( - (__m128i *)av1_warped_filter[(sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS])); - // F7 F6 F5 F4 F3 F2 F1 F0 | E7 E6 E5 E4 E3 E2 E1 E0 - v_coeff45 = _mm256_inserti128_si256( - v_coeff45, + // c40 c41 c42 c43 c44 c45 c46 c47 | x x x x x x x x + const __m256i v_coeff4 = _mm256_castsi128_si256(_mm_loadu_si128( + (__m128i *)av1_warped_filter[(s + 4 * offset) >> WARPEDDIFF_PREC_BITS])); + // c40 c41 c42 c43 c44 c45 c46 c47 | c50 c51 c52 c53 c54 c55 c56 c57 + const __m256i v_coeff45 = _mm256_inserti128_si256( + v_coeff4, _mm_loadu_si128( (__m128i *) - av1_warped_filter[(sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS]), + av1_warped_filter[(s + 5 * offset) >> WARPEDDIFF_PREC_BITS]), 1); - // G7 G6 G5 G4 G3 G2 G1 G0 - __m256i v_coeff67 = _mm256_castsi128_si256(_mm_loadu_si128( - (__m128i *)av1_warped_filter[(sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS])); - // H7 H6 H5 H4 H3 H2 H1 H0 | G7 G6 G5 G4 G3 G2 G1 G0 - v_coeff67 = _mm256_inserti128_si256( - v_coeff67, + // c60 c61 c62 c63 c64 c65 c66 c67 | x x x x x x x x + const __m256i v_coeff6 = _mm256_castsi128_si256(_mm_loadu_si128( + (__m128i *)av1_warped_filter[(s + 6 * offset) >> WARPEDDIFF_PREC_BITS])); + // c60 c61 c62 c63 c64 c65 c66 c67 | c70 c71 c72 c73 c74 c75 c76 c77 + const __m256i v_coeff67 = _mm256_inserti128_si256( + v_coeff6, _mm_loadu_si128( (__m128i *) - av1_warped_filter[(sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS]), + av1_warped_filter[(s + 7 * offset) >> WARPEDDIFF_PREC_BITS]), 1); - // D3 D2 B3 B2 D1 D0 B1 B0 | C3 C2 A3 A2 C1 C0 A1 A0 + __m256i v_c0123 = _mm256_unpacklo_epi32(v_coeff01, v_coeff23); - // D7 D6 B7 B6 D5 D4 B5 B4 | C7 C6 A7 A6 C5 C4 A5 A4 __m256i v_c0123u = _mm256_unpackhi_epi32(v_coeff01, v_coeff23); - // H3 H2 F3 F2 H1 H0 F1 F0 | G3 G2 E3 E2 G1 G0 E1 E0 __m256i v_c4567 = _mm256_unpacklo_epi32(v_coeff45, v_coeff67); - // H7 H6 F7 F6 H5 H4 F5 F4 | G7 G6 E7 E6 G5 G4 E5 E4 __m256i v_c4567u = _mm256_unpackhi_epi32(v_coeff45, v_coeff67); + // c00 c01 c20 c21 c40 c41 c60 c61 | c10 c11 c30 c31 c50 c51 c70 c71 coeffs[0] = _mm256_unpacklo_epi64(v_c0123, v_c4567); + // c02 c03 c22 c23 c06 c07 c26 c27 | c12 c13 c32 c33 c16 c17 c72 c73 coeffs[1] = _mm256_unpackhi_epi64(v_c0123, v_c4567); + // c04 c05 c24 c25 c44 c45 c64 c65 | c14 c15 c34 c35 c54 c55 c74 c75 coeffs[2] = _mm256_unpacklo_epi64(v_c0123u, v_c4567u); + // c06 c07 c26 c27 c46 c47 c66 c67 | c16 c17 c36 c37 c56 c57 c76 c77 coeffs[3] = _mm256_unpackhi_epi64(v_c0123u, v_c4567u); } -static INLINE void prepare_vertical_filter_coeffs_gamma0_avx2(int32_t sy, - __m256i *coeffs) { - __m256i v_coeff = _mm256_castsi128_si256(_mm_loadu_si128( - (__m128i *)av1_warped_filter[(sy) >> WARPEDDIFF_PREC_BITS])); - v_coeff = - _mm256_inserti128_si256(v_coeff, _mm256_castsi256_si128(v_coeff), 1); - - coeffs[0] = _mm256_shuffle_epi8( - v_coeff, _mm256_load_si256((__m256i *)shuffle_gamma0_mask0)); - coeffs[1] = _mm256_shuffle_epi8( - v_coeff, _mm256_load_si256((__m256i *)shuffle_gamma0_mask1)); - coeffs[2] = _mm256_shuffle_epi8( - v_coeff, _mm256_load_si256((__m256i *)shuffle_gamma0_mask2)); - coeffs[3] = _mm256_shuffle_epi8( - v_coeff, _mm256_load_si256((__m256i *)shuffle_gamma0_mask3)); +static INLINE void load_horiz_src_pixels_avx2(const uint16_t *ref, __m256i *r) { + r[0] = _mm256_loadu_si256((__m256i *)ref); + r[1] = _mm256_loadu_si256((__m256i *)(ref + 1)); } -static INLINE void prepare_input_data(__m256i *input, __m256i *src) { +static INLINE void prepare_8tap_horiz_src_padded_avx2(const uint16_t *ref, + int out_of_boundary_left, + int out_of_boundary_right, + __m256i *src_padded) { + const __m256i src_0 = _mm256_loadu_si256((__m256i *)ref); + + const __m256i src_01 = _mm256_shuffle_epi8( + src_0, _mm256_loadu_si256((__m256i *)warp_highbd_arrange_bytes)); + __m256i src_reg = _mm256_permute4x64_epi64(src_01, 0xD8); + + if (out_of_boundary_left >= 0) { + const __m128i shuffle_left = + _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]); + const __m256i shuffle_reg_left = _mm256_broadcastsi128_si256(shuffle_left); + src_reg = _mm256_shuffle_epi8(src_reg, shuffle_reg_left); + } + + if (out_of_boundary_right >= 0) { + const __m128i shuffle_right = + _mm_loadu_si128((__m128i *)warp_pad_right[out_of_boundary_right]); + const __m256i shuffle_reg_right = + _mm256_broadcastsi128_si256(shuffle_right); + src_reg = _mm256_shuffle_epi8(src_reg, shuffle_reg_right); + } + + src_padded[0] = _mm256_shuffle_epi8( + _mm256_permute4x64_epi64(src_reg, 0xD8), + _mm256_loadu_si256((__m256i *)warp_highbd_shuffle_pattern)); + __m256i src_padded_hi = _mm256_permute4x64_epi64(src_padded[0], 0xEE); + src_padded[1] = _mm256_alignr_epi8(src_padded_hi, src_padded[0], 2); +} + +static INLINE void prepare_8tap_horiz_src_avx2(const __m256i *input, + __m256i *src) { + // r0 r1 r2 r3 r4 r5 r6 r7 | r1 r2 r3 r4 r5 r6 r7 r8 + const __m256i r_low = _mm256_permute2x128_si256(input[0], input[1], 0x20); + // r8 r9 r10 r11 r12 r13 r14 r15 | r9 r10 r11 r12 r13 r14 r15 r16 + const __m256i r_high = _mm256_permute2x128_si256(input[0], input[1], 0x31); + + // r0 r1 r2 r3 r4 r5 r6 r7 | r1 r2 r3 r4 r5 r6 r7 r8 + src[0] = r_low; + // r2 r3 r4 r5 r6 r7 r8 r9 | r3 r4 r5 r6 r7 r8 r9 r10 + src[1] = _mm256_alignr_epi8(r_high, r_low, 4); + // r4 r5 r6 r7 r8 r9 r10 r11 | r5 r6 r7 r8 r9 r10 r11 r12 + src[2] = _mm256_alignr_epi8(r_high, r_low, 8); + // r6 r7 r8 r9 r10 r11 r12 r13 | r7 r8 r9 r10 r11 r12 r13 r14 + src[3] = _mm256_alignr_epi8(r_high, r_low, 12); +} + +static INLINE void filter_src_pixels_horiz_avx2(const __m256i *in, + const __m256i *coeffs, + const __m256i *offset, + int shift, __m256i *out) { + const __m256i res_0 = _mm256_madd_epi16(in[0], coeffs[0]); + const __m256i res_1 = _mm256_madd_epi16(in[1], coeffs[1]); + const __m256i res_2 = _mm256_madd_epi16(in[2], coeffs[2]); + const __m256i res_3 = _mm256_madd_epi16(in[3], coeffs[3]); + + const __m256i res_4 = _mm256_add_epi32( + res_0, _mm256_add_epi32(_mm256_add_epi32(res_2, res_3), res_1)); + const __m256i res = _mm256_add_epi32(res_4, *offset); + *out = _mm256_srai_epi32(res, shift); +} + +static INLINE void prepare_8tap_vert_src_avx2(const __m256i *input, + __m256i *src) { __m256i input_01 = _mm256_packus_epi32(input[0], input[1]); __m256i input_23 = _mm256_packus_epi32(input[2], input[3]); __m256i input_45 = _mm256_packus_epi32(input[4], input[5]); @@ -189,11 +269,10 @@ static INLINE void store_vertical_filter_output_avx2( uint16_t *pred, int p_stride, ConvolveParams *conv_params, int bd, const __m256i *res_lo, const __m256i *res_hi, const __m256i *res_add_const, - const __m256i *reduce_bits_vert_const, - const __m128i *reduce_bits_vert_shift, const int use_wtd_comp_avg, + const __m256i *reduce_bits_vert_const, const int use_wtd_comp_avg, const __m256i *wt0, const __m256i *wt1, const __m256i *res_sub_const, - const __m256i *round_bits_const, __m128i *round_bits_shift, int i, int j, - int k, const int reduce_bits_vert) { + const __m256i *round_bits_const, int i, int j, int k, + const int reduce_bits_vert, const int round_bits) { const __m256i clip_pixel = _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); __m256i v_sum = *res_lo; @@ -207,12 +286,11 @@ (__m128i *)&conv_params->dst[(i + k + 5) * conv_params->dst_stride + j]; v_sum = _mm256_add_epi32(v_sum, *res_add_const); - v_sum = _mm256_sra_epi32(_mm256_add_epi32(v_sum, *reduce_bits_vert_const), - *reduce_bits_vert_shift); + v_sum = _mm256_srai_epi32(_mm256_add_epi32(v_sum, *reduce_bits_vert_const), + reduce_bits_vert); v_sum_r1 = _mm256_add_epi32(v_sum_r1, *res_add_const); - v_sum_r1 = - _mm256_sra_epi32(_mm256_add_epi32(v_sum_r1, *reduce_bits_vert_const), - *reduce_bits_vert_shift); + v_sum_r1 = _mm256_srai_epi32( + _mm256_add_epi32(v_sum_r1, *reduce_bits_vert_const), reduce_bits_vert); if (conv_params->do_average) { __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j]; __m128i *const dst16_r1 = (__m128i *)&pred[(i + k + 5) * p_stride + j]; @@ -236,11 +314,11 @@ } __m256i v_sum1 = _mm256_add_epi32(v_sum, *res_sub_const); - v_sum1 = _mm256_sra_epi32(_mm256_add_epi32(v_sum1, *round_bits_const), - *round_bits_shift); + v_sum1 = _mm256_srai_epi32(_mm256_add_epi32(v_sum1, *round_bits_const), + round_bits); __m256i v_sum1_r1 = _mm256_add_epi32(v_sum_r1, *res_sub_const); - v_sum1_r1 = _mm256_sra_epi32( - _mm256_add_epi32(v_sum1_r1, *round_bits_const), *round_bits_shift); + v_sum1_r1 = _mm256_srai_epi32( + _mm256_add_epi32(v_sum1_r1, *round_bits_const), round_bits); __m256i v_sum16 = _mm256_packus_epi32(v_sum1, v_sum1_r1); v_sum16 = @@ -317,14 +395,12 @@ // into an unsigned 16-bit intermediate array. assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); - const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert); const __m256i reduce_bits_vert_const = _mm256_set1_epi32(((1 << reduce_bits_vert) >> 1)); const __m256i res_add_const = _mm256_set1_epi32(1 << offset_bits_vert); const __m256i res_sub_const = _mm256_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) - (1 << (offset_bits - conv_params->round_1 - 1))); - __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits); __m256i round_bits_const = _mm256_set1_epi32(((1 << round_bits) >> 1)); const int use_wtd_comp_avg = is_uneven_wtd_comp_avg(conv_params); @@ -334,9 +410,10 @@ const __m256i wt1 = _mm256_set1_epi32(w1); __m256i v_rbhoriz = _mm256_set1_epi32(1 << (reduce_bits_horiz - 1)); - __m256i v_zeros = _mm256_setzero_si256(); int ohoriz = 1 << offset_bits_horiz; int mhoriz = 1 << max_bits_horiz; + const __m256i v_offset_bits_horiz = _mm256_set1_epi32(ohoriz); + const __m256i offset = _mm256_add_epi32(v_offset_bits_horiz, v_rbhoriz); (void)mhoriz; int sx; @@ -355,9 +432,9 @@ const int64_t x4 = dst_x >> subsampling_x; const int64_t y4 = dst_y >> subsampling_y; - const int16_t ix4 = (int16_t)(x4 >> WARPEDMODEL_PREC_BITS); + const int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS); int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); - const int16_t iy4 = (int16_t)(y4 >> WARPEDMODEL_PREC_BITS); + const int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS); int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); #if CONFIG_RELAX_AFFINE_CONSTRAINTS @@ -424,18 +501,70 @@ #endif // CONFIG_OPFL_MEMBW_REDUCTION } #if CONFIG_OPFL_MEMBW_REDUCTION - } else if (((ix4 - 7) < left_limit) || ((ix4 + 8) > right_limit)) { + } else if (((ix4 - 7) < left_limit) || ((ix4 + 7) > right_limit)) { + const int out_of_boundary_left = left_limit - (ix4 - 6); + const int out_of_boundary_right = (ix4 + 7) - right_limit; + __m256i src_padded[2], coeffs[4], src[4]; + if (alpha == 0 && beta == 0) { + prepare_8tap_filter_coeffs_alpha0_gamma0_avx2(sx4, &coeffs[0]); + for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + iy = clamp(iy, top_limit, bottom_limit); + + prepare_8tap_horiz_src_padded_avx2( + &ref[iy * stride + ix4 - 7], out_of_boundary_left, + out_of_boundary_right, &src_padded[0]); + prepare_8tap_horiz_src_avx2(&src_padded[0], &src[0]); + filter_src_pixels_horiz_avx2(src, coeffs, &offset, + reduce_bits_horiz, &tmp[k + 7]); + } + } else if (alpha == 0) { + for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + iy = clamp(iy, top_limit, bottom_limit); + + sx = sx4 + beta * (k + 4); + prepare_8tap_filter_coeffs_alpha0_gamma0_avx2(sx, coeffs); + prepare_8tap_horiz_src_padded_avx2( + &ref[iy * stride + ix4 - 7], out_of_boundary_left, + out_of_boundary_right, &src_padded[0]); + prepare_8tap_horiz_src_avx2(&src_padded[0], &src[0]); + filter_src_pixels_horiz_avx2(src, coeffs, &offset, + reduce_bits_horiz, &tmp[k + 7]); + } + } else if (beta == 0) { + prepare_8tap_filter_coeffs_avx2(sx4, alpha, &coeffs[0]); + for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + iy = clamp(iy, top_limit, bottom_limit); + + prepare_8tap_horiz_src_padded_avx2( + &ref[iy * stride + ix4 - 7], out_of_boundary_left, + out_of_boundary_right, &src_padded[0]); + prepare_8tap_horiz_src_avx2(&src_padded[0], &src[0]); + filter_src_pixels_horiz_avx2(src, coeffs, &offset, + reduce_bits_horiz, &tmp[k + 7]); + } + } else { + for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + iy = clamp(iy, top_limit, bottom_limit); + + sx = sx4 + beta * (k + 4); + prepare_8tap_filter_coeffs_avx2(sx, alpha, &coeffs[0]); + prepare_8tap_horiz_src_padded_avx2( + &ref[iy * stride + ix4 - 7], out_of_boundary_left, + out_of_boundary_right, &src_padded[0]); + prepare_8tap_horiz_src_avx2(&src_padded[0], &src[0]); + filter_src_pixels_horiz_avx2(src, coeffs, &offset, + reduce_bits_horiz, &tmp[k + 7]); + } + } #else } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) { -#endif // CONFIG_OPFL_MEMBW_REDUCTION int32_t tmp1[8]; for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { -#if CONFIG_OPFL_MEMBW_REDUCTION - const int iy = clamp(iy4 + k, top_limit, bottom_limit); -#else const int iy = clamp(iy4 + k, 0, height - 1); -#endif // CONFIG_OPFL_MEMBW_REDUCTION - sx = sx4 + beta * (k + 4); for (int l = -4; l < 4; ++l) { int ix = ix4 + l - 3; @@ -444,11 +573,7 @@ int32_t sum = 1 << offset_bits_horiz; for (int m = 0; m < 8; ++m) { -#if CONFIG_OPFL_MEMBW_REDUCTION - const int sample_x = clamp(ix + m, left_limit, right_limit); -#else const int sample_x = clamp(ix + m, 0, width - 1); -#endif // CONFIG_OPFL_MEMBW_REDUCTION sum += ref[iy * stride + sample_x] * coeffs[m]; } sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz); @@ -457,20 +582,11 @@ } tmp[k + 7] = _mm256_loadu_si256((__m256i *)tmp1); } +#endif // CONFIG_OPFL_MEMBW_REDUCTION } else { - if (beta == 0 && alpha == 0) { - sx = sx4; - __m128i v_01 = _mm_loadu_si128( - (__m128i *) - av1_warped_filter[sx >> - WARPEDDIFF_PREC_BITS]); // A7A6A5A4A3A2A1A0 - __m256i v_c01 = _mm256_broadcastd_epi32(v_01); // A1A0A1A0A1A0A1A0 - __m256i v_c23 = _mm256_broadcastd_epi32( - _mm_shuffle_epi32(v_01, 1)); // A3A2A3A2A3A2A3A2 - __m256i v_c45 = _mm256_broadcastd_epi32( - _mm_shuffle_epi32(v_01, 2)); // A5A4A5A4A5A4A5A4 - __m256i v_c67 = _mm256_broadcastd_epi32( - _mm_shuffle_epi32(v_01, 3)); // A7A6A7A6A7A6A7A6 + __m256i r[2], coeffs[4], src[4]; + if (alpha == 0 && beta == 0) { + prepare_8tap_filter_coeffs_alpha0_gamma0_avx2(sx4, &coeffs[0]); for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { int iy = iy4 + k; #if CONFIG_OPFL_MEMBW_REDUCTION @@ -482,44 +598,10 @@ iy = height - 1; #endif // CONFIG_OPFL_MEMBW_REDUCTION iy = iy * stride; - - __m256i v_refl = _mm256_inserti128_si256( - _mm256_set1_epi16(0), - _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0); - v_refl = _mm256_inserti128_si256( - v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]), - 1); // R15 .. R0 - - __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE); - - __m256i v_refu = - _mm256_alignr_epi8(v_ref, v_refl, 2); // R8R15R14...R2R1 - v_refl = _mm256_inserti128_si256( - v_refl, _mm256_extracti128_si256(v_refu, 0), 1); - v_refu = _mm256_inserti128_si256( - v_refu, _mm256_extracti128_si256(v_ref, 0), 0); - - __m256i v_sum = _mm256_set1_epi32(ohoriz); - __m256i parsum = _mm256_madd_epi16( - v_c01, _mm256_alignr_epi8(v_refu, v_refl, - 0)); // R8R7R6..R1R7R6R5..R1R0 - __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum); - - parsum = _mm256_madd_epi16( - v_c23, - _mm256_alignr_epi8(v_refu, v_refl, 4)); // R10R9..R3R9R8..R3R2 - __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum); - parsum = _mm256_madd_epi16( - v_c45, _mm256_alignr_epi8(v_refu, v_refl, - 8)); // R12R11..R5R11R10..R5R4 - __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum); - parsum = _mm256_madd_epi16( - v_c67, _mm256_alignr_epi8(v_refu, v_refl, - 12)); // R14R13..R7R13R12..R7R6 - __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum); - - tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz), - reduce_bits_horiz); + load_horiz_src_pixels_avx2(&ref[iy + ix4 - 7], &r[0]); + prepare_8tap_horiz_src_avx2(&r[0], &src[0]); + filter_src_pixels_horiz_avx2(src, coeffs, &offset, + reduce_bits_horiz, &tmp[k + 7]); } } else if (alpha == 0) { for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { @@ -535,125 +617,15 @@ iy = iy * stride; sx = sx4 + beta * (k + 4); - - __m128i v_01 = _mm_loadu_si128( - (__m128i *)av1_warped_filter - [sx >> WARPEDDIFF_PREC_BITS]); // A7A6A5A4A3A2A1A0 - __m256i v_c01 = _mm256_broadcastd_epi32(v_01); // A1A0A1A0A1A0A1A0 - __m256i v_c23 = _mm256_broadcastd_epi32( - _mm_shuffle_epi32(v_01, 1)); // A3A2A3A2A3A2A3A2 - __m256i v_c45 = _mm256_broadcastd_epi32( - _mm_shuffle_epi32(v_01, 2)); // A5A4A5A4A5A4A5A4 - __m256i v_c67 = _mm256_broadcastd_epi32( - _mm_shuffle_epi32(v_01, 3)); // A7A6A7A6A7A6A7A6 - - __m256i v_refl = _mm256_inserti128_si256( - _mm256_set1_epi16(0), - _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0); - v_refl = _mm256_inserti128_si256( - v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]), - 1); // R15 .. R0 - - __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE); - - __m256i v_refu = - _mm256_alignr_epi8(v_ref, v_refl, 2); // R8R15R14...R2R1 - - v_refl = _mm256_inserti128_si256( - v_refl, _mm256_extracti128_si256(v_refu, 0), 1); - v_refu = _mm256_inserti128_si256( - v_refu, _mm256_extracti128_si256(v_ref, 0), 0); - - __m256i v_sum = _mm256_set1_epi32(ohoriz); - __m256i parsum = - _mm256_madd_epi16(v_c01, _mm256_alignr_epi8(v_refu, v_refl, 0)); - __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum); - - parsum = - _mm256_madd_epi16(v_c23, _mm256_alignr_epi8(v_refu, v_refl, 4)); - __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum); - parsum = - _mm256_madd_epi16(v_c45, _mm256_alignr_epi8(v_refu, v_refl, 8)); - __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum); - parsum = _mm256_madd_epi16(v_c67, - _mm256_alignr_epi8(v_refu, v_refl, 12)); - __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum); - - tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz), - reduce_bits_horiz); + prepare_8tap_filter_coeffs_alpha0_gamma0_avx2(sx, coeffs); + load_horiz_src_pixels_avx2(&ref[iy + ix4 - 7], &r[0]); + prepare_8tap_horiz_src_avx2(&r[0], &src[0]); + filter_src_pixels_horiz_avx2(src, coeffs, &offset, + reduce_bits_horiz, &tmp[k + 7]); } + } else if (beta == 0) { - sx = sx4; - __m256i v_coeff01 = _mm256_inserti128_si256( - v_zeros, - _mm_loadu_si128( - (__m128i *)av1_warped_filter[(sx) >> WARPEDDIFF_PREC_BITS]), - 0); - v_coeff01 = _mm256_inserti128_si256( - v_coeff01, - _mm_loadu_si128( - (__m128i *) - av1_warped_filter[(sx + alpha) >> WARPEDDIFF_PREC_BITS]), - 1); // B7B6..B1B0A7A6..A1A0 - __m256i v_coeff23 = _mm256_inserti128_si256( - v_zeros, - _mm_loadu_si128( - (__m128i *)av1_warped_filter[(sx + 2 * alpha) >> - WARPEDDIFF_PREC_BITS]), - 0); - v_coeff23 = _mm256_inserti128_si256( - v_coeff23, - _mm_loadu_si128( - (__m128i *)av1_warped_filter[(sx + 3 * alpha) >> - WARPEDDIFF_PREC_BITS]), - 1); // D7D6..D1D0C7C6..C1C0 - __m256i v_coeff45 = _mm256_inserti128_si256( - v_zeros, - _mm_loadu_si128( - (__m128i *)av1_warped_filter[(sx + 4 * alpha) >> - WARPEDDIFF_PREC_BITS]), - 0); - v_coeff45 = _mm256_inserti128_si256( - v_coeff45, - _mm_loadu_si128( - (__m128i *)av1_warped_filter[(sx + 5 * alpha) >> - WARPEDDIFF_PREC_BITS]), - 1); // F7F6..F1F0E7E6..E1E0 - __m256i v_coeff67 = _mm256_inserti128_si256( - v_zeros, - _mm_loadu_si128( - (__m128i *)av1_warped_filter[(sx + 6 * alpha) >> - WARPEDDIFF_PREC_BITS]), - 0); - v_coeff67 = _mm256_inserti128_si256( - v_coeff67, - _mm_loadu_si128( - (__m128i *)av1_warped_filter[(sx + 7 * alpha) >> - WARPEDDIFF_PREC_BITS]), - 1); // H7H6..H1H0G7G6..G1G0 - - __m256i v_c0123 = _mm256_unpacklo_epi32( - v_coeff01, - v_coeff23); // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0 - __m256i v_c0123u = _mm256_unpackhi_epi32( - v_coeff01, - v_coeff23); // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4 - __m256i v_c4567 = _mm256_unpacklo_epi32( - v_coeff45, - v_coeff67); // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0 - __m256i v_c4567u = _mm256_unpackhi_epi32( - v_coeff45, - v_coeff67); // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4 - - __m256i v_c01 = _mm256_unpacklo_epi64( - v_c0123, v_c4567); // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0 - __m256i v_c23 = - _mm256_unpackhi_epi64(v_c0123, v_c4567); // H3H2 ... A3A2 - __m256i v_c45 = - _mm256_unpacklo_epi64(v_c0123u, v_c4567u); // H5H4 ... A5A4 - __m256i v_c67 = - _mm256_unpackhi_epi64(v_c0123u, v_c4567u); // H7H6 ... A7A6 - + prepare_8tap_filter_coeffs_avx2(sx4, alpha, &coeffs[0]); for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { int iy = iy4 + k; #if CONFIG_OPFL_MEMBW_REDUCTION @@ -665,47 +637,11 @@ iy = height - 1; #endif // CONFIG_OPFL_MEMBW_REDUCTION iy = iy * stride; - - __m256i v_refl = _mm256_inserti128_si256( - _mm256_set1_epi16(0), - _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0); - v_refl = _mm256_inserti128_si256( - v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]), - 1); // R15 .. R0 - - __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE); - - __m256i v_refu = - _mm256_alignr_epi8(v_ref, v_refl, 2); // R8R15R14...R2R1 - - v_refl = _mm256_inserti128_si256( - v_refl, _mm256_extracti128_si256(v_refu, 0), 1); - v_refu = _mm256_inserti128_si256( - v_refu, _mm256_extracti128_si256(v_ref, 0), 0); - - __m256i v_sum = _mm256_set1_epi32(ohoriz); - __m256i parsum = _mm256_madd_epi16( - v_c01, _mm256_alignr_epi8(v_refu, v_refl, - 0)); // R8R7R6..R1R7R6R5..R1R0 - __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum); - - parsum = _mm256_madd_epi16( - v_c23, - _mm256_alignr_epi8(v_refu, v_refl, 4)); // R10R9..R3R9R8..R3R2 - __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum); - parsum = _mm256_madd_epi16( - v_c45, _mm256_alignr_epi8(v_refu, v_refl, - 8)); // R12R11..R5R11R10..R5R4 - __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum); - parsum = _mm256_madd_epi16( - v_c67, _mm256_alignr_epi8(v_refu, v_refl, - 12)); // R14R13..R7R13R12..R7R6 - __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum); - - tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz), - reduce_bits_horiz); + load_horiz_src_pixels_avx2(&ref[iy + ix4 - 7], &r[0]); + prepare_8tap_horiz_src_avx2(&r[0], &src[0]); + filter_src_pixels_horiz_avx2(src, coeffs, &offset, + reduce_bits_horiz, &tmp[k + 7]); } - } else { for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { int iy = iy4 + k; @@ -720,111 +656,11 @@ iy = iy * stride; sx = sx4 + beta * (k + 4); - - __m256i v_coeff01 = _mm256_inserti128_si256( - v_zeros, - _mm_loadu_si128( - (__m128i *)av1_warped_filter[(sx) >> WARPEDDIFF_PREC_BITS]), - 0); - v_coeff01 = _mm256_inserti128_si256( - v_coeff01, - _mm_loadu_si128( - (__m128i *)av1_warped_filter[(sx + alpha) >> - WARPEDDIFF_PREC_BITS]), - 1); // B7B6..B1B0A7A6..A1A0 - __m256i v_coeff23 = _mm256_inserti128_si256( - v_zeros, - _mm_loadu_si128( - (__m128i *)av1_warped_filter[(sx + 2 * alpha) >> - WARPEDDIFF_PREC_BITS]), - 0); - v_coeff23 = _mm256_inserti128_si256( - v_coeff23, - _mm_loadu_si128( - (__m128i *)av1_warped_filter[(sx + 3 * alpha) >> - WARPEDDIFF_PREC_BITS]), - 1); // D7D6..D1D0C7C6..C1C0 - __m256i v_coeff45 = _mm256_inserti128_si256( - v_zeros, - _mm_loadu_si128( - (__m128i *)av1_warped_filter[(sx + 4 * alpha) >> - WARPEDDIFF_PREC_BITS]), - 0); - v_coeff45 = _mm256_inserti128_si256( - v_coeff45, - _mm_loadu_si128( - (__m128i *)av1_warped_filter[(sx + 5 * alpha) >> - WARPEDDIFF_PREC_BITS]), - 1); // F7F6..F1F0E7E6..E1E0 - __m256i v_coeff67 = _mm256_inserti128_si256( - v_zeros, - _mm_loadu_si128( - (__m128i *)av1_warped_filter[(sx + 6 * alpha) >> - WARPEDDIFF_PREC_BITS]), - 0); - v_coeff67 = _mm256_inserti128_si256( - v_coeff67, - _mm_loadu_si128( - (__m128i *)av1_warped_filter[(sx + 7 * alpha) >> - WARPEDDIFF_PREC_BITS]), - 1); // H7H6..H1H0G7G6..G1G0 - - __m256i v_c0123 = _mm256_unpacklo_epi32( - v_coeff01, - v_coeff23); // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0 - __m256i v_c0123u = _mm256_unpackhi_epi32( - v_coeff01, - v_coeff23); // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4 - __m256i v_c4567 = _mm256_unpacklo_epi32( - v_coeff45, - v_coeff67); // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0 - __m256i v_c4567u = _mm256_unpackhi_epi32( - v_coeff45, - v_coeff67); // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4 - - __m256i v_c01 = _mm256_unpacklo_epi64( - v_c0123, v_c4567); // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0 - __m256i v_c23 = - _mm256_unpackhi_epi64(v_c0123, v_c4567); // H3H2 ... A3A2 - __m256i v_c45 = - _mm256_unpacklo_epi64(v_c0123u, v_c4567u); // H5H4 ... A5A4 - __m256i v_c67 = - _mm256_unpackhi_epi64(v_c0123u, v_c4567u); // H7H6 ... A7A6 - - __m256i v_refl = _mm256_inserti128_si256( - _mm256_set1_epi16(0), - _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0); - v_refl = _mm256_inserti128_si256( - v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]), - 1); // R15 .. R0 - - __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE); - - __m256i v_refu = - _mm256_alignr_epi8(v_ref, v_refl, 2); // R8R15R14...R2R1 - - v_refl = _mm256_inserti128_si256( - v_refl, _mm256_extracti128_si256(v_refu, 0), 1); - v_refu = _mm256_inserti128_si256( - v_refu, _mm256_extracti128_si256(v_ref, 0), 0); - - __m256i v_sum = _mm256_set1_epi32(ohoriz); - __m256i parsum = - _mm256_madd_epi16(v_c01, _mm256_alignr_epi8(v_refu, v_refl, 0)); - __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum); - - parsum = - _mm256_madd_epi16(v_c23, _mm256_alignr_epi8(v_refu, v_refl, 4)); - __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum); - parsum = - _mm256_madd_epi16(v_c45, _mm256_alignr_epi8(v_refu, v_refl, 8)); - __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum); - parsum = _mm256_madd_epi16(v_c67, - _mm256_alignr_epi8(v_refu, v_refl, 12)); - __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum); - - tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz), - reduce_bits_horiz); + prepare_8tap_filter_coeffs_avx2(sx, alpha, &coeffs[0]); + load_horiz_src_pixels_avx2(&ref[iy + ix4 - 7], &r[0]); + prepare_8tap_horiz_src_avx2(&r[0], &src[0]); + filter_src_pixels_horiz_avx2(src, coeffs, &offset, + reduce_bits_horiz, &tmp[k + 7]); } } } @@ -833,13 +669,13 @@ if (gamma == 0 && delta == 0) { __m256i coeffs[8], src[8]; - prepare_vertical_filter_coeffs_gamma0_avx2(sy4, coeffs); + prepare_8tap_filter_coeffs_alpha0_gamma0_avx2(sy4, coeffs); coeffs[4] = coeffs[0]; coeffs[5] = coeffs[1]; coeffs[6] = coeffs[2]; coeffs[7] = coeffs[3]; - prepare_input_data(tmp, src); + prepare_8tap_vert_src_avx2(tmp, src); for (int k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) { __m256i v_sum_r0, v_sum_r1; @@ -848,41 +684,41 @@ store_vertical_filter_output_avx2( pred, p_stride, conv_params, bd, &v_sum_r0, &v_sum_r1, - &res_add_const, &reduce_bits_vert_const, &reduce_bits_vert_shift, - use_wtd_comp_avg, &wt0, &wt1, &res_sub_const, &round_bits_const, - &round_bits_shift, i, j, k, reduce_bits_vert); + &res_add_const, &reduce_bits_vert_const, use_wtd_comp_avg, &wt0, + &wt1, &res_sub_const, &round_bits_const, i, j, k, + reduce_bits_vert, round_bits); } } else if (gamma == 0) { __m256i coeffs[8], src[8]; - prepare_input_data(tmp, src); + prepare_8tap_vert_src_avx2(tmp, src); for (int k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) { __m256i v_sum_r0, v_sum_r1; int sy_0 = sy4 + delta * (k + 4); - prepare_vertical_filter_coeffs_gamma0_avx2(sy_0, &coeffs[0]); + prepare_8tap_filter_coeffs_alpha0_gamma0_avx2(sy_0, &coeffs[0]); int sy_1 = sy4 + delta * (k + 5); - prepare_vertical_filter_coeffs_gamma0_avx2(sy_1, &coeffs[4]); + prepare_8tap_filter_coeffs_alpha0_gamma0_avx2(sy_1, &coeffs[4]); filter_src_pixels_vertical_avx2(tmp, src, coeffs, &v_sum_r0, &v_sum_r1, k); store_vertical_filter_output_avx2( pred, p_stride, conv_params, bd, &v_sum_r0, &v_sum_r1, - &res_add_const, &reduce_bits_vert_const, &reduce_bits_vert_shift, - use_wtd_comp_avg, &wt0, &wt1, &res_sub_const, &round_bits_const, - &round_bits_shift, i, j, k, reduce_bits_vert); + &res_add_const, &reduce_bits_vert_const, use_wtd_comp_avg, &wt0, + &wt1, &res_sub_const, &round_bits_const, i, j, k, + reduce_bits_vert, round_bits); } } else if (delta == 0) { __m256i coeffs[8], src[8]; - prepare_vertical_filter_coeffs_avx2(sy4, gamma, &coeffs[0]); + prepare_8tap_filter_coeffs_avx2(sy4, gamma, &coeffs[0]); coeffs[4] = coeffs[0]; coeffs[5] = coeffs[1]; coeffs[6] = coeffs[2]; coeffs[7] = coeffs[3]; - prepare_input_data(tmp, src); + prepare_8tap_vert_src_avx2(tmp, src); for (int k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) { __m256i v_sum_r0, v_sum_r1; @@ -891,31 +727,31 @@ store_vertical_filter_output_avx2( pred, p_stride, conv_params, bd, &v_sum_r0, &v_sum_r1, - &res_add_const, &reduce_bits_vert_const, &reduce_bits_vert_shift, - use_wtd_comp_avg, &wt0, &wt1, &res_sub_const, &round_bits_const, - &round_bits_shift, i, j, k, reduce_bits_vert); + &res_add_const, &reduce_bits_vert_const, use_wtd_comp_avg, &wt0, + &wt1, &res_sub_const, &round_bits_const, i, j, k, + reduce_bits_vert, round_bits); } } else { __m256i src[8]; - prepare_input_data(tmp, src); + prepare_8tap_vert_src_avx2(tmp, src); for (int k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) { __m256i coeffs[8]; __m256i v_sum_r0, v_sum_r1; int sy_0 = sy4 + delta * (k + 4); - prepare_vertical_filter_coeffs_avx2(sy_0, gamma, &coeffs[0]); + prepare_8tap_filter_coeffs_avx2(sy_0, gamma, &coeffs[0]); int sy_1 = sy4 + delta * (k + 5); - prepare_vertical_filter_coeffs_avx2(sy_1, gamma, &coeffs[4]); + prepare_8tap_filter_coeffs_avx2(sy_1, gamma, &coeffs[4]); filter_src_pixels_vertical_avx2(tmp, src, coeffs, &v_sum_r0, &v_sum_r1, k); store_vertical_filter_output_avx2( pred, p_stride, conv_params, bd, &v_sum_r0, &v_sum_r1, - &res_add_const, &reduce_bits_vert_const, &reduce_bits_vert_shift, - use_wtd_comp_avg, &wt0, &wt1, &res_sub_const, &round_bits_const, - &round_bits_shift, i, j, k, reduce_bits_vert); + &res_add_const, &reduce_bits_vert_const, use_wtd_comp_avg, &wt0, + &wt1, &res_sub_const, &round_bits_const, i, j, k, + reduce_bits_vert, round_bits); } } } @@ -934,13 +770,13 @@ // c0 c1 c0 c1 c0 c1 c0 c1 | f0 f1 f0 f1 f0 f1 f0 f1 coeff[0] = _mm256_shuffle_epi8( - filt, _mm256_load_si256((__m256i *)shuffle_gamma0_mask0)); + filt, _mm256_load_si256((__m256i *)shuffle_alpha0_gamma0_mask0)); // c2 c3 c2 c3 c2 c3 c2 c3 | f2 f3 f2 f3 f2 f3 f2 f3 coeff[1] = _mm256_shuffle_epi8( - filt, _mm256_load_si256((__m256i *)shuffle_gamma0_mask1)); + filt, _mm256_load_si256((__m256i *)shuffle_alpha0_gamma0_mask1)); // c4 c5 c4 c5 c4 c5 c4 c5 | f4 f5 f4 f5 f4 f5 f4 f5 coeff[2] = _mm256_shuffle_epi8( - filt, _mm256_load_si256((__m256i *)shuffle_gamma0_mask2)); + filt, _mm256_load_si256((__m256i *)shuffle_alpha0_gamma0_mask2)); } static INLINE void ext_highbd_warp_horizontal_filter_avx2(
diff --git a/av1/encoder/compound_type.c b/av1/encoder/compound_type.c index 28921b5..3a473fd 100644 --- a/av1/encoder/compound_type.c +++ b/av1/encoder/compound_type.c
@@ -51,6 +51,8 @@ if (is_global_mv_block(mi, wm->wmtype) != st->is_global[i]) return 0; } + // TODO(any): Consider tools like OPFL, DMVR in the match criteria. + // Store the stats for COMPOUND_AVERAGE and COMPOUND_DISTWTD for (int comp_type = COMPOUND_AVERAGE; comp_type < COMPOUND_WEDGE; comp_type++) { @@ -1464,9 +1466,14 @@ int32_t comp_model_rate[COMPOUND_TYPES] = { INT_MAX, INT_MAX, INT_MAX }; int64_t comp_model_dist[COMPOUND_TYPES] = { INT64_MAX, INT64_MAX, INT64_MAX }; int match_index = 0; + const int reuse_compound_type_data = + cpi->sf.inter_sf.reuse_compound_type_data; const int match_found = - find_comp_rd_in_stats(cpi, x, mbmi, comp_rate, comp_dist, comp_model_rate, - comp_model_dist, comp_rs2, &match_index); + reuse_compound_type_data + ? find_comp_rd_in_stats(cpi, x, mbmi, comp_rate, comp_dist, + comp_model_rate, comp_model_dist, comp_rs2, + &match_index) + : 0; best_mv[0].as_int = cur_mv[0].as_int; best_mv[1].as_int = cur_mv[1].as_int; *rd = INT64_MAX; @@ -1513,7 +1520,7 @@ #if CONFIG_REFINEMV (!mbmi->refinemv_flag || !switchable_refinemv_flag(cm, mbmi)) && #endif // CONFIG_REFINEMV - cpi->sf.inter_sf.reuse_compound_type_decision) { + (reuse_compound_type_data >= 2)) { return populate_reuse_comp_type_data(x, mbmi, &best_type_stats, cur_mv, comp_rate, comp_dist, comp_rs2, rate_mv, rd, match_index); @@ -1667,7 +1674,7 @@ } } restore_dst_buf(xd, *orig_dst, 1); - if (!match_found) + if (!match_found && reuse_compound_type_data) save_comp_rd_search_stat(x, mbmi, comp_rate, comp_dist, comp_model_rate, comp_model_dist, cur_mv, comp_rs2); return best_type_stats.best_compmode_interinter_cost;
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c index a6dc591..06e37da 100644 --- a/av1/encoder/rdopt.c +++ b/av1/encoder/rdopt.c
@@ -11538,6 +11538,7 @@ if (search_state.best_skip2 == 0) { const int try_intrabc = cpi->oxcf.kf_cfg.enable_intrabc && cpi->oxcf.kf_cfg.enable_intrabc_ext && + !sf->inter_sf.skip_eval_intrabc_in_inter_frame && av1_allow_intrabc(cm, xd #if CONFIG_ENABLE_IBC_NAT ,
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c index 20eeceb..35501ed 100644 --- a/av1/encoder/speed_features.c +++ b/av1/encoder/speed_features.c
@@ -359,6 +359,10 @@ sf->inter_sf.reduce_inter_modes = 1; sf->inter_sf.selective_ref_frame = 1; sf->inter_sf.skip_mode_eval_based_on_rate_cost = 1; + sf->inter_sf.skip_eval_intrabc_in_inter_frame = + cm->features.allow_screen_content_tools ? 0 + : cm->current_frame.pyramid_level < 3 ? 0 + : 1; sf->intra_sf.intra_pruning_with_hog = 1; sf->intra_sf.intra_pruning_with_hog_thresh = -1.2f; @@ -366,7 +370,8 @@ sf->intra_sf.reuse_uv_mode_rd_info = true; #endif // CONFIG_AIMC - sf->tx_sf.adaptive_txb_search_level = 1; + sf->tx_sf.adaptive_tx_type_search_idx = 1; + sf->tx_sf.adaptive_tx_partition_type_search_idx = 1; sf->tx_sf.intra_tx_size_search_init_depth_sqr = 1; sf->tx_sf.model_based_prune_tx_search_level = 1; sf->tx_sf.prune_tx_rd_eval_sec_tx_sse = true; @@ -390,7 +395,8 @@ if (speed >= 1) { sf->inter_sf.selective_ref_frame = 2; - sf->tx_sf.adaptive_txb_search_level = 2; + sf->tx_sf.adaptive_tx_type_search_idx = 4; + sf->tx_sf.adaptive_tx_partition_type_search_idx = 4; sf->inter_sf.prune_comp_search_by_single_result = boosted ? 2 : 1; } @@ -435,7 +441,8 @@ sf->intra_sf.skip_intra_dip_search = true; #endif // CONFIG_DIP - sf->tx_sf.adaptive_txb_search_level = 2; + sf->tx_sf.adaptive_tx_type_search_idx = 4; + sf->tx_sf.adaptive_tx_partition_type_search_idx = 4; sf->tx_sf.inter_tx_size_search_init_depth_rect = 1; sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1; sf->tx_sf.intra_tx_size_search_init_depth_rect = 1; @@ -512,7 +519,9 @@ sf->inter_sf.selective_ref_frame = 4; sf->inter_sf.skip_repeated_ref_mv = 1; sf->inter_sf.skip_repeated_full_newmv = 1; - sf->inter_sf.reuse_compound_type_decision = 1; + // TODO(any): Set this speed feature to 2 after correcting the match + // criteria by considering tools like OPFL, DMVR. + sf->inter_sf.reuse_compound_type_data = 0; sf->inter_sf.txfm_rd_gate_level = boosted ? 0 : (is_boosted_arf2_bwd_type ? 1 : 2); @@ -530,7 +539,8 @@ sf->tx_sf.tx_type_search.skip_stx_search = 1; sf->tx_sf.tx_type_search.skip_cctx_search = 1; - sf->tx_sf.adaptive_txb_search_level = boosted ? 2 : 3; + sf->tx_sf.adaptive_tx_type_search_idx = boosted ? 4 : 5; + sf->tx_sf.adaptive_tx_partition_type_search_idx = boosted ? 4 : 5; sf->tx_sf.tx_type_search.use_skip_flag_prediction = 2; // TODO(any): Refactor the code related to following winner mode speed @@ -814,6 +824,7 @@ inter_sf->prune_comp_search_by_single_result = 0; inter_sf->skip_repeated_ref_mv = 0; inter_sf->skip_repeated_newmv = 0; + inter_sf->skip_eval_intrabc_in_inter_frame = 0; inter_sf->skip_repeated_full_newmv = 0; inter_sf->inter_mode_rd_model_estimation = 0; inter_sf->prune_compound_using_single_ref = 0; @@ -835,7 +846,7 @@ inter_sf->disable_interinter_wedge = 0; inter_sf->prune_ref_mv_idx_search = 0; inter_sf->prune_warped_prob_thresh = 0; - inter_sf->reuse_compound_type_decision = 0; + inter_sf->reuse_compound_type_data = 0; inter_sf->txfm_rd_gate_level = 0; inter_sf->prune_inter_modes_if_skippable = 0; inter_sf->disable_masked_comp = 0; @@ -892,7 +903,8 @@ tx_sf->tx_type_search.prune_tx_type_est_rd = 0; tx_sf->tx_type_search.winner_mode_tx_type_pruning = 0; tx_sf->txb_split_cap = 1; - tx_sf->adaptive_txb_search_level = 0; + tx_sf->adaptive_tx_type_search_idx = 0; + tx_sf->adaptive_tx_partition_type_search_idx = 0; tx_sf->use_intra_txb_hash = 0; tx_sf->use_inter_txb_hash = 1; tx_sf->refine_fast_tx_search_results = 1; @@ -1359,9 +1371,17 @@ if (cpi->oxcf.mode == GOOD && speed == 0) { const int qindex_thresh = 124 + qindex_offset; + const int qindex_thresh2 = 135 + qindex_offset; if (cm->quant_params.base_qindex <= qindex_thresh) { - sf->tx_sf.adaptive_txb_search_level = + sf->tx_sf.adaptive_tx_type_search_idx = (boosted || cm->features.allow_screen_content_tools) ? 1 : 2; + sf->tx_sf.adaptive_tx_partition_type_search_idx = + (boosted || cm->features.allow_screen_content_tools) ? 1 : 2; + } else if (cm->quant_params.base_qindex <= qindex_thresh2) { + sf->tx_sf.adaptive_tx_partition_type_search_idx = + (boosted || cm->features.allow_screen_content_tools) ? 1 : 3; + sf->tx_sf.adaptive_tx_type_search_idx = + (boosted || cm->features.allow_screen_content_tools) ? 1 : 3; } }
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h index 491af36..ecb5882 100644 --- a/av1/encoder/speed_features.h +++ b/av1/encoder/speed_features.h
@@ -662,6 +662,9 @@ // flag to skip NEWMV mode in drl if the motion search result is the same int skip_repeated_newmv; + // flag to skip the evaulation of intrabc mode in inter frame + int skip_eval_intrabc_in_inter_frame; + // flag to early terminate jmvd scaling factors int early_terminate_jmvd_scale_factor; @@ -803,8 +806,9 @@ // Reuse compound type rd decision when exact match is found // 0: No reuse - // 1: Reuse the compound type decision - int reuse_compound_type_decision; + // 1: Reuse the compound type rd data + // 2: Reuse the compound type decision + int reuse_compound_type_data; // Enable/disable masked compound. int disable_masked_comp; @@ -890,11 +894,15 @@ // is selected as all zero coefficients. int txb_split_cap; - // Shortcut the transform block partition and type search when the target - // rdcost is relatively lower. - // Values are 0 (not used) , or 1 - 2 with progressively increasing - // aggressiveness - int adaptive_txb_search_level; + // Prune transform type evaluation when target rdcost is low as + // compared to best rdcost and based on eob. + // 0: no pruning + // 1,4,5: pruning based on best rd + // 2,3: pruning based on eob and best rd + int adaptive_tx_type_search_idx; + // Prune transform partition type evaluation when target rdcost is low as + // compared to TX_PARTITION_NONE and based on the transform size. + int adaptive_tx_partition_type_search_idx; // Prune level for tx_size_type search for inter based on rd model // 0: no pruning
diff --git a/av1/encoder/tx_search.c b/av1/encoder/tx_search.c index 875461f..5a5efdb 100644 --- a/av1/encoder/tx_search.c +++ b/av1/encoder/tx_search.c
@@ -127,6 +127,16 @@ #endif // CONFIG_EXT_RECUR_PARTITIONS }; +// look-up table of transform partition type pruning level used to prune the +// evaluation of transform partition type based on none rd. +static const int tx_partition_prune_level[2][6] = { { 0, 1, 3, 3, 2, 3 }, + { 0, 1, 2, 1, 2, 3 } }; + +// look-up table of transform type pruning level used to prune the evaluation of +// transform type based on best rd and eob. +static const int tx_type_prune_level[2][6] = { { 0, 1, 2, 1, 2, 3 }, + { 0, 1, 3, 3, 2, 3 } }; + static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record, const uint32_t hash) { // Linear search through the circular buffer to find matching hash. @@ -2867,6 +2877,7 @@ xd, plane, blk_col, blk_row, txw, txh, cm->width, cm->height, NULL, NULL); #endif // CONFIG_E191_OFS_PRED_RES_HANDLE + const int max_eob = av1_get_max_eob(tx_size); // Iterate through all transform type candidates. for (int idx = 0; idx < TX_TYPES; ++idx) { #if CONFIG_TX_TYPE_FLEX_IMPROVE @@ -3257,14 +3268,17 @@ } #endif // COLLECT_TX_SIZE_DATA - // If the current best RD cost is much worse than the reference RD cost, - // terminate early. - if (cpi->sf.tx_sf.adaptive_txb_search_level) { - if ((best_rd - (best_rd >> cpi->sf.tx_sf.adaptive_txb_search_level)) > - ref_best_rd) { - skip_idx = true; - break; - } + assert(cpi->sf.tx_sf.adaptive_tx_type_search_idx < 6); + // Terminate the search early, If the best rd is higher than the + // reference best rd and number of coded coefficients are smaller + // than a threshold. + const int search_level = + tx_type_prune_level[p->eobs[block] < max_eob / 8] + [cpi->sf.tx_sf.adaptive_tx_type_search_idx]; + if (search_level && + (best_rd - (best_rd >> search_level)) > ref_best_rd) { + skip_idx = true; + break; } // Terminate transform type search if the block has been quantized to @@ -3820,6 +3834,7 @@ const int txw = tx_size_wide[max_tx_size]; const int txh = tx_size_high[max_tx_size]; const int is_vert_rect = (txh > txw); + const int max_txw_txh = AOMMAX(txw, txh); assert(max_tx_size < TX_SIZES_ALL); TX_SIZE sub_txs[MAX_TX_PARTITIONS] = { 0 }; @@ -3982,12 +3997,13 @@ if (p->eobs[block] == 0) break; } - const int search_level = cpi->sf.tx_sf.adaptive_txb_search_level; - if (search_level) { - if ((tmp_rd - (tmp_rd >> search_level)) > ref_best_rd) { - *is_cost_valid = 0; - break; - } + const int search_level = + tx_partition_prune_level[max_txw_txh == 64] + [cpi->sf.tx_sf + .adaptive_tx_partition_type_search_idx]; + if (search_level && (tmp_rd - (tmp_rd >> search_level)) > ref_best_rd) { + *is_cost_valid = 0; + break; } } } @@ -4061,8 +4077,11 @@ plane_bsize, ta, tl, ctx, rd_stats, ref_best_rd, ftxs_mode, rd_info_node, &no_split); + assert(cpi->sf.tx_sf.adaptive_tx_partition_type_search_idx < 6); // Speed features for early termination. - const int search_level = cpi->sf.tx_sf.adaptive_txb_search_level; + const int search_level = + tx_partition_prune_level[1][cpi->sf.tx_sf + .adaptive_tx_partition_type_search_idx]; if (search_level) { if ((no_split.rd - (no_split.rd >> (1 + search_level))) > ref_best_rd) { *is_cost_valid = 0;
diff --git a/test/av1_convolve_test.cc b/test/av1_convolve_test.cc index 257c873..16c7310 100644 --- a/test/av1_convolve_test.cc +++ b/test/av1_convolve_test.cc
@@ -116,6 +116,7 @@ sizes.insert(BlockSize(w / 2, h / 2)); } } + sizes.insert(BlockSize(24, 24)); std::vector<TestParam<T>> result; for (const BlockSize &block : sizes) { for (int bd : bit_depths) { @@ -142,7 +143,7 @@ TEST_F(AV1ConvolveParametersTest, GetHighbdTestParams) { auto v = GetHighbdTestParams(av1_highbd_convolve_x_sr_c); #if CONFIG_EXT_RECUR_PARTITIONS - ASSERT_EQ(80U, v.size()); + ASSERT_EQ(82U, v.size()); #else ASSERT_EQ(60U, v.size()); #endif // CONFIG_EXT_RECUR_PARTITIONS
diff --git a/test/bawp_test.cc b/test/bawp_test.cc new file mode 100644 index 0000000..26cd971 --- /dev/null +++ b/test/bawp_test.cc
@@ -0,0 +1,147 @@ +/* + * Copyright (c) 2025, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 3-Clause Clear License + * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear + * License was not distributed with this source code in the LICENSE file, you + * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the + * Alliance for Open Media Patent License 1.0 was not distributed with this + * source code in the PATENTS file, you can obtain it at + * aomedia.org/license/patent-license/. + */ + +#include "config/av1_rtcd.h" + +#include "test/acm_random.h" +#include "test/util.h" + +#if CONFIG_BAWP +namespace { +typedef void (*make_bawp_func)(uint16_t *dst, int dst_stride, int16_t alpha, + int32_t beta, int shift, int bw, int bh, int bd); +#if HAVE_AVX2 +const BLOCK_SIZE kValidBlockSize[] = { + BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, BLOCK_8X8, + BLOCK_8X16, BLOCK_16X8, BLOCK_16X16, BLOCK_16X32, + BLOCK_32X16, BLOCK_32X32, BLOCK_32X64, BLOCK_64X32, + BLOCK_64X64, BLOCK_64X128, BLOCK_128X64, BLOCK_128X128, +#if CONFIG_EXT_RECUR_PARTITIONS + BLOCK_128X256, BLOCK_256X128, BLOCK_256X256, +#endif // CONFIG_EXT_RECUR_PARTITIONS + BLOCK_4X16, BLOCK_16X4, BLOCK_8X32, BLOCK_32X8, + BLOCK_16X64, BLOCK_64X16, +#if CONFIG_EXT_RECUR_PARTITIONS + BLOCK_4X32, BLOCK_32X4, BLOCK_8X64, BLOCK_64X8, + BLOCK_4X64, BLOCK_64X4, +#endif // CONFIG_EXT_RECUR_PARTITIONS +}; +#endif // HAVE_AVX2 + +typedef std::tuple<make_bawp_func, BLOCK_SIZE> BAWPParam; + +class BAWPTest : public ::testing::TestWithParam<BAWPParam> { + public: + ~BAWPTest(); + void SetUp(); + + void TearDown(); + + protected: + void RunCheckOutput(make_bawp_func test_impl, BLOCK_SIZE bsize); + void RunSpeedTest(make_bawp_func test_impl, BLOCK_SIZE bsize); + bool CheckResult(int width, int height) { + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + const int idx = y * width + x; + if (pred1_[idx] != pred2_[idx]) { + printf("%dx%d mismatch @%d(%d,%d) ", width, height, idx, x, y); + printf("%d != %d ", pred1_[idx], pred2_[idx]); + return false; + } + } + } + return true; + } + + libaom_test::ACMRandom rnd_; + uint16_t *pred1_; + uint16_t *pred2_; +}; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BAWPTest); + +BAWPTest::~BAWPTest() {} + +void BAWPTest::SetUp() { + rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed()); + + pred1_ = (uint16_t *)aom_memalign( + 16, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(uint16_t)); + ASSERT_NE(pred1_, nullptr); + pred2_ = (uint16_t *)aom_memalign( + 16, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(uint16_t)); + ASSERT_NE(pred2_, nullptr); + for (int i = 0; i < (MAX_SB_SIZE * MAX_SB_SIZE); ++i) { + pred1_[i] = rnd_.Rand16(); + pred2_[i] = pred1_[i]; + } +} + +void BAWPTest::TearDown() { + aom_free(pred1_); + aom_free(pred2_); +} + +void BAWPTest::RunCheckOutput(make_bawp_func test_impl, BLOCK_SIZE bsize) { + const int w = block_size_wide[bsize]; + const int h = block_size_high[bsize]; + const int16_t alpha = 320; + const int32_t beta = -42036; + const int shift = 8; + int bd[3] = { 8, 10, 12 }; + for (int i = 0; i < 3; ++i) { + av1_make_bawp_block_c(pred1_, MAX_SB_SIZE, alpha, beta, shift, w, h, bd[i]); + test_impl(pred2_, MAX_SB_SIZE, alpha, beta, shift, w, h, bd[i]); + + ASSERT_EQ(CheckResult(w, h), true); + } +} + +void BAWPTest::RunSpeedTest(make_bawp_func test_impl, BLOCK_SIZE bsize) { + const int w = block_size_wide[bsize]; + const int h = block_size_high[bsize]; + const int num_loops = 1000000000 / (w + h); + const int16_t alpha = 320; + const int32_t beta = -42036; + const int shift = 8; + int bd = 8; + + make_bawp_func functions[2] = { av1_make_bawp_block_c, test_impl }; + double elapsed_time[2] = { 0.0 }; + for (int i = 0; i < 2; ++i) { + aom_usec_timer timer; + aom_usec_timer_start(&timer); + make_bawp_func func = functions[i]; + for (int j = 0; j < num_loops; ++j) { + func(pred1_, MAX_SB_SIZE, alpha, beta, shift, w, h, bd); + } + aom_usec_timer_mark(&timer); + const double time = static_cast<double>(aom_usec_timer_elapsed(&timer)); + elapsed_time[i] = 1000.0 * time; + } + printf("BAWP %3dx%-3d: c_time=%7.2fs, simd_time=%7.2fs, scaling=%3.2f\n", w, + h, elapsed_time[0], elapsed_time[1], + elapsed_time[0] / elapsed_time[1]); +} + +TEST_P(BAWPTest, CheckOutput) { RunCheckOutput(GET_PARAM(0), GET_PARAM(1)); } + +TEST_P(BAWPTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0), GET_PARAM(1)); } + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P( + AVX2, BAWPTest, + ::testing::Combine(::testing::Values(&av1_make_bawp_block_avx2), + ::testing::ValuesIn(kValidBlockSize))); +#endif // HAVE_AVX2 +} // namespace +#endif // CONFIG_BAWP
diff --git a/test/test.cmake b/test/test.cmake index 33fb16d..c5fe3ac 100644 --- a/test/test.cmake +++ b/test/test.cmake
@@ -90,6 +90,7 @@ APPEND AOM_UNIT_TEST_COMMON_SOURCES "${AOM_ROOT}/test/av1_common_int_test.cc" + "${AOM_ROOT}/test/bawp_test.cc" "${AOM_ROOT}/test/cdef_test.cc" "${AOM_ROOT}/test/cfl_test.cc" "${AOM_ROOT}/test/convolve_test.cc"
diff --git a/test/warp_filter_test.cc b/test/warp_filter_test.cc index 3cdd921..03497e3 100644 --- a/test/warp_filter_test.cc +++ b/test/warp_filter_test.cc
@@ -60,11 +60,11 @@ #endif // CONFIG_EXT_WARP_FILTER #if HAVE_AVX2 -#if !CONFIG_OPFL_MEMBW_REDUCTION +#if CONFIG_OPFL_MEMBW_REDUCTION INSTANTIATE_TEST_SUITE_P( AVX2, AV1HighbdWarpFilterTest, libaom_test::AV1HighbdWarpFilter::BuildParams(av1_highbd_warp_affine_avx2)); -#endif // !CONFIG_OPFL_MEMBW_REDUCTION +#endif // CONFIG_OPFL_MEMBW_REDUCTION #endif // HAVE_AVX2 } // namespace
diff --git a/test/warp_filter_test_util.cc b/test/warp_filter_test_util.cc index eac44b6..712d9a4 100644 --- a/test/warp_filter_test_util.cc +++ b/test/warp_filter_test_util.cc
@@ -18,8 +18,8 @@ namespace libaom_test { int32_t random_warped_param(libaom_test::ACMRandom *rnd, int bits) { - // 1 in 8 chance of generating zero (arbitrarily chosen) - if (((rnd->Rand8()) & 7) == 0) return 0; + // 1 in 32 chance of generating zero (arbitrarily chosen) + if (((rnd->Rand8()) & 0x1f) == 0) return 0; // Otherwise, enerate uniform values in the range // [-(1 << bits), 1] U [1, 1<<bits] int32_t v = 1 + (rnd->Rand16() & ((1 << bits) - 1)); @@ -40,6 +40,8 @@ (1 << WARPEDMODEL_PREC_BITS); mat[3] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3); + if (is_alpha_zero == 1) mat[2] = 1 << WARPEDMODEL_PREC_BITS; + if (is_beta_zero == 1) mat[3] = 0; if (rnd8 <= 1) { // AFFINE mat[4] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3); @@ -52,14 +54,12 @@ mat[4] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3); mat[5] = (random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3)) + (1 << WARPEDMODEL_PREC_BITS); - if (is_alpha_zero == 1) mat[2] = 1 << WARPEDMODEL_PREC_BITS; - if (is_beta_zero == 1) mat[3] = 0; - if (is_gamma_zero == 1) mat[4] = 0; - if (is_delta_zero == 1) - mat[5] = static_cast<int32_t>( - ((static_cast<int64_t>(mat[3]) * mat[4] + (mat[2] / 2)) / mat[2]) + - (1 << WARPEDMODEL_PREC_BITS)); } + if (is_gamma_zero == 1) mat[4] = 0; + if (is_delta_zero == 1) + mat[5] = static_cast<int32_t>( + ((static_cast<int64_t>(mat[3]) * mat[4] + (mat[2] / 2)) / mat[2]) + + (1 << WARPEDMODEL_PREC_BITS)); // Calculate the derived parameters and check that they are suitable // for the warp filter. @@ -98,6 +98,31 @@ return; } } +#if CONFIG_OPFL_MEMBW_REDUCTION +void generate_ref_area_limits(libaom_test::ACMRandom *rnd, + ReferenceArea *ref_area, int w, int h, int out_w, + int out_h, int p_row, int p_col, int *mat, + int use_ref_area_pad) { + int left_limit; + if (use_ref_area_pad) { + const int32_t src_x = (p_col + 4); + const int32_t src_y = (p_row + 4); + const int64_t dst_x = + (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0]; + const int32_t ix4 = (int32_t)(dst_x >> WARPEDMODEL_PREC_BITS); + left_limit = ix4 - 7 + 3; + } else { + left_limit = rnd->Rand8() % (w - 1); + } + ref_area->pad_block.x0 = left_limit; + ref_area->pad_block.x1 = ref_area->pad_block.x0 + out_w + 7; + ref_area->pad_block.y0 = rnd->Rand8() % (h - 1); + ref_area->pad_block.y1 = ref_area->pad_block.y0 + out_h + 7; + + ref_area->pad_block.x1 = CLIP(ref_area->pad_block.x1, 1, w); + ref_area->pad_block.y1 = CLIP(ref_area->pad_block.y1, 1, h); +} +#endif namespace AV1HighbdWarpFilter { ::testing::internal::ParamGenerator<HighbdWarpTestParams> BuildParams( @@ -136,7 +161,7 @@ const int out_w = std::get<0>(param), out_h = std::get<1>(param); const int bd = std::get<3>(param); const int mask = (1 << bd) - 1; - int sub_x, sub_y; + int sub_x, sub_y, p_row, p_col; // The warp functions always write rows with widths that are multiples of 8. // So to avoid a buffer overflow, we may need to pad rows to a multiple of 8. @@ -164,6 +189,8 @@ sub_x = 0; sub_y = 0; + p_row = 32; + p_col = 32; int do_average = 0; conv_params = get_conv_params_no_round(do_average, 0, dsta, out_w, 1, bd); @@ -172,8 +199,8 @@ aom_usec_timer_start(&timer); for (int i = 0; i < num_loops; ++i) - test_impl(mat, input, w, h, stride, output, 32, 32, out_w, out_h, out_w, - sub_x, sub_y, bd, &conv_params, alpha, beta, gamma, delta + test_impl(mat, input, w, h, stride, output, p_col, p_row, out_w, out_h, + out_w, sub_x, sub_y, bd, &conv_params, alpha, beta, gamma, delta #if CONFIG_OPFL_MEMBW_REDUCTION , 0, NULL @@ -181,9 +208,27 @@ ); aom_usec_timer_mark(&timer); - const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer)); + const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer)); + printf("highbd warp %3dx%-3d: %7.2f ns\n", out_w, out_h, - 1000.0 * elapsed_time / num_loops); + 1000.0 * elapsed_time1 / num_loops); + +#if CONFIG_OPFL_MEMBW_REDUCTION + ReferenceArea ref_area; + generate_ref_area_limits(&rnd_, &ref_area, w, h, out_w, out_h, p_row, p_col, + mat, 1); + aom_usec_timer_start(&timer); + + for (int i = 0; i < num_loops; ++i) + test_impl(mat, input, w, h, stride, output, p_col, p_row, out_w, out_h, + out_w, sub_x, sub_y, bd, &conv_params, alpha, beta, gamma, delta, + 1, &ref_area); + + aom_usec_timer_mark(&timer); + const int elapsed_time2 = static_cast<int>(aom_usec_timer_elapsed(&timer)); + printf("highbd warp using ref area padding %3dx%-3d: %7.2f ns\n", out_w, + out_h, 1000.0 * elapsed_time2 / num_loops); +#endif delete[] input_; delete[] output; @@ -193,6 +238,7 @@ void AV1HighbdWarpFilterTest::RunCheckOutput( highbd_warp_affine_func test_impl) { const int w = 128, h = 128; + const int p_row = 32, p_col = 32; const int border = 16; const int stride = w + 2 * border; HighbdWarpTestParam param = GET_PARAM(0); @@ -218,6 +264,9 @@ ConvolveParams conv_params = get_conv_params(0, 0, bd); CONV_BUF_TYPE *dsta = new CONV_BUF_TYPE[output_n]; CONV_BUF_TYPE *dstb = new CONV_BUF_TYPE[output_n]; +#if CONFIG_OPFL_MEMBW_REDUCTION + ReferenceArea ref_area; +#endif for (int i = 0; i < output_n; ++i) output[i] = output2[i] = rnd_.Rand16(); for (i = 0; i < num_iters; ++i) { @@ -230,6 +279,11 @@ input[r * stride + w + c] = input[r * stride + (w - 1)]; } } +#if CONFIG_OPFL_MEMBW_REDUCTION + int use_damr_padding = i % 2 == 0; + generate_ref_area_limits(&rnd_, &ref_area, w, h, out_w, out_h, p_row, p_col, + NULL, 0); +#endif const int use_no_round = rnd_.Rand8() & 1; for (sub_x = 0; sub_x < 2; ++sub_x) for (sub_y = 0; sub_y < 2; ++sub_y) { @@ -251,12 +305,13 @@ conv_params.bck_offset = quant_dist_lookup_table[jj][1 - ii]; } - av1_highbd_warp_affine_c(mat, input, w, h, stride, output, 32, 32, - out_w, out_h, out_w, sub_x, sub_y, bd, - &conv_params, alpha, beta, gamma, delta + av1_highbd_warp_affine_c(mat, input, w, h, stride, output, p_col, + p_row, out_w, out_h, out_w, sub_x, sub_y, + bd, &conv_params, alpha, beta, gamma, + delta #if CONFIG_OPFL_MEMBW_REDUCTION , - 0, NULL + use_damr_padding, &ref_area #endif // CONFIG_OPFL_MEMBW_REDUCTION ); if (use_no_round) { @@ -270,12 +325,12 @@ conv_params.fwd_offset = quant_dist_lookup_table[jj][ii]; conv_params.bck_offset = quant_dist_lookup_table[jj][1 - ii]; } - test_impl(mat, input, w, h, stride, output2, 32, 32, out_w, out_h, - out_w, sub_x, sub_y, bd, &conv_params, alpha, beta, - gamma, delta + test_impl(mat, input, w, h, stride, output2, p_col, p_row, out_w, + out_h, out_w, sub_x, sub_y, bd, &conv_params, alpha, + beta, gamma, delta #if CONFIG_OPFL_MEMBW_REDUCTION , - 0, NULL + use_damr_padding, &ref_area #endif // CONFIG_OPFL_MEMBW_REDUCTION );
diff --git a/test/warp_filter_test_util.h b/test/warp_filter_test_util.h index a9d77ed..ae1af23 100644 --- a/test/warp_filter_test_util.h +++ b/test/warp_filter_test_util.h
@@ -34,6 +34,12 @@ int16_t *alpha, int16_t *beta, int16_t *gamma, int16_t *delta, int is_alpha_zero, int is_beta_zero, int is_gamma_zero, int is_delta_zero); +#if CONFIG_OPFL_MEMBW_REDUCTION +void generate_ref_area_limits(libaom_test::ACMRandom *rnd, + ReferenceArea *ref_area, int w, int h, int out_w, + int out_h, int p_row, int p_col, int *mat, + int use_ref_area_pad); +#endif namespace AV1WarpFilter {