Remove convolve_round/compound_round config flags Merged convolve_round experiment and removed its config flag in the code. Removed compound_round code. Change-Id: Ic01856732d75cca65d3866383d3cc1dd572f8863
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl index 1703ba0..249e1dc 100755 --- a/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -735,9 +735,7 @@ # # Alpha blending with mask # - if (aom_config("CONFIG_CONVOLVE_ROUND") eq "yes") { - add_proto qw/void aom_blend_a64_d32_mask/, "int32_t *dst, uint32_t dst_stride, const int32_t *src0, uint32_t src0_stride, const int32_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx"; - } + add_proto qw/void aom_blend_a64_d32_mask/, "int32_t *dst, uint32_t dst_stride, const int32_t *src0, uint32_t src0_stride, const int32_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx"; add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx"; add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w"; add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w";
diff --git a/aom_dsp/blend_a64_mask.c b/aom_dsp/blend_a64_mask.c index 5cd3dac..384e81b 100644 --- a/aom_dsp/blend_a64_mask.c +++ b/aom_dsp/blend_a64_mask.c
@@ -18,7 +18,6 @@ #include "./aom_dsp_rtcd.h" -#if CONFIG_CONVOLVE_ROUND // Blending with alpha mask. Mask values come from the range [0, 64], // as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can // be the same as dst, or dst can be different from both sources. @@ -79,7 +78,6 @@ } } } -#endif // CONFIG_CONVOLVE_ROUND // Blending with alpha mask. Mask values come from the range [0, 64], // as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can
diff --git a/av1/av1.cmake b/av1/av1.cmake index 74990ab..d3aadb4 100644 --- a/av1/av1.cmake +++ b/av1/av1.cmake
@@ -300,53 +300,49 @@ "${AOM_ROOT}/av1/common/clpf_neon.c") endif () -if (CONFIG_CONVOLVE_ROUND) - set(AOM_AV1_COMMON_INTRIN_SSE2 - ${AOM_AV1_COMMON_INTRIN_SSE2} - "${AOM_ROOT}/av1/common/x86/convolve_2d_sse2.c") +set(AOM_AV1_COMMON_INTRIN_SSE2 + ${AOM_AV1_COMMON_INTRIN_SSE2} + "${AOM_ROOT}/av1/common/x86/convolve_2d_sse2.c") +set(AOM_AV1_COMMON_INTRIN_AVX2 + ${AOM_AV1_COMMON_INTRIN_AVX2} + "${AOM_ROOT}/av1/common/x86/convolve_2d_avx2.c") + +if (CONFIG_HIGHBITDEPTH) set(AOM_AV1_COMMON_INTRIN_AVX2 ${AOM_AV1_COMMON_INTRIN_AVX2} - "${AOM_ROOT}/av1/common/x86/convolve_2d_avx2.c") + "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_avx2.c") - if (CONFIG_HIGHBITDEPTH) - set(AOM_AV1_COMMON_INTRIN_AVX2 - ${AOM_AV1_COMMON_INTRIN_AVX2} - "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_avx2.c") + set(AOM_AV1_COMMON_INTRIN_SSSE3 + ${AOM_AV1_COMMON_INTRIN_SSSE3} + "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_ssse3.c") +endif () - set(AOM_AV1_COMMON_INTRIN_SSSE3 - ${AOM_AV1_COMMON_INTRIN_SSSE3} - "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_ssse3.c") - endif () +if (CONFIG_JNT_COMP) + set(AOM_AV1_COMMON_INTRIN_SSE4_1 + ${AOM_AV1_COMMON_INTRIN_SSE4_1} + "${AOM_ROOT}/av1/common/x86/convolve_2d_sse4.c") +endif () - if (CONFIG_JNT_COMP) - set(AOM_AV1_COMMON_INTRIN_SSE4_1 - ${AOM_AV1_COMMON_INTRIN_SSE4_1} - "${AOM_ROOT}/av1/common/x86/convolve_2d_sse4.c") - endif () - - if(NOT CONFIG_COMPOUND_ROUND) - set(AOM_AV1_COMMON_INTRIN_SSE4_1 - ${AOM_AV1_COMMON_INTRIN_SSE4_1} - "${AOM_ROOT}/av1/common/x86/av1_convolve_scale_sse4.c") - endif() +set(AOM_AV1_COMMON_INTRIN_SSE4_1 + ${AOM_AV1_COMMON_INTRIN_SSE4_1} + "${AOM_ROOT}/av1/common/x86/av1_convolve_scale_sse4.c") set(AOM_AV1_COMMON_INTRIN_SSE2 ${AOM_AV1_COMMON_INTRIN_SSE2} "${AOM_ROOT}/av1/common/x86/convolve_sse2.c") - set(AOM_AV1_COMMON_INTRIN_AVX2 - ${AOM_AV1_COMMON_INTRIN_AVX2} - "${AOM_ROOT}/av1/common/x86/convolve_avx2.c") -endif () +set(AOM_AV1_COMMON_INTRIN_AVX2 + ${AOM_AV1_COMMON_INTRIN_AVX2} + "${AOM_ROOT}/av1/common/x86/convolve_avx2.c") - set(AOM_AV1_ENCODER_SOURCES - ${AOM_AV1_ENCODER_SOURCES} - "${AOM_ROOT}/av1/encoder/wedge_utils.c") +set(AOM_AV1_ENCODER_SOURCES + ${AOM_AV1_ENCODER_SOURCES} + "${AOM_ROOT}/av1/encoder/wedge_utils.c") - set(AOM_AV1_ENCODER_INTRIN_SSE2 - ${AOM_AV1_ENCODER_INTRIN_SSE2} - "${AOM_ROOT}/av1/encoder/x86/wedge_utils_sse2.c") +set(AOM_AV1_ENCODER_INTRIN_SSE2 + ${AOM_AV1_ENCODER_INTRIN_SSE2} + "${AOM_ROOT}/av1/encoder/x86/wedge_utils_sse2.c") if (CONFIG_ACCOUNTING) set(AOM_AV1_DECODER_SOURCES
diff --git a/av1/av1_common.mk b/av1/av1_common.mk index cbff82d..bc3afa1 100644 --- a/av1/av1_common.mk +++ b/av1/av1_common.mk
@@ -79,9 +79,7 @@ AV1_COMMON_SRCS-yes += common/av1_inv_txfm1d_cfg.h AV1_COMMON_SRCS-$(HAVE_AVX2) += common/x86/convolve_avx2.c AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/av1_convolve_ssse3.c -ifeq ($(CONFIG_CONVOLVE_ROUND)x$(CONFIG_COMPOUND_ROUND),yesx) AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_convolve_scale_sse4.c -endif ifeq ($(CONFIG_HIGHBITDEPTH),yes) AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_highbd_convolve_sse4.c endif @@ -161,7 +159,6 @@ endif endif -ifeq ($(CONFIG_CONVOLVE_ROUND),yes) AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/convolve_sse2.c AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/convolve_2d_sse2.c AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/convolve_2d_sse4.c @@ -170,7 +167,6 @@ AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/highbd_convolve_2d_ssse3.c AV1_COMMON_SRCS-$(HAVE_AVX2) += common/x86/highbd_convolve_2d_avx2.c endif -endif ifeq ($(CONFIG_LV_MAP),yes) AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/txb_sse2.c
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl index 3676d8f..f76f79a 100755 --- a/av1/common/av1_rtcd_defs.pl +++ b/av1/common/av1_rtcd_defs.pl
@@ -575,48 +575,37 @@ } # CONVOLVE_ROUND/COMPOUND_ROUND functions +add_proto qw/void av1_convolve_2d/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; +specialize qw/av1_convolve_2d sse2 avx2/; +add_proto qw/void av1_convolve_rounding/, "const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits"; +specialize qw/av1_convolve_rounding avx2/; -if (aom_config("CONFIG_CONVOLVE_ROUND") eq "yes") { - add_proto qw/void av1_convolve_2d/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; - specialize qw/av1_convolve_2d sse2 avx2/; - add_proto qw/void av1_convolve_rounding/, "const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits"; - specialize qw/av1_convolve_rounding avx2/; +add_proto qw/void av1_convolve_2d_copy/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; +specialize qw/av1_convolve_2d_copy sse2/; +add_proto qw/void av1_convolve_x/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; +specialize qw/av1_convolve_x sse2/; +add_proto qw/void av1_convolve_y/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; +specialize qw/av1_convolve_y sse2/; - if (aom_config("CONFIG_COMPOUND_ROUND") ne "yes") { - add_proto qw/void av1_convolve_2d_copy/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; - specialize qw/av1_convolve_2d_copy sse2/; - add_proto qw/void av1_convolve_x/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; - specialize qw/av1_convolve_x sse2/; - add_proto qw/void av1_convolve_y/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; - specialize qw/av1_convolve_y sse2/; - } +add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params"; +specialize qw/av1_convolve_2d_scale sse4_1/; - add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params"; - if (aom_config("CONFIG_COMPOUND_ROUND") ne "yes") { - specialize qw/av1_convolve_2d_scale sse4_1/; - } +if (aom_config("CONFIG_JNT_COMP") eq "yes") { + add_proto qw/void av1_jnt_convolve_2d/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; + specialize qw/av1_jnt_convolve_2d sse4_1/; - if (aom_config("CONFIG_JNT_COMP") eq "yes") { - add_proto qw/void av1_jnt_convolve_2d/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; - specialize qw/av1_jnt_convolve_2d sse4_1/; + add_proto qw/void av1_jnt_convolve_2d_copy/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; + specialize qw/av1_jnt_convolve_2d_copy sse2/; +} - if (aom_config("CONFIG_COMPOUND_ROUND") ne "yes") { - add_proto qw/void av1_jnt_convolve_2d_copy/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; - specialize qw/av1_jnt_convolve_2d_copy sse2/; - } - } +if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + add_proto qw/void av1_highbd_convolve_2d/, "const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; + specialize qw/av1_highbd_convolve_2d ssse3 avx2/; + add_proto qw/void av1_highbd_convolve_rounding/, "const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits, int bd"; + specialize qw/av1_highbd_convolve_rounding avx2/; - if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - add_proto qw/void av1_highbd_convolve_2d/, "const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; - specialize qw/av1_highbd_convolve_2d ssse3 avx2/; - add_proto qw/void av1_highbd_convolve_rounding/, "const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits, int bd"; - specialize qw/av1_highbd_convolve_rounding avx2/; - - add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd"; - if (aom_config("CONFIG_COMPOUND_ROUND") ne "yes") { - specialize qw/av1_highbd_convolve_2d_scale sse4_1/; - } - } + add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd"; + specialize qw/av1_highbd_convolve_2d_scale sse4_1/; } # INTRA_EDGE functions
diff --git a/av1/common/convolve.c b/av1/common/convolve.c index 74b7085..d0b747e 100644 --- a/av1/common/convolve.c +++ b/av1/common/convolve.c
@@ -357,7 +357,6 @@ } } -#if CONFIG_CONVOLVE_ROUND void av1_convolve_rounding_c(const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits) { for (int r = 0; r < h; ++r) { @@ -368,190 +367,6 @@ } } -#if CONFIG_COMPOUND_ROUND -void av1_convolve_2d_c(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, - int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { - uint8_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; - int im_h = h + filter_params_y->taps - 1; - int im_stride = w; - const int fo_vert = filter_params_y->taps / 2 - 1; - const int fo_horiz = filter_params_x->taps / 2 - 1; - - // horizontal filter - const uint8_t *src_horiz = src - fo_vert * src_stride; - const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_x, subpel_x_q4 & SUBPEL_MASK); - for (int y = 0; y < im_h; ++y) { - for (int x = 0; x < w; ++x) { - int32_t sum = 0; - for (int k = 0; k < filter_params_x->taps; ++k) { - sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; - } - im_block[y * im_stride + x] = - clip_pixel(ROUND_POWER_OF_TWO(sum, conv_params->round_0)); - } - } - - // vertical filter - uint8_t *src_vert = im_block + fo_vert * im_stride; - const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_y, subpel_y_q4 & SUBPEL_MASK); - for (int y = 0; y < h; ++y) { - for (int x = 0; x < w; ++x) { - CONV_BUF_TYPE sum = 0; - for (int k = 0; k < filter_params_y->taps; ++k) { - sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; - } - CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); - if (conv_params->do_average) - dst[y * dst_stride + x] += res; - else - dst[y * dst_stride + x] = res; - } - } -} - -#if CONFIG_JNT_COMP -void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, - CONV_BUF_TYPE *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { - uint8_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; - int im_h = h + filter_params_y->taps - 1; - int im_stride = w; - const int fo_vert = filter_params_y->taps / 2 - 1; - const int fo_horiz = filter_params_x->taps / 2 - 1; - - // horizontal filter - const uint8_t *src_horiz = src - fo_vert * src_stride; - const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_x, subpel_x_q4 & SUBPEL_MASK); - for (int y = 0; y < im_h; ++y) { - for (int x = 0; x < w; ++x) { - int32_t sum = 0; - for (int k = 0; k < filter_params_x->taps; ++k) { - sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; - } - im_block[y * im_stride + x] = - clip_pixel(ROUND_POWER_OF_TWO(sum, conv_params->round_0)); - } - } - - // vertical filter - uint8_t *src_vert = im_block + fo_vert * im_stride; - const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_y, subpel_y_q4 & SUBPEL_MASK); - for (int y = 0; y < h; ++y) { - for (int x = 0; x < w; ++x) { - CONV_BUF_TYPE sum = 0; - for (int k = 0; k < filter_params_y->taps; ++k) { - sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; - } - CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); - if (conv_params->use_jnt_comp_avg) { - if (conv_params->do_average == 0) { - dst[y * dst_stride + x] = res * conv_params->fwd_offset; - } else { - dst[y * dst_stride + x] += res * conv_params->bck_offset; - - dst[y * dst_stride + x] = ROUND_POWER_OF_TWO(dst[y * dst_stride + x], - DIST_PRECISION_BITS - 1); - } - } else { - if (conv_params->do_average) - dst[y * dst_stride + x] += res; - else - dst[y * dst_stride + x] = res; - } - } - } -} -#endif // CONFIG_JNT_COMP - -void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, - CONV_BUF_TYPE *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, - const int subpel_x_qn, const int x_step_qn, - const int subpel_y_qn, const int y_step_qn, - ConvolveParams *conv_params) { - uint8_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]; - int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + - filter_params_y->taps; - int im_stride = w; - const int fo_vert = filter_params_y->taps / 2 - 1; - const int fo_horiz = filter_params_x->taps / 2 - 1; - - // horizontal filter - const uint8_t *src_horiz = src - fo_vert * src_stride; - for (int y = 0; y < im_h; ++y) { - int x_qn = subpel_x_qn; - for (int x = 0; x < w; ++x, x_qn += x_step_qn) { - const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)]; - const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; - assert(x_filter_idx < SUBPEL_SHIFTS); - const int16_t *x_filter = - av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx); - int sum = 0; - for (int k = 0; k < filter_params_x->taps; ++k) - sum += x_filter[k] * src_x[k - fo_horiz]; - im_block[y * im_stride + x] = - clip_pixel(ROUND_POWER_OF_TWO(sum, conv_params->round_0)); - } - src_horiz += src_stride; - } - - // vertical filter - const uint8_t *src_vert = im_block + fo_vert * im_stride; - for (int x = 0; x < w; ++x) { - int y_qn = subpel_y_qn; - for (int y = 0; y < h; ++y, y_qn += y_step_qn) { - const uint8_t *const src_y = - &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride]; - const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; - assert(y_filter_idx < SUBPEL_SHIFTS); - const int16_t *y_filter = - av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx); - CONV_BUF_TYPE sum = 0; - for (int k = 0; k < filter_params_y->taps; ++k) { - sum += y_filter[k] * src_y[(k - fo_vert) * im_stride]; - } - CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); -#if CONFIG_JNT_COMP - if (conv_params->use_jnt_comp_avg) { - if (conv_params->do_average == 0) { - dst[y * dst_stride + x] = res * conv_params->fwd_offset; - } else { - dst[y * dst_stride + x] += res * conv_params->bck_offset; - - dst[y * dst_stride + x] = ROUND_POWER_OF_TWO(dst[y * dst_stride + x], - DIST_PRECISION_BITS - 1); - } - } else { - if (conv_params->do_average) - dst[y * dst_stride + x] += res; - else - dst[y * dst_stride + x] = res; - } -#else - if (conv_params->do_average) - dst[y * dst_stride + x] += res; - else - dst[y * dst_stride + x] = res; -#endif // CONFIG_JNT_COMP - } - src_vert++; - } -} - -#else - /* When convolve-round is enabled and compound-round is disabled, we use a high-precision convolve filter. Note: For notes on hardware implementations, including the required @@ -877,7 +692,6 @@ src_vert++; } } -#endif // CONFIG_COMPOUND_ROUND void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, @@ -947,12 +761,6 @@ &filter_params_y, subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4, conv_params); } else { -#if CONFIG_COMPOUND_ROUND - av1_jnt_convolve_2d(src, src_stride, conv_params->dst, - conv_params->dst_stride, w, h, &filter_params_x, - &filter_params_y, subpel_x_q4, subpel_y_q4, - conv_params); -#else if (subpel_x_q4 == 0 && subpel_y_q4 == 0) { av1_jnt_convolve_2d_copy(src, src_stride, conv_params->dst, conv_params->dst_stride, w, h, @@ -976,7 +784,6 @@ &filter_params_y, subpel_x_q4, subpel_y_q4, conv_params); } -#endif // CONFIG_COMPOUND_ROUND } #else if (scaled) { @@ -985,11 +792,6 @@ &filter_params_y, subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4, conv_params); } else { -#if CONFIG_COMPOUND_ROUND - av1_convolve_2d(src, src_stride, conv_params->dst, - conv_params->dst_stride, w, h, &filter_params_x, - &filter_params_y, subpel_x_q4, subpel_y_q4, conv_params); -#else // Special case convolve functions should produce the same result as // av1_convolve_2d. if (subpel_x_q4 == 0 && subpel_y_q4 == 0) { @@ -1012,7 +814,6 @@ &filter_params_y, subpel_x_q4, subpel_y_q4, conv_params); } -#endif // CONFIG_COMPOUND_ROUND } #endif // CONFIG_JNT_COMP } @@ -1031,114 +832,6 @@ } } -#if CONFIG_COMPOUND_ROUND -void av1_highbd_convolve_2d_c(const uint16_t *src, int src_stride, - CONV_BUF_TYPE *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params, int bd) { - uint16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; - int im_h = h + filter_params_y->taps - 1; - int im_stride = w; - const int fo_vert = filter_params_y->taps / 2 - 1; - const int fo_horiz = filter_params_x->taps / 2 - 1; - - // horizontal filter - const uint16_t *src_horiz = src - fo_vert * src_stride; - const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_x, subpel_x_q4 & SUBPEL_MASK); - for (int y = 0; y < im_h; ++y) { - for (int x = 0; x < w; ++x) { - int32_t sum = 0; - for (int k = 0; k < filter_params_x->taps; ++k) { - sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; - } - im_block[y * im_stride + x] = - clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, conv_params->round_0), bd); - } - } - - // vertical filter - uint16_t *src_vert = im_block + fo_vert * im_stride; - const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_y, subpel_y_q4 & SUBPEL_MASK); - for (int y = 0; y < h; ++y) { - for (int x = 0; x < w; ++x) { - CONV_BUF_TYPE sum = 0; - for (int k = 0; k < filter_params_y->taps; ++k) { - sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; - } - CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); - if (conv_params->do_average) - dst[y * dst_stride + x] += res; - else - dst[y * dst_stride + x] = res; - } - } -} - -void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, - CONV_BUF_TYPE *dst, int dst_stride, int w, - int h, InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, - const int subpel_x_qn, const int x_step_qn, - const int subpel_y_qn, const int y_step_qn, - ConvolveParams *conv_params, int bd) { - uint16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]; - int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + - filter_params_y->taps; - int im_stride = w; - const int fo_vert = filter_params_y->taps / 2 - 1; - const int fo_horiz = filter_params_x->taps / 2 - 1; - (void)bd; - - // horizontal filter - const uint16_t *src_horiz = src - fo_vert * src_stride; - for (int y = 0; y < im_h; ++y) { - int x_qn = subpel_x_qn; - for (int x = 0; x < w; ++x, x_qn += x_step_qn) { - const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)]; - const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; - assert(x_filter_idx < SUBPEL_SHIFTS); - const int16_t *x_filter = - av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx); - int sum = 0; - for (int k = 0; k < filter_params_x->taps; ++k) - sum += x_filter[k] * src_x[k - fo_horiz]; - im_block[y * im_stride + x] = - clip_pixel(ROUND_POWER_OF_TWO(sum, conv_params->round_0)); - } - src_horiz += src_stride; - } - - // vertical filter - uint16_t *src_vert = im_block + fo_vert * im_stride; - for (int x = 0; x < w; ++x) { - int y_qn = subpel_y_qn; - for (int y = 0; y < h; ++y, y_qn += y_step_qn) { - const uint16_t *const src_y = - &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride]; - const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; - assert(y_filter_idx < SUBPEL_SHIFTS); - const int16_t *y_filter = - av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx); - CONV_BUF_TYPE sum = 0; - for (int k = 0; k < filter_params_y->taps; ++k) { - sum += y_filter[k] * src_y[(k - fo_vert) * im_stride]; - } - CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); - if (conv_params->do_average) - dst[y * dst_stride + x] += res; - else - dst[y * dst_stride + x] = res; - } - src_vert++; - } -} - -#else - void av1_highbd_convolve_2d_c(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, @@ -1253,7 +946,6 @@ src_vert++; } } -#endif // CONFIG_COMPOUND_ROUND void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride, uint8_t *dst, int dst_stride, int w, int h, @@ -1318,8 +1010,6 @@ } #endif // CONFIG_HIGHBITDEPTH -#endif // CONFIG_CONVOLVE_ROUND - typedef void (*ConvolveFunc)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams filter_params,
diff --git a/av1/common/convolve.h b/av1/common/convolve.h index 607532b..8803ffa 100644 --- a/av1/common/convolve.h +++ b/av1/common/convolve.h
@@ -99,7 +99,6 @@ struct AV1Common; void av1_convolve_init(struct AV1Common *cm); -#if CONFIG_CONVOLVE_ROUND void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilters interp_filters, const int subpel_x_q4, @@ -113,11 +112,7 @@ conv_params.ref = ref; conv_params.do_average = do_average; conv_params.round = CONVOLVE_OPT_NO_ROUND; -#if CONFIG_COMPOUND_ROUND - conv_params.round_0 = FILTER_BITS; -#else conv_params.round_0 = 5; -#endif conv_params.round_1 = 0; conv_params.dst = dst; conv_params.dst_stride = dst_stride; @@ -135,7 +130,6 @@ int scaled, ConvolveParams *conv_params, int bd); #endif -#endif // CONFIG_CONVOLVE_ROUND void av1_convolve(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilters interp_filters,
diff --git a/av1/common/reconinter.c b/av1/common/reconinter.c index 1e3ccaa..c3737ef 100644 --- a/av1/common/reconinter.c +++ b/av1/common/reconinter.c
@@ -403,7 +403,6 @@ #elif COMPOUND_SEGMENT_TYPE == 1 #define DIFF_FACTOR 16 -#if CONFIG_CONVOLVE_ROUND static void diffwtd_mask_d32(uint8_t *mask, int which_inverse, int mask_base, const int32_t *src0, int src0_stride, const int32_t *src1, int src1_stride, @@ -441,7 +440,6 @@ default: assert(0); } } -#endif // CONFIG_CONVOLVE_ROUND static void diffwtd_mask(uint8_t *mask, int which_inverse, int mask_base, const uint8_t *src0, int src0_stride, @@ -691,7 +689,6 @@ init_wedge_masks(); } -#if CONFIG_CONVOLVE_ROUND static void build_masked_compound_no_round( CONV_BUF_TYPE *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, @@ -705,7 +702,7 @@ aom_blend_a64_d32_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, block_size_wide[sb_type], h, w, subh, subw); } -#endif // CONFIG_CONVOLVE_ROUND + static void build_masked_compound( uint8_t *dst, int dst_stride, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, @@ -756,17 +753,12 @@ // a temporary buffer, then will blend that temporary buffer with that from // the other reference. // -// With CONFIG_CONVOLVE_ROUND, if the rounding mode is CONVOLVE_OPT_NO_ROUND +// If the rounding mode is CONVOLVE_OPT_NO_ROUND // then the predictions are at 32-bits, so we'll need 32 bits per // pixel. Otherwise, we'll need up to 16 bits per pixel if // CONFIG_HIGHBITDEPTH or just 8 otherwise. -#if CONFIG_CONVOLVE_ROUND #define INTER_PRED_BYTES_PER_PIXEL 4 -#elif CONFIG_HIGHBITDEPTH -#define INTER_PRED_BYTES_PER_PIXEL 2 -#else -#define INTER_PRED_BYTES_PER_PIXEL 1 -#endif + DECLARE_ALIGNED(16, uint8_t, tmp_buf[INTER_PRED_BYTES_PER_PIXEL * MAX_SB_SQUARE]); #undef INTER_PRED_BYTES_PER_PIXEL @@ -779,7 +771,6 @@ uint8_t *tmp_dst = tmp_buf; #endif -#if CONFIG_CONVOLVE_ROUND const int tmp_buf_stride = MAX_SB_SIZE; const int is_conv_no_round = conv_params->round == CONVOLVE_OPT_NO_ROUND; CONV_BUF_TYPE *org_dst = conv_params->dst; @@ -790,7 +781,6 @@ conv_params->dst_stride = tmp_buf_stride; assert(conv_params->do_average == 0); } -#endif // CONFIG_CONVOLVE_ROUND // This will generate a prediction in tmp_buf for the second reference av1_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE, subpel_x, @@ -799,14 +789,12 @@ xd); if (!plane && comp_data.interinter_compound_type == COMPOUND_SEG) { -#if CONFIG_CONVOLVE_ROUND if (is_conv_no_round) { build_compound_seg_mask_d32(comp_data.seg_mask, comp_data.mask_type, org_dst, org_dst_stride, tmp_buf32, tmp_buf_stride, mi->mbmi.sb_type, h, w, conv_params, xd->bd); } else { -#endif // CONFIG_CONVOLVE_ROUND #if CONFIG_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { build_compound_seg_mask_highbd(comp_data.seg_mask, comp_data.mask_type, @@ -820,12 +808,9 @@ #if CONFIG_HIGHBITDEPTH } #endif -#if CONFIG_CONVOLVE_ROUND } -#endif } -#if CONFIG_CONVOLVE_ROUND if (is_conv_no_round) { build_masked_compound_no_round(org_dst, org_dst_stride, org_dst, org_dst_stride, tmp_buf32, tmp_buf_stride, @@ -844,8 +829,6 @@ conv_params->do_post_rounding = 0; } else { -#endif // CONFIG_CONVOLVE_ROUND - #if CONFIG_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) build_masked_compound_highbd(dst, dst_stride, dst, dst_stride, tmp_dst, @@ -855,9 +838,7 @@ #endif // CONFIG_HIGHBITDEPTH build_masked_compound(dst, dst_stride, dst, dst_stride, tmp_dst, MAX_SB_SIZE, &comp_data, mi->mbmi.sb_type, h, w); -#if CONFIG_CONVOLVE_ROUND } -#endif // CONFIG_CONVOLVE_ROUND } // TODO(sarahparker) av1_highbd_build_inter_predictor and @@ -1038,17 +1019,11 @@ for (idx = 0; idx < b8_w; idx += b4_w) { MB_MODE_INFO *this_mbmi = &xd->mi[row * xd->mi_stride + col]->mbmi; is_compound = has_second_ref(this_mbmi); -#if CONFIG_CONVOLVE_ROUND DECLARE_ALIGNED(16, int32_t, tmp_dst[8 * 8]); int tmp_dst_stride = 8; assert(w <= 8 && h <= 8); -#endif // CONFIG_CONVOLVE_ROUND -#if CONFIG_CONVOLVE_ROUND ConvolveParams conv_params = get_conv_params_no_round(0, 0, plane, tmp_dst, tmp_dst_stride); -#else - ConvolveParams conv_params = get_conv_params(0, 0, plane); -#endif #if CONFIG_JNT_COMP conv_params.use_jnt_comp_avg = 0; #endif // CONFIG_JNT_COMP @@ -1153,7 +1128,6 @@ (mi_y >> pd->subsampling_y) + y, plane, ref, mi, build_for_obmc, xs, ys, xd); } // for (ref = 0; ref < 1 + is_compound; ++ref) -#if CONFIG_CONVOLVE_ROUND if (conv_params.do_post_rounding) { #if CONFIG_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) @@ -1169,7 +1143,6 @@ FILTER_BITS * 2 + is_compound - conv_params.round_0 - conv_params.round_1); } -#endif // CONFIG_CONVOLVE_ROUND ++col; } ++row; @@ -1184,9 +1157,7 @@ uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x; uint8_t *pre[2]; SubpelParams subpel_params[2]; -#if CONFIG_CONVOLVE_ROUND DECLARE_ALIGNED(16, int32_t, tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE]); -#endif // CONFIG_CONVOLVE_ROUND for (ref = 0; ref < 1 + is_compound; ++ref) { #if CONFIG_INTRABC @@ -1251,7 +1222,6 @@ } } -#if CONFIG_CONVOLVE_ROUND ConvolveParams conv_params = get_conv_params_no_round(ref, ref, plane, tmp_dst, MAX_SB_SIZE); #if CONFIG_JNT_COMP @@ -1260,10 +1230,6 @@ &conv_params.use_jnt_comp_avg, is_compound); #endif // CONFIG_JNT_COMP -#else - ConvolveParams conv_params = get_conv_params(ref, ref, plane); -#endif // CONFIG_CONVOLVE_ROUND - for (ref = 0; ref < 1 + is_compound; ++ref) { #if CONFIG_INTRABC const struct scale_factors *const sf = @@ -1301,7 +1267,6 @@ subpel_params[ref].ys, xd); } -#if CONFIG_CONVOLVE_ROUND // TODO(angiebird): This part needs optimization if (conv_params.do_post_rounding) { #if CONFIG_HIGHBITDEPTH @@ -1317,7 +1282,6 @@ FILTER_BITS * 2 + is_compound - conv_params.round_0 - conv_params.round_1); } -#endif // CONFIG_CONVOLVE_ROUND } }
diff --git a/av1/common/reconinter.h b/av1/common/reconinter.h index 7c2883d..1790086 100644 --- a/av1/common/reconinter.h +++ b/av1/common/reconinter.h
@@ -55,14 +55,10 @@ // TODO(afergs, debargha): Use a different scale convolve function // that uses higher precision for subpel_x, subpel_y, xs, ys if (conv_params->round == CONVOLVE_OPT_NO_ROUND) { -#if CONFIG_CONVOLVE_ROUND av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h, interp_filters, subpel_x, xs, subpel_y, ys, 1, conv_params); conv_params->do_post_rounding = 1; -#else - assert(0); -#endif // CONFIG_CONVOLVE_ROUND } else { assert(conv_params->round == CONVOLVE_OPT_ROUND); av1_convolve_scale(src, src_stride, dst, dst_stride, w, h, interp_filters, @@ -78,14 +74,10 @@ assert(xs <= SUBPEL_SHIFTS); assert(ys <= SUBPEL_SHIFTS); if (conv_params->round == CONVOLVE_OPT_NO_ROUND) { -#if CONFIG_CONVOLVE_ROUND av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h, interp_filters, subpel_x, xs, subpel_y, ys, 0, conv_params); conv_params->do_post_rounding = 1; -#else - assert(0); -#endif // CONFIG_CONVOLVE_ROUND } else { assert(conv_params->round == CONVOLVE_OPT_ROUND); @@ -131,14 +123,10 @@ if (has_scale(xs, ys)) { if (conv_params->round == CONVOLVE_OPT_NO_ROUND) { -#if CONFIG_CONVOLVE_ROUND av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h, interp_filters, subpel_x, xs, subpel_y, ys, 1, conv_params, bd); conv_params->do_post_rounding = 1; -#else - assert(0); -#endif // CONFIG_CONVOLVE_ROUND } else { av1_highbd_convolve_scale(src, src_stride, dst, dst_stride, w, h, interp_filters, subpel_x, xs, subpel_y, ys, avg, @@ -154,14 +142,10 @@ assert(xs <= SUBPEL_SHIFTS); assert(ys <= SUBPEL_SHIFTS); if (conv_params->round == CONVOLVE_OPT_NO_ROUND) { -#if CONFIG_CONVOLVE_ROUND av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h, interp_filters, subpel_x, xs, subpel_y, ys, 0, conv_params, bd); conv_params->do_post_rounding = 1; -#else - assert(0); -#endif // CONFIG_CONVOLVE_ROUND } else { InterpFilterParams filter_params_x, filter_params_y; #if CONFIG_SHORT_FILTER
diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c index 1a9c6dc..962f140 100644 --- a/av1/common/warped_motion.c +++ b/av1/common/warped_motion.c
@@ -427,7 +427,6 @@ int16_t beta, int16_t gamma, int16_t delta) { int32_t tmp[15 * 8]; int i, j, k, l, m; -#if CONFIG_CONVOLVE_ROUND const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND; const int reduce_bits_horiz = use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS; @@ -445,14 +444,6 @@ conv_params->do_post_rounding = 1; } assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS); -#else - const int reduce_bits_horiz = HORSHEAR_REDUCE_PREC_BITS; - const int max_bits_horiz = - bd + WARPEDPIXEL_FILTER_BITS + 1 - HORSHEAR_REDUCE_PREC_BITS; - const int offset_bits_horiz = bd + WARPEDPIXEL_FILTER_BITS - 1; - const int offset_bits_vert = - bd + 2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS; -#endif (void)max_bits_horiz; for (i = p_row; i < p_row + p_height; i += 8) { @@ -524,7 +515,7 @@ for (m = 0; m < 8; ++m) { sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m]; } -#if CONFIG_CONVOLVE_ROUND + if (use_conv_params) { CONV_BUF_TYPE *p = &conv_params @@ -555,9 +546,6 @@ *p = sum; #endif // CONFIG_JNT_COMP } else { -#else - { -#endif uint16_t *p = &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)]; sum = ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS); @@ -748,7 +736,6 @@ int32_t tmp[15 * 8]; int i, j, k, l, m; const int bd = 8; -#if CONFIG_CONVOLVE_ROUND const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND; const int reduce_bits_horiz = use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS; @@ -766,14 +753,6 @@ conv_params->do_post_rounding = 1; } assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS); -#else - const int reduce_bits_horiz = HORSHEAR_REDUCE_PREC_BITS; - const int max_bits_horiz = - bd + WARPEDPIXEL_FILTER_BITS + 1 - HORSHEAR_REDUCE_PREC_BITS; - const int offset_bits_horiz = bd + WARPEDPIXEL_FILTER_BITS - 1; - const int offset_bits_vert = - bd + 2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS; -#endif (void)max_bits_horiz; for (i = p_row; i < p_row + p_height; i += 8) { @@ -851,7 +830,7 @@ for (m = 0; m < 8; ++m) { sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m]; } -#if CONFIG_CONVOLVE_ROUND + if (use_conv_params) { CONV_BUF_TYPE *p = &conv_params @@ -882,9 +861,6 @@ *p = sum; #endif // CONFIG_JNT_COMP } else { -#else - { -#endif uint8_t *p = &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)]; sum = ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS);
diff --git a/av1/common/x86/convolve_2d_avx2.c b/av1/common/x86/convolve_2d_avx2.c index ff8ade8..3c1a24d 100644 --- a/av1/common/x86/convolve_2d_avx2.c +++ b/av1/common/x86/convolve_2d_avx2.c
@@ -17,239 +17,6 @@ #include "aom_dsp/aom_filter.h" #include "av1/common/convolve.h" -#if CONFIG_COMPOUND_ROUND -void av1_convolve_2d_avx2(const uint8_t *src, int src_stride, - CONV_BUF_TYPE *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { - DECLARE_ALIGNED(32, uint8_t, - im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); - int im_h = h + filter_params_y->taps - 1; - int im_stride = MAX_SB_SIZE; - int i, j; - const int fo_vert = filter_params_y->taps / 2 - 1; - const int fo_horiz = filter_params_x->taps / 2 - 1; - const int do_average = conv_params->do_average; - const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; - - const __m256i zero = _mm256_setzero_si256(); - - /* Horizontal filter */ - { - const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_x, subpel_x_q4 & SUBPEL_MASK); - const __m128i coeffs_x8 = _mm_loadu_si128((__m128i *)x_filter); - // since not all compilers yet support _mm256_set_m128i() - const __m256i coeffs_x = _mm256_insertf128_si256( - _mm256_castsi128_si256(coeffs_x8), coeffs_x8, 1); - - // coeffs 0 1 0 1 2 3 2 3 - const __m256i tmp_0 = _mm256_unpacklo_epi32(coeffs_x, coeffs_x); - // coeffs 4 5 4 5 6 7 6 7 - const __m256i tmp_1 = _mm256_unpackhi_epi32(coeffs_x, coeffs_x); - - // coeffs 0 1 0 1 0 1 0 1 - const __m256i coeff_01 = _mm256_unpacklo_epi64(tmp_0, tmp_0); - // coeffs 2 3 2 3 2 3 2 3 - const __m256i coeff_23 = _mm256_unpackhi_epi64(tmp_0, tmp_0); - // coeffs 4 5 4 5 4 5 4 5 - const __m256i coeff_45 = _mm256_unpacklo_epi64(tmp_1, tmp_1); - // coeffs 6 7 6 7 6 7 6 7 - const __m256i coeff_67 = _mm256_unpackhi_epi64(tmp_1, tmp_1); - - const __m256i round_const = - _mm256_set1_epi32((1 << conv_params->round_0) >> 1); - const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); - - for (i = 0; i < im_h; ++i) { - for (j = 0; j < w; j += 16) { - const __m256i data = _mm256_permute4x64_epi64( - _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]), - _MM_SHUFFLE(2, 1, 1, 0)); - - // Filter even-index pixels - const __m256i src_0 = _mm256_unpacklo_epi8(data, zero); - const __m256i res_0 = _mm256_madd_epi16(src_0, coeff_01); - const __m256i src_2 = - _mm256_unpacklo_epi8(_mm256_srli_si256(data, 2), zero); - const __m256i res_2 = _mm256_madd_epi16(src_2, coeff_23); - const __m256i src_4 = - _mm256_unpacklo_epi8(_mm256_srli_si256(data, 4), zero); - const __m256i res_4 = _mm256_madd_epi16(src_4, coeff_45); - const __m256i src_6 = - _mm256_unpacklo_epi8(_mm256_srli_si256(data, 6), zero); - const __m256i res_6 = _mm256_madd_epi16(src_6, coeff_67); - - __m256i res_even = _mm256_add_epi32(_mm256_add_epi32(res_0, res_4), - _mm256_add_epi32(res_2, res_6)); - res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const), - round_shift); - - // Filter odd-index pixels - const __m256i src_1 = - _mm256_unpacklo_epi8(_mm256_srli_si256(data, 1), zero); - const __m256i res_1 = _mm256_madd_epi16(src_1, coeff_01); - const __m256i src_3 = - _mm256_unpacklo_epi8(_mm256_srli_si256(data, 3), zero); - const __m256i res_3 = _mm256_madd_epi16(src_3, coeff_23); - const __m256i src_5 = - _mm256_unpacklo_epi8(_mm256_srli_si256(data, 5), zero); - const __m256i res_5 = _mm256_madd_epi16(src_5, coeff_45); - const __m256i src_7 = - _mm256_unpacklo_epi8(_mm256_srli_si256(data, 7), zero); - const __m256i res_7 = _mm256_madd_epi16(src_7, coeff_67); - - __m256i res_odd = _mm256_add_epi32(_mm256_add_epi32(res_1, res_5), - _mm256_add_epi32(res_3, res_7)); - res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const), - round_shift); - - // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 - __m256i res = _mm256_packs_epi32(res_even, res_odd); - res = _mm256_packus_epi16(res, res); - _mm_storel_epi64((__m128i *)&im_block[i * im_stride + j], - _mm256_extractf128_si256(res, 0)); - _mm_storel_epi64((__m128i *)&im_block[i * im_stride + j + 8], - _mm256_extractf128_si256(res, 1)); - } - } - } - - /* Vertical filter */ - { - const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_y, subpel_y_q4 & SUBPEL_MASK); - const __m128i coeffs_y8 = _mm_loadu_si128((__m128i *)y_filter); - const __m256i coeffs_y = _mm256_insertf128_si256( - _mm256_castsi128_si256(coeffs_y8), coeffs_y8, 1); - - // coeffs 0 1 0 1 2 3 2 3 - const __m256i tmp_0 = _mm256_unpacklo_epi32(coeffs_y, coeffs_y); - // coeffs 4 5 4 5 6 7 6 7 - const __m256i tmp_1 = _mm256_unpackhi_epi32(coeffs_y, coeffs_y); - - // coeffs 0 1 0 1 0 1 0 1 - const __m256i coeff_01 = _mm256_unpacklo_epi64(tmp_0, tmp_0); - // coeffs 2 3 2 3 2 3 2 3 - const __m256i coeff_23 = _mm256_unpackhi_epi64(tmp_0, tmp_0); - // coeffs 4 5 4 5 4 5 4 5 - const __m256i coeff_45 = _mm256_unpacklo_epi64(tmp_1, tmp_1); - // coeffs 6 7 6 7 6 7 6 7 - const __m256i coeff_67 = _mm256_unpackhi_epi64(tmp_1, tmp_1); - - const __m256i round_const = - _mm256_set1_epi32((1 << conv_params->round_1) >> 1); - const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); - - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 16) { - // Filter even-index pixels - const uint8_t *data = &im_block[i * im_stride + j]; - const __m256i src_01 = _mm256_unpacklo_epi8( - _mm256_inserti128_si256( - _mm256_castsi128_si256( - _mm_loadl_epi64((__m128i *)(data + 0 * im_stride))), - _mm_loadl_epi64((__m128i *)(data + 0 * im_stride + 8)), 1), - _mm256_inserti128_si256( - _mm256_castsi128_si256( - _mm_loadl_epi64((__m128i *)(data + 1 * im_stride))), - _mm_loadl_epi64((__m128i *)(data + 1 * im_stride + 8)), 1)); - const __m256i src_23 = _mm256_unpacklo_epi8( - _mm256_inserti128_si256( - _mm256_castsi128_si256( - _mm_loadl_epi64((__m128i *)(data + 2 * im_stride))), - _mm_loadl_epi64((__m128i *)(data + 2 * im_stride + 8)), 1), - _mm256_inserti128_si256( - _mm256_castsi128_si256( - _mm_loadl_epi64((__m128i *)(data + 3 * im_stride))), - _mm_loadl_epi64((__m128i *)(data + 3 * im_stride + 8)), 1)); - const __m256i src_45 = _mm256_unpacklo_epi8( - _mm256_inserti128_si256( - _mm256_castsi128_si256( - _mm_loadl_epi64((__m128i *)(data + 4 * im_stride))), - _mm_loadl_epi64((__m128i *)(data + 4 * im_stride + 8)), 1), - _mm256_inserti128_si256( - _mm256_castsi128_si256( - _mm_loadl_epi64((__m128i *)(data + 5 * im_stride))), - _mm_loadl_epi64((__m128i *)(data + 5 * im_stride + 8)), 1)); - const __m256i src_67 = _mm256_unpacklo_epi8( - _mm256_inserti128_si256( - _mm256_castsi128_si256( - _mm_loadl_epi64((__m128i *)(data + 6 * im_stride))), - _mm_loadl_epi64((__m128i *)(data + 6 * im_stride + 8)), 1), - _mm256_inserti128_si256( - _mm256_castsi128_si256( - _mm_loadl_epi64((__m128i *)(data + 7 * im_stride))), - _mm_loadl_epi64((__m128i *)(data + 7 * im_stride + 8)), 1)); - - const __m256i src_0 = _mm256_unpacklo_epi8(src_01, zero); - const __m256i src_2 = _mm256_unpacklo_epi8(src_23, zero); - const __m256i src_4 = _mm256_unpacklo_epi8(src_45, zero); - const __m256i src_6 = _mm256_unpacklo_epi8(src_67, zero); - - const __m256i res_0 = _mm256_madd_epi16(src_0, coeff_01); - const __m256i res_2 = _mm256_madd_epi16(src_2, coeff_23); - const __m256i res_4 = _mm256_madd_epi16(src_4, coeff_45); - const __m256i res_6 = _mm256_madd_epi16(src_6, coeff_67); - - const __m256i res_even = _mm256_add_epi32( - _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6)); - - // Filter odd-index pixels - const __m256i src_1 = _mm256_unpackhi_epi8(src_01, zero); - const __m256i src_3 = _mm256_unpackhi_epi8(src_23, zero); - const __m256i src_5 = _mm256_unpackhi_epi8(src_45, zero); - const __m256i src_7 = _mm256_unpackhi_epi8(src_67, zero); - - const __m256i res_1 = _mm256_madd_epi16(src_1, coeff_01); - const __m256i res_3 = _mm256_madd_epi16(src_3, coeff_23); - const __m256i res_5 = _mm256_madd_epi16(src_5, coeff_45); - const __m256i res_7 = _mm256_madd_epi16(src_7, coeff_67); - - const __m256i res_odd = _mm256_add_epi32( - _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7)); - - // Rearrange pixels back into the order 0 ... 7 - const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd); - const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd); - - const __m256i res_lo_round = _mm256_sra_epi32( - _mm256_add_epi32(res_lo, round_const), round_shift); - const __m256i res_hi_round = _mm256_sra_epi32( - _mm256_add_epi32(res_hi, round_const), round_shift); - - // Accumulate values into the destination buffer - __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; - if (do_average) { - _mm_storeu_si128( - p + 0, _mm_add_epi32(_mm_loadu_si128(p + 0), - _mm256_extractf128_si256(res_lo_round, 0))); - _mm_storeu_si128( - p + 1, _mm_add_epi32(_mm_loadu_si128(p + 1), - _mm256_extractf128_si256(res_hi_round, 0))); - if (w - j > 8) { - _mm_storeu_si128(p + 2, _mm_add_epi32(_mm_loadu_si128(p + 2), - _mm256_extractf128_si256( - res_lo_round, 1))); - _mm_storeu_si128(p + 3, _mm_add_epi32(_mm_loadu_si128(p + 3), - _mm256_extractf128_si256( - res_hi_round, 1))); - } - } else { - _mm_storeu_si128(p + 0, _mm256_extractf128_si256(res_lo_round, 0)); - _mm_storeu_si128(p + 1, _mm256_extractf128_si256(res_hi_round, 0)); - if (w - j > 8) { - _mm_storeu_si128(p + 2, _mm256_extractf128_si256(res_lo_round, 1)); - _mm_storeu_si128(p + 3, _mm256_extractf128_si256(res_hi_round, 1)); - } - } - } - } - } -} -#else void av1_convolve_2d_avx2(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, @@ -459,4 +226,3 @@ } } } -#endif
diff --git a/av1/common/x86/convolve_2d_sse2.c b/av1/common/x86/convolve_2d_sse2.c index 13275b6..34b7dc7 100644 --- a/av1/common/x86/convolve_2d_sse2.c +++ b/av1/common/x86/convolve_2d_sse2.c
@@ -17,185 +17,6 @@ #include "aom_dsp/aom_filter.h" #include "av1/common/convolve.h" -#if CONFIG_COMPOUND_ROUND -void av1_convolve_2d_sse2(const uint8_t *src, int src_stride, - CONV_BUF_TYPE *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { - DECLARE_ALIGNED(16, uint8_t, - im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); - int im_h = h + filter_params_y->taps - 1; - int im_stride = MAX_SB_SIZE; - int i, j; - const int fo_vert = filter_params_y->taps / 2 - 1; - const int fo_horiz = filter_params_x->taps / 2 - 1; - const int do_average = conv_params->do_average; - const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; - - const __m128i zero = _mm_setzero_si128(); - - /* Horizontal filter */ - { - const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_x, subpel_x_q4 & SUBPEL_MASK); - const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); - - // coeffs 0 1 0 1 0 1 0 1 - const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); - // coeffs 2 3 2 3 2 3 2 3 - const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); - // coeffs 4 5 4 5 4 5 4 5 - const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); - // coeffs 6 7 6 7 6 7 6 7 - const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); - - const __m128i round_const = - _mm_set1_epi32((1 << conv_params->round_0) >> 1); - const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); - - for (i = 0; i < im_h; ++i) { - for (j = 0; j < w; j += 8) { - const __m128i data = - _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); - - // Filter even-index pixels - const __m128i src_0 = _mm_unpacklo_epi8(data, zero); - const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); - const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero); - const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); - const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero); - const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); - const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero); - const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); - - __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), - _mm_add_epi32(res_2, res_6)); - res_even = - _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift); - - // Filter odd-index pixels - const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero); - const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); - const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero); - const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); - const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero); - const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); - const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero); - const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); - - __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), - _mm_add_epi32(res_3, res_7)); - res_odd = - _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift); - - // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 - __m128i res = _mm_packs_epi32(res_even, res_odd); - res = _mm_packus_epi16(res, res); - _mm_storel_epi64((__m128i *)&im_block[i * im_stride + j], res); - } - } - } - - /* Vertical filter */ - { - const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_y, subpel_y_q4 & SUBPEL_MASK); - const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); - - // coeffs 0 1 0 1 0 1 0 1 - const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); - // coeffs 2 3 2 3 2 3 2 3 - const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); - // coeffs 4 5 4 5 4 5 4 5 - const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); - // coeffs 6 7 6 7 6 7 6 7 - const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); - - const __m128i round_const = - _mm_set1_epi32((1 << conv_params->round_1) >> 1); - const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); - - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 8) { - // Filter even-index pixels - const uint8_t *data = &im_block[i * im_stride + j]; - const __m128i src_01 = _mm_unpacklo_epi8( - _mm_loadl_epi64((__m128i *)(data + 0 * im_stride)), - _mm_loadl_epi64((__m128i *)(data + 1 * im_stride))); - const __m128i src_23 = _mm_unpacklo_epi8( - _mm_loadl_epi64((__m128i *)(data + 2 * im_stride)), - _mm_loadl_epi64((__m128i *)(data + 3 * im_stride))); - const __m128i src_45 = _mm_unpacklo_epi8( - _mm_loadl_epi64((__m128i *)(data + 4 * im_stride)), - _mm_loadl_epi64((__m128i *)(data + 5 * im_stride))); - const __m128i src_67 = _mm_unpacklo_epi8( - _mm_loadl_epi64((__m128i *)(data + 6 * im_stride)), - _mm_loadl_epi64((__m128i *)(data + 7 * im_stride))); - - const __m128i src_0 = _mm_unpacklo_epi8(src_01, zero); - const __m128i src_2 = _mm_unpacklo_epi8(src_23, zero); - const __m128i src_4 = _mm_unpacklo_epi8(src_45, zero); - const __m128i src_6 = _mm_unpacklo_epi8(src_67, zero); - - const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); - const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); - const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); - const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); - - const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), - _mm_add_epi32(res_4, res_6)); - - // Filter odd-index pixels - const __m128i src_1 = _mm_unpackhi_epi8(src_01, zero); - const __m128i src_3 = _mm_unpackhi_epi8(src_23, zero); - const __m128i src_5 = _mm_unpackhi_epi8(src_45, zero); - const __m128i src_7 = _mm_unpackhi_epi8(src_67, zero); - - const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); - const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); - const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); - const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); - - const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), - _mm_add_epi32(res_5, res_7)); - - // Rearrange pixels back into the order 0 ... 7 - const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); - const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); - - const __m128i res_lo_round = - _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); - const __m128i res_hi_round = - _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); - - // Accumulate values into the destination buffer - __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; - if (do_average) { - _mm_storeu_si128(p + 0, - _mm_add_epi32(_mm_loadu_si128(p + 0), res_lo_round)); - _mm_storeu_si128(p + 1, - _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round)); - } else { - _mm_storeu_si128(p + 0, res_lo_round); - _mm_storeu_si128(p + 1, res_hi_round); - } - } - } - } -} -#else void av1_convolve_2d_sse2(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, @@ -698,4 +519,3 @@ } } #endif // CONFIG_JNT_COMP -#endif // CONFIG_COMPOUND_ROUND
diff --git a/av1/common/x86/convolve_2d_sse4.c b/av1/common/x86/convolve_2d_sse4.c index 71c32e7..ea0811a 100644 --- a/av1/common/x86/convolve_2d_sse4.c +++ b/av1/common/x86/convolve_2d_sse4.c
@@ -19,221 +19,6 @@ #include "av1/common/convolve.h" #if CONFIG_JNT_COMP -#if CONFIG_COMPOUND_ROUND -void av1_jnt_convolve_2d_sse4_1(const uint8_t *src, int src_stride, - CONV_BUF_TYPE *dst, int dst_stride, int w, - int h, InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { - DECLARE_ALIGNED(16, uint8_t, - im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); - int im_h = h + filter_params_y->taps - 1; - int im_stride = MAX_SB_SIZE; - int i, j; - const int fo_vert = filter_params_y->taps / 2 - 1; - const int fo_horiz = filter_params_x->taps / 2 - 1; - const int do_average = conv_params->do_average; - const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; - - const __m128i zero = _mm_setzero_si128(); - - const int w0 = conv_params->fwd_offset; - const int w1 = conv_params->bck_offset; - const __m128i wt0 = _mm_set_epi32(w0, w0, w0, w0); - const __m128i wt1 = _mm_set_epi32(w1, w1, w1, w1); - const int jnt_round_const = 1 << (DIST_PRECISION_BITS - 2); - const __m128i jnt_r = _mm_set_epi32(jnt_round_const, jnt_round_const, - jnt_round_const, jnt_round_const); - - /* Horizontal filter */ - { - const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_x, subpel_x_q4 & SUBPEL_MASK); - const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); - - // coeffs 0 1 0 1 0 1 0 1 - const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); - // coeffs 2 3 2 3 2 3 2 3 - const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); - // coeffs 4 5 4 5 4 5 4 5 - const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); - // coeffs 6 7 6 7 6 7 6 7 - const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); - - const __m128i round_const = - _mm_set1_epi32((1 << conv_params->round_0) >> 1); - const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); - - for (i = 0; i < im_h; ++i) { - for (j = 0; j < w; j += 8) { - const __m128i data = - _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); - - // Filter even-index pixels - const __m128i src_0 = _mm_unpacklo_epi8(data, zero); - const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); - const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero); - const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); - const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero); - const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); - const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero); - const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); - - __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), - _mm_add_epi32(res_2, res_6)); - res_even = - _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift); - - // Filter odd-index pixels - const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero); - const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); - const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero); - const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); - const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero); - const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); - const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero); - const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); - - __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), - _mm_add_epi32(res_3, res_7)); - res_odd = - _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift); - - // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 - __m128i res = _mm_packs_epi32(res_even, res_odd); - res = _mm_packus_epi16(res, res); - _mm_storel_epi64((__m128i *)&im_block[i * im_stride + j], res); - } - } - } - - /* Vertical filter */ - { - const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_y, subpel_y_q4 & SUBPEL_MASK); - const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); - - // coeffs 0 1 0 1 0 1 0 1 - const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); - // coeffs 2 3 2 3 2 3 2 3 - const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); - // coeffs 4 5 4 5 4 5 4 5 - const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); - // coeffs 6 7 6 7 6 7 6 7 - const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); - - const __m128i round_const = - _mm_set1_epi32((1 << conv_params->round_1) >> 1); - const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); - - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 8) { - // Filter even-index pixels - const uint8_t *data = &im_block[i * im_stride + j]; - const __m128i src_01 = _mm_unpacklo_epi8( - _mm_loadl_epi64((__m128i *)(data + 0 * im_stride)), - _mm_loadl_epi64((__m128i *)(data + 1 * im_stride))); - const __m128i src_23 = _mm_unpacklo_epi8( - _mm_loadl_epi64((__m128i *)(data + 2 * im_stride)), - _mm_loadl_epi64((__m128i *)(data + 3 * im_stride))); - const __m128i src_45 = _mm_unpacklo_epi8( - _mm_loadl_epi64((__m128i *)(data + 4 * im_stride)), - _mm_loadl_epi64((__m128i *)(data + 5 * im_stride))); - const __m128i src_67 = _mm_unpacklo_epi8( - _mm_loadl_epi64((__m128i *)(data + 6 * im_stride)), - _mm_loadl_epi64((__m128i *)(data + 7 * im_stride))); - - const __m128i src_0 = _mm_unpacklo_epi8(src_01, zero); - const __m128i src_2 = _mm_unpacklo_epi8(src_23, zero); - const __m128i src_4 = _mm_unpacklo_epi8(src_45, zero); - const __m128i src_6 = _mm_unpacklo_epi8(src_67, zero); - - const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); - const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); - const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); - const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); - - const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), - _mm_add_epi32(res_4, res_6)); - - // Filter odd-index pixels - const __m128i src_1 = _mm_unpackhi_epi8(src_01, zero); - const __m128i src_3 = _mm_unpackhi_epi8(src_23, zero); - const __m128i src_5 = _mm_unpackhi_epi8(src_45, zero); - const __m128i src_7 = _mm_unpackhi_epi8(src_67, zero); - - const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); - const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); - const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); - const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); - - const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), - _mm_add_epi32(res_5, res_7)); - - // Rearrange pixels back into the order 0 ... 7 - const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); - const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); - - const __m128i res_lo_round = - _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); - const __m128i res_hi_round = - _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); - - if (conv_params->use_jnt_comp_avg) { - // NOTE(chengchen): - // only this part is different from av1_convolve_2d_sse2 - // original c function at: av1/common/convolve.c: - // av1_convolve_2d_c() and av1_jnt_convolve_2d_c() - __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; - if (do_average) { - _mm_storeu_si128( - p + 0, _mm_srai_epi32( - _mm_add_epi32(_mm_add_epi32(_mm_loadu_si128(p + 0), - _mm_mullo_epi32( - res_lo_round, wt1)), - jnt_r), - DIST_PRECISION_BITS - 1)); - - _mm_storeu_si128( - p + 1, _mm_srai_epi32( - _mm_add_epi32(_mm_add_epi32(_mm_loadu_si128(p + 1), - _mm_mullo_epi32( - res_hi_round, wt1)), - jnt_r), - DIST_PRECISION_BITS - 1)); - } else { - _mm_storeu_si128(p + 0, _mm_mullo_epi32(res_lo_round, wt0)); - _mm_storeu_si128(p + 1, _mm_mullo_epi32(res_hi_round, wt0)); - } - } else { - // Accumulate values into the destination buffer - __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; - if (do_average) { - _mm_storeu_si128( - p + 0, _mm_add_epi32(_mm_loadu_si128(p + 0), res_lo_round)); - _mm_storeu_si128( - p + 1, _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round)); - } else { - _mm_storeu_si128(p + 0, res_lo_round); - _mm_storeu_si128(p + 1, res_hi_round); - } - } - } - } - } -} -#else // CONFIG_COMPOUND_ROUND void av1_jnt_convolve_2d_sse4_1(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, @@ -450,5 +235,4 @@ } } } -#endif // CONFIG_COMPOUND_ROUND #endif // CONFIG_JNT_COMP
diff --git a/av1/common/x86/convolve_avx2.c b/av1/common/x86/convolve_avx2.c index a0e5871..93e7295 100644 --- a/av1/common/x86/convolve_avx2.c +++ b/av1/common/x86/convolve_avx2.c
@@ -14,7 +14,6 @@ #include "aom_dsp/aom_dsp_common.h" #include "./av1_rtcd.h" -#if CONFIG_CONVOLVE_ROUND static const uint32_t sindex[8] = { 0, 4, 1, 5, 2, 6, 3, 7 }; // 16 epi16 pixels @@ -339,4 +338,3 @@ } } #endif // CONFIG_HIGHBITDEPTH -#endif // CONFIG_CONVOLVE_ROUND
diff --git a/av1/common/x86/convolve_sse2.c b/av1/common/x86/convolve_sse2.c index f137ef0..08ee8c3 100644 --- a/av1/common/x86/convolve_sse2.c +++ b/av1/common/x86/convolve_sse2.c
@@ -17,7 +17,6 @@ #include "aom_dsp/aom_filter.h" #include "av1/common/convolve.h" -#if !CONFIG_COMPOUND_ROUND void av1_convolve_y_sse2(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, @@ -207,4 +206,3 @@ } } } -#endif
diff --git a/av1/common/x86/highbd_convolve_2d_avx2.c b/av1/common/x86/highbd_convolve_2d_avx2.c index 7020763..c28c63d 100644 --- a/av1/common/x86/highbd_convolve_2d_avx2.c +++ b/av1/common/x86/highbd_convolve_2d_avx2.c
@@ -18,227 +18,6 @@ #include "aom_dsp/aom_filter.h" #include "av1/common/convolve.h" -#if CONFIG_COMPOUND_ROUND -void av1_highbd_convolve_2d_avx2(const uint16_t *src, int src_stride, - CONV_BUF_TYPE *dst, int dst_stride, int w, - int h, InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params, int bd) { - DECLARE_ALIGNED(32, int16_t, - im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); - int im_h = h + filter_params_y->taps - 1; - int im_stride = MAX_SB_SIZE; - int i, j; - const int fo_vert = filter_params_y->taps / 2 - 1; - const int fo_horiz = filter_params_x->taps / 2 - 1; - const int do_average = conv_params->do_average; - const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; - - /* Horizontal filter */ - { - const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_x, subpel_x_q4 & SUBPEL_MASK); - const __m128i coeffs_x8 = _mm_loadu_si128((__m128i *)x_filter); - // since not all compilers yet support _mm256_set_m128i() - const __m256i coeffs_x = _mm256_insertf128_si256( - _mm256_castsi128_si256(coeffs_x8), coeffs_x8, 1); - - // coeffs 0 1 0 1 2 3 2 3 - const __m256i tmp_0 = _mm256_unpacklo_epi32(coeffs_x, coeffs_x); - // coeffs 4 5 4 5 6 7 6 7 - const __m256i tmp_1 = _mm256_unpackhi_epi32(coeffs_x, coeffs_x); - - // coeffs 0 1 0 1 0 1 0 1 - const __m256i coeff_01 = _mm256_unpacklo_epi64(tmp_0, tmp_0); - // coeffs 2 3 2 3 2 3 2 3 - const __m256i coeff_23 = _mm256_unpackhi_epi64(tmp_0, tmp_0); - // coeffs 4 5 4 5 4 5 4 5 - const __m256i coeff_45 = _mm256_unpacklo_epi64(tmp_1, tmp_1); - // coeffs 6 7 6 7 6 7 6 7 - const __m256i coeff_67 = _mm256_unpackhi_epi64(tmp_1, tmp_1); - - const __m256i round_const = - _mm256_set1_epi32((1 << conv_params->round_0) >> 1); - const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); - - for (i = 0; i < im_h; ++i) { - for (j = 0; j < w; j += 16) { - const __m256i data = - _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]); - const __m128i data2_1 = - _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 16]); - const __m256i data2 = _mm256_insertf128_si256( - _mm256_castsi128_si256(data2_1), data2_1, 1); - - // Filter even-index pixels - const __m256i res_0 = _mm256_madd_epi16(data, coeff_01); - const __m256i res_2 = _mm256_madd_epi16( - _mm256_alignr_epi8(_mm256_permute2x128_si256(data2, data, 0x13), - data, 4), - coeff_23); - const __m256i res_4 = _mm256_madd_epi16( - _mm256_alignr_epi8(_mm256_permute2x128_si256(data2, data, 0x13), - data, 8), - coeff_45); - const __m256i res_6 = _mm256_madd_epi16( - _mm256_alignr_epi8(_mm256_permute2x128_si256(data2, data, 0x13), - data, 12), - coeff_67); - - __m256i res_even = _mm256_add_epi32(_mm256_add_epi32(res_0, res_4), - _mm256_add_epi32(res_2, res_6)); - res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const), - round_shift); - - // Filter odd-index pixels - const __m256i res_1 = _mm256_madd_epi16( - _mm256_alignr_epi8(_mm256_permute2x128_si256(data2, data, 0x13), - data, 2), - coeff_01); - const __m256i res_3 = _mm256_madd_epi16( - _mm256_alignr_epi8(_mm256_permute2x128_si256(data2, data, 0x13), - data, 6), - coeff_23); - const __m256i res_5 = _mm256_madd_epi16( - _mm256_alignr_epi8(_mm256_permute2x128_si256(data2, data, 0x13), - data, 10), - coeff_45); - const __m256i res_7 = _mm256_madd_epi16( - _mm256_alignr_epi8(_mm256_permute2x128_si256(data2, data, 0x13), - data, 14), - coeff_67); - - __m256i res_odd = _mm256_add_epi32(_mm256_add_epi32(res_1, res_5), - _mm256_add_epi32(res_3, res_7)); - res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const), - round_shift); - - // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 - const __m256i maxval = _mm256_set1_epi16((1 << bd) - 1); - __m256i res = _mm256_packs_epi32(res_even, res_odd); - res = _mm256_max_epi16(_mm256_min_epi16(res, maxval), - _mm256_setzero_si256()); - _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], - _mm256_extractf128_si256(res, 0)); - _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j + 8], - _mm256_extractf128_si256(res, 1)); - } - } - } - - /* Vertical filter */ - { - const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_y, subpel_y_q4 & SUBPEL_MASK); - const __m128i coeffs_y8 = _mm_loadu_si128((__m128i *)y_filter); - const __m256i coeffs_y = _mm256_insertf128_si256( - _mm256_castsi128_si256(coeffs_y8), coeffs_y8, 1); - - // coeffs 0 1 0 1 2 3 2 3 - const __m256i tmp_0 = _mm256_unpacklo_epi32(coeffs_y, coeffs_y); - // coeffs 4 5 4 5 6 7 6 7 - const __m256i tmp_1 = _mm256_unpackhi_epi32(coeffs_y, coeffs_y); - - // coeffs 0 1 0 1 0 1 0 1 - const __m256i coeff_01 = _mm256_unpacklo_epi64(tmp_0, tmp_0); - // coeffs 2 3 2 3 2 3 2 3 - const __m256i coeff_23 = _mm256_unpackhi_epi64(tmp_0, tmp_0); - // coeffs 4 5 4 5 4 5 4 5 - const __m256i coeff_45 = _mm256_unpacklo_epi64(tmp_1, tmp_1); - // coeffs 6 7 6 7 6 7 6 7 - const __m256i coeff_67 = _mm256_unpackhi_epi64(tmp_1, tmp_1); - - const __m256i round_const = - _mm256_set1_epi32((1 << conv_params->round_1) >> 1); - const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); - - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 16) { - // Filter even-index pixels - const int16_t *data = &im_block[i * im_stride + j]; - const __m256i src_0 = - _mm256_unpacklo_epi16(*(__m256i *)(data + 0 * im_stride), - *(__m256i *)(data + 1 * im_stride)); - const __m256i src_2 = - _mm256_unpacklo_epi16(*(__m256i *)(data + 2 * im_stride), - *(__m256i *)(data + 3 * im_stride)); - const __m256i src_4 = - _mm256_unpacklo_epi16(*(__m256i *)(data + 4 * im_stride), - *(__m256i *)(data + 5 * im_stride)); - const __m256i src_6 = - _mm256_unpacklo_epi16(*(__m256i *)(data + 6 * im_stride), - *(__m256i *)(data + 7 * im_stride)); - - const __m256i res_0 = _mm256_madd_epi16(src_0, coeff_01); - const __m256i res_2 = _mm256_madd_epi16(src_2, coeff_23); - const __m256i res_4 = _mm256_madd_epi16(src_4, coeff_45); - const __m256i res_6 = _mm256_madd_epi16(src_6, coeff_67); - - const __m256i res_even = _mm256_add_epi32( - _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6)); - - // Filter odd-index pixels - const __m256i src_1 = - _mm256_unpackhi_epi16(*(__m256i *)(data + 0 * im_stride), - *(__m256i *)(data + 1 * im_stride)); - const __m256i src_3 = - _mm256_unpackhi_epi16(*(__m256i *)(data + 2 * im_stride), - *(__m256i *)(data + 3 * im_stride)); - const __m256i src_5 = - _mm256_unpackhi_epi16(*(__m256i *)(data + 4 * im_stride), - *(__m256i *)(data + 5 * im_stride)); - const __m256i src_7 = - _mm256_unpackhi_epi16(*(__m256i *)(data + 6 * im_stride), - *(__m256i *)(data + 7 * im_stride)); - - const __m256i res_1 = _mm256_madd_epi16(src_1, coeff_01); - const __m256i res_3 = _mm256_madd_epi16(src_3, coeff_23); - const __m256i res_5 = _mm256_madd_epi16(src_5, coeff_45); - const __m256i res_7 = _mm256_madd_epi16(src_7, coeff_67); - - const __m256i res_odd = _mm256_add_epi32( - _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7)); - - // Rearrange pixels back into the order 0 ... 7 - const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd); - const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd); - - const __m256i res_lo_round = _mm256_sra_epi32( - _mm256_add_epi32(res_lo, round_const), round_shift); - const __m256i res_hi_round = _mm256_sra_epi32( - _mm256_add_epi32(res_hi, round_const), round_shift); - - // Accumulate values into the destination buffer - __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; - if (do_average) { - _mm_storeu_si128( - p + 0, _mm_add_epi32(_mm_loadu_si128(p + 0), - _mm256_extractf128_si256(res_lo_round, 0))); - _mm_storeu_si128( - p + 1, _mm_add_epi32(_mm_loadu_si128(p + 1), - _mm256_extractf128_si256(res_hi_round, 0))); - if (w - j > 8) { - _mm_storeu_si128(p + 2, _mm_add_epi32(_mm_loadu_si128(p + 2), - _mm256_extractf128_si256( - res_lo_round, 1))); - _mm_storeu_si128(p + 3, _mm_add_epi32(_mm_loadu_si128(p + 3), - _mm256_extractf128_si256( - res_hi_round, 1))); - } - } else { - _mm_storeu_si128(p + 0, _mm256_extractf128_si256(res_lo_round, 0)); - _mm_storeu_si128(p + 1, _mm256_extractf128_si256(res_hi_round, 0)); - if (w - j > 8) { - _mm_storeu_si128(p + 2, _mm256_extractf128_si256(res_lo_round, 1)); - _mm_storeu_si128(p + 3, _mm256_extractf128_si256(res_hi_round, 1)); - } - } - } - } - } -} -#else void av1_highbd_convolve_2d_avx2(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, @@ -458,4 +237,3 @@ } } } -#endif
diff --git a/av1/common/x86/highbd_convolve_2d_ssse3.c b/av1/common/x86/highbd_convolve_2d_ssse3.c index 195f0f5..95055b0 100644 --- a/av1/common/x86/highbd_convolve_2d_ssse3.c +++ b/av1/common/x86/highbd_convolve_2d_ssse3.c
@@ -18,188 +18,6 @@ #include "aom_dsp/aom_filter.h" #include "av1/common/convolve.h" -#if CONFIG_COMPOUND_ROUND -void av1_highbd_convolve_2d_ssse3(const uint16_t *src, int src_stride, - CONV_BUF_TYPE *dst, int dst_stride, int w, - int h, InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params, int bd) { - DECLARE_ALIGNED(16, int16_t, - im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); - int im_h = h + filter_params_y->taps - 1; - int im_stride = MAX_SB_SIZE; - int i, j; - const int fo_vert = filter_params_y->taps / 2 - 1; - const int fo_horiz = filter_params_x->taps / 2 - 1; - const int do_average = conv_params->do_average; - const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; - - /* Horizontal filter */ - { - const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_x, subpel_x_q4 & SUBPEL_MASK); - const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); - - // coeffs 0 1 0 1 0 1 0 1 - const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); - // coeffs 2 3 2 3 2 3 2 3 - const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); - // coeffs 4 5 4 5 4 5 4 5 - const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); - // coeffs 6 7 6 7 6 7 6 7 - const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); - - const __m128i round_const = - _mm_set1_epi32((1 << conv_params->round_0) >> 1); - const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); - - for (i = 0; i < im_h; ++i) { - for (j = 0; j < w; j += 8) { - const __m128i data = - _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); - const __m128i data2 = - _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]); - - // Filter even-index pixels - const __m128i res_0 = _mm_madd_epi16(data, coeff_01); - const __m128i res_2 = - _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23); - const __m128i res_4 = - _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45); - const __m128i res_6 = - _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67); - - __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), - _mm_add_epi32(res_2, res_6)); - res_even = - _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift); - - // Filter odd-index pixels - const __m128i res_1 = - _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01); - const __m128i res_3 = - _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23); - const __m128i res_5 = - _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45); - const __m128i res_7 = - _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67); - - __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), - _mm_add_epi32(res_3, res_7)); - res_odd = - _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift); - - // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 - const __m128i maxval = _mm_set1_epi16((1 << bd) - 1); - __m128i res = _mm_packs_epi32(res_even, res_odd); - res = _mm_max_epi16(_mm_min_epi16(res, maxval), _mm_setzero_si128()); - _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res); - } - } - } - - /* Vertical filter */ - { - const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_y, subpel_y_q4 & SUBPEL_MASK); - const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); - - // coeffs 0 1 0 1 0 1 0 1 - const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); - // coeffs 2 3 2 3 2 3 2 3 - const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); - // coeffs 4 5 4 5 4 5 4 5 - const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); - // coeffs 6 7 6 7 6 7 6 7 - const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); - - const __m128i round_const = - _mm_set1_epi32((1 << conv_params->round_1) >> 1); - const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); - - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 8) { - // Filter even-index pixels - const int16_t *data = &im_block[i * im_stride + j]; - const __m128i src_0 = - _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride), - *(__m128i *)(data + 1 * im_stride)); - const __m128i src_2 = - _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride), - *(__m128i *)(data + 3 * im_stride)); - const __m128i src_4 = - _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride), - *(__m128i *)(data + 5 * im_stride)); - const __m128i src_6 = - _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride), - *(__m128i *)(data + 7 * im_stride)); - - const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); - const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); - const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); - const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); - - const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), - _mm_add_epi32(res_4, res_6)); - - // Filter odd-index pixels - const __m128i src_1 = - _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride), - *(__m128i *)(data + 1 * im_stride)); - const __m128i src_3 = - _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride), - *(__m128i *)(data + 3 * im_stride)); - const __m128i src_5 = - _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride), - *(__m128i *)(data + 5 * im_stride)); - const __m128i src_7 = - _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride), - *(__m128i *)(data + 7 * im_stride)); - - const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); - const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); - const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); - const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); - - const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), - _mm_add_epi32(res_5, res_7)); - - // Rearrange pixels back into the order 0 ... 7 - const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); - const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); - - const __m128i res_lo_round = - _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); - const __m128i res_hi_round = - _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); - - // Accumulate values into the destination buffer - __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; - if (do_average) { - _mm_storeu_si128(p + 0, - _mm_add_epi32(_mm_loadu_si128(p + 0), res_lo_round)); - _mm_storeu_si128(p + 1, - _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round)); - } else { - _mm_storeu_si128(p + 0, res_lo_round); - _mm_storeu_si128(p + 1, res_hi_round); - } - } - } - } -} -#else void av1_highbd_convolve_2d_ssse3(const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, @@ -383,4 +201,3 @@ } } } -#endif
diff --git a/av1/common/x86/highbd_warp_plane_sse4.c b/av1/common/x86/highbd_warp_plane_sse4.c index 7c358ec..d40a9696 100644 --- a/av1/common/x86/highbd_warp_plane_sse4.c +++ b/av1/common/x86/highbd_warp_plane_sse4.c
@@ -28,7 +28,6 @@ #error "HORSHEAR_REDUCE_PREC_BITS < 5 not currently supported by SSSE3 filter" #endif int i, j, k; -#if CONFIG_CONVOLVE_ROUND const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND; const int reduce_bits_horiz = use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS; @@ -46,10 +45,6 @@ const int jnt_round_const = 1 << (DIST_PRECISION_BITS - 2); const __m128i jnt_r = _mm_set1_epi32(jnt_round_const); #endif // CONFIG_JNT_COMP -#else - const int reduce_bits_horiz = HORSHEAR_REDUCE_PREC_BITS; - const int offset_bits_horiz = bd + WARPEDPIXEL_FILTER_BITS - 1; -#endif /* Note: For this code to work, the left/right frame borders need to be extended by at least 13 pixels each. By the time we get here, other @@ -310,7 +305,6 @@ __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); -#if CONFIG_CONVOLVE_ROUND if (use_conv_params) { __m128i *const p = (__m128i *)&conv_params @@ -369,9 +363,6 @@ #endif } } else { -#else - { -#endif // Round and pack into 8 bits const __m128i round_const = _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
diff --git a/av1/common/x86/highbd_warp_plane_ssse3.c b/av1/common/x86/highbd_warp_plane_ssse3.c index 71b0ec7..5eedf9a 100644 --- a/av1/common/x86/highbd_warp_plane_ssse3.c +++ b/av1/common/x86/highbd_warp_plane_ssse3.c
@@ -28,7 +28,6 @@ #error "HORSHEAR_REDUCE_PREC_BITS < 5 not currently supported by SSSE3 filter" #endif int i, j, k; -#if CONFIG_CONVOLVE_ROUND const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND; const int reduce_bits_horiz = use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS; @@ -38,10 +37,6 @@ conv_params->do_post_rounding = 1; } assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS); -#else - const int reduce_bits_horiz = HORSHEAR_REDUCE_PREC_BITS; - const int offset_bits_horiz = bd + WARPEDPIXEL_FILTER_BITS - 1; -#endif /* Note: For this code to work, the left/right frame borders need to be extended by at least 13 pixels each. By the time we get here, other @@ -302,7 +297,6 @@ __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); -#if CONFIG_CONVOLVE_ROUND if (use_conv_params) { __m128i *const p = (__m128i *)&conv_params @@ -324,9 +318,6 @@ _mm_storeu_si128(p + 1, res_hi); } } else { -#else - { -#endif // Round and pack into 8 bits const __m128i round_const = _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
diff --git a/av1/common/x86/warp_plane_sse2.c b/av1/common/x86/warp_plane_sse2.c index d30466a..6505d9a 100644 --- a/av1/common/x86/warp_plane_sse2.c +++ b/av1/common/x86/warp_plane_sse2.c
@@ -24,7 +24,6 @@ __m128i tmp[15]; int i, j, k; const int bd = 8; -#if CONFIG_CONVOLVE_ROUND const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND; const int reduce_bits_horiz = use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS; @@ -34,10 +33,6 @@ conv_params->do_post_rounding = 1; } assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS); -#else - const int reduce_bits_horiz = HORSHEAR_REDUCE_PREC_BITS; - const int offset_bits_horiz = bd + WARPEDPIXEL_FILTER_BITS - 1; -#endif /* Note: For this code to work, the left/right frame borders need to be extended by at least 13 pixels each. By the time we get here, other @@ -298,7 +293,6 @@ __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); -#if CONFIG_CONVOLVE_ROUND if (use_conv_params) { __m128i *const p = (__m128i *)&conv_params @@ -320,9 +314,6 @@ _mm_storeu_si128(p + 1, res_hi); } } else { -#else - { -#endif // Round and pack into 8 bits const __m128i round_const = _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
diff --git a/av1/common/x86/warp_plane_sse4.c b/av1/common/x86/warp_plane_sse4.c index 6385c17..e0d6206 100644 --- a/av1/common/x86/warp_plane_sse4.c +++ b/av1/common/x86/warp_plane_sse4.c
@@ -25,7 +25,6 @@ __m128i tmp[15]; int i, j, k; const int bd = 8; -#if CONFIG_CONVOLVE_ROUND const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND; const int reduce_bits_horiz = use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS; @@ -43,10 +42,6 @@ const int jnt_round_const = 1 << (DIST_PRECISION_BITS - 2); const __m128i jnt_r = _mm_set1_epi32(jnt_round_const); #endif // CONFIG_JNT_COMP -#else - const int reduce_bits_horiz = HORSHEAR_REDUCE_PREC_BITS; - const int offset_bits_horiz = bd + WARPEDPIXEL_FILTER_BITS - 1; -#endif /* Note: For this code to work, the left/right frame borders need to be extended by at least 13 pixels each. By the time we get here, other @@ -307,7 +302,6 @@ __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); -#if CONFIG_CONVOLVE_ROUND if (use_conv_params) { __m128i *const p = (__m128i *)&conv_params @@ -364,9 +358,6 @@ #endif // CONFIG_JNT_COMP } } else { -#else - { -#endif // Round and pack into 8 bits const __m128i round_const = _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
diff --git a/av1/common/x86/warp_plane_ssse3.c b/av1/common/x86/warp_plane_ssse3.c index 3986ad3..7bf3253 100644 --- a/av1/common/x86/warp_plane_ssse3.c +++ b/av1/common/x86/warp_plane_ssse3.c
@@ -211,7 +211,6 @@ __m128i tmp[15]; int i, j, k; const int bd = 8; -#if CONFIG_CONVOLVE_ROUND const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND; const int reduce_bits_horiz = use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS; @@ -221,10 +220,6 @@ conv_params->do_post_rounding = 1; } assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS); -#else - const int reduce_bits_horiz = HORSHEAR_REDUCE_PREC_BITS; - const int offset_bits_horiz = bd + WARPEDPIXEL_FILTER_BITS - 1; -#endif /* Note: For this code to work, the left/right frame borders need to be extended by at least 13 pixels each. By the time we get here, other @@ -474,7 +469,6 @@ __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); -#if CONFIG_CONVOLVE_ROUND if (use_conv_params) { __m128i *const p = (__m128i *)&conv_params @@ -496,9 +490,6 @@ _mm_storeu_si128(p + 1, res_hi); } } else { -#else - { -#endif // Round and pack into 8 bits const __m128i round_const = _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake index 43d9e71..167c203 100644 --- a/build/cmake/aom_config_defaults.cmake +++ b/build/cmake/aom_config_defaults.cmake
@@ -114,8 +114,6 @@ set(CONFIG_CDEF_SINGLEPASS 1 CACHE NUMBER "AV1 experiment flag.") set(CONFIG_CFL 1 CACHE NUMBER "AV1 experiment flag.") set(CONFIG_COLORSPACE_HEADERS 0 CACHE NUMBER "AV1 experiment flag.") -set(CONFIG_COMPOUND_ROUND 0 CACHE NUMBER "AV1 experiment flag.") -set(CONFIG_CONVOLVE_ROUND 1 CACHE NUMBER "AV1 experiment flag.") set(CONFIG_DAALA_TX 0 CACHE NUMBER "AV1 experiment flag.") set(CONFIG_DAALA_TX16 0 CACHE NUMBER "AV1 experiment flag.") set(CONFIG_DAALA_TX32 0 CACHE NUMBER "AV1 experiment flag.")
diff --git a/build/cmake/aom_experiment_deps.cmake b/build/cmake/aom_experiment_deps.cmake index 1a7e563..0555c63 100644 --- a/build/cmake/aom_experiment_deps.cmake +++ b/build/cmake/aom_experiment_deps.cmake
@@ -32,12 +32,6 @@ endif () endif () - if (CONFIG_COMPOUND_ROUND) - if (NOT CONFIG_CONVOLVE_ROUND) - change_config_and_warn(CONVOLVE_ROUND 1 CONFIG_COMPOUND_ROUND) - endif () - endif () - if (CONFIG_EOB_FIRST) if (NOT CONFIG_LV_MAP) change_config_and_warn(CONFIG_LV_MAP 1 CONFIG_EOB_FIRST)
diff --git a/configure b/configure index a13efcf..2c119c4 100755 --- a/configure +++ b/configure
@@ -252,8 +252,6 @@ rect_tx_ext_intra short_filter dual_filter - convolve_round - compound_round tx64x64 ext_intra filter_intra @@ -496,7 +494,6 @@ soft_enable intra_edge soft_enable mv_compress soft_enable dual_filter - soft_enable convolve_round soft_enable aom_qm soft_enable dist_8x8 soft_enable loop_restoration @@ -540,7 +537,6 @@ enabled lv_map_multi && soft_enable lv_map enabled eob_first && enable_feature lv_map enabled txk_sel && soft_enable lv_map - enabled compound_round && soft_enable convolve_round enabled ext_intra_mod && enable_feature intra_edge enabled intra_edge && enable_feature ext_intra enabled mfmv && enable_feature frame_marker
diff --git a/test/test.cmake b/test/test.cmake index a0cce48..e24f7e8 100644 --- a/test/test.cmake +++ b/test/test.cmake
@@ -237,24 +237,21 @@ "${AOM_ROOT}/test/quantize_func_test.cc") endif () - if (CONFIG_CONVOLVE_ROUND) + set(AOM_UNIT_TEST_ENCODER_SOURCES + ${AOM_UNIT_TEST_ENCODER_SOURCES} + "${AOM_ROOT}/test/convolve_round_test.cc") + if (HAVE_SSE2) set(AOM_UNIT_TEST_ENCODER_SOURCES ${AOM_UNIT_TEST_ENCODER_SOURCES} - "${AOM_ROOT}/test/convolve_round_test.cc") - if (HAVE_SSE2) - set(AOM_UNIT_TEST_ENCODER_SOURCES - ${AOM_UNIT_TEST_ENCODER_SOURCES} - "${AOM_ROOT}/test/av1_convolve_2d_test.cc" - "${AOM_ROOT}/test/av1_convolve_2d_test_util.cc" - "${AOM_ROOT}/test/av1_convolve_2d_test_util.h") - endif () - if (NOT CONFIG_COMPOUND_ROUND) - if (HAVE_SSE4_1) - set(AOM_UNIT_TEST_ENCODER_SOURCES - ${AOM_UNIT_TEST_ENCODER_SOURCES} - "${AOM_ROOT}/test/av1_convolve_scale_test.cc") - endif () - endif () + "${AOM_ROOT}/test/av1_convolve_2d_test.cc" + "${AOM_ROOT}/test/av1_convolve_2d_test_util.cc" + "${AOM_ROOT}/test/av1_convolve_2d_test_util.h") + endif () + + if (HAVE_SSE4_1) + set(AOM_UNIT_TEST_ENCODER_SOURCES + ${AOM_UNIT_TEST_ENCODER_SOURCES} + "${AOM_ROOT}/test/av1_convolve_scale_test.cc") endif () set(AOM_UNIT_TEST_ENCODER_SOURCES
diff --git a/test/test.mk b/test/test.mk index 2389a2f..2a7b9ae 100644 --- a/test/test.mk +++ b/test/test.mk
@@ -227,16 +227,11 @@ LIBAOM_TEST_SRCS-$(HAVE_SSE2) += hiprec_convolve_test_util.cc LIBAOM_TEST_SRCS-$(HAVE_SSE4_1) += selfguided_filter_test.cc endif -ifeq ($(CONFIG_CONVOLVE_ROUND),yes) LIBAOM_TEST_SRCS-$(HAVE_SSE2) += av1_convolve_2d_test_util.h LIBAOM_TEST_SRCS-$(HAVE_SSE2) += av1_convolve_2d_test.cc LIBAOM_TEST_SRCS-$(HAVE_SSE2) += av1_convolve_2d_test_util.cc LIBAOM_TEST_SRCS-yes += convolve_round_test.cc -endif - -ifeq (yesx,$(CONFIG_CONVOLVE_ROUND)x$(CONFIG_COMPOUND_ROUND)) LIBAOM_TEST_SRCS-$(HAVE_SSE4_1) += av1_convolve_scale_test.cc -endif ifeq ($(CONFIG_AV1_ENCODER),yes) LIBAOM_TEST_SRCS-$(HAVE_SSE4_1) += corner_match_test.cc
diff --git a/test/warp_filter_test.cc b/test/warp_filter_test.cc index ea052f8..c6fc23b 100644 --- a/test/warp_filter_test.cc +++ b/test/warp_filter_test.cc
@@ -22,7 +22,7 @@ namespace { -#if CONFIG_JNT_COMP && CONFIG_CONVOLVE_ROUND && HAVE_SSE4_1 +#if CONFIG_JNT_COMP && HAVE_SSE4_1 TEST_P(AV1WarpFilterTest, CheckOutput) { RunCheckOutput(GET_PARAM(3)); } INSTANTIATE_TEST_CASE_P( @@ -38,7 +38,7 @@ libaom_test::AV1HighbdWarpFilter::GetDefaultParams()); #endif -#else // CONFIG_JNT_COMP && CONFIG_CONVOLVE_ROUND && HAVE_SSE4_1 +#else // CONFIG_JNT_COMP && HAVE_SSE4_1 TEST_P(AV1WarpFilterTest, CheckOutput) { RunCheckOutput(GET_PARAM(3)); } INSTANTIATE_TEST_CASE_P(
diff --git a/test/warp_filter_test_util.cc b/test/warp_filter_test_util.cc index c815bf6..72b35ba 100644 --- a/test/warp_filter_test_util.cc +++ b/test/warp_filter_test_util.cc
@@ -113,10 +113,8 @@ int32_t mat[8]; int16_t alpha, beta, gamma, delta; ConvolveParams conv_params = get_conv_params(0, 0, 0); -#if CONFIG_CONVOLVE_ROUND int32_t *dsta = new int32_t[output_n]; int32_t *dstb = new int32_t[output_n]; -#endif for (i = 0; i < num_iters; ++i) { // Generate an input block and extend its borders horizontally @@ -126,17 +124,15 @@ memset(input + r * stride - border, input[r * stride], border); memset(input + r * stride + w, input[r * stride + (w - 1)], border); } -#if CONFIG_CONVOLVE_ROUND + const int use_no_round = rnd_.Rand8() & 1; -#endif for (sub_x = 0; sub_x < 2; ++sub_x) for (sub_y = 0; sub_y < 2; ++sub_y) { generate_model(mat, &alpha, &beta, &gamma, &delta); -#if CONFIG_JNT_COMP && CONFIG_CONVOLVE_ROUND +#if CONFIG_JNT_COMP for (int ii = 0; ii < 2; ++ii) { for (int jj = 0; jj < 5; ++jj) { -#endif // CONFIG_JNT_COMP && CONFIG_CONVOLVE_ROUND -#if CONFIG_CONVOLVE_ROUND +#endif // CONFIG_JNT_COMP if (use_no_round) { // Prepare two copies of the destination for (j = 0; j < out_w * out_h; ++j) { @@ -148,8 +144,7 @@ } else { conv_params = get_conv_params(0, 0, 0); } -#endif -#if CONFIG_JNT_COMP && CONFIG_CONVOLVE_ROUND +#if CONFIG_JNT_COMP if (jj >= 4) { conv_params.use_jnt_comp_avg = 0; } else { @@ -157,17 +152,15 @@ conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0]; conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1]; } -#endif // CONFIG_JNT_COMP && CONFIG_CONVOLVE_ROUND +#endif // CONFIG_JNT_COMP av1_warp_affine_c(mat, input, w, h, stride, output, 32, 32, out_w, out_h, out_w, sub_x, sub_y, &conv_params, alpha, beta, gamma, delta); -#if CONFIG_CONVOLVE_ROUND if (use_no_round) { conv_params = get_conv_params_no_round(0, 0, 0, dstb, out_w); } -#endif -#if CONFIG_JNT_COMP && CONFIG_CONVOLVE_ROUND +#if CONFIG_JNT_COMP if (jj >= 4) { conv_params.use_jnt_comp_avg = 0; } else { @@ -175,12 +168,11 @@ conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0]; conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1]; } -#endif // CONFIG_JNT_COMP && CONFIG_CONVOLVE_ROUND +#endif // CONFIG_JNT_COMP test_impl(mat, input, w, h, stride, output2, 32, 32, out_w, out_h, out_w, sub_x, sub_y, &conv_params, alpha, beta, gamma, delta); -#if CONFIG_CONVOLVE_ROUND if (use_no_round) { for (j = 0; j < out_w * out_h; ++j) ASSERT_EQ(dsta[j], dstb[j]) @@ -192,25 +184,17 @@ << "Pixel mismatch at index " << j << " = (" << (j % out_w) << ", " << (j / out_w) << ") on iteration " << i; } -#else - for (j = 0; j < out_w * out_h; ++j) - ASSERT_EQ(output[j], output2[j]) - << "Pixel mismatch at index " << j << " = (" << (j % out_w) - << ", " << (j / out_w) << ") on iteration " << i; -#endif -#if CONFIG_JNT_COMP && CONFIG_CONVOLVE_ROUND +#if CONFIG_JNT_COMP } } -#endif // CONFIG_JNT_COMP && CONFIG_CONVOLVE_ROUND +#endif // CONFIG_JNT_COMP } } delete[] input_; delete[] output; delete[] output2; -#if CONFIG_CONVOLVE_ROUND delete[] dsta; delete[] dstb; -#endif } } // namespace AV1WarpFilter @@ -320,10 +304,8 @@ int32_t mat[8]; int16_t alpha, beta, gamma, delta; ConvolveParams conv_params = get_conv_params(0, 0, 0); -#if CONFIG_CONVOLVE_ROUND int32_t *dsta = new int32_t[output_n]; int32_t *dstb = new int32_t[output_n]; -#endif for (i = 0; i < num_iters; ++i) { // Generate an input block and extend its borders horizontally @@ -335,17 +317,14 @@ input[r * stride + w + c] = input[r * stride + (w - 1)]; } } -#if CONFIG_CONVOLVE_ROUND const int use_no_round = rnd_.Rand8() & 1; -#endif for (sub_x = 0; sub_x < 2; ++sub_x) for (sub_y = 0; sub_y < 2; ++sub_y) { generate_model(mat, &alpha, &beta, &gamma, &delta); -#if CONFIG_JNT_COMP && CONFIG_CONVOLVE_ROUND +#if CONFIG_JNT_COMP for (int ii = 0; ii < 2; ++ii) { for (int jj = 0; jj < 5; ++jj) { -#endif // CONFIG_JNT_COMP && CONFIG_CONVOLVE_ROUND -#if CONFIG_CONVOLVE_ROUND +#endif // CONFIG_JNT_COMP if (use_no_round) { // Prepare two copies of the destination for (j = 0; j < out_w * out_h; ++j) { @@ -357,8 +336,7 @@ } else { conv_params = get_conv_params(0, 0, 0); } -#endif -#if CONFIG_JNT_COMP && CONFIG_CONVOLVE_ROUND +#if CONFIG_JNT_COMP if (jj >= 4) { conv_params.use_jnt_comp_avg = 0; } else { @@ -366,18 +344,16 @@ conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0]; conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1]; } -#endif // CONFIG_JNT_COMP && CONFIG_CONVOLVE_ROUND +#endif // CONFIG_JNT_COMP av1_highbd_warp_affine_c(mat, input, w, h, stride, output, 32, 32, out_w, out_h, out_w, sub_x, sub_y, bd, &conv_params, alpha, beta, gamma, delta); -#if CONFIG_CONVOLVE_ROUND if (use_no_round) { // TODO(angiebird): Change this to test_impl once we have SIMD // implementation conv_params = get_conv_params_no_round(0, 0, 0, dstb, out_w); } -#endif -#if CONFIG_JNT_COMP && CONFIG_CONVOLVE_ROUND +#if CONFIG_JNT_COMP if (jj >= 4) { conv_params.use_jnt_comp_avg = 0; } else { @@ -385,12 +361,11 @@ conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0]; conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1]; } -#endif // CONFIG_JNT_COMP && CONFIG_CONVOLVE_ROUND +#endif // CONFIG_JNT_COMP test_impl(mat, input, w, h, stride, output2, 32, 32, out_w, out_h, out_w, sub_x, sub_y, bd, &conv_params, alpha, beta, gamma, delta); -#if CONFIG_CONVOLVE_ROUND if (use_no_round) { for (j = 0; j < out_w * out_h; ++j) ASSERT_EQ(dsta[j], dstb[j]) @@ -402,26 +377,18 @@ << "Pixel mismatch at index " << j << " = (" << (j % out_w) << ", " << (j / out_w) << ") on iteration " << i; } -#else - for (j = 0; j < out_w * out_h; ++j) - ASSERT_EQ(output[j], output2[j]) - << "Pixel mismatch at index " << j << " = (" << (j % out_w) - << ", " << (j / out_w) << ") on iteration " << i; -#endif -#if CONFIG_JNT_COMP && CONFIG_CONVOLVE_ROUND +#if CONFIG_JNT_COMP } } -#endif // CONFIG_JNT_COMP && CONFIG_CONVOLVE_ROUND +#endif // CONFIG_JNT_COMP } } delete[] input_; delete[] output; delete[] output2; -#if CONFIG_CONVOLVE_ROUND delete[] dsta; delete[] dstb; -#endif } } // namespace AV1HighbdWarpFilter #endif // CONFIG_HIGHBITDEPTH