Remove convolve_round/compound_round config flags
Merged convolve_round experiment and removed its config flag in the code.
Removed compound_round code.
Change-Id: Ic01856732d75cca65d3866383d3cc1dd572f8863
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 1703ba0..249e1dc 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -735,9 +735,7 @@
#
# Alpha blending with mask
#
- if (aom_config("CONFIG_CONVOLVE_ROUND") eq "yes") {
- add_proto qw/void aom_blend_a64_d32_mask/, "int32_t *dst, uint32_t dst_stride, const int32_t *src0, uint32_t src0_stride, const int32_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx";
- }
+ add_proto qw/void aom_blend_a64_d32_mask/, "int32_t *dst, uint32_t dst_stride, const int32_t *src0, uint32_t src0_stride, const int32_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx";
add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx";
add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w";
add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w";
diff --git a/aom_dsp/blend_a64_mask.c b/aom_dsp/blend_a64_mask.c
index 5cd3dac..384e81b 100644
--- a/aom_dsp/blend_a64_mask.c
+++ b/aom_dsp/blend_a64_mask.c
@@ -18,7 +18,6 @@
#include "./aom_dsp_rtcd.h"
-#if CONFIG_CONVOLVE_ROUND
// Blending with alpha mask. Mask values come from the range [0, 64],
// as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can
// be the same as dst, or dst can be different from both sources.
@@ -79,7 +78,6 @@
}
}
}
-#endif // CONFIG_CONVOLVE_ROUND
// Blending with alpha mask. Mask values come from the range [0, 64],
// as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can
diff --git a/av1/av1.cmake b/av1/av1.cmake
index 74990ab..d3aadb4 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -300,53 +300,49 @@
"${AOM_ROOT}/av1/common/clpf_neon.c")
endif ()
-if (CONFIG_CONVOLVE_ROUND)
- set(AOM_AV1_COMMON_INTRIN_SSE2
- ${AOM_AV1_COMMON_INTRIN_SSE2}
- "${AOM_ROOT}/av1/common/x86/convolve_2d_sse2.c")
+set(AOM_AV1_COMMON_INTRIN_SSE2
+ ${AOM_AV1_COMMON_INTRIN_SSE2}
+ "${AOM_ROOT}/av1/common/x86/convolve_2d_sse2.c")
+set(AOM_AV1_COMMON_INTRIN_AVX2
+ ${AOM_AV1_COMMON_INTRIN_AVX2}
+ "${AOM_ROOT}/av1/common/x86/convolve_2d_avx2.c")
+
+if (CONFIG_HIGHBITDEPTH)
set(AOM_AV1_COMMON_INTRIN_AVX2
${AOM_AV1_COMMON_INTRIN_AVX2}
- "${AOM_ROOT}/av1/common/x86/convolve_2d_avx2.c")
+ "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_avx2.c")
- if (CONFIG_HIGHBITDEPTH)
- set(AOM_AV1_COMMON_INTRIN_AVX2
- ${AOM_AV1_COMMON_INTRIN_AVX2}
- "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_avx2.c")
+ set(AOM_AV1_COMMON_INTRIN_SSSE3
+ ${AOM_AV1_COMMON_INTRIN_SSSE3}
+ "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_ssse3.c")
+endif ()
- set(AOM_AV1_COMMON_INTRIN_SSSE3
- ${AOM_AV1_COMMON_INTRIN_SSSE3}
- "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_ssse3.c")
- endif ()
+if (CONFIG_JNT_COMP)
+ set(AOM_AV1_COMMON_INTRIN_SSE4_1
+ ${AOM_AV1_COMMON_INTRIN_SSE4_1}
+ "${AOM_ROOT}/av1/common/x86/convolve_2d_sse4.c")
+endif ()
- if (CONFIG_JNT_COMP)
- set(AOM_AV1_COMMON_INTRIN_SSE4_1
- ${AOM_AV1_COMMON_INTRIN_SSE4_1}
- "${AOM_ROOT}/av1/common/x86/convolve_2d_sse4.c")
- endif ()
-
- if(NOT CONFIG_COMPOUND_ROUND)
- set(AOM_AV1_COMMON_INTRIN_SSE4_1
- ${AOM_AV1_COMMON_INTRIN_SSE4_1}
- "${AOM_ROOT}/av1/common/x86/av1_convolve_scale_sse4.c")
- endif()
+set(AOM_AV1_COMMON_INTRIN_SSE4_1
+ ${AOM_AV1_COMMON_INTRIN_SSE4_1}
+ "${AOM_ROOT}/av1/common/x86/av1_convolve_scale_sse4.c")
set(AOM_AV1_COMMON_INTRIN_SSE2
${AOM_AV1_COMMON_INTRIN_SSE2}
"${AOM_ROOT}/av1/common/x86/convolve_sse2.c")
- set(AOM_AV1_COMMON_INTRIN_AVX2
- ${AOM_AV1_COMMON_INTRIN_AVX2}
- "${AOM_ROOT}/av1/common/x86/convolve_avx2.c")
-endif ()
+set(AOM_AV1_COMMON_INTRIN_AVX2
+ ${AOM_AV1_COMMON_INTRIN_AVX2}
+ "${AOM_ROOT}/av1/common/x86/convolve_avx2.c")
- set(AOM_AV1_ENCODER_SOURCES
- ${AOM_AV1_ENCODER_SOURCES}
- "${AOM_ROOT}/av1/encoder/wedge_utils.c")
+set(AOM_AV1_ENCODER_SOURCES
+ ${AOM_AV1_ENCODER_SOURCES}
+ "${AOM_ROOT}/av1/encoder/wedge_utils.c")
- set(AOM_AV1_ENCODER_INTRIN_SSE2
- ${AOM_AV1_ENCODER_INTRIN_SSE2}
- "${AOM_ROOT}/av1/encoder/x86/wedge_utils_sse2.c")
+set(AOM_AV1_ENCODER_INTRIN_SSE2
+ ${AOM_AV1_ENCODER_INTRIN_SSE2}
+ "${AOM_ROOT}/av1/encoder/x86/wedge_utils_sse2.c")
if (CONFIG_ACCOUNTING)
set(AOM_AV1_DECODER_SOURCES
diff --git a/av1/av1_common.mk b/av1/av1_common.mk
index cbff82d..bc3afa1 100644
--- a/av1/av1_common.mk
+++ b/av1/av1_common.mk
@@ -79,9 +79,7 @@
AV1_COMMON_SRCS-yes += common/av1_inv_txfm1d_cfg.h
AV1_COMMON_SRCS-$(HAVE_AVX2) += common/x86/convolve_avx2.c
AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/av1_convolve_ssse3.c
-ifeq ($(CONFIG_CONVOLVE_ROUND)x$(CONFIG_COMPOUND_ROUND),yesx)
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_convolve_scale_sse4.c
-endif
ifeq ($(CONFIG_HIGHBITDEPTH),yes)
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_highbd_convolve_sse4.c
endif
@@ -161,7 +159,6 @@
endif
endif
-ifeq ($(CONFIG_CONVOLVE_ROUND),yes)
AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/convolve_sse2.c
AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/convolve_2d_sse2.c
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/convolve_2d_sse4.c
@@ -170,7 +167,6 @@
AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/highbd_convolve_2d_ssse3.c
AV1_COMMON_SRCS-$(HAVE_AVX2) += common/x86/highbd_convolve_2d_avx2.c
endif
-endif
ifeq ($(CONFIG_LV_MAP),yes)
AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/txb_sse2.c
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 3676d8f..f76f79a 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -575,48 +575,37 @@
}
# CONVOLVE_ROUND/COMPOUND_ROUND functions
+add_proto qw/void av1_convolve_2d/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+specialize qw/av1_convolve_2d sse2 avx2/;
+add_proto qw/void av1_convolve_rounding/, "const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits";
+specialize qw/av1_convolve_rounding avx2/;
-if (aom_config("CONFIG_CONVOLVE_ROUND") eq "yes") {
- add_proto qw/void av1_convolve_2d/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
- specialize qw/av1_convolve_2d sse2 avx2/;
- add_proto qw/void av1_convolve_rounding/, "const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits";
- specialize qw/av1_convolve_rounding avx2/;
+add_proto qw/void av1_convolve_2d_copy/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+specialize qw/av1_convolve_2d_copy sse2/;
+add_proto qw/void av1_convolve_x/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+specialize qw/av1_convolve_x sse2/;
+add_proto qw/void av1_convolve_y/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+specialize qw/av1_convolve_y sse2/;
- if (aom_config("CONFIG_COMPOUND_ROUND") ne "yes") {
- add_proto qw/void av1_convolve_2d_copy/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
- specialize qw/av1_convolve_2d_copy sse2/;
- add_proto qw/void av1_convolve_x/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
- specialize qw/av1_convolve_x sse2/;
- add_proto qw/void av1_convolve_y/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
- specialize qw/av1_convolve_y sse2/;
- }
+add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params";
+specialize qw/av1_convolve_2d_scale sse4_1/;
- add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params";
- if (aom_config("CONFIG_COMPOUND_ROUND") ne "yes") {
- specialize qw/av1_convolve_2d_scale sse4_1/;
- }
+if (aom_config("CONFIG_JNT_COMP") eq "yes") {
+ add_proto qw/void av1_jnt_convolve_2d/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+ specialize qw/av1_jnt_convolve_2d sse4_1/;
- if (aom_config("CONFIG_JNT_COMP") eq "yes") {
- add_proto qw/void av1_jnt_convolve_2d/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
- specialize qw/av1_jnt_convolve_2d sse4_1/;
+ add_proto qw/void av1_jnt_convolve_2d_copy/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+ specialize qw/av1_jnt_convolve_2d_copy sse2/;
+}
- if (aom_config("CONFIG_COMPOUND_ROUND") ne "yes") {
- add_proto qw/void av1_jnt_convolve_2d_copy/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
- specialize qw/av1_jnt_convolve_2d_copy sse2/;
- }
- }
+if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void av1_highbd_convolve_2d/, "const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+ specialize qw/av1_highbd_convolve_2d ssse3 avx2/;
+ add_proto qw/void av1_highbd_convolve_rounding/, "const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits, int bd";
+ specialize qw/av1_highbd_convolve_rounding avx2/;
- if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
- add_proto qw/void av1_highbd_convolve_2d/, "const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
- specialize qw/av1_highbd_convolve_2d ssse3 avx2/;
- add_proto qw/void av1_highbd_convolve_rounding/, "const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits, int bd";
- specialize qw/av1_highbd_convolve_rounding avx2/;
-
- add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd";
- if (aom_config("CONFIG_COMPOUND_ROUND") ne "yes") {
- specialize qw/av1_highbd_convolve_2d_scale sse4_1/;
- }
- }
+ add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd";
+ specialize qw/av1_highbd_convolve_2d_scale sse4_1/;
}
# INTRA_EDGE functions
diff --git a/av1/common/convolve.c b/av1/common/convolve.c
index 74b7085..d0b747e 100644
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c
@@ -357,7 +357,6 @@
}
}
-#if CONFIG_CONVOLVE_ROUND
void av1_convolve_rounding_c(const int32_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h, int bits) {
for (int r = 0; r < h; ++r) {
@@ -368,190 +367,6 @@
}
}
-#if CONFIG_COMPOUND_ROUND
-void av1_convolve_2d_c(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst,
- int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
- uint8_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
- int im_h = h + filter_params_y->taps - 1;
- int im_stride = w;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
-
- // horizontal filter
- const uint8_t *src_horiz = src - fo_vert * src_stride;
- const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
- for (int y = 0; y < im_h; ++y) {
- for (int x = 0; x < w; ++x) {
- int32_t sum = 0;
- for (int k = 0; k < filter_params_x->taps; ++k) {
- sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
- }
- im_block[y * im_stride + x] =
- clip_pixel(ROUND_POWER_OF_TWO(sum, conv_params->round_0));
- }
- }
-
- // vertical filter
- uint8_t *src_vert = im_block + fo_vert * im_stride;
- const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
- for (int y = 0; y < h; ++y) {
- for (int x = 0; x < w; ++x) {
- CONV_BUF_TYPE sum = 0;
- for (int k = 0; k < filter_params_y->taps; ++k) {
- sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
- }
- CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
- if (conv_params->do_average)
- dst[y * dst_stride + x] += res;
- else
- dst[y * dst_stride + x] = res;
- }
- }
-}
-
-#if CONFIG_JNT_COMP
-void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride,
- CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
- uint8_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
- int im_h = h + filter_params_y->taps - 1;
- int im_stride = w;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
-
- // horizontal filter
- const uint8_t *src_horiz = src - fo_vert * src_stride;
- const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
- for (int y = 0; y < im_h; ++y) {
- for (int x = 0; x < w; ++x) {
- int32_t sum = 0;
- for (int k = 0; k < filter_params_x->taps; ++k) {
- sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
- }
- im_block[y * im_stride + x] =
- clip_pixel(ROUND_POWER_OF_TWO(sum, conv_params->round_0));
- }
- }
-
- // vertical filter
- uint8_t *src_vert = im_block + fo_vert * im_stride;
- const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
- for (int y = 0; y < h; ++y) {
- for (int x = 0; x < w; ++x) {
- CONV_BUF_TYPE sum = 0;
- for (int k = 0; k < filter_params_y->taps; ++k) {
- sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
- }
- CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
- if (conv_params->use_jnt_comp_avg) {
- if (conv_params->do_average == 0) {
- dst[y * dst_stride + x] = res * conv_params->fwd_offset;
- } else {
- dst[y * dst_stride + x] += res * conv_params->bck_offset;
-
- dst[y * dst_stride + x] = ROUND_POWER_OF_TWO(dst[y * dst_stride + x],
- DIST_PRECISION_BITS - 1);
- }
- } else {
- if (conv_params->do_average)
- dst[y * dst_stride + x] += res;
- else
- dst[y * dst_stride + x] = res;
- }
- }
- }
-}
-#endif // CONFIG_JNT_COMP
-
-void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride,
- CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_qn, const int x_step_qn,
- const int subpel_y_qn, const int y_step_qn,
- ConvolveParams *conv_params) {
- uint8_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
- int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
- filter_params_y->taps;
- int im_stride = w;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
-
- // horizontal filter
- const uint8_t *src_horiz = src - fo_vert * src_stride;
- for (int y = 0; y < im_h; ++y) {
- int x_qn = subpel_x_qn;
- for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
- const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
- const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
- assert(x_filter_idx < SUBPEL_SHIFTS);
- const int16_t *x_filter =
- av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx);
- int sum = 0;
- for (int k = 0; k < filter_params_x->taps; ++k)
- sum += x_filter[k] * src_x[k - fo_horiz];
- im_block[y * im_stride + x] =
- clip_pixel(ROUND_POWER_OF_TWO(sum, conv_params->round_0));
- }
- src_horiz += src_stride;
- }
-
- // vertical filter
- const uint8_t *src_vert = im_block + fo_vert * im_stride;
- for (int x = 0; x < w; ++x) {
- int y_qn = subpel_y_qn;
- for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
- const uint8_t *const src_y =
- &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
- const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
- assert(y_filter_idx < SUBPEL_SHIFTS);
- const int16_t *y_filter =
- av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx);
- CONV_BUF_TYPE sum = 0;
- for (int k = 0; k < filter_params_y->taps; ++k) {
- sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
- }
- CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
-#if CONFIG_JNT_COMP
- if (conv_params->use_jnt_comp_avg) {
- if (conv_params->do_average == 0) {
- dst[y * dst_stride + x] = res * conv_params->fwd_offset;
- } else {
- dst[y * dst_stride + x] += res * conv_params->bck_offset;
-
- dst[y * dst_stride + x] = ROUND_POWER_OF_TWO(dst[y * dst_stride + x],
- DIST_PRECISION_BITS - 1);
- }
- } else {
- if (conv_params->do_average)
- dst[y * dst_stride + x] += res;
- else
- dst[y * dst_stride + x] = res;
- }
-#else
- if (conv_params->do_average)
- dst[y * dst_stride + x] += res;
- else
- dst[y * dst_stride + x] = res;
-#endif // CONFIG_JNT_COMP
- }
- src_vert++;
- }
-}
-
-#else
-
/* When convolve-round is enabled and compound-round is disabled, we use a
high-precision convolve filter.
Note: For notes on hardware implementations, including the required
@@ -877,7 +692,6 @@
src_vert++;
}
}
-#endif // CONFIG_COMPOUND_ROUND
void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
@@ -947,12 +761,6 @@
&filter_params_y, subpel_x_q4, x_step_q4,
subpel_y_q4, y_step_q4, conv_params);
} else {
-#if CONFIG_COMPOUND_ROUND
- av1_jnt_convolve_2d(src, src_stride, conv_params->dst,
- conv_params->dst_stride, w, h, &filter_params_x,
- &filter_params_y, subpel_x_q4, subpel_y_q4,
- conv_params);
-#else
if (subpel_x_q4 == 0 && subpel_y_q4 == 0) {
av1_jnt_convolve_2d_copy(src, src_stride, conv_params->dst,
conv_params->dst_stride, w, h,
@@ -976,7 +784,6 @@
&filter_params_y, subpel_x_q4, subpel_y_q4,
conv_params);
}
-#endif // CONFIG_COMPOUND_ROUND
}
#else
if (scaled) {
@@ -985,11 +792,6 @@
&filter_params_y, subpel_x_q4, x_step_q4,
subpel_y_q4, y_step_q4, conv_params);
} else {
-#if CONFIG_COMPOUND_ROUND
- av1_convolve_2d(src, src_stride, conv_params->dst,
- conv_params->dst_stride, w, h, &filter_params_x,
- &filter_params_y, subpel_x_q4, subpel_y_q4, conv_params);
-#else
// Special case convolve functions should produce the same result as
// av1_convolve_2d.
if (subpel_x_q4 == 0 && subpel_y_q4 == 0) {
@@ -1012,7 +814,6 @@
&filter_params_y, subpel_x_q4, subpel_y_q4,
conv_params);
}
-#endif // CONFIG_COMPOUND_ROUND
}
#endif // CONFIG_JNT_COMP
}
@@ -1031,114 +832,6 @@
}
}
-#if CONFIG_COMPOUND_ROUND
-void av1_highbd_convolve_2d_c(const uint16_t *src, int src_stride,
- CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params, int bd) {
- uint16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
- int im_h = h + filter_params_y->taps - 1;
- int im_stride = w;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
-
- // horizontal filter
- const uint16_t *src_horiz = src - fo_vert * src_stride;
- const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
- for (int y = 0; y < im_h; ++y) {
- for (int x = 0; x < w; ++x) {
- int32_t sum = 0;
- for (int k = 0; k < filter_params_x->taps; ++k) {
- sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
- }
- im_block[y * im_stride + x] =
- clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, conv_params->round_0), bd);
- }
- }
-
- // vertical filter
- uint16_t *src_vert = im_block + fo_vert * im_stride;
- const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
- for (int y = 0; y < h; ++y) {
- for (int x = 0; x < w; ++x) {
- CONV_BUF_TYPE sum = 0;
- for (int k = 0; k < filter_params_y->taps; ++k) {
- sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
- }
- CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
- if (conv_params->do_average)
- dst[y * dst_stride + x] += res;
- else
- dst[y * dst_stride + x] = res;
- }
- }
-}
-
-void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
- CONV_BUF_TYPE *dst, int dst_stride, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_qn, const int x_step_qn,
- const int subpel_y_qn, const int y_step_qn,
- ConvolveParams *conv_params, int bd) {
- uint16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
- int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
- filter_params_y->taps;
- int im_stride = w;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
- (void)bd;
-
- // horizontal filter
- const uint16_t *src_horiz = src - fo_vert * src_stride;
- for (int y = 0; y < im_h; ++y) {
- int x_qn = subpel_x_qn;
- for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
- const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
- const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
- assert(x_filter_idx < SUBPEL_SHIFTS);
- const int16_t *x_filter =
- av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx);
- int sum = 0;
- for (int k = 0; k < filter_params_x->taps; ++k)
- sum += x_filter[k] * src_x[k - fo_horiz];
- im_block[y * im_stride + x] =
- clip_pixel(ROUND_POWER_OF_TWO(sum, conv_params->round_0));
- }
- src_horiz += src_stride;
- }
-
- // vertical filter
- uint16_t *src_vert = im_block + fo_vert * im_stride;
- for (int x = 0; x < w; ++x) {
- int y_qn = subpel_y_qn;
- for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
- const uint16_t *const src_y =
- &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
- const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
- assert(y_filter_idx < SUBPEL_SHIFTS);
- const int16_t *y_filter =
- av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx);
- CONV_BUF_TYPE sum = 0;
- for (int k = 0; k < filter_params_y->taps; ++k) {
- sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
- }
- CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
- if (conv_params->do_average)
- dst[y * dst_stride + x] += res;
- else
- dst[y * dst_stride + x] = res;
- }
- src_vert++;
- }
-}
-
-#else
-
void av1_highbd_convolve_2d_c(const uint16_t *src, int src_stride,
CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
InterpFilterParams *filter_params_x,
@@ -1253,7 +946,6 @@
src_vert++;
}
}
-#endif // CONFIG_COMPOUND_ROUND
void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
uint8_t *dst, int dst_stride, int w, int h,
@@ -1318,8 +1010,6 @@
}
#endif // CONFIG_HIGHBITDEPTH
-#endif // CONFIG_CONVOLVE_ROUND
-
typedef void (*ConvolveFunc)(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
const InterpFilterParams filter_params,
diff --git a/av1/common/convolve.h b/av1/common/convolve.h
index 607532b..8803ffa 100644
--- a/av1/common/convolve.h
+++ b/av1/common/convolve.h
@@ -99,7 +99,6 @@
struct AV1Common;
void av1_convolve_init(struct AV1Common *cm);
-#if CONFIG_CONVOLVE_ROUND
void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
InterpFilters interp_filters, const int subpel_x_q4,
@@ -113,11 +112,7 @@
conv_params.ref = ref;
conv_params.do_average = do_average;
conv_params.round = CONVOLVE_OPT_NO_ROUND;
-#if CONFIG_COMPOUND_ROUND
- conv_params.round_0 = FILTER_BITS;
-#else
conv_params.round_0 = 5;
-#endif
conv_params.round_1 = 0;
conv_params.dst = dst;
conv_params.dst_stride = dst_stride;
@@ -135,7 +130,6 @@
int scaled, ConvolveParams *conv_params,
int bd);
#endif
-#endif // CONFIG_CONVOLVE_ROUND
void av1_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h, InterpFilters interp_filters,
diff --git a/av1/common/reconinter.c b/av1/common/reconinter.c
index 1e3ccaa..c3737ef 100644
--- a/av1/common/reconinter.c
+++ b/av1/common/reconinter.c
@@ -403,7 +403,6 @@
#elif COMPOUND_SEGMENT_TYPE == 1
#define DIFF_FACTOR 16
-#if CONFIG_CONVOLVE_ROUND
static void diffwtd_mask_d32(uint8_t *mask, int which_inverse, int mask_base,
const int32_t *src0, int src0_stride,
const int32_t *src1, int src1_stride,
@@ -441,7 +440,6 @@
default: assert(0);
}
}
-#endif // CONFIG_CONVOLVE_ROUND
static void diffwtd_mask(uint8_t *mask, int which_inverse, int mask_base,
const uint8_t *src0, int src0_stride,
@@ -691,7 +689,6 @@
init_wedge_masks();
}
-#if CONFIG_CONVOLVE_ROUND
static void build_masked_compound_no_round(
CONV_BUF_TYPE *dst, int dst_stride, const CONV_BUF_TYPE *src0,
int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride,
@@ -705,7 +702,7 @@
aom_blend_a64_d32_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride,
mask, block_size_wide[sb_type], h, w, subh, subw);
}
-#endif // CONFIG_CONVOLVE_ROUND
+
static void build_masked_compound(
uint8_t *dst, int dst_stride, const uint8_t *src0, int src0_stride,
const uint8_t *src1, int src1_stride,
@@ -756,17 +753,12 @@
// a temporary buffer, then will blend that temporary buffer with that from
// the other reference.
//
-// With CONFIG_CONVOLVE_ROUND, if the rounding mode is CONVOLVE_OPT_NO_ROUND
+// If the rounding mode is CONVOLVE_OPT_NO_ROUND
// then the predictions are at 32-bits, so we'll need 32 bits per
// pixel. Otherwise, we'll need up to 16 bits per pixel if
// CONFIG_HIGHBITDEPTH or just 8 otherwise.
-#if CONFIG_CONVOLVE_ROUND
#define INTER_PRED_BYTES_PER_PIXEL 4
-#elif CONFIG_HIGHBITDEPTH
-#define INTER_PRED_BYTES_PER_PIXEL 2
-#else
-#define INTER_PRED_BYTES_PER_PIXEL 1
-#endif
+
DECLARE_ALIGNED(16, uint8_t,
tmp_buf[INTER_PRED_BYTES_PER_PIXEL * MAX_SB_SQUARE]);
#undef INTER_PRED_BYTES_PER_PIXEL
@@ -779,7 +771,6 @@
uint8_t *tmp_dst = tmp_buf;
#endif
-#if CONFIG_CONVOLVE_ROUND
const int tmp_buf_stride = MAX_SB_SIZE;
const int is_conv_no_round = conv_params->round == CONVOLVE_OPT_NO_ROUND;
CONV_BUF_TYPE *org_dst = conv_params->dst;
@@ -790,7 +781,6 @@
conv_params->dst_stride = tmp_buf_stride;
assert(conv_params->do_average == 0);
}
-#endif // CONFIG_CONVOLVE_ROUND
// This will generate a prediction in tmp_buf for the second reference
av1_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE, subpel_x,
@@ -799,14 +789,12 @@
xd);
if (!plane && comp_data.interinter_compound_type == COMPOUND_SEG) {
-#if CONFIG_CONVOLVE_ROUND
if (is_conv_no_round) {
build_compound_seg_mask_d32(comp_data.seg_mask, comp_data.mask_type,
org_dst, org_dst_stride, tmp_buf32,
tmp_buf_stride, mi->mbmi.sb_type, h, w,
conv_params, xd->bd);
} else {
-#endif // CONFIG_CONVOLVE_ROUND
#if CONFIG_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
build_compound_seg_mask_highbd(comp_data.seg_mask, comp_data.mask_type,
@@ -820,12 +808,9 @@
#if CONFIG_HIGHBITDEPTH
}
#endif
-#if CONFIG_CONVOLVE_ROUND
}
-#endif
}
-#if CONFIG_CONVOLVE_ROUND
if (is_conv_no_round) {
build_masked_compound_no_round(org_dst, org_dst_stride, org_dst,
org_dst_stride, tmp_buf32, tmp_buf_stride,
@@ -844,8 +829,6 @@
conv_params->do_post_rounding = 0;
} else {
-#endif // CONFIG_CONVOLVE_ROUND
-
#if CONFIG_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
build_masked_compound_highbd(dst, dst_stride, dst, dst_stride, tmp_dst,
@@ -855,9 +838,7 @@
#endif // CONFIG_HIGHBITDEPTH
build_masked_compound(dst, dst_stride, dst, dst_stride, tmp_dst,
MAX_SB_SIZE, &comp_data, mi->mbmi.sb_type, h, w);
-#if CONFIG_CONVOLVE_ROUND
}
-#endif // CONFIG_CONVOLVE_ROUND
}
// TODO(sarahparker) av1_highbd_build_inter_predictor and
@@ -1038,17 +1019,11 @@
for (idx = 0; idx < b8_w; idx += b4_w) {
MB_MODE_INFO *this_mbmi = &xd->mi[row * xd->mi_stride + col]->mbmi;
is_compound = has_second_ref(this_mbmi);
-#if CONFIG_CONVOLVE_ROUND
DECLARE_ALIGNED(16, int32_t, tmp_dst[8 * 8]);
int tmp_dst_stride = 8;
assert(w <= 8 && h <= 8);
-#endif // CONFIG_CONVOLVE_ROUND
-#if CONFIG_CONVOLVE_ROUND
ConvolveParams conv_params =
get_conv_params_no_round(0, 0, plane, tmp_dst, tmp_dst_stride);
-#else
- ConvolveParams conv_params = get_conv_params(0, 0, plane);
-#endif
#if CONFIG_JNT_COMP
conv_params.use_jnt_comp_avg = 0;
#endif // CONFIG_JNT_COMP
@@ -1153,7 +1128,6 @@
(mi_y >> pd->subsampling_y) + y, plane, ref, mi, build_for_obmc,
xs, ys, xd);
} // for (ref = 0; ref < 1 + is_compound; ++ref)
-#if CONFIG_CONVOLVE_ROUND
if (conv_params.do_post_rounding) {
#if CONFIG_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
@@ -1169,7 +1143,6 @@
FILTER_BITS * 2 + is_compound - conv_params.round_0 -
conv_params.round_1);
}
-#endif // CONFIG_CONVOLVE_ROUND
++col;
}
++row;
@@ -1184,9 +1157,7 @@
uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
uint8_t *pre[2];
SubpelParams subpel_params[2];
-#if CONFIG_CONVOLVE_ROUND
DECLARE_ALIGNED(16, int32_t, tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE]);
-#endif // CONFIG_CONVOLVE_ROUND
for (ref = 0; ref < 1 + is_compound; ++ref) {
#if CONFIG_INTRABC
@@ -1251,7 +1222,6 @@
}
}
-#if CONFIG_CONVOLVE_ROUND
ConvolveParams conv_params =
get_conv_params_no_round(ref, ref, plane, tmp_dst, MAX_SB_SIZE);
#if CONFIG_JNT_COMP
@@ -1260,10 +1230,6 @@
&conv_params.use_jnt_comp_avg, is_compound);
#endif // CONFIG_JNT_COMP
-#else
- ConvolveParams conv_params = get_conv_params(ref, ref, plane);
-#endif // CONFIG_CONVOLVE_ROUND
-
for (ref = 0; ref < 1 + is_compound; ++ref) {
#if CONFIG_INTRABC
const struct scale_factors *const sf =
@@ -1301,7 +1267,6 @@
subpel_params[ref].ys, xd);
}
-#if CONFIG_CONVOLVE_ROUND
// TODO(angiebird): This part needs optimization
if (conv_params.do_post_rounding) {
#if CONFIG_HIGHBITDEPTH
@@ -1317,7 +1282,6 @@
FILTER_BITS * 2 + is_compound -
conv_params.round_0 - conv_params.round_1);
}
-#endif // CONFIG_CONVOLVE_ROUND
}
}
diff --git a/av1/common/reconinter.h b/av1/common/reconinter.h
index 7c2883d..1790086 100644
--- a/av1/common/reconinter.h
+++ b/av1/common/reconinter.h
@@ -55,14 +55,10 @@
// TODO(afergs, debargha): Use a different scale convolve function
// that uses higher precision for subpel_x, subpel_y, xs, ys
if (conv_params->round == CONVOLVE_OPT_NO_ROUND) {
-#if CONFIG_CONVOLVE_ROUND
av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
interp_filters, subpel_x, xs, subpel_y, ys, 1,
conv_params);
conv_params->do_post_rounding = 1;
-#else
- assert(0);
-#endif // CONFIG_CONVOLVE_ROUND
} else {
assert(conv_params->round == CONVOLVE_OPT_ROUND);
av1_convolve_scale(src, src_stride, dst, dst_stride, w, h, interp_filters,
@@ -78,14 +74,10 @@
assert(xs <= SUBPEL_SHIFTS);
assert(ys <= SUBPEL_SHIFTS);
if (conv_params->round == CONVOLVE_OPT_NO_ROUND) {
-#if CONFIG_CONVOLVE_ROUND
av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
interp_filters, subpel_x, xs, subpel_y, ys, 0,
conv_params);
conv_params->do_post_rounding = 1;
-#else
- assert(0);
-#endif // CONFIG_CONVOLVE_ROUND
} else {
assert(conv_params->round == CONVOLVE_OPT_ROUND);
@@ -131,14 +123,10 @@
if (has_scale(xs, ys)) {
if (conv_params->round == CONVOLVE_OPT_NO_ROUND) {
-#if CONFIG_CONVOLVE_ROUND
av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
interp_filters, subpel_x, xs, subpel_y, ys,
1, conv_params, bd);
conv_params->do_post_rounding = 1;
-#else
- assert(0);
-#endif // CONFIG_CONVOLVE_ROUND
} else {
av1_highbd_convolve_scale(src, src_stride, dst, dst_stride, w, h,
interp_filters, subpel_x, xs, subpel_y, ys, avg,
@@ -154,14 +142,10 @@
assert(xs <= SUBPEL_SHIFTS);
assert(ys <= SUBPEL_SHIFTS);
if (conv_params->round == CONVOLVE_OPT_NO_ROUND) {
-#if CONFIG_CONVOLVE_ROUND
av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
interp_filters, subpel_x, xs, subpel_y, ys,
0, conv_params, bd);
conv_params->do_post_rounding = 1;
-#else
- assert(0);
-#endif // CONFIG_CONVOLVE_ROUND
} else {
InterpFilterParams filter_params_x, filter_params_y;
#if CONFIG_SHORT_FILTER
diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c
index 1a9c6dc..962f140 100644
--- a/av1/common/warped_motion.c
+++ b/av1/common/warped_motion.c
@@ -427,7 +427,6 @@
int16_t beta, int16_t gamma, int16_t delta) {
int32_t tmp[15 * 8];
int i, j, k, l, m;
-#if CONFIG_CONVOLVE_ROUND
const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
const int reduce_bits_horiz =
use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
@@ -445,14 +444,6 @@
conv_params->do_post_rounding = 1;
}
assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS);
-#else
- const int reduce_bits_horiz = HORSHEAR_REDUCE_PREC_BITS;
- const int max_bits_horiz =
- bd + WARPEDPIXEL_FILTER_BITS + 1 - HORSHEAR_REDUCE_PREC_BITS;
- const int offset_bits_horiz = bd + WARPEDPIXEL_FILTER_BITS - 1;
- const int offset_bits_vert =
- bd + 2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS;
-#endif
(void)max_bits_horiz;
for (i = p_row; i < p_row + p_height; i += 8) {
@@ -524,7 +515,7 @@
for (m = 0; m < 8; ++m) {
sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
}
-#if CONFIG_CONVOLVE_ROUND
+
if (use_conv_params) {
CONV_BUF_TYPE *p =
&conv_params
@@ -555,9 +546,6 @@
*p = sum;
#endif // CONFIG_JNT_COMP
} else {
-#else
- {
-#endif
uint16_t *p =
&pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
sum = ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS);
@@ -748,7 +736,6 @@
int32_t tmp[15 * 8];
int i, j, k, l, m;
const int bd = 8;
-#if CONFIG_CONVOLVE_ROUND
const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
const int reduce_bits_horiz =
use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
@@ -766,14 +753,6 @@
conv_params->do_post_rounding = 1;
}
assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS);
-#else
- const int reduce_bits_horiz = HORSHEAR_REDUCE_PREC_BITS;
- const int max_bits_horiz =
- bd + WARPEDPIXEL_FILTER_BITS + 1 - HORSHEAR_REDUCE_PREC_BITS;
- const int offset_bits_horiz = bd + WARPEDPIXEL_FILTER_BITS - 1;
- const int offset_bits_vert =
- bd + 2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS;
-#endif
(void)max_bits_horiz;
for (i = p_row; i < p_row + p_height; i += 8) {
@@ -851,7 +830,7 @@
for (m = 0; m < 8; ++m) {
sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
}
-#if CONFIG_CONVOLVE_ROUND
+
if (use_conv_params) {
CONV_BUF_TYPE *p =
&conv_params
@@ -882,9 +861,6 @@
*p = sum;
#endif // CONFIG_JNT_COMP
} else {
-#else
- {
-#endif
uint8_t *p =
&pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
sum = ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS);
diff --git a/av1/common/x86/convolve_2d_avx2.c b/av1/common/x86/convolve_2d_avx2.c
index ff8ade8..3c1a24d 100644
--- a/av1/common/x86/convolve_2d_avx2.c
+++ b/av1/common/x86/convolve_2d_avx2.c
@@ -17,239 +17,6 @@
#include "aom_dsp/aom_filter.h"
#include "av1/common/convolve.h"
-#if CONFIG_COMPOUND_ROUND
-void av1_convolve_2d_avx2(const uint8_t *src, int src_stride,
- CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
- DECLARE_ALIGNED(32, uint8_t,
- im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
- int im_h = h + filter_params_y->taps - 1;
- int im_stride = MAX_SB_SIZE;
- int i, j;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
- const int do_average = conv_params->do_average;
- const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
-
- const __m256i zero = _mm256_setzero_si256();
-
- /* Horizontal filter */
- {
- const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
- const __m128i coeffs_x8 = _mm_loadu_si128((__m128i *)x_filter);
- // since not all compilers yet support _mm256_set_m128i()
- const __m256i coeffs_x = _mm256_insertf128_si256(
- _mm256_castsi128_si256(coeffs_x8), coeffs_x8, 1);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m256i tmp_0 = _mm256_unpacklo_epi32(coeffs_x, coeffs_x);
- // coeffs 4 5 4 5 6 7 6 7
- const __m256i tmp_1 = _mm256_unpackhi_epi32(coeffs_x, coeffs_x);
-
- // coeffs 0 1 0 1 0 1 0 1
- const __m256i coeff_01 = _mm256_unpacklo_epi64(tmp_0, tmp_0);
- // coeffs 2 3 2 3 2 3 2 3
- const __m256i coeff_23 = _mm256_unpackhi_epi64(tmp_0, tmp_0);
- // coeffs 4 5 4 5 4 5 4 5
- const __m256i coeff_45 = _mm256_unpacklo_epi64(tmp_1, tmp_1);
- // coeffs 6 7 6 7 6 7 6 7
- const __m256i coeff_67 = _mm256_unpackhi_epi64(tmp_1, tmp_1);
-
- const __m256i round_const =
- _mm256_set1_epi32((1 << conv_params->round_0) >> 1);
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
-
- for (i = 0; i < im_h; ++i) {
- for (j = 0; j < w; j += 16) {
- const __m256i data = _mm256_permute4x64_epi64(
- _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]),
- _MM_SHUFFLE(2, 1, 1, 0));
-
- // Filter even-index pixels
- const __m256i src_0 = _mm256_unpacklo_epi8(data, zero);
- const __m256i res_0 = _mm256_madd_epi16(src_0, coeff_01);
- const __m256i src_2 =
- _mm256_unpacklo_epi8(_mm256_srli_si256(data, 2), zero);
- const __m256i res_2 = _mm256_madd_epi16(src_2, coeff_23);
- const __m256i src_4 =
- _mm256_unpacklo_epi8(_mm256_srli_si256(data, 4), zero);
- const __m256i res_4 = _mm256_madd_epi16(src_4, coeff_45);
- const __m256i src_6 =
- _mm256_unpacklo_epi8(_mm256_srli_si256(data, 6), zero);
- const __m256i res_6 = _mm256_madd_epi16(src_6, coeff_67);
-
- __m256i res_even = _mm256_add_epi32(_mm256_add_epi32(res_0, res_4),
- _mm256_add_epi32(res_2, res_6));
- res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const),
- round_shift);
-
- // Filter odd-index pixels
- const __m256i src_1 =
- _mm256_unpacklo_epi8(_mm256_srli_si256(data, 1), zero);
- const __m256i res_1 = _mm256_madd_epi16(src_1, coeff_01);
- const __m256i src_3 =
- _mm256_unpacklo_epi8(_mm256_srli_si256(data, 3), zero);
- const __m256i res_3 = _mm256_madd_epi16(src_3, coeff_23);
- const __m256i src_5 =
- _mm256_unpacklo_epi8(_mm256_srli_si256(data, 5), zero);
- const __m256i res_5 = _mm256_madd_epi16(src_5, coeff_45);
- const __m256i src_7 =
- _mm256_unpacklo_epi8(_mm256_srli_si256(data, 7), zero);
- const __m256i res_7 = _mm256_madd_epi16(src_7, coeff_67);
-
- __m256i res_odd = _mm256_add_epi32(_mm256_add_epi32(res_1, res_5),
- _mm256_add_epi32(res_3, res_7));
- res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const),
- round_shift);
-
- // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
- __m256i res = _mm256_packs_epi32(res_even, res_odd);
- res = _mm256_packus_epi16(res, res);
- _mm_storel_epi64((__m128i *)&im_block[i * im_stride + j],
- _mm256_extractf128_si256(res, 0));
- _mm_storel_epi64((__m128i *)&im_block[i * im_stride + j + 8],
- _mm256_extractf128_si256(res, 1));
- }
- }
- }
-
- /* Vertical filter */
- {
- const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
- const __m128i coeffs_y8 = _mm_loadu_si128((__m128i *)y_filter);
- const __m256i coeffs_y = _mm256_insertf128_si256(
- _mm256_castsi128_si256(coeffs_y8), coeffs_y8, 1);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m256i tmp_0 = _mm256_unpacklo_epi32(coeffs_y, coeffs_y);
- // coeffs 4 5 4 5 6 7 6 7
- const __m256i tmp_1 = _mm256_unpackhi_epi32(coeffs_y, coeffs_y);
-
- // coeffs 0 1 0 1 0 1 0 1
- const __m256i coeff_01 = _mm256_unpacklo_epi64(tmp_0, tmp_0);
- // coeffs 2 3 2 3 2 3 2 3
- const __m256i coeff_23 = _mm256_unpackhi_epi64(tmp_0, tmp_0);
- // coeffs 4 5 4 5 4 5 4 5
- const __m256i coeff_45 = _mm256_unpacklo_epi64(tmp_1, tmp_1);
- // coeffs 6 7 6 7 6 7 6 7
- const __m256i coeff_67 = _mm256_unpackhi_epi64(tmp_1, tmp_1);
-
- const __m256i round_const =
- _mm256_set1_epi32((1 << conv_params->round_1) >> 1);
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
-
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 16) {
- // Filter even-index pixels
- const uint8_t *data = &im_block[i * im_stride + j];
- const __m256i src_01 = _mm256_unpacklo_epi8(
- _mm256_inserti128_si256(
- _mm256_castsi128_si256(
- _mm_loadl_epi64((__m128i *)(data + 0 * im_stride))),
- _mm_loadl_epi64((__m128i *)(data + 0 * im_stride + 8)), 1),
- _mm256_inserti128_si256(
- _mm256_castsi128_si256(
- _mm_loadl_epi64((__m128i *)(data + 1 * im_stride))),
- _mm_loadl_epi64((__m128i *)(data + 1 * im_stride + 8)), 1));
- const __m256i src_23 = _mm256_unpacklo_epi8(
- _mm256_inserti128_si256(
- _mm256_castsi128_si256(
- _mm_loadl_epi64((__m128i *)(data + 2 * im_stride))),
- _mm_loadl_epi64((__m128i *)(data + 2 * im_stride + 8)), 1),
- _mm256_inserti128_si256(
- _mm256_castsi128_si256(
- _mm_loadl_epi64((__m128i *)(data + 3 * im_stride))),
- _mm_loadl_epi64((__m128i *)(data + 3 * im_stride + 8)), 1));
- const __m256i src_45 = _mm256_unpacklo_epi8(
- _mm256_inserti128_si256(
- _mm256_castsi128_si256(
- _mm_loadl_epi64((__m128i *)(data + 4 * im_stride))),
- _mm_loadl_epi64((__m128i *)(data + 4 * im_stride + 8)), 1),
- _mm256_inserti128_si256(
- _mm256_castsi128_si256(
- _mm_loadl_epi64((__m128i *)(data + 5 * im_stride))),
- _mm_loadl_epi64((__m128i *)(data + 5 * im_stride + 8)), 1));
- const __m256i src_67 = _mm256_unpacklo_epi8(
- _mm256_inserti128_si256(
- _mm256_castsi128_si256(
- _mm_loadl_epi64((__m128i *)(data + 6 * im_stride))),
- _mm_loadl_epi64((__m128i *)(data + 6 * im_stride + 8)), 1),
- _mm256_inserti128_si256(
- _mm256_castsi128_si256(
- _mm_loadl_epi64((__m128i *)(data + 7 * im_stride))),
- _mm_loadl_epi64((__m128i *)(data + 7 * im_stride + 8)), 1));
-
- const __m256i src_0 = _mm256_unpacklo_epi8(src_01, zero);
- const __m256i src_2 = _mm256_unpacklo_epi8(src_23, zero);
- const __m256i src_4 = _mm256_unpacklo_epi8(src_45, zero);
- const __m256i src_6 = _mm256_unpacklo_epi8(src_67, zero);
-
- const __m256i res_0 = _mm256_madd_epi16(src_0, coeff_01);
- const __m256i res_2 = _mm256_madd_epi16(src_2, coeff_23);
- const __m256i res_4 = _mm256_madd_epi16(src_4, coeff_45);
- const __m256i res_6 = _mm256_madd_epi16(src_6, coeff_67);
-
- const __m256i res_even = _mm256_add_epi32(
- _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6));
-
- // Filter odd-index pixels
- const __m256i src_1 = _mm256_unpackhi_epi8(src_01, zero);
- const __m256i src_3 = _mm256_unpackhi_epi8(src_23, zero);
- const __m256i src_5 = _mm256_unpackhi_epi8(src_45, zero);
- const __m256i src_7 = _mm256_unpackhi_epi8(src_67, zero);
-
- const __m256i res_1 = _mm256_madd_epi16(src_1, coeff_01);
- const __m256i res_3 = _mm256_madd_epi16(src_3, coeff_23);
- const __m256i res_5 = _mm256_madd_epi16(src_5, coeff_45);
- const __m256i res_7 = _mm256_madd_epi16(src_7, coeff_67);
-
- const __m256i res_odd = _mm256_add_epi32(
- _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7));
-
- // Rearrange pixels back into the order 0 ... 7
- const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
- const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
-
- const __m256i res_lo_round = _mm256_sra_epi32(
- _mm256_add_epi32(res_lo, round_const), round_shift);
- const __m256i res_hi_round = _mm256_sra_epi32(
- _mm256_add_epi32(res_hi, round_const), round_shift);
-
- // Accumulate values into the destination buffer
- __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
- if (do_average) {
- _mm_storeu_si128(
- p + 0, _mm_add_epi32(_mm_loadu_si128(p + 0),
- _mm256_extractf128_si256(res_lo_round, 0)));
- _mm_storeu_si128(
- p + 1, _mm_add_epi32(_mm_loadu_si128(p + 1),
- _mm256_extractf128_si256(res_hi_round, 0)));
- if (w - j > 8) {
- _mm_storeu_si128(p + 2, _mm_add_epi32(_mm_loadu_si128(p + 2),
- _mm256_extractf128_si256(
- res_lo_round, 1)));
- _mm_storeu_si128(p + 3, _mm_add_epi32(_mm_loadu_si128(p + 3),
- _mm256_extractf128_si256(
- res_hi_round, 1)));
- }
- } else {
- _mm_storeu_si128(p + 0, _mm256_extractf128_si256(res_lo_round, 0));
- _mm_storeu_si128(p + 1, _mm256_extractf128_si256(res_hi_round, 0));
- if (w - j > 8) {
- _mm_storeu_si128(p + 2, _mm256_extractf128_si256(res_lo_round, 1));
- _mm_storeu_si128(p + 3, _mm256_extractf128_si256(res_hi_round, 1));
- }
- }
- }
- }
- }
-}
-#else
void av1_convolve_2d_avx2(const uint8_t *src, int src_stride,
CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
InterpFilterParams *filter_params_x,
@@ -459,4 +226,3 @@
}
}
}
-#endif
diff --git a/av1/common/x86/convolve_2d_sse2.c b/av1/common/x86/convolve_2d_sse2.c
index 13275b6..34b7dc7 100644
--- a/av1/common/x86/convolve_2d_sse2.c
+++ b/av1/common/x86/convolve_2d_sse2.c
@@ -17,185 +17,6 @@
#include "aom_dsp/aom_filter.h"
#include "av1/common/convolve.h"
-#if CONFIG_COMPOUND_ROUND
-void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
- CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
- DECLARE_ALIGNED(16, uint8_t,
- im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
- int im_h = h + filter_params_y->taps - 1;
- int im_stride = MAX_SB_SIZE;
- int i, j;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
- const int do_average = conv_params->do_average;
- const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
-
- const __m128i zero = _mm_setzero_si128();
-
- /* Horizontal filter */
- {
- const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
- const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
- // coeffs 4 5 4 5 6 7 6 7
- const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
- // coeffs 0 1 0 1 0 1 0 1
- const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
- // coeffs 2 3 2 3 2 3 2 3
- const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
- // coeffs 4 5 4 5 4 5 4 5
- const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
- // coeffs 6 7 6 7 6 7 6 7
- const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
- const __m128i round_const =
- _mm_set1_epi32((1 << conv_params->round_0) >> 1);
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
-
- for (i = 0; i < im_h; ++i) {
- for (j = 0; j < w; j += 8) {
- const __m128i data =
- _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
-
- // Filter even-index pixels
- const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
- const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
- const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
- const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
- const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
- const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
- const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
- const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
- __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
- _mm_add_epi32(res_2, res_6));
- res_even =
- _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
-
- // Filter odd-index pixels
- const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
- const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
- const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
- const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
- const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
- const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
- const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
- const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
- __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
- _mm_add_epi32(res_3, res_7));
- res_odd =
- _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
-
- // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
- __m128i res = _mm_packs_epi32(res_even, res_odd);
- res = _mm_packus_epi16(res, res);
- _mm_storel_epi64((__m128i *)&im_block[i * im_stride + j], res);
- }
- }
- }
-
- /* Vertical filter */
- {
- const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
- const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
- // coeffs 4 5 4 5 6 7 6 7
- const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
- // coeffs 0 1 0 1 0 1 0 1
- const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
- // coeffs 2 3 2 3 2 3 2 3
- const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
- // coeffs 4 5 4 5 4 5 4 5
- const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
- // coeffs 6 7 6 7 6 7 6 7
- const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
- const __m128i round_const =
- _mm_set1_epi32((1 << conv_params->round_1) >> 1);
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
-
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 8) {
- // Filter even-index pixels
- const uint8_t *data = &im_block[i * im_stride + j];
- const __m128i src_01 = _mm_unpacklo_epi8(
- _mm_loadl_epi64((__m128i *)(data + 0 * im_stride)),
- _mm_loadl_epi64((__m128i *)(data + 1 * im_stride)));
- const __m128i src_23 = _mm_unpacklo_epi8(
- _mm_loadl_epi64((__m128i *)(data + 2 * im_stride)),
- _mm_loadl_epi64((__m128i *)(data + 3 * im_stride)));
- const __m128i src_45 = _mm_unpacklo_epi8(
- _mm_loadl_epi64((__m128i *)(data + 4 * im_stride)),
- _mm_loadl_epi64((__m128i *)(data + 5 * im_stride)));
- const __m128i src_67 = _mm_unpacklo_epi8(
- _mm_loadl_epi64((__m128i *)(data + 6 * im_stride)),
- _mm_loadl_epi64((__m128i *)(data + 7 * im_stride)));
-
- const __m128i src_0 = _mm_unpacklo_epi8(src_01, zero);
- const __m128i src_2 = _mm_unpacklo_epi8(src_23, zero);
- const __m128i src_4 = _mm_unpacklo_epi8(src_45, zero);
- const __m128i src_6 = _mm_unpacklo_epi8(src_67, zero);
-
- const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
- const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
- const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
- const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
- const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
- _mm_add_epi32(res_4, res_6));
-
- // Filter odd-index pixels
- const __m128i src_1 = _mm_unpackhi_epi8(src_01, zero);
- const __m128i src_3 = _mm_unpackhi_epi8(src_23, zero);
- const __m128i src_5 = _mm_unpackhi_epi8(src_45, zero);
- const __m128i src_7 = _mm_unpackhi_epi8(src_67, zero);
-
- const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
- const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
- const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
- const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
- const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
- _mm_add_epi32(res_5, res_7));
-
- // Rearrange pixels back into the order 0 ... 7
- const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
- const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
- const __m128i res_lo_round =
- _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
- const __m128i res_hi_round =
- _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
-
- // Accumulate values into the destination buffer
- __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
- if (do_average) {
- _mm_storeu_si128(p + 0,
- _mm_add_epi32(_mm_loadu_si128(p + 0), res_lo_round));
- _mm_storeu_si128(p + 1,
- _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
- } else {
- _mm_storeu_si128(p + 0, res_lo_round);
- _mm_storeu_si128(p + 1, res_hi_round);
- }
- }
- }
- }
-}
-#else
void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
InterpFilterParams *filter_params_x,
@@ -698,4 +519,3 @@
}
}
#endif // CONFIG_JNT_COMP
-#endif // CONFIG_COMPOUND_ROUND
diff --git a/av1/common/x86/convolve_2d_sse4.c b/av1/common/x86/convolve_2d_sse4.c
index 71c32e7..ea0811a 100644
--- a/av1/common/x86/convolve_2d_sse4.c
+++ b/av1/common/x86/convolve_2d_sse4.c
@@ -19,221 +19,6 @@
#include "av1/common/convolve.h"
#if CONFIG_JNT_COMP
-#if CONFIG_COMPOUND_ROUND
-void av1_jnt_convolve_2d_sse4_1(const uint8_t *src, int src_stride,
- CONV_BUF_TYPE *dst, int dst_stride, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
- DECLARE_ALIGNED(16, uint8_t,
- im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
- int im_h = h + filter_params_y->taps - 1;
- int im_stride = MAX_SB_SIZE;
- int i, j;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
- const int do_average = conv_params->do_average;
- const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
-
- const __m128i zero = _mm_setzero_si128();
-
- const int w0 = conv_params->fwd_offset;
- const int w1 = conv_params->bck_offset;
- const __m128i wt0 = _mm_set_epi32(w0, w0, w0, w0);
- const __m128i wt1 = _mm_set_epi32(w1, w1, w1, w1);
- const int jnt_round_const = 1 << (DIST_PRECISION_BITS - 2);
- const __m128i jnt_r = _mm_set_epi32(jnt_round_const, jnt_round_const,
- jnt_round_const, jnt_round_const);
-
- /* Horizontal filter */
- {
- const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
- const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
- // coeffs 4 5 4 5 6 7 6 7
- const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
- // coeffs 0 1 0 1 0 1 0 1
- const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
- // coeffs 2 3 2 3 2 3 2 3
- const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
- // coeffs 4 5 4 5 4 5 4 5
- const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
- // coeffs 6 7 6 7 6 7 6 7
- const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
- const __m128i round_const =
- _mm_set1_epi32((1 << conv_params->round_0) >> 1);
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
-
- for (i = 0; i < im_h; ++i) {
- for (j = 0; j < w; j += 8) {
- const __m128i data =
- _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
-
- // Filter even-index pixels
- const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
- const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
- const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
- const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
- const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
- const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
- const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
- const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
- __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
- _mm_add_epi32(res_2, res_6));
- res_even =
- _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
-
- // Filter odd-index pixels
- const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
- const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
- const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
- const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
- const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
- const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
- const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
- const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
- __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
- _mm_add_epi32(res_3, res_7));
- res_odd =
- _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
-
- // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
- __m128i res = _mm_packs_epi32(res_even, res_odd);
- res = _mm_packus_epi16(res, res);
- _mm_storel_epi64((__m128i *)&im_block[i * im_stride + j], res);
- }
- }
- }
-
- /* Vertical filter */
- {
- const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
- const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
- // coeffs 4 5 4 5 6 7 6 7
- const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
- // coeffs 0 1 0 1 0 1 0 1
- const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
- // coeffs 2 3 2 3 2 3 2 3
- const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
- // coeffs 4 5 4 5 4 5 4 5
- const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
- // coeffs 6 7 6 7 6 7 6 7
- const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
- const __m128i round_const =
- _mm_set1_epi32((1 << conv_params->round_1) >> 1);
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
-
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 8) {
- // Filter even-index pixels
- const uint8_t *data = &im_block[i * im_stride + j];
- const __m128i src_01 = _mm_unpacklo_epi8(
- _mm_loadl_epi64((__m128i *)(data + 0 * im_stride)),
- _mm_loadl_epi64((__m128i *)(data + 1 * im_stride)));
- const __m128i src_23 = _mm_unpacklo_epi8(
- _mm_loadl_epi64((__m128i *)(data + 2 * im_stride)),
- _mm_loadl_epi64((__m128i *)(data + 3 * im_stride)));
- const __m128i src_45 = _mm_unpacklo_epi8(
- _mm_loadl_epi64((__m128i *)(data + 4 * im_stride)),
- _mm_loadl_epi64((__m128i *)(data + 5 * im_stride)));
- const __m128i src_67 = _mm_unpacklo_epi8(
- _mm_loadl_epi64((__m128i *)(data + 6 * im_stride)),
- _mm_loadl_epi64((__m128i *)(data + 7 * im_stride)));
-
- const __m128i src_0 = _mm_unpacklo_epi8(src_01, zero);
- const __m128i src_2 = _mm_unpacklo_epi8(src_23, zero);
- const __m128i src_4 = _mm_unpacklo_epi8(src_45, zero);
- const __m128i src_6 = _mm_unpacklo_epi8(src_67, zero);
-
- const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
- const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
- const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
- const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
- const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
- _mm_add_epi32(res_4, res_6));
-
- // Filter odd-index pixels
- const __m128i src_1 = _mm_unpackhi_epi8(src_01, zero);
- const __m128i src_3 = _mm_unpackhi_epi8(src_23, zero);
- const __m128i src_5 = _mm_unpackhi_epi8(src_45, zero);
- const __m128i src_7 = _mm_unpackhi_epi8(src_67, zero);
-
- const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
- const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
- const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
- const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
- const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
- _mm_add_epi32(res_5, res_7));
-
- // Rearrange pixels back into the order 0 ... 7
- const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
- const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
- const __m128i res_lo_round =
- _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
- const __m128i res_hi_round =
- _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
-
- if (conv_params->use_jnt_comp_avg) {
- // NOTE(chengchen):
- // only this part is different from av1_convolve_2d_sse2
- // original c function at: av1/common/convolve.c:
- // av1_convolve_2d_c() and av1_jnt_convolve_2d_c()
- __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
- if (do_average) {
- _mm_storeu_si128(
- p + 0, _mm_srai_epi32(
- _mm_add_epi32(_mm_add_epi32(_mm_loadu_si128(p + 0),
- _mm_mullo_epi32(
- res_lo_round, wt1)),
- jnt_r),
- DIST_PRECISION_BITS - 1));
-
- _mm_storeu_si128(
- p + 1, _mm_srai_epi32(
- _mm_add_epi32(_mm_add_epi32(_mm_loadu_si128(p + 1),
- _mm_mullo_epi32(
- res_hi_round, wt1)),
- jnt_r),
- DIST_PRECISION_BITS - 1));
- } else {
- _mm_storeu_si128(p + 0, _mm_mullo_epi32(res_lo_round, wt0));
- _mm_storeu_si128(p + 1, _mm_mullo_epi32(res_hi_round, wt0));
- }
- } else {
- // Accumulate values into the destination buffer
- __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
- if (do_average) {
- _mm_storeu_si128(
- p + 0, _mm_add_epi32(_mm_loadu_si128(p + 0), res_lo_round));
- _mm_storeu_si128(
- p + 1, _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
- } else {
- _mm_storeu_si128(p + 0, res_lo_round);
- _mm_storeu_si128(p + 1, res_hi_round);
- }
- }
- }
- }
- }
-}
-#else // CONFIG_COMPOUND_ROUND
void av1_jnt_convolve_2d_sse4_1(const uint8_t *src, int src_stride,
CONV_BUF_TYPE *dst, int dst_stride, int w,
int h, InterpFilterParams *filter_params_x,
@@ -450,5 +235,4 @@
}
}
}
-#endif // CONFIG_COMPOUND_ROUND
#endif // CONFIG_JNT_COMP
diff --git a/av1/common/x86/convolve_avx2.c b/av1/common/x86/convolve_avx2.c
index a0e5871..93e7295 100644
--- a/av1/common/x86/convolve_avx2.c
+++ b/av1/common/x86/convolve_avx2.c
@@ -14,7 +14,6 @@
#include "aom_dsp/aom_dsp_common.h"
#include "./av1_rtcd.h"
-#if CONFIG_CONVOLVE_ROUND
static const uint32_t sindex[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
// 16 epi16 pixels
@@ -339,4 +338,3 @@
}
}
#endif // CONFIG_HIGHBITDEPTH
-#endif // CONFIG_CONVOLVE_ROUND
diff --git a/av1/common/x86/convolve_sse2.c b/av1/common/x86/convolve_sse2.c
index f137ef0..08ee8c3 100644
--- a/av1/common/x86/convolve_sse2.c
+++ b/av1/common/x86/convolve_sse2.c
@@ -17,7 +17,6 @@
#include "aom_dsp/aom_filter.h"
#include "av1/common/convolve.h"
-#if !CONFIG_COMPOUND_ROUND
void av1_convolve_y_sse2(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst,
int dst_stride, int w, int h,
InterpFilterParams *filter_params_x,
@@ -207,4 +206,3 @@
}
}
}
-#endif
diff --git a/av1/common/x86/highbd_convolve_2d_avx2.c b/av1/common/x86/highbd_convolve_2d_avx2.c
index 7020763..c28c63d 100644
--- a/av1/common/x86/highbd_convolve_2d_avx2.c
+++ b/av1/common/x86/highbd_convolve_2d_avx2.c
@@ -18,227 +18,6 @@
#include "aom_dsp/aom_filter.h"
#include "av1/common/convolve.h"
-#if CONFIG_COMPOUND_ROUND
-void av1_highbd_convolve_2d_avx2(const uint16_t *src, int src_stride,
- CONV_BUF_TYPE *dst, int dst_stride, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params, int bd) {
- DECLARE_ALIGNED(32, int16_t,
- im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
- int im_h = h + filter_params_y->taps - 1;
- int im_stride = MAX_SB_SIZE;
- int i, j;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
- const int do_average = conv_params->do_average;
- const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
-
- /* Horizontal filter */
- {
- const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
- const __m128i coeffs_x8 = _mm_loadu_si128((__m128i *)x_filter);
- // since not all compilers yet support _mm256_set_m128i()
- const __m256i coeffs_x = _mm256_insertf128_si256(
- _mm256_castsi128_si256(coeffs_x8), coeffs_x8, 1);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m256i tmp_0 = _mm256_unpacklo_epi32(coeffs_x, coeffs_x);
- // coeffs 4 5 4 5 6 7 6 7
- const __m256i tmp_1 = _mm256_unpackhi_epi32(coeffs_x, coeffs_x);
-
- // coeffs 0 1 0 1 0 1 0 1
- const __m256i coeff_01 = _mm256_unpacklo_epi64(tmp_0, tmp_0);
- // coeffs 2 3 2 3 2 3 2 3
- const __m256i coeff_23 = _mm256_unpackhi_epi64(tmp_0, tmp_0);
- // coeffs 4 5 4 5 4 5 4 5
- const __m256i coeff_45 = _mm256_unpacklo_epi64(tmp_1, tmp_1);
- // coeffs 6 7 6 7 6 7 6 7
- const __m256i coeff_67 = _mm256_unpackhi_epi64(tmp_1, tmp_1);
-
- const __m256i round_const =
- _mm256_set1_epi32((1 << conv_params->round_0) >> 1);
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
-
- for (i = 0; i < im_h; ++i) {
- for (j = 0; j < w; j += 16) {
- const __m256i data =
- _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
- const __m128i data2_1 =
- _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 16]);
- const __m256i data2 = _mm256_insertf128_si256(
- _mm256_castsi128_si256(data2_1), data2_1, 1);
-
- // Filter even-index pixels
- const __m256i res_0 = _mm256_madd_epi16(data, coeff_01);
- const __m256i res_2 = _mm256_madd_epi16(
- _mm256_alignr_epi8(_mm256_permute2x128_si256(data2, data, 0x13),
- data, 4),
- coeff_23);
- const __m256i res_4 = _mm256_madd_epi16(
- _mm256_alignr_epi8(_mm256_permute2x128_si256(data2, data, 0x13),
- data, 8),
- coeff_45);
- const __m256i res_6 = _mm256_madd_epi16(
- _mm256_alignr_epi8(_mm256_permute2x128_si256(data2, data, 0x13),
- data, 12),
- coeff_67);
-
- __m256i res_even = _mm256_add_epi32(_mm256_add_epi32(res_0, res_4),
- _mm256_add_epi32(res_2, res_6));
- res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const),
- round_shift);
-
- // Filter odd-index pixels
- const __m256i res_1 = _mm256_madd_epi16(
- _mm256_alignr_epi8(_mm256_permute2x128_si256(data2, data, 0x13),
- data, 2),
- coeff_01);
- const __m256i res_3 = _mm256_madd_epi16(
- _mm256_alignr_epi8(_mm256_permute2x128_si256(data2, data, 0x13),
- data, 6),
- coeff_23);
- const __m256i res_5 = _mm256_madd_epi16(
- _mm256_alignr_epi8(_mm256_permute2x128_si256(data2, data, 0x13),
- data, 10),
- coeff_45);
- const __m256i res_7 = _mm256_madd_epi16(
- _mm256_alignr_epi8(_mm256_permute2x128_si256(data2, data, 0x13),
- data, 14),
- coeff_67);
-
- __m256i res_odd = _mm256_add_epi32(_mm256_add_epi32(res_1, res_5),
- _mm256_add_epi32(res_3, res_7));
- res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const),
- round_shift);
-
- // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
- const __m256i maxval = _mm256_set1_epi16((1 << bd) - 1);
- __m256i res = _mm256_packs_epi32(res_even, res_odd);
- res = _mm256_max_epi16(_mm256_min_epi16(res, maxval),
- _mm256_setzero_si256());
- _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j],
- _mm256_extractf128_si256(res, 0));
- _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j + 8],
- _mm256_extractf128_si256(res, 1));
- }
- }
- }
-
- /* Vertical filter */
- {
- const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
- const __m128i coeffs_y8 = _mm_loadu_si128((__m128i *)y_filter);
- const __m256i coeffs_y = _mm256_insertf128_si256(
- _mm256_castsi128_si256(coeffs_y8), coeffs_y8, 1);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m256i tmp_0 = _mm256_unpacklo_epi32(coeffs_y, coeffs_y);
- // coeffs 4 5 4 5 6 7 6 7
- const __m256i tmp_1 = _mm256_unpackhi_epi32(coeffs_y, coeffs_y);
-
- // coeffs 0 1 0 1 0 1 0 1
- const __m256i coeff_01 = _mm256_unpacklo_epi64(tmp_0, tmp_0);
- // coeffs 2 3 2 3 2 3 2 3
- const __m256i coeff_23 = _mm256_unpackhi_epi64(tmp_0, tmp_0);
- // coeffs 4 5 4 5 4 5 4 5
- const __m256i coeff_45 = _mm256_unpacklo_epi64(tmp_1, tmp_1);
- // coeffs 6 7 6 7 6 7 6 7
- const __m256i coeff_67 = _mm256_unpackhi_epi64(tmp_1, tmp_1);
-
- const __m256i round_const =
- _mm256_set1_epi32((1 << conv_params->round_1) >> 1);
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
-
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 16) {
- // Filter even-index pixels
- const int16_t *data = &im_block[i * im_stride + j];
- const __m256i src_0 =
- _mm256_unpacklo_epi16(*(__m256i *)(data + 0 * im_stride),
- *(__m256i *)(data + 1 * im_stride));
- const __m256i src_2 =
- _mm256_unpacklo_epi16(*(__m256i *)(data + 2 * im_stride),
- *(__m256i *)(data + 3 * im_stride));
- const __m256i src_4 =
- _mm256_unpacklo_epi16(*(__m256i *)(data + 4 * im_stride),
- *(__m256i *)(data + 5 * im_stride));
- const __m256i src_6 =
- _mm256_unpacklo_epi16(*(__m256i *)(data + 6 * im_stride),
- *(__m256i *)(data + 7 * im_stride));
-
- const __m256i res_0 = _mm256_madd_epi16(src_0, coeff_01);
- const __m256i res_2 = _mm256_madd_epi16(src_2, coeff_23);
- const __m256i res_4 = _mm256_madd_epi16(src_4, coeff_45);
- const __m256i res_6 = _mm256_madd_epi16(src_6, coeff_67);
-
- const __m256i res_even = _mm256_add_epi32(
- _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6));
-
- // Filter odd-index pixels
- const __m256i src_1 =
- _mm256_unpackhi_epi16(*(__m256i *)(data + 0 * im_stride),
- *(__m256i *)(data + 1 * im_stride));
- const __m256i src_3 =
- _mm256_unpackhi_epi16(*(__m256i *)(data + 2 * im_stride),
- *(__m256i *)(data + 3 * im_stride));
- const __m256i src_5 =
- _mm256_unpackhi_epi16(*(__m256i *)(data + 4 * im_stride),
- *(__m256i *)(data + 5 * im_stride));
- const __m256i src_7 =
- _mm256_unpackhi_epi16(*(__m256i *)(data + 6 * im_stride),
- *(__m256i *)(data + 7 * im_stride));
-
- const __m256i res_1 = _mm256_madd_epi16(src_1, coeff_01);
- const __m256i res_3 = _mm256_madd_epi16(src_3, coeff_23);
- const __m256i res_5 = _mm256_madd_epi16(src_5, coeff_45);
- const __m256i res_7 = _mm256_madd_epi16(src_7, coeff_67);
-
- const __m256i res_odd = _mm256_add_epi32(
- _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7));
-
- // Rearrange pixels back into the order 0 ... 7
- const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
- const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
-
- const __m256i res_lo_round = _mm256_sra_epi32(
- _mm256_add_epi32(res_lo, round_const), round_shift);
- const __m256i res_hi_round = _mm256_sra_epi32(
- _mm256_add_epi32(res_hi, round_const), round_shift);
-
- // Accumulate values into the destination buffer
- __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
- if (do_average) {
- _mm_storeu_si128(
- p + 0, _mm_add_epi32(_mm_loadu_si128(p + 0),
- _mm256_extractf128_si256(res_lo_round, 0)));
- _mm_storeu_si128(
- p + 1, _mm_add_epi32(_mm_loadu_si128(p + 1),
- _mm256_extractf128_si256(res_hi_round, 0)));
- if (w - j > 8) {
- _mm_storeu_si128(p + 2, _mm_add_epi32(_mm_loadu_si128(p + 2),
- _mm256_extractf128_si256(
- res_lo_round, 1)));
- _mm_storeu_si128(p + 3, _mm_add_epi32(_mm_loadu_si128(p + 3),
- _mm256_extractf128_si256(
- res_hi_round, 1)));
- }
- } else {
- _mm_storeu_si128(p + 0, _mm256_extractf128_si256(res_lo_round, 0));
- _mm_storeu_si128(p + 1, _mm256_extractf128_si256(res_hi_round, 0));
- if (w - j > 8) {
- _mm_storeu_si128(p + 2, _mm256_extractf128_si256(res_lo_round, 1));
- _mm_storeu_si128(p + 3, _mm256_extractf128_si256(res_hi_round, 1));
- }
- }
- }
- }
- }
-}
-#else
void av1_highbd_convolve_2d_avx2(const uint16_t *src, int src_stride,
CONV_BUF_TYPE *dst, int dst_stride, int w,
int h, InterpFilterParams *filter_params_x,
@@ -458,4 +237,3 @@
}
}
}
-#endif
diff --git a/av1/common/x86/highbd_convolve_2d_ssse3.c b/av1/common/x86/highbd_convolve_2d_ssse3.c
index 195f0f5..95055b0 100644
--- a/av1/common/x86/highbd_convolve_2d_ssse3.c
+++ b/av1/common/x86/highbd_convolve_2d_ssse3.c
@@ -18,188 +18,6 @@
#include "aom_dsp/aom_filter.h"
#include "av1/common/convolve.h"
-#if CONFIG_COMPOUND_ROUND
-void av1_highbd_convolve_2d_ssse3(const uint16_t *src, int src_stride,
- CONV_BUF_TYPE *dst, int dst_stride, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params, int bd) {
- DECLARE_ALIGNED(16, int16_t,
- im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
- int im_h = h + filter_params_y->taps - 1;
- int im_stride = MAX_SB_SIZE;
- int i, j;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
- const int do_average = conv_params->do_average;
- const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
-
- /* Horizontal filter */
- {
- const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
- const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
- // coeffs 4 5 4 5 6 7 6 7
- const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
- // coeffs 0 1 0 1 0 1 0 1
- const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
- // coeffs 2 3 2 3 2 3 2 3
- const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
- // coeffs 4 5 4 5 4 5 4 5
- const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
- // coeffs 6 7 6 7 6 7 6 7
- const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
- const __m128i round_const =
- _mm_set1_epi32((1 << conv_params->round_0) >> 1);
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
-
- for (i = 0; i < im_h; ++i) {
- for (j = 0; j < w; j += 8) {
- const __m128i data =
- _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
- const __m128i data2 =
- _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);
-
- // Filter even-index pixels
- const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
- const __m128i res_2 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
- const __m128i res_4 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
- const __m128i res_6 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);
-
- __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
- _mm_add_epi32(res_2, res_6));
- res_even =
- _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
-
- // Filter odd-index pixels
- const __m128i res_1 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
- const __m128i res_3 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
- const __m128i res_5 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
- const __m128i res_7 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);
-
- __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
- _mm_add_epi32(res_3, res_7));
- res_odd =
- _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
-
- // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
- const __m128i maxval = _mm_set1_epi16((1 << bd) - 1);
- __m128i res = _mm_packs_epi32(res_even, res_odd);
- res = _mm_max_epi16(_mm_min_epi16(res, maxval), _mm_setzero_si128());
- _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
- }
- }
- }
-
- /* Vertical filter */
- {
- const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
- const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
- // coeffs 4 5 4 5 6 7 6 7
- const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
- // coeffs 0 1 0 1 0 1 0 1
- const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
- // coeffs 2 3 2 3 2 3 2 3
- const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
- // coeffs 4 5 4 5 4 5 4 5
- const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
- // coeffs 6 7 6 7 6 7 6 7
- const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
- const __m128i round_const =
- _mm_set1_epi32((1 << conv_params->round_1) >> 1);
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
-
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 8) {
- // Filter even-index pixels
- const int16_t *data = &im_block[i * im_stride + j];
- const __m128i src_0 =
- _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
- *(__m128i *)(data + 1 * im_stride));
- const __m128i src_2 =
- _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
- *(__m128i *)(data + 3 * im_stride));
- const __m128i src_4 =
- _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
- *(__m128i *)(data + 5 * im_stride));
- const __m128i src_6 =
- _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
- *(__m128i *)(data + 7 * im_stride));
-
- const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
- const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
- const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
- const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
- const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
- _mm_add_epi32(res_4, res_6));
-
- // Filter odd-index pixels
- const __m128i src_1 =
- _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
- *(__m128i *)(data + 1 * im_stride));
- const __m128i src_3 =
- _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
- *(__m128i *)(data + 3 * im_stride));
- const __m128i src_5 =
- _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
- *(__m128i *)(data + 5 * im_stride));
- const __m128i src_7 =
- _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
- *(__m128i *)(data + 7 * im_stride));
-
- const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
- const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
- const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
- const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
- const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
- _mm_add_epi32(res_5, res_7));
-
- // Rearrange pixels back into the order 0 ... 7
- const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
- const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
- const __m128i res_lo_round =
- _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
- const __m128i res_hi_round =
- _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
-
- // Accumulate values into the destination buffer
- __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
- if (do_average) {
- _mm_storeu_si128(p + 0,
- _mm_add_epi32(_mm_loadu_si128(p + 0), res_lo_round));
- _mm_storeu_si128(p + 1,
- _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
- } else {
- _mm_storeu_si128(p + 0, res_lo_round);
- _mm_storeu_si128(p + 1, res_hi_round);
- }
- }
- }
- }
-}
-#else
void av1_highbd_convolve_2d_ssse3(const uint16_t *src, int src_stride,
CONV_BUF_TYPE *dst, int dst_stride, int w,
int h, InterpFilterParams *filter_params_x,
@@ -383,4 +201,3 @@
}
}
}
-#endif
diff --git a/av1/common/x86/highbd_warp_plane_sse4.c b/av1/common/x86/highbd_warp_plane_sse4.c
index 7c358ec..d40a9696 100644
--- a/av1/common/x86/highbd_warp_plane_sse4.c
+++ b/av1/common/x86/highbd_warp_plane_sse4.c
@@ -28,7 +28,6 @@
#error "HORSHEAR_REDUCE_PREC_BITS < 5 not currently supported by SSSE3 filter"
#endif
int i, j, k;
-#if CONFIG_CONVOLVE_ROUND
const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
const int reduce_bits_horiz =
use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
@@ -46,10 +45,6 @@
const int jnt_round_const = 1 << (DIST_PRECISION_BITS - 2);
const __m128i jnt_r = _mm_set1_epi32(jnt_round_const);
#endif // CONFIG_JNT_COMP
-#else
- const int reduce_bits_horiz = HORSHEAR_REDUCE_PREC_BITS;
- const int offset_bits_horiz = bd + WARPEDPIXEL_FILTER_BITS - 1;
-#endif
/* Note: For this code to work, the left/right frame borders need to be
extended by at least 13 pixels each. By the time we get here, other
@@ -310,7 +305,6 @@
__m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
__m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-#if CONFIG_CONVOLVE_ROUND
if (use_conv_params) {
__m128i *const p =
(__m128i *)&conv_params
@@ -369,9 +363,6 @@
#endif
}
} else {
-#else
- {
-#endif
// Round and pack into 8 bits
const __m128i round_const =
_mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
diff --git a/av1/common/x86/highbd_warp_plane_ssse3.c b/av1/common/x86/highbd_warp_plane_ssse3.c
index 71b0ec7..5eedf9a 100644
--- a/av1/common/x86/highbd_warp_plane_ssse3.c
+++ b/av1/common/x86/highbd_warp_plane_ssse3.c
@@ -28,7 +28,6 @@
#error "HORSHEAR_REDUCE_PREC_BITS < 5 not currently supported by SSSE3 filter"
#endif
int i, j, k;
-#if CONFIG_CONVOLVE_ROUND
const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
const int reduce_bits_horiz =
use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
@@ -38,10 +37,6 @@
conv_params->do_post_rounding = 1;
}
assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS);
-#else
- const int reduce_bits_horiz = HORSHEAR_REDUCE_PREC_BITS;
- const int offset_bits_horiz = bd + WARPEDPIXEL_FILTER_BITS - 1;
-#endif
/* Note: For this code to work, the left/right frame borders need to be
extended by at least 13 pixels each. By the time we get here, other
@@ -302,7 +297,6 @@
__m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
__m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-#if CONFIG_CONVOLVE_ROUND
if (use_conv_params) {
__m128i *const p =
(__m128i *)&conv_params
@@ -324,9 +318,6 @@
_mm_storeu_si128(p + 1, res_hi);
}
} else {
-#else
- {
-#endif
// Round and pack into 8 bits
const __m128i round_const =
_mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
diff --git a/av1/common/x86/warp_plane_sse2.c b/av1/common/x86/warp_plane_sse2.c
index d30466a..6505d9a 100644
--- a/av1/common/x86/warp_plane_sse2.c
+++ b/av1/common/x86/warp_plane_sse2.c
@@ -24,7 +24,6 @@
__m128i tmp[15];
int i, j, k;
const int bd = 8;
-#if CONFIG_CONVOLVE_ROUND
const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
const int reduce_bits_horiz =
use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
@@ -34,10 +33,6 @@
conv_params->do_post_rounding = 1;
}
assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS);
-#else
- const int reduce_bits_horiz = HORSHEAR_REDUCE_PREC_BITS;
- const int offset_bits_horiz = bd + WARPEDPIXEL_FILTER_BITS - 1;
-#endif
/* Note: For this code to work, the left/right frame borders need to be
extended by at least 13 pixels each. By the time we get here, other
@@ -298,7 +293,6 @@
__m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
__m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-#if CONFIG_CONVOLVE_ROUND
if (use_conv_params) {
__m128i *const p =
(__m128i *)&conv_params
@@ -320,9 +314,6 @@
_mm_storeu_si128(p + 1, res_hi);
}
} else {
-#else
- {
-#endif
// Round and pack into 8 bits
const __m128i round_const =
_mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
diff --git a/av1/common/x86/warp_plane_sse4.c b/av1/common/x86/warp_plane_sse4.c
index 6385c17..e0d6206 100644
--- a/av1/common/x86/warp_plane_sse4.c
+++ b/av1/common/x86/warp_plane_sse4.c
@@ -25,7 +25,6 @@
__m128i tmp[15];
int i, j, k;
const int bd = 8;
-#if CONFIG_CONVOLVE_ROUND
const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
const int reduce_bits_horiz =
use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
@@ -43,10 +42,6 @@
const int jnt_round_const = 1 << (DIST_PRECISION_BITS - 2);
const __m128i jnt_r = _mm_set1_epi32(jnt_round_const);
#endif // CONFIG_JNT_COMP
-#else
- const int reduce_bits_horiz = HORSHEAR_REDUCE_PREC_BITS;
- const int offset_bits_horiz = bd + WARPEDPIXEL_FILTER_BITS - 1;
-#endif
/* Note: For this code to work, the left/right frame borders need to be
extended by at least 13 pixels each. By the time we get here, other
@@ -307,7 +302,6 @@
__m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
__m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-#if CONFIG_CONVOLVE_ROUND
if (use_conv_params) {
__m128i *const p =
(__m128i *)&conv_params
@@ -364,9 +358,6 @@
#endif // CONFIG_JNT_COMP
}
} else {
-#else
- {
-#endif
// Round and pack into 8 bits
const __m128i round_const =
_mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
diff --git a/av1/common/x86/warp_plane_ssse3.c b/av1/common/x86/warp_plane_ssse3.c
index 3986ad3..7bf3253 100644
--- a/av1/common/x86/warp_plane_ssse3.c
+++ b/av1/common/x86/warp_plane_ssse3.c
@@ -211,7 +211,6 @@
__m128i tmp[15];
int i, j, k;
const int bd = 8;
-#if CONFIG_CONVOLVE_ROUND
const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
const int reduce_bits_horiz =
use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
@@ -221,10 +220,6 @@
conv_params->do_post_rounding = 1;
}
assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS);
-#else
- const int reduce_bits_horiz = HORSHEAR_REDUCE_PREC_BITS;
- const int offset_bits_horiz = bd + WARPEDPIXEL_FILTER_BITS - 1;
-#endif
/* Note: For this code to work, the left/right frame borders need to be
extended by at least 13 pixels each. By the time we get here, other
@@ -474,7 +469,6 @@
__m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
__m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-#if CONFIG_CONVOLVE_ROUND
if (use_conv_params) {
__m128i *const p =
(__m128i *)&conv_params
@@ -496,9 +490,6 @@
_mm_storeu_si128(p + 1, res_hi);
}
} else {
-#else
- {
-#endif
// Round and pack into 8 bits
const __m128i round_const =
_mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake
index 43d9e71..167c203 100644
--- a/build/cmake/aom_config_defaults.cmake
+++ b/build/cmake/aom_config_defaults.cmake
@@ -114,8 +114,6 @@
set(CONFIG_CDEF_SINGLEPASS 1 CACHE NUMBER "AV1 experiment flag.")
set(CONFIG_CFL 1 CACHE NUMBER "AV1 experiment flag.")
set(CONFIG_COLORSPACE_HEADERS 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_COMPOUND_ROUND 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_CONVOLVE_ROUND 1 CACHE NUMBER "AV1 experiment flag.")
set(CONFIG_DAALA_TX 0 CACHE NUMBER "AV1 experiment flag.")
set(CONFIG_DAALA_TX16 0 CACHE NUMBER "AV1 experiment flag.")
set(CONFIG_DAALA_TX32 0 CACHE NUMBER "AV1 experiment flag.")
diff --git a/build/cmake/aom_experiment_deps.cmake b/build/cmake/aom_experiment_deps.cmake
index 1a7e563..0555c63 100644
--- a/build/cmake/aom_experiment_deps.cmake
+++ b/build/cmake/aom_experiment_deps.cmake
@@ -32,12 +32,6 @@
endif ()
endif ()
- if (CONFIG_COMPOUND_ROUND)
- if (NOT CONFIG_CONVOLVE_ROUND)
- change_config_and_warn(CONVOLVE_ROUND 1 CONFIG_COMPOUND_ROUND)
- endif ()
- endif ()
-
if (CONFIG_EOB_FIRST)
if (NOT CONFIG_LV_MAP)
change_config_and_warn(CONFIG_LV_MAP 1 CONFIG_EOB_FIRST)
diff --git a/configure b/configure
index a13efcf..2c119c4 100755
--- a/configure
+++ b/configure
@@ -252,8 +252,6 @@
rect_tx_ext_intra
short_filter
dual_filter
- convolve_round
- compound_round
tx64x64
ext_intra
filter_intra
@@ -496,7 +494,6 @@
soft_enable intra_edge
soft_enable mv_compress
soft_enable dual_filter
- soft_enable convolve_round
soft_enable aom_qm
soft_enable dist_8x8
soft_enable loop_restoration
@@ -540,7 +537,6 @@
enabled lv_map_multi && soft_enable lv_map
enabled eob_first && enable_feature lv_map
enabled txk_sel && soft_enable lv_map
- enabled compound_round && soft_enable convolve_round
enabled ext_intra_mod && enable_feature intra_edge
enabled intra_edge && enable_feature ext_intra
enabled mfmv && enable_feature frame_marker
diff --git a/test/test.cmake b/test/test.cmake
index a0cce48..e24f7e8 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -237,24 +237,21 @@
"${AOM_ROOT}/test/quantize_func_test.cc")
endif ()
- if (CONFIG_CONVOLVE_ROUND)
+ set(AOM_UNIT_TEST_ENCODER_SOURCES
+ ${AOM_UNIT_TEST_ENCODER_SOURCES}
+ "${AOM_ROOT}/test/convolve_round_test.cc")
+ if (HAVE_SSE2)
set(AOM_UNIT_TEST_ENCODER_SOURCES
${AOM_UNIT_TEST_ENCODER_SOURCES}
- "${AOM_ROOT}/test/convolve_round_test.cc")
- if (HAVE_SSE2)
- set(AOM_UNIT_TEST_ENCODER_SOURCES
- ${AOM_UNIT_TEST_ENCODER_SOURCES}
- "${AOM_ROOT}/test/av1_convolve_2d_test.cc"
- "${AOM_ROOT}/test/av1_convolve_2d_test_util.cc"
- "${AOM_ROOT}/test/av1_convolve_2d_test_util.h")
- endif ()
- if (NOT CONFIG_COMPOUND_ROUND)
- if (HAVE_SSE4_1)
- set(AOM_UNIT_TEST_ENCODER_SOURCES
- ${AOM_UNIT_TEST_ENCODER_SOURCES}
- "${AOM_ROOT}/test/av1_convolve_scale_test.cc")
- endif ()
- endif ()
+ "${AOM_ROOT}/test/av1_convolve_2d_test.cc"
+ "${AOM_ROOT}/test/av1_convolve_2d_test_util.cc"
+ "${AOM_ROOT}/test/av1_convolve_2d_test_util.h")
+ endif ()
+
+ if (HAVE_SSE4_1)
+ set(AOM_UNIT_TEST_ENCODER_SOURCES
+ ${AOM_UNIT_TEST_ENCODER_SOURCES}
+ "${AOM_ROOT}/test/av1_convolve_scale_test.cc")
endif ()
set(AOM_UNIT_TEST_ENCODER_SOURCES
diff --git a/test/test.mk b/test/test.mk
index 2389a2f..2a7b9ae 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -227,16 +227,11 @@
LIBAOM_TEST_SRCS-$(HAVE_SSE2) += hiprec_convolve_test_util.cc
LIBAOM_TEST_SRCS-$(HAVE_SSE4_1) += selfguided_filter_test.cc
endif
-ifeq ($(CONFIG_CONVOLVE_ROUND),yes)
LIBAOM_TEST_SRCS-$(HAVE_SSE2) += av1_convolve_2d_test_util.h
LIBAOM_TEST_SRCS-$(HAVE_SSE2) += av1_convolve_2d_test.cc
LIBAOM_TEST_SRCS-$(HAVE_SSE2) += av1_convolve_2d_test_util.cc
LIBAOM_TEST_SRCS-yes += convolve_round_test.cc
-endif
-
-ifeq (yesx,$(CONFIG_CONVOLVE_ROUND)x$(CONFIG_COMPOUND_ROUND))
LIBAOM_TEST_SRCS-$(HAVE_SSE4_1) += av1_convolve_scale_test.cc
-endif
ifeq ($(CONFIG_AV1_ENCODER),yes)
LIBAOM_TEST_SRCS-$(HAVE_SSE4_1) += corner_match_test.cc
diff --git a/test/warp_filter_test.cc b/test/warp_filter_test.cc
index ea052f8..c6fc23b 100644
--- a/test/warp_filter_test.cc
+++ b/test/warp_filter_test.cc
@@ -22,7 +22,7 @@
namespace {
-#if CONFIG_JNT_COMP && CONFIG_CONVOLVE_ROUND && HAVE_SSE4_1
+#if CONFIG_JNT_COMP && HAVE_SSE4_1
TEST_P(AV1WarpFilterTest, CheckOutput) { RunCheckOutput(GET_PARAM(3)); }
INSTANTIATE_TEST_CASE_P(
@@ -38,7 +38,7 @@
libaom_test::AV1HighbdWarpFilter::GetDefaultParams());
#endif
-#else // CONFIG_JNT_COMP && CONFIG_CONVOLVE_ROUND && HAVE_SSE4_1
+#else // CONFIG_JNT_COMP && HAVE_SSE4_1
TEST_P(AV1WarpFilterTest, CheckOutput) { RunCheckOutput(GET_PARAM(3)); }
INSTANTIATE_TEST_CASE_P(
diff --git a/test/warp_filter_test_util.cc b/test/warp_filter_test_util.cc
index c815bf6..72b35ba 100644
--- a/test/warp_filter_test_util.cc
+++ b/test/warp_filter_test_util.cc
@@ -113,10 +113,8 @@
int32_t mat[8];
int16_t alpha, beta, gamma, delta;
ConvolveParams conv_params = get_conv_params(0, 0, 0);
-#if CONFIG_CONVOLVE_ROUND
int32_t *dsta = new int32_t[output_n];
int32_t *dstb = new int32_t[output_n];
-#endif
for (i = 0; i < num_iters; ++i) {
// Generate an input block and extend its borders horizontally
@@ -126,17 +124,15 @@
memset(input + r * stride - border, input[r * stride], border);
memset(input + r * stride + w, input[r * stride + (w - 1)], border);
}
-#if CONFIG_CONVOLVE_ROUND
+
const int use_no_round = rnd_.Rand8() & 1;
-#endif
for (sub_x = 0; sub_x < 2; ++sub_x)
for (sub_y = 0; sub_y < 2; ++sub_y) {
generate_model(mat, &alpha, &beta, &gamma, &delta);
-#if CONFIG_JNT_COMP && CONFIG_CONVOLVE_ROUND
+#if CONFIG_JNT_COMP
for (int ii = 0; ii < 2; ++ii) {
for (int jj = 0; jj < 5; ++jj) {
-#endif // CONFIG_JNT_COMP && CONFIG_CONVOLVE_ROUND
-#if CONFIG_CONVOLVE_ROUND
+#endif // CONFIG_JNT_COMP
if (use_no_round) {
// Prepare two copies of the destination
for (j = 0; j < out_w * out_h; ++j) {
@@ -148,8 +144,7 @@
} else {
conv_params = get_conv_params(0, 0, 0);
}
-#endif
-#if CONFIG_JNT_COMP && CONFIG_CONVOLVE_ROUND
+#if CONFIG_JNT_COMP
if (jj >= 4) {
conv_params.use_jnt_comp_avg = 0;
} else {
@@ -157,17 +152,15 @@
conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
}
-#endif // CONFIG_JNT_COMP && CONFIG_CONVOLVE_ROUND
+#endif // CONFIG_JNT_COMP
av1_warp_affine_c(mat, input, w, h, stride, output, 32, 32, out_w,
out_h, out_w, sub_x, sub_y, &conv_params, alpha,
beta, gamma, delta);
-#if CONFIG_CONVOLVE_ROUND
if (use_no_round) {
conv_params = get_conv_params_no_round(0, 0, 0, dstb, out_w);
}
-#endif
-#if CONFIG_JNT_COMP && CONFIG_CONVOLVE_ROUND
+#if CONFIG_JNT_COMP
if (jj >= 4) {
conv_params.use_jnt_comp_avg = 0;
} else {
@@ -175,12 +168,11 @@
conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
}
-#endif // CONFIG_JNT_COMP && CONFIG_CONVOLVE_ROUND
+#endif // CONFIG_JNT_COMP
test_impl(mat, input, w, h, stride, output2, 32, 32, out_w, out_h,
out_w, sub_x, sub_y, &conv_params, alpha, beta, gamma,
delta);
-#if CONFIG_CONVOLVE_ROUND
if (use_no_round) {
for (j = 0; j < out_w * out_h; ++j)
ASSERT_EQ(dsta[j], dstb[j])
@@ -192,25 +184,17 @@
<< "Pixel mismatch at index " << j << " = (" << (j % out_w)
<< ", " << (j / out_w) << ") on iteration " << i;
}
-#else
- for (j = 0; j < out_w * out_h; ++j)
- ASSERT_EQ(output[j], output2[j])
- << "Pixel mismatch at index " << j << " = (" << (j % out_w)
- << ", " << (j / out_w) << ") on iteration " << i;
-#endif
-#if CONFIG_JNT_COMP && CONFIG_CONVOLVE_ROUND
+#if CONFIG_JNT_COMP
}
}
-#endif // CONFIG_JNT_COMP && CONFIG_CONVOLVE_ROUND
+#endif // CONFIG_JNT_COMP
}
}
delete[] input_;
delete[] output;
delete[] output2;
-#if CONFIG_CONVOLVE_ROUND
delete[] dsta;
delete[] dstb;
-#endif
}
} // namespace AV1WarpFilter
@@ -320,10 +304,8 @@
int32_t mat[8];
int16_t alpha, beta, gamma, delta;
ConvolveParams conv_params = get_conv_params(0, 0, 0);
-#if CONFIG_CONVOLVE_ROUND
int32_t *dsta = new int32_t[output_n];
int32_t *dstb = new int32_t[output_n];
-#endif
for (i = 0; i < num_iters; ++i) {
// Generate an input block and extend its borders horizontally
@@ -335,17 +317,14 @@
input[r * stride + w + c] = input[r * stride + (w - 1)];
}
}
-#if CONFIG_CONVOLVE_ROUND
const int use_no_round = rnd_.Rand8() & 1;
-#endif
for (sub_x = 0; sub_x < 2; ++sub_x)
for (sub_y = 0; sub_y < 2; ++sub_y) {
generate_model(mat, &alpha, &beta, &gamma, &delta);
-#if CONFIG_JNT_COMP && CONFIG_CONVOLVE_ROUND
+#if CONFIG_JNT_COMP
for (int ii = 0; ii < 2; ++ii) {
for (int jj = 0; jj < 5; ++jj) {
-#endif // CONFIG_JNT_COMP && CONFIG_CONVOLVE_ROUND
-#if CONFIG_CONVOLVE_ROUND
+#endif // CONFIG_JNT_COMP
if (use_no_round) {
// Prepare two copies of the destination
for (j = 0; j < out_w * out_h; ++j) {
@@ -357,8 +336,7 @@
} else {
conv_params = get_conv_params(0, 0, 0);
}
-#endif
-#if CONFIG_JNT_COMP && CONFIG_CONVOLVE_ROUND
+#if CONFIG_JNT_COMP
if (jj >= 4) {
conv_params.use_jnt_comp_avg = 0;
} else {
@@ -366,18 +344,16 @@
conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
}
-#endif // CONFIG_JNT_COMP && CONFIG_CONVOLVE_ROUND
+#endif // CONFIG_JNT_COMP
av1_highbd_warp_affine_c(mat, input, w, h, stride, output, 32, 32,
out_w, out_h, out_w, sub_x, sub_y, bd,
&conv_params, alpha, beta, gamma, delta);
-#if CONFIG_CONVOLVE_ROUND
if (use_no_round) {
// TODO(angiebird): Change this to test_impl once we have SIMD
// implementation
conv_params = get_conv_params_no_round(0, 0, 0, dstb, out_w);
}
-#endif
-#if CONFIG_JNT_COMP && CONFIG_CONVOLVE_ROUND
+#if CONFIG_JNT_COMP
if (jj >= 4) {
conv_params.use_jnt_comp_avg = 0;
} else {
@@ -385,12 +361,11 @@
conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
}
-#endif // CONFIG_JNT_COMP && CONFIG_CONVOLVE_ROUND
+#endif // CONFIG_JNT_COMP
test_impl(mat, input, w, h, stride, output2, 32, 32, out_w, out_h,
out_w, sub_x, sub_y, bd, &conv_params, alpha, beta, gamma,
delta);
-#if CONFIG_CONVOLVE_ROUND
if (use_no_round) {
for (j = 0; j < out_w * out_h; ++j)
ASSERT_EQ(dsta[j], dstb[j])
@@ -402,26 +377,18 @@
<< "Pixel mismatch at index " << j << " = (" << (j % out_w)
<< ", " << (j / out_w) << ") on iteration " << i;
}
-#else
- for (j = 0; j < out_w * out_h; ++j)
- ASSERT_EQ(output[j], output2[j])
- << "Pixel mismatch at index " << j << " = (" << (j % out_w)
- << ", " << (j / out_w) << ") on iteration " << i;
-#endif
-#if CONFIG_JNT_COMP && CONFIG_CONVOLVE_ROUND
+#if CONFIG_JNT_COMP
}
}
-#endif // CONFIG_JNT_COMP && CONFIG_CONVOLVE_ROUND
+#endif // CONFIG_JNT_COMP
}
}
delete[] input_;
delete[] output;
delete[] output2;
-#if CONFIG_CONVOLVE_ROUND
delete[] dsta;
delete[] dstb;
-#endif
}
} // namespace AV1HighbdWarpFilter
#endif // CONFIG_HIGHBITDEPTH