Remove CONFIG_LOWPRECISION_BLEND
This tool is fully adopted.
Change-Id: Id349a3aac5bf125a23d5961c2baacbe3f585eb73
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 9b1a99c..dc81bda 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -473,12 +473,8 @@
#
# Alpha blending with mask
#
- if (aom_config("CONFIG_LOWPRECISION_BLEND") eq "yes") {
- add_proto qw/void aom_lowbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx, ConvolveParams *conv_params";
- add_proto qw/void aom_highbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx, ConvolveParams *conv_params, const int bd";
- } else {
- add_proto qw/void aom_blend_a64_d32_mask/, "int32_t *dst, uint32_t dst_stride, const int32_t *src0, uint32_t src0_stride, const int32_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx";
- }
+ add_proto qw/void aom_lowbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx, ConvolveParams *conv_params";
+ add_proto qw/void aom_highbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx, ConvolveParams *conv_params, const int bd";
add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx";
add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w";
add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w";
diff --git a/aom_dsp/blend_a64_mask.c b/aom_dsp/blend_a64_mask.c
index 559f44d..99b0d20 100644
--- a/aom_dsp/blend_a64_mask.c
+++ b/aom_dsp/blend_a64_mask.c
@@ -33,7 +33,6 @@
// In contrast, the output of the non-d32 functions will not be further rounded,
// so we *should* use ROUND_POWER_OF_TWO there.
-#if CONFIG_LOWPRECISION_BLEND
void aom_lowbd_blend_a64_d16_mask(uint8_t *dst, uint32_t dst_stride,
const CONV_BUF_TYPE *src0,
uint32_t src0_stride,
@@ -209,72 +208,6 @@
}
}
}
-#else // CONFIG_LOWPRECISION_BLEND
-void aom_blend_a64_d32_mask_c(int32_t *dst, uint32_t dst_stride,
- const int32_t *src0, uint32_t src0_stride,
- const int32_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int h,
- int w, int subh, int subw) {
- int i, j;
-
- assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
- assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
- assert(h >= 1);
- assert(w >= 1);
- assert(IS_POWER_OF_TWO(h));
- assert(IS_POWER_OF_TWO(w));
-
- if (subw == 0 && subh == 0) {
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; ++j) {
- const int m = mask[i * mask_stride + j];
- dst[i * dst_stride + j] =
- ((m * src0[i * src0_stride + j] +
- (AOM_BLEND_A64_MAX_ALPHA - m) * src1[i * src1_stride + j]) >>
- AOM_BLEND_A64_ROUND_BITS);
- }
- }
- } else if (subw == 1 && subh == 1) {
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; ++j) {
- const int m = ROUND_POWER_OF_TWO(
- mask[(2 * i) * mask_stride + (2 * j)] +
- mask[(2 * i + 1) * mask_stride + (2 * j)] +
- mask[(2 * i) * mask_stride + (2 * j + 1)] +
- mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
- 2);
- dst[i * dst_stride + j] =
- ((m * src0[i * src0_stride + j] +
- (AOM_BLEND_A64_MAX_ALPHA - m) * src1[i * src1_stride + j]) >>
- AOM_BLEND_A64_ROUND_BITS);
- }
- }
- } else if (subw == 1 && subh == 0) {
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; ++j) {
- const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
- mask[i * mask_stride + (2 * j + 1)]);
- dst[i * dst_stride + j] =
- ((m * src0[i * src0_stride + j] +
- (AOM_BLEND_A64_MAX_ALPHA - m) * src1[i * src1_stride + j]) >>
- AOM_BLEND_A64_ROUND_BITS);
- }
- }
- } else {
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; ++j) {
- const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
- mask[(2 * i + 1) * mask_stride + j]);
- dst[i * dst_stride + j] =
- ((m * src0[i * src0_stride + j] +
- (AOM_BLEND_A64_MAX_ALPHA - m) * src1[i * src1_stride + j]) >>
- AOM_BLEND_A64_ROUND_BITS);
- }
- }
- }
-}
-#endif // CONFIG_LOWPRECISION_BLEND
// Blending with alpha mask. Mask values come from the range [0, 64],
// as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can
diff --git a/aom_dsp/x86/convolve_avx2.h b/aom_dsp/x86/convolve_avx2.h
index 5190c99..7790baf 100644
--- a/aom_dsp/x86/convolve_avx2.h
+++ b/aom_dsp/x86/convolve_avx2.h
@@ -134,7 +134,6 @@
_mm256_store_si256((__m256i *)dst, d);
}
-#if CONFIG_LOWPRECISION_BLEND
static INLINE __m256i comp_avg(const __m256i *const data_ref_0,
const __m256i *const res_unsigned,
const __m256i *const wt,
@@ -195,23 +194,5 @@
return res_round;
}
-#else
-static INLINE void mult_add_store_aligned_256(CONV_BUF_TYPE *const dst,
- const __m256i *const res,
- const __m256i *const wt0,
- const __m256i *const wt1,
- const int do_average) {
- __m256i d;
- if (do_average) {
- d = _mm256_load_si256((__m256i *)dst);
- d = _mm256_add_epi32(_mm256_mullo_epi32(d, *wt0),
- _mm256_mullo_epi32(*res, *wt1));
- d = _mm256_srai_epi32(d, DIST_PRECISION_BITS);
- } else {
- d = *res;
- }
- _mm256_store_si256((__m256i *)dst, d);
-}
-#endif
#endif
diff --git a/aom_dsp/x86/convolve_sse2.h b/aom_dsp/x86/convolve_sse2.h
index 9790580..846fe7b 100644
--- a/aom_dsp/x86/convolve_sse2.h
+++ b/aom_dsp/x86/convolve_sse2.h
@@ -75,7 +75,6 @@
return convolve(ss, coeffs);
}
-#if CONFIG_LOWPRECISION_BLEND
static INLINE __m128i comp_avg(const __m128i *const data_ref_0,
const __m128i *const res_unsigned,
const __m128i *const wt,
@@ -120,5 +119,3 @@
}
#endif
-
-#endif
diff --git a/aom_dsp/x86/convolve_sse4_1.h b/aom_dsp/x86/convolve_sse4_1.h
index 4b5a9b8..d48c256 100644
--- a/aom_dsp/x86/convolve_sse4_1.h
+++ b/aom_dsp/x86/convolve_sse4_1.h
@@ -31,7 +31,6 @@
_mm_store_si128((__m128i *)dst, d);
}
-#if CONFIG_LOWPRECISION_BLEND
static INLINE __m128i highbd_comp_avg_sse4_1(const __m128i *const data_ref_0,
const __m128i *const res_unsigned,
const __m128i *const wt0,
@@ -50,6 +49,5 @@
}
return res;
}
-#endif
#endif // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_
diff --git a/av1/av1.cmake b/av1/av1.cmake
index fdad64a..818c5fb 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -286,20 +286,14 @@
${AOM_AV1_COMMON_INTRIN_SSE4_1}
"${AOM_ROOT}/av1/common/x86/highbd_jnt_convolve_sse4.c")
-if (CONFIG_LOWPRECISION_BLEND)
- set(AOM_AV1_COMMON_INTRIN_SSE2
- ${AOM_AV1_COMMON_INTRIN_SSE2}
- "${AOM_ROOT}/av1/common/x86/jnt_convolve_sse2.c")
+set(AOM_AV1_COMMON_INTRIN_SSE2
+ ${AOM_AV1_COMMON_INTRIN_SSE2}
+ "${AOM_ROOT}/av1/common/x86/jnt_convolve_sse2.c")
- set(AOM_AV1_COMMON_INTRIN_SSSE3
- ${AOM_AV1_COMMON_INTRIN_SSSE3}
- "${AOM_ROOT}/av1/common/x86/jnt_convolve_ssse3.c")
-else()
+set(AOM_AV1_COMMON_INTRIN_SSSE3
+ ${AOM_AV1_COMMON_INTRIN_SSSE3}
+ "${AOM_ROOT}/av1/common/x86/jnt_convolve_ssse3.c")
- set(AOM_AV1_COMMON_INTRIN_SSE4_1
- ${AOM_AV1_COMMON_INTRIN_SSE4_1}
- "${AOM_ROOT}/av1/common/x86/jnt_convolve_sse4.c")
-endif()
set(AOM_AV1_COMMON_INTRIN_AVX2
${AOM_AV1_COMMON_INTRIN_AVX2}
"${AOM_ROOT}/av1/common/x86/jnt_convolve_avx2.c")
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 1960169..cd4ac0a 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -307,7 +307,6 @@
add_proto qw/void av1_highbd_jnt_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
add_proto qw/void av1_highbd_jnt_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-if (aom_config("CONFIG_LOWPRECISION_BLEND") eq "yes") {
add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params";
add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd";
specialize qw/av1_convolve_2d_sr sse2 avx2/;
@@ -328,22 +327,6 @@
specialize qw/av1_highbd_jnt_convolve_x sse4_1 avx2/;
specialize qw/av1_highbd_jnt_convolve_y sse4_1 avx2/;
specialize qw/av1_highbd_jnt_convolve_2d_copy sse4_1 avx2/;
-}
-else
-{
- add_proto qw/void av1_convolve_rounding/, "const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits";
- add_proto qw/void av1_highbd_convolve_rounding/, "const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits, int bd";
- add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params";
- add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd";
- add_proto qw/void av1_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
- add_proto qw/void av1_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
- add_proto qw/void av1_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
- add_proto qw/void av1_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
- add_proto qw/void av1_highbd_convolve_2d/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
- add_proto qw/void av1_highbd_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
- add_proto qw/void av1_highbd_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
- add_proto qw/void av1_highbd_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-}
# INTRA_EDGE functions
add_proto qw/void av1_filter_intra_edge/, "uint8_t *p, int sz, int strength";
diff --git a/av1/common/convolve.c b/av1/common/convolve.c
index 855779b..df6eb2c 100644
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c
@@ -76,190 +76,6 @@
}
}
-#if !CONFIG_LOWPRECISION_BLEND
-void av1_convolve_rounding_c(const int32_t *src, int src_stride, uint8_t *dst,
- int dst_stride, int w, int h, int bits) {
- for (int r = 0; r < h; ++r) {
- for (int c = 0; c < w; ++c) {
- dst[r * dst_stride + c] =
- clip_pixel(ROUND_POWER_OF_TWO(src[r * src_stride + c], bits));
- }
- }
-}
-
-/* Note: For notes on hardware implementations, including the required
- bit widths for various intermediate values, see the comments above
- av1_warp_affine_c.
-*/
-void av1_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst0,
- int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
- CONV_BUF_TYPE *dst = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
- int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
- int im_h = h + filter_params_y->taps - 1;
- int im_stride = w;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
- const int bd = 8;
- (void)dst0;
- (void)dst_stride0;
-
- // horizontal filter
- const uint8_t *src_horiz = src - fo_vert * src_stride;
- const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
- for (int y = 0; y < im_h; ++y) {
- for (int x = 0; x < w; ++x) {
- int32_t sum = (1 << (bd + FILTER_BITS - 1));
- for (int k = 0; k < filter_params_x->taps; ++k) {
- sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
- }
- assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
- im_block[y * im_stride + x] =
- (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
- }
- }
-
- // vertical filter
- int16_t *src_vert = im_block + fo_vert * im_stride;
- const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
- const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
- for (int y = 0; y < h; ++y) {
- for (int x = 0; x < w; ++x) {
- CONV_BUF_TYPE sum = 1 << offset_bits;
- for (int k = 0; k < filter_params_y->taps; ++k) {
- sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
- }
- assert(0 <= sum && sum < (1 << (offset_bits + 2)));
- CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
- ((1 << (offset_bits - conv_params->round_1)) +
- (1 << (offset_bits - conv_params->round_1 - 1)));
- if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- tmp += res;
- dst[y * dst_stride + x] = tmp >> 1;
- } else {
- dst[y * dst_stride + x] = res;
- }
- }
- }
-}
-
-void av1_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst0,
- int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
- CONV_BUF_TYPE *dst = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const int bits = FILTER_BITS - conv_params->round_0;
- (void)filter_params_x;
- (void)subpel_x_q4;
- (void)dst0;
- (void)dst_stride0;
-
- assert(bits >= 0);
-
- // vertical filter
- const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
- for (int y = 0; y < h; ++y) {
- for (int x = 0; x < w; ++x) {
- CONV_BUF_TYPE res = 0;
- for (int k = 0; k < filter_params_y->taps; ++k) {
- res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
- }
- res *= (1 << bits);
- res = ROUND_POWER_OF_TWO(res, conv_params->round_1);
- if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- tmp += res;
- dst[y * dst_stride + x] = tmp >> 1;
- } else {
- dst[y * dst_stride + x] = res;
- }
- }
- }
-}
-
-void av1_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst0,
- int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
- CONV_BUF_TYPE *dst = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
- const int bits = FILTER_BITS - conv_params->round_1;
- (void)filter_params_y;
- (void)subpel_y_q4;
- (void)dst0;
- (void)dst_stride0;
-
- assert(bits >= 0);
-
- // horizontal filter
- const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
- for (int y = 0; y < h; ++y) {
- for (int x = 0; x < w; ++x) {
- CONV_BUF_TYPE res = 0;
- for (int k = 0; k < filter_params_x->taps; ++k) {
- res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
- }
- res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
- if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- tmp += res;
- dst[y * dst_stride + x] = tmp >> 1;
- } else {
- dst[y * dst_stride + x] = res;
- }
- }
- }
-}
-
-void av1_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst0,
- int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
- CONV_BUF_TYPE *dst = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
- const int bits =
- FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
-
- (void)filter_params_x;
- (void)filter_params_y;
- (void)subpel_x_q4;
- (void)subpel_y_q4;
- (void)dst0;
- (void)dst_stride0;
-
- for (int y = 0; y < h; ++y) {
- for (int x = 0; x < w; ++x) {
- CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
- if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- tmp += res;
- dst[y * dst_stride + x] = tmp >> 1;
- } else {
- dst[y * dst_stride + x] = res;
- }
- }
- }
-}
-#endif // CONFIG_LOWPRECISION_BLEND
-
void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
InterpFilterParams *filter_params_x,
@@ -298,7 +114,6 @@
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
-#if CONFIG_LOWPRECISION_BLEND
int32_t sum = 1 << offset_bits;
for (int k = 0; k < filter_params_y->taps; ++k) {
sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
@@ -307,16 +122,6 @@
int16_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
((1 << (offset_bits - conv_params->round_1)) +
(1 << (offset_bits - conv_params->round_1 - 1)));
-#else // CONFIG_LOWPRECISION_BLEND
- CONV_BUF_TYPE sum = 1 << offset_bits;
- for (int k = 0; k < filter_params_y->taps; ++k) {
- sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
- }
- assert(0 <= sum && sum < (1 << (offset_bits + 2)));
- CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
- ((1 << (offset_bits - conv_params->round_1)) +
- (1 << (offset_bits - conv_params->round_1 - 1)));
-#endif // CONFIG_LOWPRECISION_BLEND
dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
}
}
@@ -342,11 +147,7 @@
*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
-#if CONFIG_LOWPRECISION_BLEND
int32_t res = 0;
-#else
- CONV_BUF_TYPE res = 0;
-#endif
for (int k = 0; k < filter_params_y->taps; ++k) {
res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
}
@@ -377,11 +178,7 @@
*filter_params_x, subpel_x_q4 & SUBPEL_MASK);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
-#if CONFIG_LOWPRECISION_BLEND
int32_t res = 0;
-#else
- CONV_BUF_TYPE res = 0;
-#endif
for (int k = 0; k < filter_params_x->taps; ++k) {
res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
}
@@ -424,13 +221,8 @@
const int fo_vert = filter_params_y->taps / 2 - 1;
const int fo_horiz = filter_params_x->taps / 2 - 1;
const int bd = 8;
-#if CONFIG_LOWPRECISION_BLEND
const int round_bits =
2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-#else
- (void)dst8;
- (void)dst8_stride;
-#endif
// horizontal filter
const uint8_t *src_horiz = src - fo_vert * src_stride;
@@ -455,7 +247,6 @@
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
-#if CONFIG_LOWPRECISION_BLEND
int32_t sum = 1 << offset_bits;
for (int k = 0; k < filter_params_y->taps; ++k) {
sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
@@ -478,33 +269,6 @@
} else {
dst[y * dst_stride + x] = res;
}
-#else // CONFIG_LOWPRECISION_BLEND
- CONV_BUF_TYPE sum = 1 << offset_bits;
- for (int k = 0; k < filter_params_y->taps; ++k) {
- sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
- }
- assert(0 <= sum && sum < (1 << (offset_bits + 2)));
- CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
- ((1 << (offset_bits - conv_params->round_1)) +
- (1 << (offset_bits - conv_params->round_1 - 1)));
- if (conv_params->use_jnt_comp_avg) {
- if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
- dst[y * dst_stride + x] = tmp >> DIST_PRECISION_BITS;
- } else {
- dst[y * dst_stride + x] = res;
- }
- } else {
- if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- tmp += res;
- dst[y * dst_stride + x] = tmp >> 1;
- } else {
- dst[y * dst_stride + x] = res;
- }
- }
-#endif // CONFIG_LOWPRECISION_BLEND
}
}
}
@@ -519,27 +283,20 @@
int dst_stride = conv_params->dst_stride;
const int fo_vert = filter_params_y->taps / 2 - 1;
const int bits = FILTER_BITS - conv_params->round_0;
-#if CONFIG_LOWPRECISION_BLEND
const int bd = 8;
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
(1 << (offset_bits - conv_params->round_1 - 1));
const int round_bits =
2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-#endif
(void)filter_params_x;
(void)subpel_x_q4;
-#if !CONFIG_LOWPRECISION_BLEND
- (void)dst8;
- (void)dst8_stride;
-#endif
// vertical filter
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
-#if CONFIG_LOWPRECISION_BLEND
int32_t res = 0;
for (int k = 0; k < filter_params_y->taps; ++k) {
res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
@@ -562,31 +319,6 @@
} else {
dst[y * dst_stride + x] = res;
}
-#else // CONFIG_LOWPRECISION_BLEND
- CONV_BUF_TYPE res = 0;
- for (int k = 0; k < filter_params_y->taps; ++k) {
- res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
- }
- res *= (1 << bits);
- res = ROUND_POWER_OF_TWO(res, conv_params->round_1);
- if (conv_params->use_jnt_comp_avg) {
- if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
- dst[y * dst_stride + x] = tmp >> DIST_PRECISION_BITS;
- } else {
- dst[y * dst_stride + x] = res;
- }
- } else {
- if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- tmp += res;
- dst[y * dst_stride + x] = tmp >> 1;
- } else {
- dst[y * dst_stride + x] = res;
- }
- }
-#endif // CONFIG_LOWPRECISION_BLEND
}
}
}
@@ -601,27 +333,20 @@
int dst_stride = conv_params->dst_stride;
const int fo_horiz = filter_params_x->taps / 2 - 1;
const int bits = FILTER_BITS - conv_params->round_1;
-#if CONFIG_LOWPRECISION_BLEND
const int bd = 8;
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
(1 << (offset_bits - conv_params->round_1 - 1));
const int round_bits =
2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-#endif
(void)filter_params_y;
(void)subpel_y_q4;
-#if !CONFIG_LOWPRECISION_BLEND
- (void)dst8;
- (void)dst8_stride;
-#endif
// horizontal filter
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
*filter_params_x, subpel_x_q4 & SUBPEL_MASK);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
-#if CONFIG_LOWPRECISION_BLEND
int32_t res = 0;
for (int k = 0; k < filter_params_x->taps; ++k) {
res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
@@ -644,30 +369,6 @@
} else {
dst[y * dst_stride + x] = res;
}
-#else // CONFIG_LOWPRECISION_BLEND
- CONV_BUF_TYPE res = 0;
- for (int k = 0; k < filter_params_x->taps; ++k) {
- res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
- }
- res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
- if (conv_params->use_jnt_comp_avg) {
- if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
- dst[y * dst_stride + x] = tmp >> DIST_PRECISION_BITS;
- } else {
- dst[y * dst_stride + x] = res;
- }
- } else {
- if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- tmp += res;
- dst[y * dst_stride + x] = tmp >> 1;
- } else {
- dst[y * dst_stride + x] = res;
- }
- }
-#endif // CONFIG_LOWPRECISION_BLEND
}
}
}
@@ -682,25 +383,18 @@
int dst_stride = conv_params->dst_stride;
const int bits =
FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
-#if CONFIG_LOWPRECISION_BLEND
const int bd = 8;
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
(1 << (offset_bits - conv_params->round_1 - 1));
-#endif
(void)filter_params_x;
(void)filter_params_y;
(void)subpel_x_q4;
(void)subpel_y_q4;
-#if !CONFIG_LOWPRECISION_BLEND
- (void)dst8;
- (void)dst8_stride;
-#endif
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
-#if CONFIG_LOWPRECISION_BLEND
res += round_offset;
if (conv_params->do_average) {
@@ -717,57 +411,25 @@
} else {
dst[y * dst_stride + x] = res;
}
-#else // CONFIG_LOWPRECISION_BLEND
- if (conv_params->use_jnt_comp_avg) {
- if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
- dst[y * dst_stride + x] = tmp >> DIST_PRECISION_BITS;
- } else {
- dst[y * dst_stride + x] = res;
- }
- } else {
- if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- tmp += res;
- dst[y * dst_stride + x] = tmp >> 1;
- } else {
- dst[y * dst_stride + x] = res;
- }
- }
-#endif // CONFIG_LOWPRECISION_BLEND
}
}
}
-#if CONFIG_LOWPRECISION_BLEND
void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8,
int dst8_stride, int w, int h,
InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
const int subpel_x_qn, const int x_step_qn,
const int subpel_y_qn, const int y_step_qn,
- ConvolveParams *conv_params)
-#else
-void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride,
- CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_qn, const int x_step_qn,
- const int subpel_y_qn, const int y_step_qn,
- ConvolveParams *conv_params)
-#endif
-{
+ ConvolveParams *conv_params) {
int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
filter_params_y->taps;
-#if CONFIG_LOWPRECISION_BLEND
CONV_BUF_TYPE *dst16 = conv_params->dst;
const int dst16_stride = conv_params->dst_stride;
const int bits =
FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
assert(bits >= 0);
-#endif
int im_stride = w;
const int fo_vert = filter_params_y->taps / 2 - 1;
const int fo_horiz = filter_params_x->taps / 2 - 1;
@@ -805,7 +467,6 @@
assert(y_filter_idx < SUBPEL_SHIFTS);
const int16_t *y_filter =
av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx);
-#if CONFIG_LOWPRECISION_BLEND
int32_t sum = 1 << offset_bits;
for (int k = 0; k < filter_params_y->taps; ++k) {
sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
@@ -835,33 +496,6 @@
(1 << (offset_bits - conv_params->round_1 - 1)));
dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
}
-#else // CONFIG_LOWPRECISION_BLEND
- CONV_BUF_TYPE sum = 1 << offset_bits;
- for (int k = 0; k < filter_params_y->taps; ++k) {
- sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
- }
- assert(0 <= sum && sum < (1 << (offset_bits + 2)));
- CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
- ((1 << (offset_bits - conv_params->round_1)) +
- (1 << (offset_bits - conv_params->round_1 - 1)));
- if (conv_params->use_jnt_comp_avg) {
- if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
- dst[y * dst_stride + x] = tmp >> DIST_PRECISION_BITS;
- } else {
- dst[y * dst_stride + x] = res;
- }
- } else {
- if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- tmp += res;
- dst[y * dst_stride + x] = tmp >> 1;
- } else {
- dst[y * dst_stride + x] = res;
- }
- }
-#endif // CONFIG_LOWPRECISION_BLEND
}
src_vert++;
}
@@ -873,32 +507,12 @@
InterpFilterParams *filter_params_y, const int subpel_x_qn,
const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
ConvolveParams *conv_params) {
-#if CONFIG_LOWPRECISION_BLEND
if (conv_params->is_compound) {
assert(conv_params->dst != NULL);
}
av1_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, filter_params_x,
filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn,
y_step_qn, conv_params);
-#else // CONFIG_LOWPRECISION_BLEND
- if (conv_params->is_compound) {
- assert(conv_params->dst != NULL);
- av1_convolve_2d_scale(src, src_stride, conv_params->dst,
- conv_params->dst_stride, w, h, filter_params_x,
- filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn,
- y_step_qn, conv_params);
- } else {
- CONV_BUF_TYPE tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE];
- int tmp_dst_stride = MAX_SB_SIZE;
- av1_convolve_2d_scale(src, src_stride, tmp_dst, tmp_dst_stride, w, h,
- filter_params_x, filter_params_y, subpel_x_qn,
- x_step_qn, subpel_y_qn, y_step_qn, conv_params);
- const int rbits =
- 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
- av1_convolve_rounding(tmp_dst, tmp_dst_stride, dst, dst_stride, w, h,
- rbits);
- }
-#endif // CONFIG_LOWPRECISION_BLEND
}
void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
@@ -926,191 +540,6 @@
&filter_params_y, subpel_x_q4, subpel_y_q4, conv_params);
}
-#if !CONFIG_LOWPRECISION_BLEND
-void av1_highbd_convolve_rounding_c(const int32_t *src, int src_stride,
- uint8_t *dst8, int dst_stride, int w, int h,
- int bits, int bd) {
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
- for (int r = 0; r < h; ++r) {
- for (int c = 0; c < w; ++c) {
- dst[r * dst_stride + c] = clip_pixel_highbd(
- ROUND_POWER_OF_TWO(src[r * src_stride + c], bits), bd);
- }
- }
-}
-
-void av1_highbd_convolve_2d_c(const uint16_t *src, int src_stride,
- uint16_t *dst0, int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params, int bd) {
- int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
- CONV_BUF_TYPE *dst = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
- int im_h = h + filter_params_y->taps - 1;
- int im_stride = w;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
- (void)dst0;
- (void)dst_stride0;
-
- // horizontal filter
- const uint16_t *src_horiz = src - fo_vert * src_stride;
- const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
- for (int y = 0; y < im_h; ++y) {
- for (int x = 0; x < w; ++x) {
- int32_t sum = (1 << (bd + FILTER_BITS - 1));
- for (int k = 0; k < filter_params_x->taps; ++k) {
- sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
- }
- assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
- (void)bd;
- im_block[y * im_stride + x] =
- (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
- }
- }
-
- // vertical filter
- int16_t *src_vert = im_block + fo_vert * im_stride;
- const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
- const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
- for (int y = 0; y < h; ++y) {
- for (int x = 0; x < w; ++x) {
- CONV_BUF_TYPE sum = 1 << offset_bits;
- for (int k = 0; k < filter_params_y->taps; ++k) {
- sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
- }
- assert(0 <= sum && sum < (1 << (offset_bits + 2)));
- CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
- ((1 << (offset_bits - conv_params->round_1)) +
- (1 << (offset_bits - conv_params->round_1 - 1)));
- if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- tmp += res;
- dst[y * dst_stride + x] = tmp >> 1;
- } else {
- dst[y * dst_stride + x] = res;
- }
- }
- }
-}
-
-void av1_highbd_convolve_2d_copy_c(const uint16_t *src, int src_stride,
- uint16_t *dst0, int dst_stride0, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params, int bd) {
- CONV_BUF_TYPE *dst = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
- const int bits =
- FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
-
- (void)filter_params_x;
- (void)filter_params_y;
- (void)subpel_x_q4;
- (void)subpel_y_q4;
- (void)dst0;
- (void)dst_stride0;
- (void)bd;
-
- for (int y = 0; y < h; ++y) {
- for (int x = 0; x < w; ++x) {
- CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
- if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- tmp += res;
- dst[y * dst_stride + x] = tmp >> 1;
- } else {
- dst[y * dst_stride + x] = res;
- }
- }
- }
-}
-
-void av1_highbd_convolve_x_c(const uint16_t *src, int src_stride,
- uint16_t *dst0, int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params, int bd) {
- CONV_BUF_TYPE *dst = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
- const int bits = FILTER_BITS - conv_params->round_1;
- (void)filter_params_y;
- (void)subpel_y_q4;
- (void)dst0;
- (void)dst_stride0;
- (void)bd;
-
- assert(bits >= 0);
-
- // horizontal filter
- const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
- for (int y = 0; y < h; ++y) {
- for (int x = 0; x < w; ++x) {
- CONV_BUF_TYPE res = 0;
- for (int k = 0; k < filter_params_x->taps; ++k) {
- res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
- }
- res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
- if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- tmp += res;
- dst[y * dst_stride + x] = tmp >> 1;
- } else {
- dst[y * dst_stride + x] = res;
- }
- }
- }
-}
-
-void av1_highbd_convolve_y_c(const uint16_t *src, int src_stride,
- uint16_t *dst0, int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params, int bd) {
- CONV_BUF_TYPE *dst = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const int bits = FILTER_BITS - conv_params->round_0;
- (void)filter_params_x;
- (void)subpel_x_q4;
- (void)dst0;
- (void)dst_stride0;
- (void)bd;
-
- assert(bits >= 0);
-
- // vertical filter
- const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
- for (int y = 0; y < h; ++y) {
- for (int x = 0; x < w; ++x) {
- CONV_BUF_TYPE res = 0;
- for (int k = 0; k < filter_params_y->taps; ++k) {
- res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
- }
- res *= (1 << bits);
- res = ROUND_POWER_OF_TWO(res, conv_params->round_1);
- if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- tmp += res;
- dst[y * dst_stride + x] = tmp >> 1;
- } else {
- dst[y * dst_stride + x] = res;
- }
- }
- }
-}
-#endif // CONFIG_LOWPRECISION_BLEND
-
void av1_highbd_convolve_2d_copy_sr_c(
const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
int h, InterpFilterParams *filter_params_x,
@@ -1150,11 +579,7 @@
*filter_params_x, subpel_x_q4 & SUBPEL_MASK);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
-#if CONFIG_LOWPRECISION_BLEND
int32_t res = 0;
-#else
- CONV_BUF_TYPE res = 0;
-#endif
for (int k = 0; k < filter_params_x->taps; ++k) {
res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
}
@@ -1184,11 +609,7 @@
*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
-#if CONFIG_LOWPRECISION_BLEND
int32_t res = 0;
-#else
- CONV_BUF_TYPE res = 0;
-#endif
for (int k = 0; k < filter_params_y->taps; ++k) {
res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
}
@@ -1236,7 +657,6 @@
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
-#if CONFIG_LOWPRECISION_BLEND
int32_t sum = 1 << offset_bits;
for (int k = 0; k < filter_params_y->taps; ++k) {
sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
@@ -1245,38 +665,18 @@
int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
((1 << (offset_bits - conv_params->round_1)) +
(1 << (offset_bits - conv_params->round_1 - 1)));
-#else // CONFIG_LOWPRECISION_BLEND
- CONV_BUF_TYPE sum = 1 << offset_bits;
- for (int k = 0; k < filter_params_y->taps; ++k) {
- sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
- }
- assert(0 <= sum && sum < (1 << (offset_bits + 2)));
- CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
- ((1 << (offset_bits - conv_params->round_1)) +
- (1 << (offset_bits - conv_params->round_1 - 1)));
-#endif // CONFIG_LOWPRECISION_BLEND
dst[y * dst_stride + x] =
clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
}
}
}
-#if CONFIG_LOWPRECISION_BLEND
void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
uint16_t *dst16, int dst16_stride, int w,
int h, InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params, int bd)
-#else // CONFIG_LOWPRECISION_BLEND
-void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
- uint16_t *dst0, int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params, int bd)
-#endif // CONFIG_LOWPRECISION_BLEND
-{
+ ConvolveParams *conv_params, int bd) {
int x, y, k;
int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
CONV_BUF_TYPE *dst = conv_params->dst;
@@ -1285,14 +685,9 @@
int im_stride = w;
const int fo_vert = filter_params_y->taps / 2 - 1;
const int fo_horiz = filter_params_x->taps / 2 - 1;
-#if CONFIG_LOWPRECISION_BLEND
const int round_bits =
2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
assert(round_bits >= 0);
-#else
- (void)dst0;
- (void)dst_stride0;
-#endif
// horizontal filter
const uint16_t *src_horiz = src - fo_vert * src_stride;
@@ -1318,7 +713,6 @@
*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
for (y = 0; y < h; ++y) {
for (x = 0; x < w; ++x) {
-#if CONFIG_LOWPRECISION_BLEND
int32_t sum = 1 << offset_bits;
for (k = 0; k < filter_params_y->taps; ++k) {
sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
@@ -1341,33 +735,6 @@
} else {
dst[y * dst_stride + x] = res;
}
-#else
- CONV_BUF_TYPE sum = 1 << offset_bits;
- for (k = 0; k < filter_params_y->taps; ++k) {
- sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
- }
- assert(0 <= sum && sum < (1 << (offset_bits + 2)));
- CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
- ((1 << (offset_bits - conv_params->round_1)) +
- (1 << (offset_bits - conv_params->round_1 - 1)));
- if (conv_params->use_jnt_comp_avg) {
- if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
- dst[y * dst_stride + x] = tmp >> DIST_PRECISION_BITS;
- } else {
- dst[y * dst_stride + x] = res;
- }
- } else {
- if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- tmp += res;
- dst[y * dst_stride + x] = tmp >> 1;
- } else {
- dst[y * dst_stride + x] = res;
- }
- }
-#endif
}
}
}
@@ -1382,28 +749,20 @@
int dst_stride = conv_params->dst_stride;
const int fo_horiz = filter_params_x->taps / 2 - 1;
const int bits = FILTER_BITS - conv_params->round_1;
-#if CONFIG_LOWPRECISION_BLEND
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
(1 << (offset_bits - conv_params->round_1 - 1));
const int round_bits =
2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
assert(round_bits >= 0);
-#endif
(void)filter_params_y;
(void)subpel_y_q4;
-#if !CONFIG_LOWPRECISION_BLEND
- (void)dst16;
- (void)dst16_stride;
- (void)bd;
-#endif
assert(bits >= 0);
// horizontal filter
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
*filter_params_x, subpel_x_q4 & SUBPEL_MASK);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
-#if CONFIG_LOWPRECISION_BLEND
int32_t res = 0;
for (int k = 0; k < filter_params_x->taps; ++k) {
res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
@@ -1426,30 +785,6 @@
} else {
dst[y * dst_stride + x] = res;
}
-#else // CONFIG_LOWPRECISION_BLEND
- CONV_BUF_TYPE res = 0;
- for (int k = 0; k < filter_params_x->taps; ++k) {
- res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
- }
- res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
- if (conv_params->use_jnt_comp_avg) {
- if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
- dst[y * dst_stride + x] = tmp >> DIST_PRECISION_BITS;
- } else {
- dst[y * dst_stride + x] = res;
- }
- } else {
- if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- tmp += res;
- dst[y * dst_stride + x] = tmp >> 1;
- } else {
- dst[y * dst_stride + x] = res;
- }
- }
-#endif // CONFIG_LOWPRECISION_BLEND
}
}
}
@@ -1464,28 +799,20 @@
int dst_stride = conv_params->dst_stride;
const int fo_vert = filter_params_y->taps / 2 - 1;
const int bits = FILTER_BITS - conv_params->round_0;
-#if CONFIG_LOWPRECISION_BLEND
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
(1 << (offset_bits - conv_params->round_1 - 1));
const int round_bits =
2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
assert(round_bits >= 0);
-#endif
(void)filter_params_x;
(void)subpel_x_q4;
-#if !CONFIG_LOWPRECISION_BLEND
- (void)dst16;
- (void)dst16_stride;
- (void)bd;
-#endif
assert(bits >= 0);
// vertical filter
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
-#if CONFIG_LOWPRECISION_BLEND
int32_t res = 0;
for (int k = 0; k < filter_params_y->taps; ++k) {
res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
@@ -1508,31 +835,6 @@
} else {
dst[y * dst_stride + x] = res;
}
-#else // CONFIG_LOWPRECISION_BLEND
- CONV_BUF_TYPE res = 0;
- for (int k = 0; k < filter_params_y->taps; ++k) {
- res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
- }
- res *= (1 << bits);
- res = ROUND_POWER_OF_TWO(res, conv_params->round_1);
- if (conv_params->use_jnt_comp_avg) {
- if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
- dst[y * dst_stride + x] = tmp >> DIST_PRECISION_BITS;
- } else {
- dst[y * dst_stride + x] = res;
- }
- } else {
- if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- tmp += res;
- dst[y * dst_stride + x] = tmp >> 1;
- } else {
- dst[y * dst_stride + x] = res;
- }
- }
-#endif // CONFIG_LOWPRECISION_BLEND
}
}
}
@@ -1546,26 +848,18 @@
int dst_stride = conv_params->dst_stride;
const int bits =
FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
-#if CONFIG_LOWPRECISION_BLEND
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
(1 << (offset_bits - conv_params->round_1 - 1));
assert(bits >= 0);
-#endif
(void)filter_params_x;
(void)filter_params_y;
(void)subpel_x_q4;
(void)subpel_y_q4;
-#if !CONFIG_LOWPRECISION_BLEND
- (void)dst16;
- (void)dst16_stride;
- (void)bd;
-#endif
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
-#if CONFIG_LOWPRECISION_BLEND
res += round_offset;
if (conv_params->do_average) {
int32_t tmp = dst[y * dst_stride + x];
@@ -1582,60 +876,28 @@
} else {
dst[y * dst_stride + x] = res;
}
-#else // CONFIG_LOWPRECISION_BLEND
- if (conv_params->use_jnt_comp_avg) {
- if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
- dst[y * dst_stride + x] = tmp >> DIST_PRECISION_BITS;
- } else {
- dst[y * dst_stride + x] = res;
- }
- } else {
- if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- tmp += res;
- dst[y * dst_stride + x] = tmp >> 1;
- } else {
- dst[y * dst_stride + x] = res;
- }
- }
-#endif // CONFIG_LOWPRECISION_BLEND
}
}
}
-#if CONFIG_LOWPRECISION_BLEND
void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
uint16_t *dst, int dst_stride, int w, int h,
InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
const int subpel_x_qn, const int x_step_qn,
const int subpel_y_qn, const int y_step_qn,
- ConvolveParams *conv_params, int bd)
-#else
-void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
- CONV_BUF_TYPE *dst, int dst_stride, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_qn, const int x_step_qn,
- const int subpel_y_qn, const int y_step_qn,
- ConvolveParams *conv_params, int bd)
-#endif // CONFIG_LOWPRECISION_BLEND
-{
+ ConvolveParams *conv_params, int bd) {
int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
filter_params_y->taps;
int im_stride = w;
const int fo_vert = filter_params_y->taps / 2 - 1;
const int fo_horiz = filter_params_x->taps / 2 - 1;
-#if CONFIG_LOWPRECISION_BLEND
CONV_BUF_TYPE *dst16 = conv_params->dst;
const int dst16_stride = conv_params->dst_stride;
const int bits =
FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
assert(bits >= 0);
-#endif
// horizontal filter
const uint16_t *src_horiz = src - fo_vert * src_stride;
for (int y = 0; y < im_h; ++y) {
@@ -1668,7 +930,6 @@
assert(y_filter_idx < SUBPEL_SHIFTS);
const int16_t *y_filter =
av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx);
-#if CONFIG_LOWPRECISION_BLEND
int32_t sum = 1 << offset_bits;
for (int k = 0; k < filter_params_y->taps; ++k) {
sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
@@ -1700,33 +961,6 @@
dst[y * dst_stride + x] =
clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
}
-#else // CONFIG_LOWPRECISION_BLEND
- CONV_BUF_TYPE sum = 1 << offset_bits;
- for (int k = 0; k < filter_params_y->taps; ++k) {
- sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
- }
- assert(0 <= sum && sum < (1 << (offset_bits + 2)));
- CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
- ((1 << (offset_bits - conv_params->round_1)) +
- (1 << (offset_bits - conv_params->round_1 - 1)));
- if (conv_params->use_jnt_comp_avg) {
- if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
- dst[y * dst_stride + x] = tmp >> DIST_PRECISION_BITS;
- } else {
- dst[y * dst_stride + x] = res;
- }
- } else {
- if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- tmp += res;
- dst[y * dst_stride + x] = tmp >> 1;
- } else {
- dst[y * dst_stride + x] = res;
- }
- }
-#endif // CONFIG_LOWPRECISION_BLEND
}
src_vert++;
}
@@ -1749,7 +983,6 @@
&filter_params_y, w, h);
if (scaled) {
-#if CONFIG_LOWPRECISION_BLEND
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
if (conv_params->is_compound) {
assert(conv_params->dst != NULL);
@@ -1758,28 +991,6 @@
&filter_params_x, &filter_params_y,
subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4,
conv_params, bd);
-#else // CONFIG_LOWPRECISION_BLEND
- if (conv_params->is_compound) {
- av1_highbd_convolve_2d_scale(
- src, src_stride, conv_params->dst, conv_params->dst_stride, w, h,
- &filter_params_x, &filter_params_y, subpel_x_q4, x_step_q4,
- subpel_y_q4, y_step_q4, conv_params, bd);
- } else {
- CONV_BUF_TYPE tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE];
- int tmp_dst_stride = MAX_SB_SIZE;
- av1_highbd_convolve_2d_scale(src, src_stride, tmp_dst, tmp_dst_stride, w,
- h, &filter_params_x, &filter_params_y,
- subpel_x_q4, x_step_q4, subpel_y_q4,
- y_step_q4, conv_params, bd);
-
- // 0-bit rounding just to convert from int32 to uint16
- const int rbits =
- 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
- assert(rbits >= 0);
- av1_highbd_convolve_rounding(tmp_dst, tmp_dst_stride, dst8, dst_stride, w,
- h, rbits, bd);
- }
-#endif // CONFIG_LOWPRECISION_BLEND
} else {
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
diff --git a/av1/common/convolve.h b/av1/common/convolve.h
index b388af3..1b2c2d0 100644
--- a/av1/common/convolve.h
+++ b/av1/common/convolve.h
@@ -17,11 +17,7 @@
extern "C" {
#endif
-#if CONFIG_LOWPRECISION_BLEND
typedef uint16_t CONV_BUF_TYPE;
-#else
-typedef int32_t CONV_BUF_TYPE;
-#endif
typedef struct ConvolveParams {
int ref;
int do_average;
@@ -36,15 +32,9 @@
int bck_offset;
} ConvolveParams;
-#if CONFIG_LOWPRECISION_BLEND
#define ROUND0_BITS 3
#define COMPOUND_ROUND1_BITS 7
#define WIENER_ROUND0_BITS 3
-#else
-#define ROUND0_BITS 5
-#define COMPOUND_ROUND1_BITS 0
-#define WIENER_ROUND0_BITS 5
-#endif // CONFIG_LOWPRECISION_BLEND
#define WIENER_CLAMP_LIMIT(r0, bd) (1 << ((bd) + 1 + FILTER_BITS - r0))
@@ -92,7 +82,6 @@
assert(IMPLIES(do_average, is_compound));
conv_params.is_compound = is_compound;
conv_params.round_0 = ROUND0_BITS;
-#if CONFIG_LOWPRECISION_BLEND
conv_params.round_1 = is_compound ? COMPOUND_ROUND1_BITS
: 2 * FILTER_BITS - conv_params.round_0;
const int intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2;
@@ -101,10 +90,6 @@
conv_params.round_0 += intbufrange - 16;
if (!is_compound) conv_params.round_1 -= intbufrange - 16;
}
-#else
- (void)bd;
- conv_params.round_1 = 0;
-#endif // CONFIG_LOWPRECISION_BLEND
// TODO(yunqing): The following dst should only be valid while
// is_compound = 1;
conv_params.dst = dst;
diff --git a/av1/common/reconinter.c b/av1/common/reconinter.c
index b5c11e4..3caab50 100644
--- a/av1/common/reconinter.c
+++ b/av1/common/reconinter.c
@@ -32,14 +32,6 @@
#define USE_PRECOMPUTED_WEDGE_MASK 1
#define USE_PRECOMPUTED_WEDGE_SIGN 1
-#if !CONFIG_LOWPRECISION_BLEND
-static INLINE int get_compound_post_rounding_bits(
- const ConvolveParams *conv_params) {
- assert(conv_params->is_compound);
- return 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-}
-#endif
-
// This function will determine whether or not to create a warped
// prediction.
static INLINE int allow_warp(const MODE_INFO *const mi,
@@ -362,20 +354,11 @@
#elif COMPOUND_SEGMENT_TYPE == 1
#define DIFF_FACTOR 16
-#if CONFIG_LOWPRECISION_BLEND
static void diffwtd_mask_d32(uint8_t *mask, int which_inverse, int mask_base,
const CONV_BUF_TYPE *src0, int src0_stride,
const CONV_BUF_TYPE *src1, int src1_stride,
BLOCK_SIZE sb_type, int h, int w,
- ConvolveParams *conv_params, int bd)
-#else
-static void diffwtd_mask_d32(uint8_t *mask, int which_inverse, int mask_base,
- const int32_t *src0, int src0_stride,
- const int32_t *src1, int src1_stride,
- BLOCK_SIZE sb_type, int h, int w,
- ConvolveParams *conv_params, int bd)
-#endif
-{
+ ConvolveParams *conv_params, int bd) {
int round =
2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
int i, j, m, diff;
@@ -391,19 +374,10 @@
}
}
-#if CONFIG_LOWPRECISION_BLEND
static void build_compound_seg_mask_d16(
uint8_t *mask, SEG_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride,
- BLOCK_SIZE sb_type, int h, int w, ConvolveParams *conv_params, int bd)
-#else
-static void build_compound_seg_mask_d32(uint8_t *mask, SEG_MASK_TYPE mask_type,
- const int32_t *src0, int src0_stride,
- const int32_t *src1, int src1_stride,
- BLOCK_SIZE sb_type, int h, int w,
- ConvolveParams *conv_params, int bd)
-#endif
-{
+ BLOCK_SIZE sb_type, int h, int w, ConvolveParams *conv_params, int bd) {
switch (mask_type) {
case DIFFWTD_38:
diffwtd_mask_d32(mask, 0, 38, src0, src0_stride, src1, src1_stride,
@@ -627,7 +601,6 @@
init_wedge_masks();
}
-#if CONFIG_LOWPRECISION_BLEND
static void build_masked_compound_no_round(
uint8_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
const CONV_BUF_TYPE *src1, int src1_stride,
@@ -647,21 +620,6 @@
src1_stride, mask, block_size_wide[sb_type], h,
w, subh, subw, conv_params);
}
-#else // CONFIG_LOWPRECISION_BLEND
-static void build_masked_compound_no_round(
- CONV_BUF_TYPE *dst, int dst_stride, const CONV_BUF_TYPE *src0,
- int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride,
- const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
- int w) {
- // Derive subsampling from h and w passed in. May be refactored to
- // pass in subsampling factors directly.
- const int subh = (2 << mi_size_high_log2[sb_type]) == h;
- const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
- const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
- aom_blend_a64_d32_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride,
- mask, block_size_wide[sb_type], h, w, subh, subw);
-}
-#endif // CONFIG_LOWPRECISION_BLEND
static void build_masked_compound(
uint8_t *dst, int dst_stride, const uint8_t *src0, int src0_stride,
@@ -713,14 +671,7 @@
// a temporary buffer, then will blend that temporary buffer with that from
// the other reference.
//
-#if CONFIG_LOWPRECISION_BLEND
#define INTER_PRED_BYTES_PER_PIXEL 2
-#else
-// The predictions are at 32-bits, so we'll need 32 bits per
-// pixel. Otherwise, we'll need up to 16 bits per pixel if
-// CONFIG_HIGHBITDEPTH or just 8 otherwise.
-#define INTER_PRED_BYTES_PER_PIXEL 4
-#endif
DECLARE_ALIGNED(32, uint8_t,
tmp_buf[INTER_PRED_BYTES_PER_PIXEL * MAX_SB_SQUARE]);
@@ -733,13 +684,8 @@
const int tmp_buf_stride = MAX_SB_SIZE;
CONV_BUF_TYPE *org_dst = conv_params->dst;
int org_dst_stride = conv_params->dst_stride;
-#if CONFIG_LOWPRECISION_BLEND
CONV_BUF_TYPE *tmp_buf16 = (CONV_BUF_TYPE *)tmp_buf;
conv_params->dst = tmp_buf16;
-#else
- CONV_BUF_TYPE *tmp_buf32 = (CONV_BUF_TYPE *)tmp_buf;
- conv_params->dst = tmp_buf32;
-#endif
conv_params->dst_stride = tmp_buf_stride;
assert(conv_params->do_average == 0);
@@ -750,25 +696,13 @@
xd, can_use_previous);
if (!plane && comp_data.interinter_compound_type == COMPOUND_SEG) {
-#if CONFIG_LOWPRECISION_BLEND
build_compound_seg_mask_d16(
comp_data.seg_mask, comp_data.mask_type, org_dst, org_dst_stride,
tmp_buf16, tmp_buf_stride, mi->mbmi.sb_type, h, w, conv_params, xd->bd);
-#else
- build_compound_seg_mask_d32(
- comp_data.seg_mask, comp_data.mask_type, org_dst, org_dst_stride,
- tmp_buf32, tmp_buf_stride, mi->mbmi.sb_type, h, w, conv_params, xd->bd);
-#endif
}
-#if CONFIG_LOWPRECISION_BLEND
build_masked_compound_no_round(dst, dst_stride, org_dst, org_dst_stride,
tmp_buf16, tmp_buf_stride, &comp_data,
mi->mbmi.sb_type, h, w, conv_params, xd);
-#else
- build_masked_compound_no_round(org_dst, org_dst_stride, org_dst,
- org_dst_stride, tmp_buf32, tmp_buf_stride,
- &comp_data, mi->mbmi.sb_type, h, w);
-#endif
}
// TODO(sarahparker) av1_highbd_build_inter_predictor and
@@ -956,11 +890,7 @@
for (idx = 0; idx < b8_w; idx += b4_w) {
MB_MODE_INFO *this_mbmi = &xd->mi[row * xd->mi_stride + col]->mbmi;
is_compound = has_second_ref(this_mbmi);
-#if CONFIG_LOWPRECISION_BLEND
DECLARE_ALIGNED(32, CONV_BUF_TYPE, tmp_dst[8 * 8]);
-#else
- DECLARE_ALIGNED(32, int32_t, tmp_dst[8 * 8]);
-#endif
int tmp_dst_stride = 8;
assert(w < 8 || h < 8);
ConvolveParams conv_params = get_conv_params_no_round(
@@ -1064,11 +994,7 @@
uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
uint8_t *pre[2];
SubpelParams subpel_params[2];
-#if CONFIG_LOWPRECISION_BLEND
DECLARE_ALIGNED(32, uint16_t, tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE]);
-#else
- DECLARE_ALIGNED(32, int32_t, tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE]);
-#endif
for (ref = 0; ref < 1 + is_compound; ++ref) {
const struct scale_factors *const sf =
is_intrabc ? &cm->sf_identity : &xd->block_refs[ref]->sf;
@@ -1164,20 +1090,6 @@
plane, ref, mi, build_for_obmc, subpel_params[ref].xs,
subpel_params[ref].ys, xd, cm->allow_warped_motion);
}
-
-#if !CONFIG_LOWPRECISION_BLEND
- // TODO(angiebird): This part needs optimization
- if (conv_params.is_compound) {
- assert(conv_params.dst != NULL);
- int round_bits = get_compound_post_rounding_bits(&conv_params);
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
- av1_highbd_convolve_rounding(tmp_dst, MAX_SB_SIZE, dst, dst_buf->stride,
- w, h, round_bits, xd->bd);
- else
- av1_convolve_rounding(tmp_dst, MAX_SB_SIZE, dst, dst_buf->stride, w, h,
- round_bits);
- }
-#endif
}
}
diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c
index 95ee5d5..0e0068f 100644
--- a/av1/common/warped_motion.c
+++ b/av1/common/warped_motion.c
@@ -431,11 +431,9 @@
const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
const int offset_bits_horiz = bd + FILTER_BITS - 1;
const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
-#if CONFIG_LOWPRECISION_BLEND
const int round_bits =
2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-#endif
(void)max_bits_horiz;
assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
@@ -506,7 +504,6 @@
&conv_params
->dst[(i - p_row + k + 4) * conv_params->dst_stride +
(j - p_col + l + 4)];
-#if CONFIG_LOWPRECISION_BLEND
sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
if (conv_params->do_average) {
uint16_t *dst16 =
@@ -527,30 +524,6 @@
} else {
*p = sum;
}
-#else // CONFIG_LOWPRECISION_BLEND
- sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert) -
- (1 << (offset_bits_horiz + FILTER_BITS - reduce_bits_horiz -
- reduce_bits_vert)) -
- (1 << (offset_bits_vert - reduce_bits_vert));
- if (conv_params->use_jnt_comp_avg) {
- if (conv_params->do_average) {
- int32_t tmp32 = *p;
- tmp32 = tmp32 * conv_params->fwd_offset +
- sum * conv_params->bck_offset;
- *p = tmp32 >> DIST_PRECISION_BITS;
- } else {
- *p = sum;
- }
- } else {
- if (conv_params->do_average) {
- int32_t tmp32 = *p;
- tmp32 += sum;
- *p = tmp32 >> 1;
- } else {
- *p = sum;
- }
- }
-#endif // CONFIG_LOWPRECISION_BLEND
} else {
uint16_t *p =
&pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
@@ -744,11 +717,9 @@
const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
const int offset_bits_horiz = bd + FILTER_BITS - 1;
const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
-#if CONFIG_LOWPRECISION_BLEND
const int round_bits =
2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-#endif
(void)max_bits_horiz;
assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
@@ -825,7 +796,6 @@
&conv_params
->dst[(i - p_row + k + 4) * conv_params->dst_stride +
(j - p_col + l + 4)];
-#if CONFIG_LOWPRECISION_BLEND
sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
if (conv_params->do_average) {
uint8_t *dst8 =
@@ -845,30 +815,6 @@
} else {
*p = sum;
}
-#else // CONFIG_LOWPRECISION_BLEND
- sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert) -
- (1 << (offset_bits_horiz + FILTER_BITS - reduce_bits_horiz -
- reduce_bits_vert)) -
- (1 << (offset_bits_vert - reduce_bits_vert));
- if (conv_params->use_jnt_comp_avg) {
- if (conv_params->do_average) {
- int32_t tmp32 = *p;
- tmp32 = tmp32 * conv_params->fwd_offset +
- sum * conv_params->bck_offset;
- *p = tmp32 >> DIST_PRECISION_BITS;
- } else {
- *p = sum;
- }
- } else {
- if (conv_params->do_average) {
- int32_t tmp32 = *p;
- tmp32 += sum;
- *p = tmp32 >> 1;
- } else {
- *p = sum;
- }
- }
-#endif // CONFIG_LOWPRECISION_BLEND
} else {
uint8_t *p =
&pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
diff --git a/av1/common/x86/av1_convolve_scale_sse4.c b/av1/common/x86/av1_convolve_scale_sse4.c
index 09acddd..366c664 100644
--- a/av1/common/x86/av1_convolve_scale_sse4.c
+++ b/av1/common/x86/av1_convolve_scale_sse4.c
@@ -42,107 +42,8 @@
return _mm_castpd_si128(_mm_and_pd(ad, bd));
}
-#if !CONFIG_LOWPRECISION_BLEND
-// The horizontal filter for av1_convolve_2d_scale_sse4_1. This is the more
-// general version, supporting 10 and 12 tap filters. For 8-tap filters, use
-// hfilter8.
-static void hfilter(const uint8_t *src, int src_stride, int32_t *dst, int w,
- int h, int subpel_x_qn, int x_step_qn,
- const InterpFilterParams *filter_params, unsigned round) {
- const int bd = 8;
- const int ntaps = filter_params->taps;
- assert(ntaps == 10 || ntaps == 12);
-
- src -= ntaps / 2 - 1;
-
- // Construct a mask with which we'll AND filter coefficients 89ab89ab to zero
- // out the unneeded entries.
- const __m128i hicoeff_mask = make_1012_mask(ntaps);
-
- int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
- const __m128i round_add = _mm_set1_epi32(round_add32);
- const __m128i round_shift = extend_32_to_128(round);
-
- int x_qn = subpel_x_qn;
- for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
- const uint8_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS);
- const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
- assert(filter_idx < SUBPEL_SHIFTS);
- const int16_t *filter =
- av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
-
- // The "lo" coefficients are coefficients 0..7. For a 12-tap filter, the
- // "hi" coefficients are arranged as 89ab89ab. For a 10-tap filter, they
- // are masked out with hicoeff_mask.
- const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
- const __m128i coeffhi = load_and_128i(filter + 8, hicoeff_mask);
- const __m128i zero = _mm_castps_si128(_mm_setzero_ps());
-
- int y;
- for (y = 0; y <= h - 4; y += 4) {
- const uint8_t *const src0 = src_col + y * src_stride;
- const uint8_t *const src1 = src0 + 1 * src_stride;
- const uint8_t *const src2 = src0 + 2 * src_stride;
- const uint8_t *const src3 = src0 + 3 * src_stride;
-
- // Load up source data. This is 8-bit input data, so each load gets 16
- // pixels (we need at most 12)
- const __m128i data08 = _mm_loadu_si128((__m128i *)src0);
- const __m128i data18 = _mm_loadu_si128((__m128i *)src1);
- const __m128i data28 = _mm_loadu_si128((__m128i *)src2);
- const __m128i data38 = _mm_loadu_si128((__m128i *)src3);
-
- // Now zero-extend up to 16-bit precision by interleaving with zeros. For
- // the "high" pixels (8 to 11), interleave first (so that the expansion
- // to 16-bits operates on an entire register).
- const __m128i data0lo = _mm_unpacklo_epi8(data08, zero);
- const __m128i data1lo = _mm_unpacklo_epi8(data18, zero);
- const __m128i data2lo = _mm_unpacklo_epi8(data28, zero);
- const __m128i data3lo = _mm_unpacklo_epi8(data38, zero);
- const __m128i data01hi8 = _mm_unpackhi_epi32(data08, data18);
- const __m128i data23hi8 = _mm_unpackhi_epi32(data28, data38);
- const __m128i data01hi = _mm_unpacklo_epi8(data01hi8, zero);
- const __m128i data23hi = _mm_unpacklo_epi8(data23hi8, zero);
-
- // Multiply by coefficients
- const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
- const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
- const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
- const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);
- const __m128i conv01hi = _mm_madd_epi16(data01hi, coeffhi);
- const __m128i conv23hi = _mm_madd_epi16(data23hi, coeffhi);
-
- // Reduce horizontally and add
- const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
- const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
- const __m128i convlo = _mm_hadd_epi32(conv01lo, conv23lo);
- const __m128i convhi = _mm_hadd_epi32(conv01hi, conv23hi);
- const __m128i conv = _mm_add_epi32(convlo, convhi);
-
- // Divide down by (1 << round), rounding to nearest.
- const __m128i shifted =
- _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
-
- // Write transposed to the output
- _mm_storeu_si128((__m128i *)(dst + y + x * h), shifted);
- }
- for (; y < h; ++y) {
- const uint8_t *const src_row = src_col + y * src_stride;
-
- int32_t sum = (1 << (bd + FILTER_BITS - 1));
- for (int k = 0; k < ntaps; ++k) {
- sum += filter[k] * src_row[k];
- }
-
- dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round);
- }
- }
-}
-#endif // CONFIG_LOWPRECISION_BLEND
-
// A specialised version of hfilter, the horizontal filter for
// av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters.
-#if CONFIG_LOWPRECISION_BLEND
static void hfilter8(const uint8_t *src, int src_stride, int16_t *dst, int w,
int h, int subpel_x_qn, int x_step_qn,
const InterpFilterParams *filter_params, unsigned round) {
@@ -219,83 +120,6 @@
}
}
}
-#else
-static void hfilter8(const uint8_t *src, int src_stride, int32_t *dst, int w,
- int h, int subpel_x_qn, int x_step_qn,
- const InterpFilterParams *filter_params, unsigned round) {
- const int bd = 8;
- const int ntaps = 8;
-
- src -= ntaps / 2 - 1;
-
- int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
- const __m128i round_add = _mm_set1_epi32(round_add32);
- const __m128i round_shift = extend_32_to_128(round);
-
- int x_qn = subpel_x_qn;
- for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
- const uint8_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS);
- const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
- assert(filter_idx < SUBPEL_SHIFTS);
- const int16_t *filter =
- av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
-
- // Load the filter coefficients
- const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
- const __m128i zero = _mm_castps_si128(_mm_setzero_ps());
-
- int y;
- for (y = 0; y <= h - 4; y += 4) {
- const uint8_t *const src0 = src_col + y * src_stride;
- const uint8_t *const src1 = src0 + 1 * src_stride;
- const uint8_t *const src2 = src0 + 2 * src_stride;
- const uint8_t *const src3 = src0 + 3 * src_stride;
-
- // Load up source data. This is 8-bit input data; each load is just
- // loading the lower half of the register and gets 8 pixels
- const __m128i data08 = _mm_loadl_epi64((__m128i *)src0);
- const __m128i data18 = _mm_loadl_epi64((__m128i *)src1);
- const __m128i data28 = _mm_loadl_epi64((__m128i *)src2);
- const __m128i data38 = _mm_loadl_epi64((__m128i *)src3);
-
- // Now zero-extend up to 16-bit precision by interleaving with
- // zeros. Drop the upper half of each register (which just had zeros)
- const __m128i data0lo = _mm_unpacklo_epi8(data08, zero);
- const __m128i data1lo = _mm_unpacklo_epi8(data18, zero);
- const __m128i data2lo = _mm_unpacklo_epi8(data28, zero);
- const __m128i data3lo = _mm_unpacklo_epi8(data38, zero);
-
- // Multiply by coefficients
- const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
- const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
- const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
- const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);
-
- // Reduce horizontally and add
- const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
- const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
- const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo);
-
- // Divide down by (1 << round), rounding to nearest.
- const __m128i shifted =
- _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
-
- // Write transposed to the output
- _mm_storeu_si128((__m128i *)(dst + y + x * h), shifted);
- }
- for (; y < h; ++y) {
- const uint8_t *const src_row = src_col + y * src_stride;
-
- int32_t sum = (1 << (bd + FILTER_BITS - 1));
- for (int k = 0; k < ntaps; ++k) {
- sum += filter[k] * src_row[k];
- }
-
- dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round);
- }
- }
-}
-#endif // CONFIG_LOWPRECISION_BLEND
// Do a 12-tap convolution with the given coefficients, loading data from src.
static __m128i convolve_32(const int32_t *src, __m128i coeff03, __m128i coeff47,
@@ -319,134 +143,13 @@
return _mm_add_epi32(conv03, conv47);
}
-#if CONFIG_LOWPRECISION_BLEND
static __m128i convolve_16_8(const int16_t *src, __m128i coeff) {
__m128i data = _mm_loadu_si128((__m128i *)src);
return _mm_madd_epi16(data, coeff);
}
-#endif
-
-#if !CONFIG_LOWPRECISION_BLEND
-// The vertical filter for av1_convolve_2d_scale_sse4_1. This is the more
-// general version, supporting 10 and 12 tap filters. For 8-tap filters, use
-// vfilter8.
-static void vfilter(const int32_t *src, int src_stride, int32_t *dst,
- int dst_stride, int w, int h, int subpel_y_qn,
- int y_step_qn, const InterpFilterParams *filter_params,
- const ConvolveParams *conv_params, int bd) {
- const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
- const int ntaps = filter_params->taps;
-
- // Construct a mask with which we'll AND filter coefficients 89ab to zero out
- // the unneeded entries. The upper bits of this mask are unused.
- const __m128i hicoeff_mask = make_1012_mask(ntaps);
-
- int32_t round_add32 = (1 << conv_params->round_1) / 2 + (1 << offset_bits);
- const __m128i round_add = _mm_set1_epi32(round_add32);
- const __m128i round_shift = extend_32_to_128(conv_params->round_1);
-
- const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) +
- (1 << (offset_bits - conv_params->round_1 - 1)));
- const __m128i sub = _mm_set1_epi32(sub32);
-
- const __m128i fwd_offset = _mm_set1_epi32(conv_params->fwd_offset);
- const __m128i bck_offset = _mm_set1_epi32(conv_params->bck_offset);
-
- int y_qn = subpel_y_qn;
- for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
- const int32_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
- const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
- assert(filter_idx < SUBPEL_SHIFTS);
- const int16_t *filter =
- av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
-
- // Load up coefficients for the filter and sign-extend to 32-bit precision
- // (to do so, calculate sign bits and then interleave)
- const __m128i zero = _mm_castps_si128(_mm_setzero_ps());
- const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
- const __m128i coeffhi16 = load_and_128i(filter + 8, hicoeff_mask);
- const __m128i csign0716 = _mm_cmplt_epi16(coeff0716, zero);
- const __m128i csignhi16 = _mm_cmplt_epi16(coeffhi16, zero);
- const __m128i coeff03 = _mm_unpacklo_epi16(coeff0716, csign0716);
- const __m128i coeff47 = _mm_unpackhi_epi16(coeff0716, csign0716);
- const __m128i coeff8d = _mm_unpacklo_epi16(coeffhi16, csignhi16);
-
- int x;
- for (x = 0; x <= w - 4; x += 4) {
- const int32_t *const src0 = src_y + x * src_stride;
- const int32_t *const src1 = src0 + 1 * src_stride;
- const int32_t *const src2 = src0 + 2 * src_stride;
- const int32_t *const src3 = src0 + 3 * src_stride;
-
- // Load the source data for the three rows, adding the three registers of
- // convolved products to one as we go (conv0..conv3) to avoid the
- // register pressure getting too high.
- const __m128i conv0 = convolve_32(src0, coeff03, coeff47, coeff8d);
- const __m128i conv1 = convolve_32(src1, coeff03, coeff47, coeff8d);
- const __m128i conv2 = convolve_32(src2, coeff03, coeff47, coeff8d);
- const __m128i conv3 = convolve_32(src3, coeff03, coeff47, coeff8d);
-
- // Now reduce horizontally to get one lane for each result
- const __m128i conv01 = _mm_hadd_epi32(conv0, conv1);
- const __m128i conv23 = _mm_hadd_epi32(conv2, conv3);
- const __m128i conv = _mm_hadd_epi32(conv01, conv23);
-
- // Divide down by (1 << round_1), rounding to nearest and subtract sub32.
- const __m128i shifted =
- _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
- const __m128i subbed = _mm_sub_epi32(shifted, sub);
-
- int32_t *dst_x = dst + y * dst_stride + x;
- __m128i result;
- if (conv_params->use_jnt_comp_avg) {
- if (conv_params->do_average) {
- __m128i tmp = _mm_loadu_si128((__m128i *)dst_x);
- tmp = _mm_add_epi32(_mm_mullo_epi32(tmp, fwd_offset),
- _mm_mullo_epi32(subbed, bck_offset));
- result = _mm_srai_epi32(tmp, DIST_PRECISION_BITS);
- } else {
- result = subbed;
- }
- } else {
- result =
- (conv_params->do_average)
- ? _mm_srai_epi32(
- _mm_add_epi32(subbed, _mm_loadu_si128((__m128i *)dst_x)),
- 1)
- : subbed;
- }
- _mm_storeu_si128((__m128i *)dst_x, result);
- }
- for (; x < w; ++x) {
- const int32_t *src_x = src_y + x * src_stride;
- CONV_BUF_TYPE sum = 1 << offset_bits;
- for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
- CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - sub32;
- if (conv_params->use_jnt_comp_avg) {
- if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
- dst[y * dst_stride + x] = tmp >> DIST_PRECISION_BITS;
- } else {
- dst[y * dst_stride + x] = res;
- }
- } else {
- if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- tmp += res;
- dst[y * dst_stride + x] = tmp >> 1;
- } else {
- dst[y * dst_stride + x] = res;
- }
- }
- }
- }
-}
-#endif // CONFIG_LOWPRECISION_BLEND
// A specialised version of vfilter, the vertical filter for
// av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters.
-#if CONFIG_LOWPRECISION_BLEND
static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h, int subpel_y_qn,
int y_step_qn, const InterpFilterParams *filter_params,
@@ -572,116 +275,6 @@
}
}
}
-#else
-static void vfilter8(const int32_t *src, int src_stride, int32_t *dst,
- int dst_stride, int w, int h, int subpel_y_qn,
- int y_step_qn, const InterpFilterParams *filter_params,
- const ConvolveParams *conv_params, int bd) {
- const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
- const int ntaps = 8;
-
- int32_t round_add32 = (1 << conv_params->round_1) / 2 + (1 << offset_bits);
- const __m128i round_add = _mm_set1_epi32(round_add32);
- const __m128i round_shift = extend_32_to_128(conv_params->round_1);
-
- const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) +
- (1 << (offset_bits - conv_params->round_1 - 1)));
- const __m128i sub = _mm_set1_epi32(sub32);
-
- const __m128i fwd_offset = _mm_set1_epi32(conv_params->fwd_offset);
- const __m128i bck_offset = _mm_set1_epi32(conv_params->bck_offset);
-
- int y_qn = subpel_y_qn;
- for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
- const int32_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
- const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
- assert(filter_idx < SUBPEL_SHIFTS);
- const int16_t *filter =
- av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
-
- // Load up coefficients for the filter and sign-extend to 32-bit precision
- // (to do so, calculate sign bits and then interleave)
- const __m128i zero = _mm_castps_si128(_mm_setzero_ps());
- const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
- const __m128i csign0716 = _mm_cmplt_epi16(coeff0716, zero);
- const __m128i coeff03 = _mm_unpacklo_epi16(coeff0716, csign0716);
- const __m128i coeff47 = _mm_unpackhi_epi16(coeff0716, csign0716);
-
- int x;
- for (x = 0; x <= w - 4; x += 4) {
- const int32_t *const src0 = src_y + x * src_stride;
- const int32_t *const src1 = src0 + 1 * src_stride;
- const int32_t *const src2 = src0 + 2 * src_stride;
- const int32_t *const src3 = src0 + 3 * src_stride;
-
- // Load the source data for the three rows, adding the three registers of
- // convolved products to one as we go (conv0..conv3) to avoid the
- // register pressure getting too high.
- const __m128i conv0 = convolve_32_8(src0, coeff03, coeff47);
- const __m128i conv1 = convolve_32_8(src1, coeff03, coeff47);
- const __m128i conv2 = convolve_32_8(src2, coeff03, coeff47);
- const __m128i conv3 = convolve_32_8(src3, coeff03, coeff47);
-
- // Now reduce horizontally to get one lane for each result
- const __m128i conv01 = _mm_hadd_epi32(conv0, conv1);
- const __m128i conv23 = _mm_hadd_epi32(conv2, conv3);
- const __m128i conv = _mm_hadd_epi32(conv01, conv23);
-
- // Divide down by (1 << round_1), rounding to nearest and subtract sub32.
- const __m128i shifted =
- _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
- const __m128i subbed = _mm_sub_epi32(shifted, sub);
-
- int32_t *dst_x = dst + y * dst_stride + x;
- __m128i result;
- if (conv_params->use_jnt_comp_avg) {
- if (conv_params->do_average) {
- __m128i tmp = _mm_loadu_si128((__m128i *)dst_x);
- tmp = _mm_add_epi32(_mm_mullo_epi32(tmp, fwd_offset),
- _mm_mullo_epi32(subbed, bck_offset));
- result = _mm_srai_epi32(tmp, DIST_PRECISION_BITS);
- } else {
- result = subbed;
- }
- } else {
- result =
- (conv_params->do_average)
- ? _mm_srai_epi32(
- _mm_add_epi32(subbed, _mm_loadu_si128((__m128i *)dst_x)),
- 1)
- : subbed;
- }
-
- _mm_storeu_si128((__m128i *)dst_x, result);
- }
- for (; x < w; ++x) {
- const int32_t *src_x = src_y + x * src_stride;
- CONV_BUF_TYPE sum = 1 << offset_bits;
- for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
- CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - sub32;
- if (conv_params->use_jnt_comp_avg) {
- if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
- dst[y * dst_stride + x] = tmp >> DIST_PRECISION_BITS;
- } else {
- dst[y * dst_stride + x] = res;
- }
- } else {
- if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- tmp += res;
- dst[y * dst_stride + x] = tmp >> 1;
- } else {
- dst[y * dst_stride + x] = res;
- }
- }
- }
- }
-}
-#endif // CONFIG_LOWPRECISION_BLEND
-
-#if CONFIG_LOWPRECISION_BLEND
void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride,
uint8_t *dst8, int dst8_stride, int w, int h,
InterpFilterParams *filter_params_x,
@@ -708,41 +301,6 @@
vfilter8(tmp, im_h, dst8, dst8_stride, w, h, subpel_y_qn, y_step_qn,
filter_params_y, conv_params, 8);
}
-#else
-void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride,
- CONV_BUF_TYPE *dst, int dst_stride, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_qn, const int x_step_qn,
- const int subpel_y_qn, const int y_step_qn,
- ConvolveParams *conv_params) {
- // TODO(yaowu): remove unnecessary initializations
- int32_t tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE] = { 0 };
- int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
- filter_params_y->taps;
-
- const int xtaps = filter_params_x->taps;
- const int ytaps = filter_params_y->taps;
-
- const int fo_vert = ytaps / 2 - 1;
-
- // horizontal filter
- if (xtaps == 8)
- hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h, subpel_x_qn,
- x_step_qn, filter_params_x, conv_params->round_0);
- else
- hfilter(src - fo_vert * src_stride, src_stride, tmp, w, im_h, subpel_x_qn,
- x_step_qn, filter_params_x, conv_params->round_0);
-
- // vertical filter (input is transposed)
- if (ytaps == 8)
- vfilter8(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
- filter_params_y, conv_params, 8);
- else
- vfilter(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
- filter_params_y, conv_params, 8);
-}
-#endif
// An wrapper to generate the SHUFPD instruction with __m128i types (just
// writing _mm_shuffle_pd at the callsites gets a bit ugly because of the
@@ -753,104 +311,9 @@
return _mm_castpd_si128(_mm_shuffle_pd(ad, bd, 0));
}
-#if !CONFIG_LOWPRECISION_BLEND
-// The horizontal filter for av1_highbd_convolve_2d_scale_sse4_1. This
-// is the more general version, supporting 10 and 12 tap filters. For
-// 8-tap filters, use hfilter8.
-static void highbd_hfilter(const uint16_t *src, int src_stride, int32_t *dst,
- int w, int h, int subpel_x_qn, int x_step_qn,
- const InterpFilterParams *filter_params,
- unsigned round, int bd) {
- const int ntaps = filter_params->taps;
- assert(ntaps == 10 || ntaps == 12);
-
- src -= ntaps / 2 - 1;
-
- // Construct a mask with which we'll AND filter coefficients 89ab89ab to zero
- // out the unneeded entries.
- const __m128i hicoeff_mask = make_1012_mask(ntaps);
-
- int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
- const __m128i round_add = _mm_set1_epi32(round_add32);
- const __m128i round_shift = extend_32_to_128(round);
-
- int x_qn = subpel_x_qn;
- for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
- const uint16_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS);
- const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
- assert(filter_idx < SUBPEL_SHIFTS);
- const int16_t *filter =
- av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
-
- // The "lo" coefficients are coefficients 0..7. For a 12-tap filter, the
- // "hi" coefficients are arranged as 89ab89ab. For a 10-tap filter, they
- // are masked out with hicoeff_mask.
- const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
- const __m128i coeffhi = load_and_128i(filter + 8, hicoeff_mask);
-
- int y;
- for (y = 0; y <= h - 4; y += 4) {
- const uint16_t *const src0 = src_col + y * src_stride;
- const uint16_t *const src1 = src0 + 1 * src_stride;
- const uint16_t *const src2 = src0 + 2 * src_stride;
- const uint16_t *const src3 = src0 + 3 * src_stride;
-
- // Load up source data. This is 16-bit input data, so each load gets 8
- // pixels (we need at most 12)
- const __m128i data0lo = _mm_loadu_si128((__m128i *)src0);
- const __m128i data1lo = _mm_loadu_si128((__m128i *)src1);
- const __m128i data2lo = _mm_loadu_si128((__m128i *)src2);
- const __m128i data3lo = _mm_loadu_si128((__m128i *)src3);
- const __m128i data0hi = _mm_loadu_si128((__m128i *)(src0 + 8));
- const __m128i data1hi = _mm_loadu_si128((__m128i *)(src1 + 8));
- const __m128i data2hi = _mm_loadu_si128((__m128i *)(src2 + 8));
- const __m128i data3hi = _mm_loadu_si128((__m128i *)(src3 + 8));
-
- // The "hi" data has rubbish in the top half so interleave pairs together
- // to minimise the calculation we need to do.
- const __m128i data01hi = mm_shuffle0_si128(data0hi, data1hi);
- const __m128i data23hi = mm_shuffle0_si128(data2hi, data3hi);
-
- // Multiply by coefficients
- const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
- const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
- const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
- const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);
- const __m128i conv01hi = _mm_madd_epi16(data01hi, coeffhi);
- const __m128i conv23hi = _mm_madd_epi16(data23hi, coeffhi);
-
- // Reduce horizontally and add
- const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
- const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
- const __m128i convlo = _mm_hadd_epi32(conv01lo, conv23lo);
- const __m128i convhi = _mm_hadd_epi32(conv01hi, conv23hi);
- const __m128i conv = _mm_add_epi32(convlo, convhi);
-
- // Divide down by (1 << round), rounding to nearest.
- const __m128i shifted =
- _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
-
- // Write transposed to the output
- _mm_storeu_si128((__m128i *)(dst + y + x * h), shifted);
- }
- for (; y < h; ++y) {
- const uint16_t *const src_row = src_col + y * src_stride;
-
- int32_t sum = (1 << (bd + FILTER_BITS - 1));
- for (int k = 0; k < ntaps; ++k) {
- sum += filter[k] * src_row[k];
- }
-
- dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round);
- }
- }
-}
-#endif // CONFIG_LOWPRECISION_BLEND
-
// A specialised version of hfilter, the horizontal filter for
// av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap
// filters.
-#if CONFIG_LOWPRECISION_BLEND
static void highbd_hfilter8(const uint16_t *src, int src_stride, int16_t *dst,
int w, int h, int subpel_x_qn, int x_step_qn,
const InterpFilterParams *filter_params,
@@ -919,77 +382,6 @@
}
}
}
-#else // CONFIG_LOWPRECISION_BLEND
-static void highbd_hfilter8(const uint16_t *src, int src_stride, int32_t *dst,
- int w, int h, int subpel_x_qn, int x_step_qn,
- const InterpFilterParams *filter_params,
- unsigned round, int bd) {
- const int ntaps = 8;
-
- src -= ntaps / 2 - 1;
-
- int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
- const __m128i round_add = _mm_set1_epi32(round_add32);
- const __m128i round_shift = extend_32_to_128(round);
-
- int x_qn = subpel_x_qn;
- for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
- const uint16_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS);
- const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
- assert(filter_idx < SUBPEL_SHIFTS);
- const int16_t *filter =
- av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
-
- // Load the filter coefficients
- const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
-
- int y;
- for (y = 0; y <= h - 4; y += 4) {
- const uint16_t *const src0 = src_col + y * src_stride;
- const uint16_t *const src1 = src0 + 1 * src_stride;
- const uint16_t *const src2 = src0 + 2 * src_stride;
- const uint16_t *const src3 = src0 + 3 * src_stride;
-
- // Load up source data. This is 16-bit input data, so each load gets the 8
- // pixels we need.
- const __m128i data0lo = _mm_loadu_si128((__m128i *)src0);
- const __m128i data1lo = _mm_loadu_si128((__m128i *)src1);
- const __m128i data2lo = _mm_loadu_si128((__m128i *)src2);
- const __m128i data3lo = _mm_loadu_si128((__m128i *)src3);
-
- // Multiply by coefficients
- const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
- const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
- const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
- const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);
-
- // Reduce horizontally and add
- const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
- const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
- const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo);
-
- // Divide down by (1 << round), rounding to nearest.
- const __m128i shifted =
- _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
-
- // Write transposed to the output
- _mm_storeu_si128((__m128i *)(dst + y + x * h), shifted);
- }
- for (; y < h; ++y) {
- const uint16_t *const src_row = src_col + y * src_stride;
-
- int32_t sum = (1 << (bd + FILTER_BITS - 1));
- for (int k = 0; k < ntaps; ++k) {
- sum += filter[k] * src_row[k];
- }
-
- dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round);
- }
- }
-}
-#endif // CONFIG_LOWPRECISION_BLEND
-
-#if CONFIG_LOWPRECISION_BLEND
// A specialised version of vfilter, the vertical filter for
// av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap
// filters.
@@ -1160,39 +552,3 @@
highbd_vfilter8(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
filter_params_y, conv_params, bd);
}
-#else // CONFIG_LOWPRECISION_BLEND
-void av1_highbd_convolve_2d_scale_sse4_1(
- const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride,
- int w, int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y, const int subpel_x_qn,
- const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
- ConvolveParams *conv_params, int bd) {
- // TODO(yaowu): Move this out of stack
- DECLARE_ALIGNED(16, int32_t,
- tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
- int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
- filter_params_y->taps;
- const int xtaps = filter_params_x->taps;
- const int ytaps = filter_params_y->taps;
- const int fo_vert = ytaps / 2 - 1;
-
- memset(tmp, 0, sizeof(tmp));
- // horizontal filter
- if (xtaps == 8)
- highbd_hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h,
- subpel_x_qn, x_step_qn, filter_params_x,
- conv_params->round_0, bd);
- else
- highbd_hfilter(src - fo_vert * src_stride, src_stride, tmp, w, im_h,
- subpel_x_qn, x_step_qn, filter_params_x,
- conv_params->round_0, bd);
-
- // vertical filter (input is transposed)
- if (ytaps == 8)
- vfilter8(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
- filter_params_y, conv_params, bd);
- else
- vfilter(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
- filter_params_y, conv_params, bd);
-}
-#endif // CONFIG_LOWPRECISION_BLEND
diff --git a/av1/common/x86/convolve_2d_avx2.c b/av1/common/x86/convolve_2d_avx2.c
index 168e699..3e523ac 100644
--- a/av1/common/x86/convolve_2d_avx2.c
+++ b/av1/common/x86/convolve_2d_avx2.c
@@ -21,137 +21,6 @@
#include "aom_dsp/x86/synonyms.h"
#include "av1/common/convolve.h"
-#if !CONFIG_LOWPRECISION_BLEND
-void av1_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
- int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
- CONV_BUF_TYPE *dst = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
- const int bd = 8;
- (void)dst0;
- (void)dst_stride0;
-
- DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
- int im_h = h + filter_params_y->taps - 1;
- int im_stride = 8;
- int i, j;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
- const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
- const int do_average = conv_params->do_average;
-
- __m256i filt[4], s[8], coeffs_x[4], coeffs_y[4];
-
- assert(conv_params->round_0 > 0);
-
- filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
- filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
- filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
- filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
-
- prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_x);
- prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
-
- for (j = 0; j < w; j += 8) {
- /* Horizontal filter */
- {
- const __m256i round_const =
- _mm256_set1_epi16(((1 << (conv_params->round_0 - 1)) >> 1) +
- (1 << (bd + FILTER_BITS - 2)));
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
-
- for (i = 0; i < im_h; i += 2) {
- __m256i data = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
- if (i + 1 < im_h)
- data = _mm256_inserti128_si256(
- data,
- _mm_loadu_si128(
- (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
- 1);
- __m256i res = convolve_lowbd_x(data, coeffs_x, filt);
-
- res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift);
-
- // 0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15
- _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
- }
- }
-
- /* Vertical filter */
- {
- const __m256i round_const = _mm256_set1_epi32(
- ((1 << conv_params->round_1) >> 1) -
- (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
-
- __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
- __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
- __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
- __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
- __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
- __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
-
- s[0] = _mm256_unpacklo_epi16(s0, s1);
- s[1] = _mm256_unpacklo_epi16(s2, s3);
- s[2] = _mm256_unpacklo_epi16(s4, s5);
-
- s[4] = _mm256_unpackhi_epi16(s0, s1);
- s[5] = _mm256_unpackhi_epi16(s2, s3);
- s[6] = _mm256_unpackhi_epi16(s4, s5);
-
- for (i = 0; i < h; i += 2) {
- const int16_t *data = &im_block[i * im_stride];
-
- const __m256i s6 =
- _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
- const __m256i s7 =
- _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
-
- s[3] = _mm256_unpacklo_epi16(s6, s7);
- s[7] = _mm256_unpackhi_epi16(s6, s7);
-
- const __m256i res_a = convolve(s, coeffs_y);
- const __m256i res_b = convolve(s + 4, coeffs_y);
-
- const __m256i res_a_round =
- _mm256_sra_epi32(_mm256_add_epi32(res_a, round_const), round_shift);
- const __m256i res_b_round =
- _mm256_sra_epi32(_mm256_add_epi32(res_b, round_const), round_shift);
-
- if (w - j > 4) {
- const __m256i res_ax =
- _mm256_permute2x128_si256(res_a_round, res_b_round, 0x20);
- const __m256i res_bx =
- _mm256_permute2x128_si256(res_a_round, res_b_round, 0x31);
-
- add_store_aligned_256(&dst[i * dst_stride + j], &res_ax, do_average);
- add_store_aligned_256(&dst[i * dst_stride + j + dst_stride], &res_bx,
- do_average);
- } else {
- const __m128i res_ax = _mm256_extracti128_si256(res_a_round, 0);
- const __m128i res_bx = _mm256_extracti128_si256(res_a_round, 1);
-
- add_store(&dst[i * dst_stride + j], &res_ax, do_average);
- add_store(&dst[i * dst_stride + j + dst_stride], &res_bx, do_average);
- }
-
- s[0] = s[1];
- s[1] = s[2];
- s[2] = s[3];
-
- s[4] = s[5];
- s[5] = s[6];
- s[6] = s[7];
- }
- }
- }
-}
-#endif // CONFIG_LOWPRECISION_BLEND
-
void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
InterpFilterParams *filter_params_x,
diff --git a/av1/common/x86/convolve_2d_sse2.c b/av1/common/x86/convolve_2d_sse2.c
index e814675..174b6dd 100644
--- a/av1/common/x86/convolve_2d_sse2.c
+++ b/av1/common/x86/convolve_2d_sse2.c
@@ -18,201 +18,6 @@
#include "aom_dsp/x86/convolve_sse2.h"
#include "av1/common/convolve.h"
-#if !CONFIG_LOWPRECISION_BLEND
-void av1_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
- int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
- CONV_BUF_TYPE *dst = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
- const int bd = 8;
- (void)dst0;
- (void)dst_stride0;
-
- DECLARE_ALIGNED(16, int16_t,
- im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
- int im_h = h + filter_params_y->taps - 1;
- int im_stride = MAX_SB_SIZE;
- int i, j;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
- const int do_average = conv_params->do_average;
- const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
-
- const __m128i zero = _mm_setzero_si128();
-
- assert(conv_params->round_0 > 0);
-
- /* Horizontal filter */
- {
- const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
- const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
- // coeffs 4 5 4 5 6 7 6 7
- const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
- // coeffs 0 1 0 1 0 1 0 1
- const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
- // coeffs 2 3 2 3 2 3 2 3
- const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
- // coeffs 4 5 4 5 4 5 4 5
- const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
- // coeffs 6 7 6 7 6 7 6 7
- const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
- const __m128i round_const = _mm_set1_epi32(
- ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
-
- for (i = 0; i < im_h; ++i) {
- for (j = 0; j < w; j += 8) {
- const __m128i data =
- _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
-
- // Filter even-index pixels
- const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
- const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
- const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
- const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
- const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
- const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
- const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
- const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
- __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
- _mm_add_epi32(res_2, res_6));
- res_even =
- _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
-
- // Filter odd-index pixels
- const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
- const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
- const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
- const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
- const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
- const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
- const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
- const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
- __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
- _mm_add_epi32(res_3, res_7));
- res_odd =
- _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
-
- // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
- __m128i res = _mm_packs_epi32(res_even, res_odd);
- _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
- }
- }
- }
-
- /* Vertical filter */
- {
- const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
- const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
- // coeffs 4 5 4 5 6 7 6 7
- const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
- // coeffs 0 1 0 1 0 1 0 1
- const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
- // coeffs 2 3 2 3 2 3 2 3
- const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
- // coeffs 4 5 4 5 4 5 4 5
- const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
- // coeffs 6 7 6 7 6 7 6 7
- const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
- const __m128i round_const = _mm_set1_epi32(
- ((1 << conv_params->round_1) >> 1) -
- (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
-
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 8) {
- // Filter even-index pixels
- const int16_t *data = &im_block[i * im_stride + j];
- const __m128i src_0 =
- _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
- *(__m128i *)(data + 1 * im_stride));
- const __m128i src_2 =
- _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
- *(__m128i *)(data + 3 * im_stride));
- const __m128i src_4 =
- _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
- *(__m128i *)(data + 5 * im_stride));
- const __m128i src_6 =
- _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
- *(__m128i *)(data + 7 * im_stride));
-
- const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
- const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
- const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
- const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
- const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
- _mm_add_epi32(res_4, res_6));
-
- // Filter odd-index pixels
- const __m128i src_1 =
- _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
- *(__m128i *)(data + 1 * im_stride));
- const __m128i src_3 =
- _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
- *(__m128i *)(data + 3 * im_stride));
- const __m128i src_5 =
- _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
- *(__m128i *)(data + 5 * im_stride));
- const __m128i src_7 =
- _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
- *(__m128i *)(data + 7 * im_stride));
-
- const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
- const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
- const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
- const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
- const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
- _mm_add_epi32(res_5, res_7));
-
- // Rearrange pixels back into the order 0 ... 7
- const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
- const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
- const __m128i res_lo_round =
- _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
- const __m128i res_hi_round =
- _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
-
- // Accumulate values into the destination buffer
- __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
- if (do_average) {
- _mm_storeu_si128(
- p + 0,
- _mm_srai_epi32(
- _mm_add_epi32(_mm_loadu_si128(p + 0), res_lo_round), 1));
- _mm_storeu_si128(
- p + 1,
- _mm_srai_epi32(
- _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round), 1));
- } else {
- _mm_storeu_si128(p + 0, res_lo_round);
- _mm_storeu_si128(p + 1, res_hi_round);
- }
- }
- }
- }
-}
-#endif // CONFIG_LOWPRECISION_BLEND
-
void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
InterpFilterParams *filter_params_x,
@@ -411,138 +216,6 @@
}
}
-#if !CONFIG_LOWPRECISION_BLEND
-void av1_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
- uint8_t *dst0, int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
- CONV_BUF_TYPE *dst = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
- (void)filter_params_x;
- (void)filter_params_y;
- (void)subpel_x_q4;
- (void)subpel_y_q4;
- (void)dst0;
- (void)dst_stride0;
-
- const int bits =
- FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
- const int do_average = conv_params->do_average;
- const __m128i zero = _mm_setzero_si128();
- const __m128i left_shift = _mm_cvtsi32_si128(bits);
- int i, j;
-
- if (!(w % 16)) {
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 16) {
- const __m128i d8 = _mm_loadu_si128((__m128i *)&src[j]);
- const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero);
- const __m128i d16_1 = _mm_unpackhi_epi8(d8, zero);
- __m128i d32_0 = _mm_unpacklo_epi16(d16_0, zero);
- __m128i d32_1 = _mm_unpackhi_epi16(d16_0, zero);
- __m128i d32_2 = _mm_unpacklo_epi16(d16_1, zero);
- __m128i d32_3 = _mm_unpackhi_epi16(d16_1, zero);
-
- d32_0 = _mm_sll_epi32(d32_0, left_shift);
- d32_1 = _mm_sll_epi32(d32_1, left_shift);
- d32_2 = _mm_sll_epi32(d32_2, left_shift);
- d32_3 = _mm_sll_epi32(d32_3, left_shift);
-
- __m128i *const p = (__m128i *)&dst[j];
- if (do_average) {
- _mm_storeu_si128(
- p + 0,
- _mm_srai_epi32(_mm_add_epi32(_mm_loadu_si128(p + 0), d32_0), 1));
- _mm_storeu_si128(
- p + 1,
- _mm_srai_epi32(_mm_add_epi32(_mm_loadu_si128(p + 1), d32_1), 1));
- _mm_storeu_si128(
- p + 2,
- _mm_srai_epi32(_mm_add_epi32(_mm_loadu_si128(p + 2), d32_2), 1));
- _mm_storeu_si128(
- p + 3,
- _mm_srai_epi32(_mm_add_epi32(_mm_loadu_si128(p + 3), d32_3), 1));
- } else {
- _mm_storeu_si128(p + 0, d32_0);
- _mm_storeu_si128(p + 1, d32_1);
- _mm_storeu_si128(p + 2, d32_2);
- _mm_storeu_si128(p + 3, d32_3);
- }
- }
- src += src_stride;
- dst += dst_stride;
- }
- } else if (!(w % 8)) {
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 8) {
- const __m128i d8 = _mm_loadl_epi64((__m128i *)&src[j]);
- const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero);
- __m128i d32_0 = _mm_unpacklo_epi16(d16_0, zero);
- __m128i d32_1 = _mm_unpackhi_epi16(d16_0, zero);
-
- d32_0 = _mm_sll_epi32(d32_0, left_shift);
- d32_1 = _mm_sll_epi32(d32_1, left_shift);
-
- __m128i *const p = (__m128i *)&dst[j];
- if (do_average) {
- _mm_storeu_si128(
- p + 0,
- _mm_srai_epi32(_mm_add_epi32(_mm_loadu_si128(p + 0), d32_0), 1));
- _mm_storeu_si128(
- p + 1,
- _mm_srai_epi32(_mm_add_epi32(_mm_loadu_si128(p + 1), d32_1), 1));
- } else {
- _mm_storeu_si128(p + 0, d32_0);
- _mm_storeu_si128(p + 1, d32_1);
- }
- }
- src += src_stride;
- dst += dst_stride;
- }
- } else if (!(w % 4)) {
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 4) {
- const __m128i d8 = _mm_cvtsi32_si128(*(const int *)&src[j]);
- const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero);
- __m128i d32_0 = _mm_unpacklo_epi16(d16_0, zero);
-
- d32_0 = _mm_sll_epi32(d32_0, left_shift);
- __m128i *const p = (__m128i *)&dst[j];
- if (do_average) {
- _mm_storeu_si128(
- p, _mm_srai_epi32(_mm_add_epi32(_mm_loadu_si128(p), d32_0), 1));
- } else {
- _mm_storeu_si128(p, d32_0);
- }
- }
- src += src_stride;
- dst += dst_stride;
- }
- } else {
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 2) {
- const __m128i d8 = _mm_cvtsi32_si128(*(const int *)&src[j]);
- const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero);
- __m128i d32_0 = _mm_unpacklo_epi16(d16_0, zero);
-
- d32_0 = _mm_sll_epi32(d32_0, left_shift);
- __m128i *const p = (__m128i *)&dst[j];
- if (do_average) {
- _mm_storel_epi64(
- p, _mm_srai_epi32(_mm_add_epi32(_mm_loadl_epi64(p), d32_0), 1));
- } else {
- _mm_storel_epi64(p, d32_0);
- }
- }
- src += src_stride;
- dst += dst_stride;
- }
- }
-}
-#endif // CONFIG_LOWPRECISION_BLEND
-
static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
__m128i s[8];
s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
@@ -685,7 +358,6 @@
}
}
-#if CONFIG_LOWPRECISION_BLEND
void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
uint8_t *dst0, int dst_stride0, int w, int h,
InterpFilterParams *filter_params_x,
@@ -802,228 +474,3 @@
}
}
}
-#else
-void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
- uint8_t *dst0, int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
- CONV_BUF_TYPE *dst = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
- (void)filter_params_x;
- (void)filter_params_y;
- (void)subpel_x_q4;
- (void)subpel_y_q4;
- (void)dst0;
- (void)dst_stride0;
-
- const int bits =
- FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
- const int do_average = conv_params->do_average;
- const __m128i zero = _mm_setzero_si128();
- const __m128i left_shift = _mm_cvtsi32_si128(bits);
- int i, j;
-
- const int w0 = conv_params->fwd_offset;
- const int w1 = conv_params->bck_offset;
- const __m128i wt0 = _mm_set1_epi32(w0);
- const __m128i wt1 = _mm_set1_epi32(w1);
-
- if (!(w % 16)) {
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 16) {
- const __m128i d8 = _mm_loadu_si128((__m128i *)&src[j]);
- const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero);
- const __m128i d16_1 = _mm_unpackhi_epi8(d8, zero);
- __m128i d32_0 = _mm_unpacklo_epi16(d16_0, zero);
- __m128i d32_1 = _mm_unpackhi_epi16(d16_0, zero);
- __m128i d32_2 = _mm_unpacklo_epi16(d16_1, zero);
- __m128i d32_3 = _mm_unpackhi_epi16(d16_1, zero);
-
- __m128i *const p = (__m128i *)&dst[j];
-
- if (conv_params->use_jnt_comp_avg) {
- if (do_average) {
- __m128i mul = _mm_madd_epi16(d32_0, wt1);
- __m128i weighted_res = _mm_sll_epi32(mul, left_shift);
- __m128i tmp = _mm_loadu_si128(p + 0);
- __m128i sum = _mm_add_epi32(_mm_madd_epi16(tmp, wt0), weighted_res);
- d32_0 = _mm_srai_epi32(sum, DIST_PRECISION_BITS);
-
- mul = _mm_madd_epi16(d32_1, wt1);
- weighted_res = _mm_sll_epi32(mul, left_shift);
- tmp = _mm_loadu_si128(p + 1);
- sum = _mm_add_epi32(_mm_madd_epi16(tmp, wt0), weighted_res);
- d32_1 = _mm_srai_epi32(sum, DIST_PRECISION_BITS);
-
- mul = _mm_madd_epi16(d32_2, wt1);
- weighted_res = _mm_sll_epi32(mul, left_shift);
- tmp = _mm_loadu_si128(p + 2);
- sum = _mm_add_epi32(_mm_madd_epi16(tmp, wt0), weighted_res);
- d32_2 = _mm_srai_epi32(sum, DIST_PRECISION_BITS);
-
- mul = _mm_madd_epi16(d32_3, wt1);
- weighted_res = _mm_sll_epi32(mul, left_shift);
- tmp = _mm_loadu_si128(p + 3);
- sum = _mm_add_epi32(_mm_madd_epi16(tmp, wt0), weighted_res);
- d32_3 = _mm_srai_epi32(sum, DIST_PRECISION_BITS);
- } else {
- d32_0 = _mm_sll_epi32(d32_0, left_shift);
- d32_1 = _mm_sll_epi32(d32_1, left_shift);
- d32_2 = _mm_sll_epi32(d32_2, left_shift);
- d32_3 = _mm_sll_epi32(d32_3, left_shift);
- }
- } else {
- if (do_average) {
- d32_0 =
- _mm_srai_epi32(_mm_add_epi32(_mm_loadu_si128(p + 0),
- _mm_sll_epi32(d32_0, left_shift)),
- 1);
- d32_1 =
- _mm_srai_epi32(_mm_add_epi32(_mm_loadu_si128(p + 1),
- _mm_sll_epi32(d32_1, left_shift)),
- 1);
- d32_2 =
- _mm_srai_epi32(_mm_add_epi32(_mm_loadu_si128(p + 2),
- _mm_sll_epi32(d32_2, left_shift)),
- 1);
- d32_3 =
- _mm_srai_epi32(_mm_add_epi32(_mm_loadu_si128(p + 3),
- _mm_sll_epi32(d32_3, left_shift)),
- 1);
- } else {
- d32_0 = _mm_sll_epi32(d32_0, left_shift);
- d32_1 = _mm_sll_epi32(d32_1, left_shift);
- d32_2 = _mm_sll_epi32(d32_2, left_shift);
- d32_3 = _mm_sll_epi32(d32_3, left_shift);
- }
- }
-
- _mm_storeu_si128(p + 0, d32_0);
- _mm_storeu_si128(p + 1, d32_1);
- _mm_storeu_si128(p + 2, d32_2);
- _mm_storeu_si128(p + 3, d32_3);
- }
- src += src_stride;
- dst += dst_stride;
- }
- } else if (!(w % 8)) {
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 8) {
- const __m128i d8 = _mm_loadl_epi64((__m128i *)&src[j]);
- const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero);
- __m128i d32_0 = _mm_unpacklo_epi16(d16_0, zero);
- __m128i d32_1 = _mm_unpackhi_epi16(d16_0, zero);
-
- __m128i *const p = (__m128i *)&dst[j];
- if (conv_params->use_jnt_comp_avg) {
- if (do_average) {
- __m128i mul = _mm_madd_epi16(d32_0, wt1);
- __m128i weighted_res = _mm_sll_epi32(mul, left_shift);
- __m128i tmp = _mm_loadu_si128(p + 0);
- __m128i sum = _mm_add_epi32(_mm_madd_epi16(tmp, wt0), weighted_res);
- d32_0 = _mm_srai_epi32(sum, DIST_PRECISION_BITS);
-
- mul = _mm_madd_epi16(d32_1, wt1);
- weighted_res = _mm_sll_epi32(mul, left_shift);
- tmp = _mm_loadu_si128(p + 1);
- sum = _mm_add_epi32(_mm_madd_epi16(tmp, wt0), weighted_res);
- d32_1 = _mm_srai_epi32(sum, DIST_PRECISION_BITS);
- } else {
- d32_0 = _mm_sll_epi32(d32_0, left_shift);
- d32_1 = _mm_sll_epi32(d32_1, left_shift);
- }
- } else {
- if (do_average) {
- d32_0 =
- _mm_srai_epi32(_mm_add_epi32(_mm_loadu_si128(p + 0),
- _mm_sll_epi32(d32_0, left_shift)),
- 1);
- d32_1 =
- _mm_srai_epi32(_mm_add_epi32(_mm_loadu_si128(p + 1),
- _mm_sll_epi32(d32_1, left_shift)),
- 1);
- } else {
- d32_0 = _mm_sll_epi32(d32_0, left_shift);
- d32_1 = _mm_sll_epi32(d32_1, left_shift);
- }
- }
-
- _mm_storeu_si128(p + 0, d32_0);
- _mm_storeu_si128(p + 1, d32_1);
- }
- src += src_stride;
- dst += dst_stride;
- }
- } else if (!(w % 4)) {
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 4) {
- const __m128i d8 = _mm_loadl_epi64((__m128i *)&src[j]);
- const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero);
- __m128i d32_0 = _mm_unpacklo_epi16(d16_0, zero);
-
- __m128i *const p = (__m128i *)&dst[j];
- if (conv_params->use_jnt_comp_avg) {
- if (do_average) {
- __m128i mul = _mm_madd_epi16(d32_0, wt1);
- __m128i weighted_res = _mm_sll_epi32(mul, left_shift);
- __m128i tmp = _mm_loadu_si128(p + 0);
- __m128i sum = _mm_add_epi32(_mm_madd_epi16(tmp, wt0), weighted_res);
- d32_0 = _mm_srai_epi32(sum, DIST_PRECISION_BITS);
- } else {
- d32_0 = _mm_sll_epi32(d32_0, left_shift);
- }
- } else {
- if (do_average) {
- d32_0 =
- _mm_srai_epi32(_mm_add_epi32(_mm_loadu_si128(p + 0),
- _mm_sll_epi32(d32_0, left_shift)),
- 1);
- } else {
- d32_0 = _mm_sll_epi32(d32_0, left_shift);
- }
- }
-
- _mm_storeu_si128(p, d32_0);
- }
- src += src_stride;
- dst += dst_stride;
- }
- } else {
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 2) {
- const __m128i d8 = _mm_cvtsi32_si128(*(const int *)&src[j]);
- const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero);
- __m128i d32_0 = _mm_unpacklo_epi16(d16_0, zero);
-
- __m128i *const p = (__m128i *)&dst[j];
- if (conv_params->use_jnt_comp_avg) {
- if (do_average) {
- __m128i mul = _mm_madd_epi16(d32_0, wt1);
- __m128i weighted_res = _mm_sll_epi32(mul, left_shift);
- __m128i tmp = _mm_loadl_epi64(p);
- __m128i sum = _mm_add_epi32(_mm_madd_epi16(tmp, wt0), weighted_res);
- d32_0 = _mm_srai_epi32(sum, DIST_PRECISION_BITS);
- } else {
- d32_0 = _mm_sll_epi32(d32_0, left_shift);
- }
- } else {
- if (do_average) {
- d32_0 =
- _mm_srai_epi32(_mm_add_epi32(_mm_loadl_epi64(p),
- _mm_sll_epi32(d32_0, left_shift)),
- 1);
- } else {
- d32_0 = _mm_sll_epi32(d32_0, left_shift);
- }
- }
-
- _mm_storel_epi64(p, d32_0);
- }
- src += src_stride;
- dst += dst_stride;
- }
- }
-}
-#endif
diff --git a/av1/common/x86/convolve_avx2.c b/av1/common/x86/convolve_avx2.c
index 7d5b2c7..7e5320b 100644
--- a/av1/common/x86/convolve_avx2.c
+++ b/av1/common/x86/convolve_avx2.c
@@ -137,67 +137,6 @@
*(uint32_t *)dst = _mm_cvtsi128_si32(x);
}
-#if !CONFIG_LOWPRECISION_BLEND
-void av1_convolve_rounding_avx2(const int32_t *src, int src_stride,
- uint8_t *dst, int dst_stride, int w, int h,
- int bits) {
- const __m256i rnd_num = _mm256_set1_epi32((int32_t)((1 << bits) >> 1));
- const __m128i rnd_num_sse2 = _mm256_castsi256_si128(rnd_num);
-
- if (w > 64) { // width = 128
- do {
- cal_rounding_32xn_avx2(src, dst, &rnd_num, bits, 4);
- src += src_stride;
- dst += dst_stride;
- h--;
- } while (h > 0);
- } else if (w > 32) { // width = 64
- do {
- cal_rounding_32xn_avx2(src, dst, &rnd_num, bits, 2);
- src += src_stride;
- dst += dst_stride;
- h--;
- } while (h > 0);
- } else if (w > 16) { // width = 32
- do {
- cal_rounding_32xn_avx2(src, dst, &rnd_num, bits, 1);
- src += src_stride;
- dst += dst_stride;
- h--;
- } while (h > 0);
- } else if (w > 8) { // width = 16
- do {
- cal_rounding_16_avx2(src, dst, &rnd_num, bits);
- src += src_stride;
- dst += dst_stride;
- h--;
- } while (h > 0);
- } else if (w > 4) { // width = 8
- do {
- cal_rounding_8_avx2(src, dst, &rnd_num, bits);
- src += src_stride;
- dst += dst_stride;
- h--;
- } while (h > 0);
- } else if (w > 2) { // width = 4
- do {
- cal_rounding_4_sse2(src, dst, &rnd_num_sse2, bits);
- src += src_stride;
- dst += dst_stride;
- h--;
- } while (h > 0);
- } else { // width = 2
- do {
- dst[0] = clip_pixel(ROUND_POWER_OF_TWO(src[0], bits));
- dst[1] = clip_pixel(ROUND_POWER_OF_TWO(src[1], bits));
- src += src_stride;
- dst += dst_stride;
- h--;
- } while (h > 0);
- }
-}
-#endif // CONFIG_LOWPRECISION_BLEND
-
static INLINE void cal_highbd_rounding_32xn_avx2(const int32_t *src,
uint16_t *dst,
const __m256i *rnd, int shift,
@@ -281,227 +220,6 @@
_mm_storel_epi64((__m128i *)dst, x);
}
-#if !CONFIG_LOWPRECISION_BLEND
-void av1_highbd_convolve_rounding_avx2(const int32_t *src, int src_stride,
- uint8_t *dst8, int dst_stride, int w,
- int h, int bits, int bd) {
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
- const __m256i rnd_num = _mm256_set1_epi32((int32_t)((1 << bits) >> 1));
- const __m128i rnd_num_sse2 = _mm256_castsi256_si128(rnd_num);
-
- if (w > 64) { // width = 128
- do {
- cal_highbd_rounding_32xn_avx2(src, dst, &rnd_num, bits, 4, bd);
- src += src_stride;
- dst += dst_stride;
- h--;
- } while (h > 0);
- } else if (w > 32) { // width = 64
- do {
- cal_highbd_rounding_32xn_avx2(src, dst, &rnd_num, bits, 2, bd);
- src += src_stride;
- dst += dst_stride;
- h--;
- } while (h > 0);
- } else if (w > 16) { // width = 32
- do {
- cal_highbd_rounding_32xn_avx2(src, dst, &rnd_num, bits, 1, bd);
- src += src_stride;
- dst += dst_stride;
- h--;
- } while (h > 0);
- } else if (w > 8) { // width = 16
- do {
- cal_highbd_rounding_16_avx2(src, dst, &rnd_num, bits, bd);
- src += src_stride;
- dst += dst_stride;
- h--;
- } while (h > 0);
- } else if (w > 4) { // width = 8
- do {
- cal_highbd_rounding_8_avx2(src, dst, &rnd_num, bits, bd);
- src += src_stride;
- dst += dst_stride;
- h--;
- } while (h > 0);
- } else if (w > 2) { // width = 4
- do {
- cal_highbd_rounding_4_sse2(src, dst, &rnd_num_sse2, bits, bd);
- src += src_stride;
- dst += dst_stride;
- h--;
- } while (h > 0);
- } else { // width = 2
- do {
- dst[0] = clip_pixel_highbd(ROUND_POWER_OF_TWO(src[0], bits), bd);
- dst[1] = clip_pixel_highbd(ROUND_POWER_OF_TWO(src[1], bits), bd);
- src += src_stride;
- dst += dst_stride;
- h--;
- } while (h > 0);
- }
-}
-
-void av1_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
- int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
- CONV_BUF_TYPE *dst = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
- int i, j;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const uint8_t *const src_ptr = src - fo_vert * src_stride;
- // +1 to compensate for dividing the filter coeffs by 2
- const int left_shift = FILTER_BITS - conv_params->round_0 + 1;
- const __m256i round_const =
- _mm256_set1_epi32((1 << conv_params->round_1) >> 1);
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
- const int do_average = conv_params->do_average;
- __m256i coeffs[4], s[8];
-
- assert((FILTER_BITS - conv_params->round_0) >= 0);
-
- prepare_coeffs_lowbd(filter_params_y, subpel_y_q4, coeffs);
-
- (void)conv_params;
- (void)filter_params_x;
- (void)subpel_x_q4;
- (void)dst0;
- (void)dst_stride0;
-
- for (j = 0; j < w; j += 16) {
- const uint8_t *data = &src_ptr[j];
- __m256i src6;
-
- // Load lines a and b. Line a to lower 128, line b to upper 128
- const __m256i src_01a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
- 0x20);
-
- const __m256i src_12a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
- 0x20);
-
- const __m256i src_23a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
- 0x20);
-
- const __m256i src_34a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
- 0x20);
-
- const __m256i src_45a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
- 0x20);
-
- src6 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
- const __m256i src_56a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
- src6, 0x20);
-
- s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
- s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
- s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
-
- s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
- s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
- s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
-
- for (i = 0; i < h; i += 2) {
- data = &src_ptr[i * src_stride + j];
- const __m256i src_67a = _mm256_permute2x128_si256(
- src6,
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
- 0x20);
-
- src6 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
- const __m256i src_78a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
- src6, 0x20);
-
- s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
- s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
-
- const __m256i res_lo = convolve_lowbd(s, coeffs);
-
- const __m256i res_lo_0_32b =
- _mm256_cvtepi16_epi32(_mm256_castsi256_si128(res_lo));
- const __m256i res_lo_0_shift =
- _mm256_slli_epi32(res_lo_0_32b, left_shift);
- const __m256i res_lo_0_round = _mm256_sra_epi32(
- _mm256_add_epi32(res_lo_0_shift, round_const), round_shift);
-
- // Accumulate values into the destination buffer
- add_store_aligned_256(&dst[i * dst_stride + j], &res_lo_0_round,
- do_average);
-
- const __m256i res_lo_1_32b =
- _mm256_cvtepi16_epi32(_mm256_extracti128_si256(res_lo, 1));
- const __m256i res_lo_1_shift =
- _mm256_slli_epi32(res_lo_1_32b, left_shift);
- const __m256i res_lo_1_round = _mm256_sra_epi32(
- _mm256_add_epi32(res_lo_1_shift, round_const), round_shift);
-
- add_store_aligned_256(&dst[i * dst_stride + j + dst_stride],
- &res_lo_1_round, do_average);
-
- if (w - j > 8) {
- const __m256i res_hi = convolve_lowbd(s + 4, coeffs);
-
- const __m256i res_hi_0_32b =
- _mm256_cvtepi16_epi32(_mm256_castsi256_si128(res_hi));
- const __m256i res_hi_0_shift =
- _mm256_slli_epi32(res_hi_0_32b, left_shift);
- const __m256i res_hi_0_round = _mm256_sra_epi32(
- _mm256_add_epi32(res_hi_0_shift, round_const), round_shift);
-
- add_store_aligned_256(&dst[i * dst_stride + j + 8], &res_hi_0_round,
- do_average);
-
- const __m256i res_hi_1_32b =
- _mm256_cvtepi16_epi32(_mm256_extracti128_si256(res_hi, 1));
- const __m256i res_hi_1_shift =
- _mm256_slli_epi32(res_hi_1_32b, left_shift);
- const __m256i res_hi_1_round = _mm256_sra_epi32(
- _mm256_add_epi32(res_hi_1_shift, round_const), round_shift);
-
- add_store_aligned_256(&dst[i * dst_stride + j + 8 + dst_stride],
- &res_hi_1_round, do_average);
- }
- s[0] = s[1];
- s[1] = s[2];
- s[2] = s[3];
-
- s[4] = s[5];
- s[5] = s[6];
- s[6] = s[7];
- }
- }
-}
-#endif // CONFIG_LOWPRECISION_BLEND
-
void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
InterpFilterParams *filter_params_x,
@@ -658,72 +376,6 @@
}
}
}
-#if !CONFIG_LOWPRECISION_BLEND
-void av1_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
- int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
- CONV_BUF_TYPE *dst = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
- int i, j;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
- const uint8_t *const src_ptr = src - fo_horiz;
- const int bits = FILTER_BITS - conv_params->round_1;
- const int do_average = conv_params->do_average;
- __m256i filt[4], coeffs[4];
-
- assert(bits >= 0);
- assert(conv_params->round_0 > 0);
-
- filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
- filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
- filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
- filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
-
- prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
-
- const __m256i round_const =
- _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
-
- (void)filter_params_y;
- (void)subpel_y_q4;
- (void)dst0;
- (void)dst_stride0;
-
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 16) {
- // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 18 19
- // 20 21 22 23
- const __m256i data = _mm256_permute4x64_epi64(
- _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]),
- _MM_SHUFFLE(2, 1, 1, 0));
-
- __m256i res = convolve_lowbd_x(data, coeffs, filt);
-
- res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift);
-
- const __m256i res_lo_round =
- _mm256_cvtepi16_epi32(_mm256_castsi256_si128(res));
- const __m256i res_hi_round =
- _mm256_cvtepi16_epi32(_mm256_extracti128_si256(res, 1));
-
- const __m256i res_lo_shift = _mm256_slli_epi32(res_lo_round, bits);
- const __m256i res_hi_shift = _mm256_slli_epi32(res_hi_round, bits);
-
- // Accumulate values into the destination buffer
- add_store_aligned_256(&dst[i * dst_stride + j], &res_lo_shift,
- do_average);
- if (w - j > 8) {
- add_store_aligned_256(&dst[i * dst_stride + j + 8], &res_hi_shift,
- do_average);
- }
- }
- }
-}
-#endif // CONFIG_LOWPRECISION_BLEND
void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
diff --git a/av1/common/x86/convolve_sse2.c b/av1/common/x86/convolve_sse2.c
index 87a114a..d8b4425 100644
--- a/av1/common/x86/convolve_sse2.c
+++ b/av1/common/x86/convolve_sse2.c
@@ -75,252 +75,6 @@
return convolve(ss, coeffs);
}
-#if !CONFIG_LOWPRECISION_BLEND
-void av1_convolve_y_sse2(const uint8_t *src, int src_stride,
- const uint8_t *dst0, int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
- CONV_BUF_TYPE *dst = conv_params->dst;
- const int dst_stride = conv_params->dst_stride;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const uint8_t *src_ptr = src - fo_vert * src_stride;
- const int bits = FILTER_BITS - conv_params->round_0;
- const __m128i left_shift = _mm_cvtsi32_si128(bits);
- const __m128i round_const = _mm_set1_epi32((1 << conv_params->round_1) >> 1);
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
- __m128i coeffs[4];
-
- (void)filter_params_x;
- (void)subpel_x_q4;
- (void)dst0;
- (void)dst_stride0;
-
- assert(bits >= 0);
-
- prepare_coeffs(filter_params_y, subpel_y_q4, coeffs);
-
- if (w == 4) {
- __m128i s[8], src6, res, res_shift;
- src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 6 * src_stride));
- s[0] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 0 * src_stride)),
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)));
- s[1] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)),
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)));
- s[2] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)),
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)));
- s[3] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)),
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)));
- s[4] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)),
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)));
- s[5] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)), src6);
-
- do {
- s[6] = _mm_unpacklo_epi8(
- src6, _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)));
- src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 8 * src_stride));
- s[7] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)), src6);
-
- res = convolve_lo_y(s + 0, coeffs);
- res_shift = _mm_sll_epi32(res, left_shift);
- res_shift =
- _mm_sra_epi32(_mm_add_epi32(res_shift, round_const), round_shift);
- add_store(dst, &res_shift, conv_params->do_average);
- src_ptr += src_stride;
- dst += dst_stride;
-
- res = convolve_lo_y(s + 1, coeffs);
- res_shift = _mm_sll_epi32(res, left_shift);
- res_shift =
- _mm_sra_epi32(_mm_add_epi32(res_shift, round_const), round_shift);
- add_store(dst, &res_shift, conv_params->do_average);
- src_ptr += src_stride;
- dst += dst_stride;
-
- s[0] = s[2];
- s[1] = s[3];
- s[2] = s[4];
- s[3] = s[5];
- s[4] = s[6];
- s[5] = s[7];
- h -= 2;
- } while (h);
- } else {
- assert(!(w % 8));
- int j = 0;
- do {
- __m128i s[8], src6, res_lo, res_hi, res_lo_shift, res_hi_shift;
- const uint8_t *data = &src_ptr[j];
-
- src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
- s[0] = _mm_unpacklo_epi8(
- _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
- _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
- s[1] = _mm_unpacklo_epi8(
- _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
- _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
- s[2] = _mm_unpacklo_epi8(
- _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
- _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
- s[3] = _mm_unpacklo_epi8(
- _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
- _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
- s[4] = _mm_unpacklo_epi8(
- _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
- _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
- s[5] = _mm_unpacklo_epi8(
- _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6);
-
- int i = 0;
- do {
- data = &src_ptr[i * src_stride + j];
- s[6] = _mm_unpacklo_epi8(
- src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
- src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
- s[7] = _mm_unpacklo_epi8(
- _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6);
-
- res_lo = convolve_lo_y(s, coeffs); // Filter low index pixels
- res_hi = convolve_hi_y(s, coeffs); // Filter high index pixels
- res_lo_shift = _mm_sll_epi32(res_lo, left_shift);
- res_hi_shift = _mm_sll_epi32(res_hi, left_shift);
- res_lo_shift = _mm_sra_epi32(_mm_add_epi32(res_lo_shift, round_const),
- round_shift);
- res_hi_shift = _mm_sra_epi32(_mm_add_epi32(res_hi_shift, round_const),
- round_shift);
- add_store(dst + i * dst_stride + j + 0, &res_lo_shift,
- conv_params->do_average);
- add_store(dst + i * dst_stride + j + 4, &res_hi_shift,
- conv_params->do_average);
- i++;
-
- res_lo = convolve_lo_y(s + 1, coeffs); // Filter low index pixels
- res_hi = convolve_hi_y(s + 1, coeffs); // Filter high index pixels
- res_lo_shift = _mm_sll_epi32(res_lo, left_shift);
- res_hi_shift = _mm_sll_epi32(res_hi, left_shift);
- res_lo_shift = _mm_sra_epi32(_mm_add_epi32(res_lo_shift, round_const),
- round_shift);
- res_hi_shift = _mm_sra_epi32(_mm_add_epi32(res_hi_shift, round_const),
- round_shift);
- add_store(dst + i * dst_stride + j + 0, &res_lo_shift,
- conv_params->do_average);
- add_store(dst + i * dst_stride + j + 4, &res_hi_shift,
- conv_params->do_average);
- i++;
-
- s[0] = s[2];
- s[1] = s[3];
- s[2] = s[4];
- s[3] = s[5];
- s[4] = s[6];
- s[5] = s[7];
- } while (i < h);
- j += 8;
- } while (j < w);
- }
-}
-
-void av1_convolve_x_sse2(const uint8_t *src, int src_stride,
- const uint8_t *dst0, int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
- CONV_BUF_TYPE *dst = conv_params->dst;
- const int dst_stride = conv_params->dst_stride;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
- const uint8_t *src_ptr = src - fo_horiz;
- const int bits = FILTER_BITS - conv_params->round_1;
- const __m128i left_shift = _mm_cvtsi32_si128(bits);
- const __m128i round_const = _mm_set1_epi32((1 << conv_params->round_0) >> 1);
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
- __m128i coeffs[4];
-
- (void)filter_params_y;
- (void)subpel_y_q4;
- (void)dst0;
- (void)dst_stride0;
-
- assert(bits >= 0);
-
- prepare_coeffs(filter_params_x, subpel_x_q4, coeffs);
-
- if (w == 4) {
- do {
- const __m128i data = _mm_loadu_si128((__m128i *)src_ptr);
- __m128i s[4];
-
- s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1));
- s[1] =
- _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
- s[2] =
- _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
- s[3] =
- _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
- const __m128i res_lo = convolve_lo_x(s, coeffs);
- const __m128i res_lo_round =
- _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
- const __m128i res_lo_shift = _mm_sll_epi32(res_lo_round, left_shift);
-
- // Accumulate values into the destination buffer
- add_store(dst, &res_lo_shift, conv_params->do_average);
- src_ptr += src_stride;
- dst += dst_stride;
- } while (--h);
- } else {
- assert(!(w % 8));
- int i = 0;
- do {
- int j = 0;
- do {
- const __m128i data =
- _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
- __m128i s[4];
-
- // Filter even-index pixels
- s[0] = data;
- s[1] = _mm_srli_si128(data, 2);
- s[2] = _mm_srli_si128(data, 4);
- s[3] = _mm_srli_si128(data, 6);
- const __m128i res_even = convolve_lo_x(s, coeffs);
-
- // Filter odd-index pixels
- s[0] = _mm_srli_si128(data, 1);
- s[1] = _mm_srli_si128(data, 3);
- s[2] = _mm_srli_si128(data, 5);
- s[3] = _mm_srli_si128(data, 7);
- const __m128i res_odd = convolve_lo_x(s, coeffs);
-
- // Rearrange pixels back into the order 0 ... 7
- const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
- const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
- const __m128i res_lo_round =
- _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
- const __m128i res_hi_round =
- _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
- const __m128i res_lo_shift = _mm_sll_epi32(res_lo_round, left_shift);
- const __m128i res_hi_shift = _mm_sll_epi32(res_hi_round, left_shift);
-
- // Accumulate values into the destination buffer
- add_store(dst + i * dst_stride + j + 0, &res_lo_shift,
- conv_params->do_average);
- add_store(dst + i * dst_stride + j + 4, &res_hi_shift,
- conv_params->do_average);
- j += 8;
- } while (j < w);
- } while (++i < h);
- }
-}
-#endif // CONFIG_LOWPRECISION_BLEND
-
void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride,
const uint8_t *dst, int dst_stride, int w, int h,
InterpFilterParams *filter_params_x,
diff --git a/av1/common/x86/highbd_convolve_2d_avx2.c b/av1/common/x86/highbd_convolve_2d_avx2.c
index bb2ccdd..63c1cf1 100644
--- a/av1/common/x86/highbd_convolve_2d_avx2.c
+++ b/av1/common/x86/highbd_convolve_2d_avx2.c
@@ -186,239 +186,6 @@
}
}
-#if !CONFIG_LOWPRECISION_BLEND
-void av1_highbd_convolve_2d_avx2(const uint16_t *src, int src_stride,
- uint16_t *dst0, int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params, int bd) {
- DECLARE_ALIGNED(32, int16_t,
- im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
- CONV_BUF_TYPE *dst = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
- int im_h = h + filter_params_y->taps - 1;
- int im_stride = MAX_SB_SIZE;
- int i, j;
- const int do_average = conv_params->do_average;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
- const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
- (void)dst0;
- (void)dst_stride0;
- // Check that, even with 12-bit input, the intermediate values will fit
- // into an unsigned 16-bit intermediate array.
- assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
-
- /* Horizontal filter */
- {
- const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
-
- const __m128i coeffs_x8 = _mm_loadu_si128((__m128i *)x_filter);
- // since not all compilers yet support _mm256_set_m128i()
- const __m256i coeffs_x = _mm256_insertf128_si256(
- _mm256_castsi128_si256(coeffs_x8), coeffs_x8, 1);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m256i tmp_0 = _mm256_unpacklo_epi32(coeffs_x, coeffs_x);
- // coeffs 4 5 4 5 6 7 6 7
- const __m256i tmp_1 = _mm256_unpackhi_epi32(coeffs_x, coeffs_x);
-
- // coeffs 0 1 0 1 0 1 0 1
- const __m256i coeff_01 = _mm256_unpacklo_epi64(tmp_0, tmp_0);
- // coeffs 2 3 2 3 2 3 2 3
- const __m256i coeff_23 = _mm256_unpackhi_epi64(tmp_0, tmp_0);
- // coeffs 4 5 4 5 4 5 4 5
- const __m256i coeff_45 = _mm256_unpacklo_epi64(tmp_1, tmp_1);
- // coeffs 6 7 6 7 6 7 6 7
- const __m256i coeff_67 = _mm256_unpackhi_epi64(tmp_1, tmp_1);
-
- const __m256i round_const = _mm256_set1_epi32(
- ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
-
- for (i = 0; i < im_h; ++i) {
- for (j = 0; j < w; j += 16) {
- const __m256i data =
- _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
- const __m128i data2_1 =
- _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 16]);
- const __m256i data2 = _mm256_insertf128_si256(
- _mm256_castsi128_si256(data2_1), data2_1, 1);
-
- // Filter even-index pixels
- const __m256i res_0 = _mm256_madd_epi16(data, coeff_01);
- const __m256i res_2 = _mm256_madd_epi16(
- _mm256_alignr_epi8(_mm256_permute2x128_si256(data2, data, 0x13),
- data, 4),
- coeff_23);
- const __m256i res_4 = _mm256_madd_epi16(
- _mm256_alignr_epi8(_mm256_permute2x128_si256(data2, data, 0x13),
- data, 8),
- coeff_45);
- const __m256i res_6 = _mm256_madd_epi16(
- _mm256_alignr_epi8(_mm256_permute2x128_si256(data2, data, 0x13),
- data, 12),
- coeff_67);
-
- __m256i res_even = _mm256_add_epi32(_mm256_add_epi32(res_0, res_4),
- _mm256_add_epi32(res_2, res_6));
- res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const),
- round_shift);
-
- // Filter odd-index pixels
- const __m256i res_1 = _mm256_madd_epi16(
- _mm256_alignr_epi8(_mm256_permute2x128_si256(data2, data, 0x13),
- data, 2),
- coeff_01);
- const __m256i res_3 = _mm256_madd_epi16(
- _mm256_alignr_epi8(_mm256_permute2x128_si256(data2, data, 0x13),
- data, 6),
- coeff_23);
- const __m256i res_5 = _mm256_madd_epi16(
- _mm256_alignr_epi8(_mm256_permute2x128_si256(data2, data, 0x13),
- data, 10),
- coeff_45);
- const __m256i res_7 = _mm256_madd_epi16(
- _mm256_alignr_epi8(_mm256_permute2x128_si256(data2, data, 0x13),
- data, 14),
- coeff_67);
-
- __m256i res_odd = _mm256_add_epi32(_mm256_add_epi32(res_1, res_5),
- _mm256_add_epi32(res_3, res_7));
- res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const),
- round_shift);
-
- __m256i res = _mm256_packs_epi32(res_even, res_odd);
- _mm256_storeu_si256((__m256i *)&im_block[i * im_stride + j], res);
- }
- }
- }
-
- /* Vertical filter */
- {
- const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
-
- const __m128i coeffs_y8 = _mm_loadu_si128((__m128i *)y_filter);
- const __m256i coeffs_y = _mm256_insertf128_si256(
- _mm256_castsi128_si256(coeffs_y8), coeffs_y8, 1);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m256i tmp_0 = _mm256_unpacklo_epi32(coeffs_y, coeffs_y);
- // coeffs 4 5 4 5 6 7 6 7
- const __m256i tmp_1 = _mm256_unpackhi_epi32(coeffs_y, coeffs_y);
-
- // coeffs 0 1 0 1 0 1 0 1
- const __m256i coeff_01 = _mm256_unpacklo_epi64(tmp_0, tmp_0);
- // coeffs 2 3 2 3 2 3 2 3
- const __m256i coeff_23 = _mm256_unpackhi_epi64(tmp_0, tmp_0);
- // coeffs 4 5 4 5 4 5 4 5
- const __m256i coeff_45 = _mm256_unpacklo_epi64(tmp_1, tmp_1);
- // coeffs 6 7 6 7 6 7 6 7
- const __m256i coeff_67 = _mm256_unpackhi_epi64(tmp_1, tmp_1);
-
- const __m256i round_const = _mm256_set1_epi32(
- ((1 << conv_params->round_1) >> 1) -
- (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
-
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 16) {
- // Filter even-index pixels
- const int16_t *data = &im_block[i * im_stride + j];
- const __m256i src_0 =
- _mm256_unpacklo_epi16(*(__m256i *)(data + 0 * im_stride),
- *(__m256i *)(data + 1 * im_stride));
- const __m256i src_2 =
- _mm256_unpacklo_epi16(*(__m256i *)(data + 2 * im_stride),
- *(__m256i *)(data + 3 * im_stride));
- const __m256i src_4 =
- _mm256_unpacklo_epi16(*(__m256i *)(data + 4 * im_stride),
- *(__m256i *)(data + 5 * im_stride));
- const __m256i src_6 =
- _mm256_unpacklo_epi16(*(__m256i *)(data + 6 * im_stride),
- *(__m256i *)(data + 7 * im_stride));
-
- const __m256i res_0 = _mm256_madd_epi16(src_0, coeff_01);
- const __m256i res_2 = _mm256_madd_epi16(src_2, coeff_23);
- const __m256i res_4 = _mm256_madd_epi16(src_4, coeff_45);
- const __m256i res_6 = _mm256_madd_epi16(src_6, coeff_67);
-
- const __m256i res_even = _mm256_add_epi32(
- _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6));
-
- // Filter odd-index pixels
- const __m256i src_1 =
- _mm256_unpackhi_epi16(*(__m256i *)(data + 0 * im_stride),
- *(__m256i *)(data + 1 * im_stride));
- const __m256i src_3 =
- _mm256_unpackhi_epi16(*(__m256i *)(data + 2 * im_stride),
- *(__m256i *)(data + 3 * im_stride));
- const __m256i src_5 =
- _mm256_unpackhi_epi16(*(__m256i *)(data + 4 * im_stride),
- *(__m256i *)(data + 5 * im_stride));
- const __m256i src_7 =
- _mm256_unpackhi_epi16(*(__m256i *)(data + 6 * im_stride),
- *(__m256i *)(data + 7 * im_stride));
-
- const __m256i res_1 = _mm256_madd_epi16(src_1, coeff_01);
- const __m256i res_3 = _mm256_madd_epi16(src_3, coeff_23);
- const __m256i res_5 = _mm256_madd_epi16(src_5, coeff_45);
- const __m256i res_7 = _mm256_madd_epi16(src_7, coeff_67);
-
- const __m256i res_odd = _mm256_add_epi32(
- _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7));
-
- // Rearrange pixels back into the order 0 ... 7
- const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
- const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
-
- const __m256i res_lo_round = _mm256_sra_epi32(
- _mm256_add_epi32(res_lo, round_const), round_shift);
- const __m256i res_hi_round = _mm256_sra_epi32(
- _mm256_add_epi32(res_hi, round_const), round_shift);
-
- // Accumulate values into the destination buffer
- __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
- if (do_average) {
- _mm_storeu_si128(
- p + 0, _mm_srai_epi32(_mm_add_epi32(_mm_loadu_si128(p + 0),
- _mm256_extractf128_si256(
- res_lo_round, 0)),
- 1));
- _mm_storeu_si128(
- p + 1, _mm_srai_epi32(_mm_add_epi32(_mm_loadu_si128(p + 1),
- _mm256_extractf128_si256(
- res_hi_round, 0)),
- 1));
- if (w - j > 8) {
- _mm_storeu_si128(
- p + 2, _mm_srai_epi32(_mm_add_epi32(_mm_loadu_si128(p + 2),
- _mm256_extractf128_si256(
- res_lo_round, 1)),
- 1));
- _mm_storeu_si128(
- p + 3, _mm_srai_epi32(_mm_add_epi32(_mm_loadu_si128(p + 3),
- _mm256_extractf128_si256(
- res_hi_round, 1)),
- 1));
- }
- } else {
- _mm_storeu_si128(p + 0, _mm256_extractf128_si256(res_lo_round, 0));
- _mm_storeu_si128(p + 1, _mm256_extractf128_si256(res_hi_round, 0));
- if (w - j > 8) {
- _mm_storeu_si128(p + 2, _mm256_extractf128_si256(res_lo_round, 1));
- _mm_storeu_si128(p + 3, _mm256_extractf128_si256(res_hi_round, 1));
- }
- }
- }
- }
- }
-}
-#endif // CONFIG_LOWPRECISION_BLEND
-
static INLINE void copy_64(const uint16_t *src, uint16_t *dst) {
__m256i s[4];
s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
diff --git a/av1/common/x86/highbd_convolve_2d_sse4.c b/av1/common/x86/highbd_convolve_2d_sse4.c
index 047f214..428c2c7 100644
--- a/av1/common/x86/highbd_convolve_2d_sse4.c
+++ b/av1/common/x86/highbd_convolve_2d_sse4.c
@@ -21,7 +21,6 @@
#include "aom_dsp/x86/convolve_sse4_1.h"
#include "av1/common/convolve.h"
-#if CONFIG_LOWPRECISION_BLEND
void av1_highbd_jnt_convolve_2d_copy_sse4_1(
const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
int h, InterpFilterParams *filter_params_x,
@@ -419,224 +418,3 @@
}
}
}
-#else
-void av1_highbd_jnt_convolve_2d_sse4_1(
- const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y, const int subpel_x_q4,
- const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
- DECLARE_ALIGNED(16, int16_t,
- im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
- CONV_BUF_TYPE *dst = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
- int im_h = h + filter_params_y->taps - 1;
- int im_stride = MAX_SB_SIZE;
- int i, j;
- const int do_average = conv_params->do_average;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
- const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
-
- const int w0 = conv_params->fwd_offset;
- const int w1 = conv_params->bck_offset;
- const __m128i wt0 = _mm_set1_epi32(w0);
- const __m128i wt1 = _mm_set1_epi32(w1);
-
- (void)dst0;
- (void)dst_stride0;
-
- // Check that, even with 12-bit input, the intermediate values will fit
- // into an unsigned 16-bit intermediate array.
- assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
-
- /* Horizontal filter */
- {
- const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
- const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
- // coeffs 4 5 4 5 6 7 6 7
- const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
- // coeffs 0 1 0 1 0 1 0 1
- const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
- // coeffs 2 3 2 3 2 3 2 3
- const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
- // coeffs 4 5 4 5 4 5 4 5
- const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
- // coeffs 6 7 6 7 6 7 6 7
- const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
- const __m128i round_const = _mm_set1_epi32(
- ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
-
- for (i = 0; i < im_h; ++i) {
- for (j = 0; j < w; j += 8) {
- const __m128i data =
- _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
- const __m128i data2 =
- _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);
-
- // Filter even-index pixels
- const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
- const __m128i res_2 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
- const __m128i res_4 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
- const __m128i res_6 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);
-
- __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
- _mm_add_epi32(res_2, res_6));
- res_even =
- _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
-
- // Filter odd-index pixels
- const __m128i res_1 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
- const __m128i res_3 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
- const __m128i res_5 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
- const __m128i res_7 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);
-
- __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
- _mm_add_epi32(res_3, res_7));
- res_odd =
- _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
-
- // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
- __m128i res = _mm_packs_epi32(res_even, res_odd);
- _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
- }
- }
- }
-
- /* Vertical filter */
- {
- const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
- const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
- // coeffs 4 5 4 5 6 7 6 7
- const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
- // coeffs 0 1 0 1 0 1 0 1
- const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
- // coeffs 2 3 2 3 2 3 2 3
- const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
- // coeffs 4 5 4 5 4 5 4 5
- const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
- // coeffs 6 7 6 7 6 7 6 7
- const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
- const __m128i round_const = _mm_set1_epi32(
- ((1 << conv_params->round_1) >> 1) -
- (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
-
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 8) {
- // Filter even-index pixels
- const int16_t *data = &im_block[i * im_stride + j];
- const __m128i src_0 =
- _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
- *(__m128i *)(data + 1 * im_stride));
- const __m128i src_2 =
- _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
- *(__m128i *)(data + 3 * im_stride));
- const __m128i src_4 =
- _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
- *(__m128i *)(data + 5 * im_stride));
- const __m128i src_6 =
- _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
- *(__m128i *)(data + 7 * im_stride));
-
- const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
- const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
- const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
- const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
- const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
- _mm_add_epi32(res_4, res_6));
-
- // Filter odd-index pixels
- const __m128i src_1 =
- _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
- *(__m128i *)(data + 1 * im_stride));
- const __m128i src_3 =
- _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
- *(__m128i *)(data + 3 * im_stride));
- const __m128i src_5 =
- _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
- *(__m128i *)(data + 5 * im_stride));
- const __m128i src_7 =
- _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
- *(__m128i *)(data + 7 * im_stride));
-
- const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
- const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
- const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
- const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
- const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
- _mm_add_epi32(res_5, res_7));
-
- // Rearrange pixels back into the order 0 ... 7
- const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
- const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
- const __m128i res_lo_round =
- _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
- const __m128i res_hi_round =
- _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
-
- // Accumulate values into the destination buffer
- __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
- if (conv_params->use_jnt_comp_avg) {
- if (do_average) {
- const __m128i tmp_lo = _mm_loadu_si128(p + 0);
- const __m128i tmp_hi = _mm_loadu_si128(p + 1);
- const __m128i jnt_sum_lo =
- _mm_add_epi32(_mm_mullo_epi32(tmp_lo, wt0),
- _mm_mullo_epi32(res_lo_round, wt1));
- const __m128i jnt_sum_hi =
- _mm_add_epi32(_mm_mullo_epi32(tmp_hi, wt0),
- _mm_mullo_epi32(res_hi_round, wt1));
- const __m128i final_lo =
- _mm_srai_epi32(jnt_sum_lo, DIST_PRECISION_BITS);
- const __m128i final_hi =
- _mm_srai_epi32(jnt_sum_hi, DIST_PRECISION_BITS);
-
- _mm_storeu_si128(p + 0, final_lo);
- _mm_storeu_si128(p + 1, final_hi);
- } else {
- _mm_storeu_si128(p + 0, res_lo_round);
- _mm_storeu_si128(p + 1, res_hi_round);
- }
- } else {
- if (do_average) {
- _mm_storeu_si128(
- p + 0,
- _mm_srai_epi32(
- _mm_add_epi32(_mm_loadu_si128(p + 0), res_lo_round), 1));
- _mm_storeu_si128(
- p + 1,
- _mm_srai_epi32(
- _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round), 1));
- } else {
- _mm_storeu_si128(p + 0, res_lo_round);
- _mm_storeu_si128(p + 1, res_hi_round);
- }
- }
- }
- }
- }
-}
-#endif
diff --git a/av1/common/x86/highbd_convolve_2d_ssse3.c b/av1/common/x86/highbd_convolve_2d_ssse3.c
index 125e464..d04fe84 100644
--- a/av1/common/x86/highbd_convolve_2d_ssse3.c
+++ b/av1/common/x86/highbd_convolve_2d_ssse3.c
@@ -19,199 +19,6 @@
#include "aom_dsp/x86/convolve_sse2.h"
#include "av1/common/convolve.h"
-#if !CONFIG_LOWPRECISION_BLEND
-void av1_highbd_convolve_2d_ssse3(const uint16_t *src, int src_stride,
- uint16_t *dst0, int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params, int bd) {
- DECLARE_ALIGNED(16, int16_t,
- im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
- CONV_BUF_TYPE *dst = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
- int im_h = h + filter_params_y->taps - 1;
- int im_stride = MAX_SB_SIZE;
- int i, j;
- const int do_average = conv_params->do_average;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
- const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
- (void)dst0;
- (void)dst_stride0;
- // Check that, even with 12-bit input, the intermediate values will fit
- // into an unsigned 16-bit intermediate array.
- assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
-
- /* Horizontal filter */
- {
- const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
- const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
- // coeffs 4 5 4 5 6 7 6 7
- const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
- // coeffs 0 1 0 1 0 1 0 1
- const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
- // coeffs 2 3 2 3 2 3 2 3
- const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
- // coeffs 4 5 4 5 4 5 4 5
- const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
- // coeffs 6 7 6 7 6 7 6 7
- const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
- const __m128i round_const = _mm_set1_epi32(
- ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
-
- for (i = 0; i < im_h; ++i) {
- for (j = 0; j < w; j += 8) {
- const __m128i data =
- _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
- const __m128i data2 =
- _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);
-
- // Filter even-index pixels
- const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
- const __m128i res_2 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
- const __m128i res_4 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
- const __m128i res_6 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);
-
- __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
- _mm_add_epi32(res_2, res_6));
- res_even =
- _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
-
- // Filter odd-index pixels
- const __m128i res_1 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
- const __m128i res_3 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
- const __m128i res_5 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
- const __m128i res_7 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);
-
- __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
- _mm_add_epi32(res_3, res_7));
- res_odd =
- _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
-
- // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
- __m128i res = _mm_packs_epi32(res_even, res_odd);
- _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
- }
- }
- }
-
- /* Vertical filter */
- {
- const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
- const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
- // coeffs 4 5 4 5 6 7 6 7
- const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
- // coeffs 0 1 0 1 0 1 0 1
- const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
- // coeffs 2 3 2 3 2 3 2 3
- const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
- // coeffs 4 5 4 5 4 5 4 5
- const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
- // coeffs 6 7 6 7 6 7 6 7
- const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
- const __m128i round_const = _mm_set1_epi32(
- ((1 << conv_params->round_1) >> 1) -
- (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
-
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 8) {
- // Filter even-index pixels
- const int16_t *data = &im_block[i * im_stride + j];
- const __m128i src_0 =
- _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
- *(__m128i *)(data + 1 * im_stride));
- const __m128i src_2 =
- _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
- *(__m128i *)(data + 3 * im_stride));
- const __m128i src_4 =
- _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
- *(__m128i *)(data + 5 * im_stride));
- const __m128i src_6 =
- _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
- *(__m128i *)(data + 7 * im_stride));
-
- const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
- const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
- const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
- const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
- const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
- _mm_add_epi32(res_4, res_6));
-
- // Filter odd-index pixels
- const __m128i src_1 =
- _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
- *(__m128i *)(data + 1 * im_stride));
- const __m128i src_3 =
- _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
- *(__m128i *)(data + 3 * im_stride));
- const __m128i src_5 =
- _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
- *(__m128i *)(data + 5 * im_stride));
- const __m128i src_7 =
- _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
- *(__m128i *)(data + 7 * im_stride));
-
- const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
- const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
- const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
- const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
- const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
- _mm_add_epi32(res_5, res_7));
-
- // Rearrange pixels back into the order 0 ... 7
- const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
- const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
- const __m128i res_lo_round =
- _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
- const __m128i res_hi_round =
- _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
-
- // Accumulate values into the destination buffer
- __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
- if (do_average) {
- _mm_storeu_si128(
- p + 0,
- _mm_srai_epi32(
- _mm_add_epi32(_mm_loadu_si128(p + 0), res_lo_round), 1));
- _mm_storeu_si128(
- p + 1,
- _mm_srai_epi32(
- _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round), 1));
- } else {
- _mm_storeu_si128(p + 0, res_lo_round);
- _mm_storeu_si128(p + 1, res_hi_round);
- }
- }
- }
- }
-}
-#endif // CONFIG_LOWPRECISION_BLEND
-
void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride,
uint16_t *dst, int dst_stride, int w,
int h, InterpFilterParams *filter_params_x,
diff --git a/av1/common/x86/highbd_jnt_convolve_avx2.c b/av1/common/x86/highbd_jnt_convolve_avx2.c
index ff1f2f8..581060d 100644
--- a/av1/common/x86/highbd_jnt_convolve_avx2.c
+++ b/av1/common/x86/highbd_jnt_convolve_avx2.c
@@ -22,7 +22,6 @@
#include "aom_dsp/aom_filter.h"
#include "av1/common/convolve.h"
-#if CONFIG_LOWPRECISION_BLEND
void av1_highbd_jnt_convolve_2d_copy_avx2(
const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
int h, InterpFilterParams *filter_params_x,
@@ -851,521 +850,3 @@
}
}
}
-#else
-void av1_highbd_jnt_convolve_2d_copy_avx2(
- const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y, const int subpel_x_q4,
- const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
- CONV_BUF_TYPE *dst = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
- (void)filter_params_x;
- (void)filter_params_y;
- (void)subpel_x_q4;
- (void)subpel_y_q4;
- (void)dst0;
- (void)dst_stride0;
- (void)bd;
-
- const int bits =
- FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
- const __m128i left_shift = _mm_cvtsi32_si128(bits);
- const int do_average = conv_params->do_average;
- const int w0 = conv_params->fwd_offset;
- const int w1 = conv_params->bck_offset;
- const __m256i wt0 = _mm256_set1_epi32(w0);
- const __m256i wt1 = _mm256_set1_epi32(w1);
- int i, j;
-
- assert(bits <= 4);
-
- if (!(w % 16)) {
- for (i = 0; i < h; i += 1) {
- for (j = 0; j < w; j += 16) {
- const __m256i src_16bit =
- _mm256_loadu_si256((__m256i *)(&src[i * src_stride + j]));
-
- const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
- const __m256i res_lo =
- _mm256_cvtepu16_epi32(_mm256_castsi256_si128(res));
- const __m256i res_hi =
- _mm256_cvtepu16_epi32(_mm256_extracti128_si256(res, 1));
-
- if (conv_params->use_jnt_comp_avg) {
- mult_add_store_aligned_256(&dst[i * dst_stride + j], &res_lo, &wt0,
- &wt1, do_average);
- mult_add_store_aligned_256(&dst[i * dst_stride + j + 8], &res_hi,
- &wt0, &wt1, do_average);
- } else {
- add_store_aligned_256(&dst[i * dst_stride + j], &res_lo, do_average);
- add_store_aligned_256(&dst[i * dst_stride + j + 8], &res_hi,
- do_average);
- }
- }
- }
- } else if (!(w % 4)) {
- for (i = 0; i < h; i += 2) {
- for (j = 0; j < w; j += 8) {
- const __m128i src_row_0 =
- _mm_loadu_si128((__m128i *)(&src[i * src_stride + j]));
- const __m128i src_row_1 =
- _mm_loadu_si128((__m128i *)(&src[i * src_stride + j + src_stride]));
- // since not all compilers yet support _mm256_set_m128i()
- const __m256i src_10 = _mm256_insertf128_si256(
- _mm256_castsi128_si256(src_row_0), src_row_1, 1);
-
- const __m256i res = _mm256_sll_epi16(src_10, left_shift);
-
- const __m256i res_lo =
- _mm256_cvtepu16_epi32(_mm256_castsi256_si128(res));
- const __m256i res_hi =
- _mm256_cvtepu16_epi32(_mm256_extracti128_si256(res, 1));
-
- if (conv_params->use_jnt_comp_avg) {
- mult_add_store_aligned_256(&dst[i * dst_stride + j], &res_lo, &wt0,
- &wt1, do_average);
- mult_add_store_aligned_256(&dst[i * dst_stride + j + dst_stride],
- &res_hi, &wt0, &wt1, do_average);
- } else {
- add_store_aligned_256(&dst[i * dst_stride + j], &res_lo, do_average);
- add_store_aligned_256(&dst[i * dst_stride + j + dst_stride], &res_hi,
- do_average);
- }
- }
- }
- }
-}
-
-void av1_highbd_jnt_convolve_2d_avx2(
- const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst0, int dst_stride0,
- int w, int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y, const int subpel_x_q4,
- const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
- DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
- CONV_BUF_TYPE *dst = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
- int im_h = h + filter_params_y->taps - 1;
- int im_stride = 8;
- int i, j;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
- const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
- (void)dst0;
- (void)dst_stride0;
-
- // Check that, even with 12-bit input, the intermediate values will fit
- // into an unsigned 16-bit intermediate array.
- assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
-
- __m256i s[8], coeffs_y[4], coeffs_x[4];
- const int do_average = conv_params->do_average;
-
- const int w0 = conv_params->fwd_offset;
- const int w1 = conv_params->bck_offset;
- const __m256i wt0 = _mm256_set1_epi32(w0);
- const __m256i wt1 = _mm256_set1_epi32(w1);
- const __m128i wt0_128 = _mm256_castsi256_si128(wt0);
- const __m128i wt1_128 = _mm256_castsi256_si128(wt1);
-
- const __m256i round_const_x = _mm256_set1_epi32(
- ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
- const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
-
- const __m256i round_const_y = _mm256_set1_epi32(
- ((1 << conv_params->round_1) >> 1) -
- (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
- const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
-
- prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
- prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
-
- for (j = 0; j < w; j += 8) {
- /* Horizontal filter */
- {
- for (i = 0; i < im_h; i += 2) {
- const __m256i row0 =
- _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
- __m256i row1 = _mm256_set1_epi16(0);
- if (i + 1 < im_h)
- row1 =
- _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
-
- const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
- const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
-
- // even pixels
- s[0] = _mm256_alignr_epi8(r1, r0, 0);
- s[1] = _mm256_alignr_epi8(r1, r0, 4);
- s[2] = _mm256_alignr_epi8(r1, r0, 8);
- s[3] = _mm256_alignr_epi8(r1, r0, 12);
-
- __m256i res_even = convolve(s, coeffs_x);
- res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
- round_shift_x);
-
- // odd pixels
- s[0] = _mm256_alignr_epi8(r1, r0, 2);
- s[1] = _mm256_alignr_epi8(r1, r0, 6);
- s[2] = _mm256_alignr_epi8(r1, r0, 10);
- s[3] = _mm256_alignr_epi8(r1, r0, 14);
-
- __m256i res_odd = convolve(s, coeffs_x);
- res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
- round_shift_x);
-
- __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
- __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
- __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
-
- _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
- }
- }
-
- /* Vertical filter */
- {
- __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
- __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
- __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
- __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
- __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
- __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
-
- s[0] = _mm256_unpacklo_epi16(s0, s1);
- s[1] = _mm256_unpacklo_epi16(s2, s3);
- s[2] = _mm256_unpacklo_epi16(s4, s5);
-
- s[4] = _mm256_unpackhi_epi16(s0, s1);
- s[5] = _mm256_unpackhi_epi16(s2, s3);
- s[6] = _mm256_unpackhi_epi16(s4, s5);
-
- for (i = 0; i < h; i += 2) {
- const int16_t *data = &im_block[i * im_stride];
-
- const __m256i s6 =
- _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
- const __m256i s7 =
- _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
-
- s[3] = _mm256_unpacklo_epi16(s6, s7);
- s[7] = _mm256_unpackhi_epi16(s6, s7);
-
- const __m256i res_a = convolve(s, coeffs_y);
-
- const __m256i res_a_round = _mm256_sra_epi32(
- _mm256_add_epi32(res_a, round_const_y), round_shift_y);
-
- if (w - j > 4) {
- const __m256i res_b = convolve(s + 4, coeffs_y);
- const __m256i res_b_round = _mm256_sra_epi32(
- _mm256_add_epi32(res_b, round_const_y), round_shift_y);
- const __m256i res_ax =
- _mm256_permute2x128_si256(res_a_round, res_b_round, 0x20);
- const __m256i res_bx =
- _mm256_permute2x128_si256(res_a_round, res_b_round, 0x31);
-
- if (conv_params->use_jnt_comp_avg) {
- mult_add_store_aligned_256(&dst[i * dst_stride + j], &res_ax, &wt0,
- &wt1, do_average);
- mult_add_store_aligned_256(&dst[i * dst_stride + j + dst_stride],
- &res_bx, &wt0, &wt1, do_average);
- } else {
- add_store_aligned_256(&dst[i * dst_stride + j], &res_ax,
- do_average);
- add_store_aligned_256(&dst[i * dst_stride + j + dst_stride],
- &res_bx, do_average);
- }
- } else {
- const __m128i res_ax = _mm256_castsi256_si128(res_a_round);
- const __m128i res_bx = _mm256_extracti128_si256(res_a_round, 1);
-
- if (conv_params->use_jnt_comp_avg) {
- mult_add_store(&dst[i * dst_stride + j], &res_ax, &wt0_128,
- &wt1_128, do_average);
- mult_add_store(&dst[i * dst_stride + j + dst_stride], &res_bx,
- &wt0_128, &wt1_128, do_average);
- } else {
- add_store(&dst[i * dst_stride + j], &res_ax, do_average);
- add_store(&dst[i * dst_stride + j + dst_stride], &res_bx,
- do_average);
- }
- }
-
- s[0] = s[1];
- s[1] = s[2];
- s[2] = s[3];
-
- s[4] = s[5];
- s[5] = s[6];
- s[6] = s[7];
- }
- }
- }
-}
-
-void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride,
- uint16_t *dst0, int dst_stride0, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4,
- const int subpel_y_q4,
- ConvolveParams *conv_params, int bd) {
- CONV_BUF_TYPE *dst = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
- const uint16_t *const src_ptr = src - fo_horiz;
- const int bits = FILTER_BITS - conv_params->round_1;
- (void)filter_params_y;
- (void)subpel_y_q4;
- (void)dst0;
- (void)dst_stride0;
- (void)bd;
-
- int i, j;
- __m256i s[4], coeffs_x[4];
-
- const int do_average = conv_params->do_average;
- const int w0 = conv_params->fwd_offset;
- const int w1 = conv_params->bck_offset;
- const __m256i wt0 = _mm256_set1_epi32(w0);
- const __m256i wt1 = _mm256_set1_epi32(w1);
- const __m128i wt0_128 = _mm256_castsi256_si128(wt0);
- const __m128i wt1_128 = _mm256_castsi256_si128(wt1);
-
- const __m256i round_const_x =
- _mm256_set1_epi32(((1 << conv_params->round_0) >> 1));
- const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
- const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
-
- assert(bits >= 0);
- prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
-
- for (j = 0; j < w; j += 8) {
- /* Horizontal filter */
- for (i = 0; i < h; i += 2) {
- const __m256i row0 =
- _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
- __m256i row1 =
- _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
-
- const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
- const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
-
- // even pixels
- s[0] = _mm256_alignr_epi8(r1, r0, 0);
- s[1] = _mm256_alignr_epi8(r1, r0, 4);
- s[2] = _mm256_alignr_epi8(r1, r0, 8);
- s[3] = _mm256_alignr_epi8(r1, r0, 12);
-
- __m256i res_even = convolve(s, coeffs_x);
- res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
- round_shift_x);
-
- // odd pixels
- s[0] = _mm256_alignr_epi8(r1, r0, 2);
- s[1] = _mm256_alignr_epi8(r1, r0, 6);
- s[2] = _mm256_alignr_epi8(r1, r0, 10);
- s[3] = _mm256_alignr_epi8(r1, r0, 14);
-
- __m256i res_odd = convolve(s, coeffs_x);
- res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
- round_shift_x);
-
- res_even = _mm256_sll_epi32(res_even, round_shift_bits);
- res_odd = _mm256_sll_epi32(res_odd, round_shift_bits);
-
- __m256i res1 = _mm256_unpacklo_epi32(res_even, res_odd);
-
- if (w - j > 4) {
- __m256i res2 = _mm256_unpackhi_epi32(res_even, res_odd);
-
- const __m256i res_ax = _mm256_permute2x128_si256(res1, res2, 0x20);
- const __m256i res_bx = _mm256_permute2x128_si256(res1, res2, 0x31);
-
- if (conv_params->use_jnt_comp_avg) {
- mult_add_store_aligned_256(&dst[i * dst_stride + j], &res_ax, &wt0,
- &wt1, do_average);
- mult_add_store_aligned_256(&dst[i * dst_stride + j + dst_stride],
- &res_bx, &wt0, &wt1, do_average);
- } else {
- add_store_aligned_256(&dst[i * dst_stride + j], &res_ax, do_average);
- add_store_aligned_256(&dst[i * dst_stride + j + dst_stride], &res_bx,
- do_average);
- }
- } else {
- const __m128i res_ax = _mm256_castsi256_si128(res1);
- const __m128i res_bx = _mm256_extracti128_si256(res1, 1);
-
- if (conv_params->use_jnt_comp_avg) {
- mult_add_store(&dst[i * dst_stride + j], &res_ax, &wt0_128, &wt1_128,
- do_average);
- mult_add_store(&dst[i * dst_stride + j + dst_stride], &res_bx,
- &wt0_128, &wt1_128, do_average);
- } else {
- add_store(&dst[i * dst_stride + j], &res_ax, do_average);
- add_store(&dst[i * dst_stride + j + dst_stride], &res_bx, do_average);
- }
- }
- }
- }
-}
-
-void av1_highbd_jnt_convolve_y_avx2(const uint16_t *src, int src_stride,
- uint16_t *dst0, int dst_stride0, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4,
- const int subpel_y_q4,
- ConvolveParams *conv_params, int bd) {
- CONV_BUF_TYPE *dst = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const uint16_t *const src_ptr = src - fo_vert * src_stride;
- const int bits = FILTER_BITS - conv_params->round_0;
- (void)filter_params_x;
- (void)subpel_x_q4;
- (void)dst0;
- (void)dst_stride0;
- (void)bd;
-
- assert(bits >= 0);
- int i, j;
- __m256i s[8], coeffs_y[4];
- const int do_average = conv_params->do_average;
-
- const int w0 = conv_params->fwd_offset;
- const int w1 = conv_params->bck_offset;
- const __m256i wt0 = _mm256_set1_epi32(w0);
- const __m256i wt1 = _mm256_set1_epi32(w1);
- const __m128i wt0_128 = _mm256_castsi256_si128(wt0);
- const __m128i wt1_128 = _mm256_castsi256_si128(wt1);
- const __m256i round_const_y =
- _mm256_set1_epi32(((1 << conv_params->round_1) >> 1));
- const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
- const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
-
- prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
-
- for (j = 0; j < w; j += 8) {
- const uint16_t *data = &src_ptr[j];
- /* Vertical filter */
- {
- __m256i src6;
- __m256i s01 = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
- 0x20);
- __m256i s12 = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
- 0x20);
- __m256i s23 = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
- 0x20);
- __m256i s34 = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
- 0x20);
- __m256i s45 = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
- 0x20);
- src6 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
- __m256i s56 = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
- src6, 0x20);
-
- s[0] = _mm256_unpacklo_epi16(s01, s12);
- s[1] = _mm256_unpacklo_epi16(s23, s34);
- s[2] = _mm256_unpacklo_epi16(s45, s56);
-
- s[4] = _mm256_unpackhi_epi16(s01, s12);
- s[5] = _mm256_unpackhi_epi16(s23, s34);
- s[6] = _mm256_unpackhi_epi16(s45, s56);
-
- for (i = 0; i < h; i += 2) {
- data = &src_ptr[i * src_stride + j];
-
- const __m256i s67 = _mm256_permute2x128_si256(
- src6,
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
- 0x20);
-
- src6 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
-
- const __m256i s78 = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
- src6, 0x20);
-
- s[3] = _mm256_unpacklo_epi16(s67, s78);
- s[7] = _mm256_unpackhi_epi16(s67, s78);
-
- const __m256i res_a = convolve(s, coeffs_y);
-
- __m256i res_a_round = _mm256_sll_epi32(res_a, round_shift_bits);
- res_a_round = _mm256_sra_epi32(
- _mm256_add_epi32(res_a_round, round_const_y), round_shift_y);
-
- if (w - j > 4) {
- const __m256i res_b = convolve(s + 4, coeffs_y);
- __m256i res_b_round = _mm256_sll_epi32(res_b, round_shift_bits);
- res_b_round = _mm256_sra_epi32(
- _mm256_add_epi32(res_b_round, round_const_y), round_shift_y);
-
- const __m256i res_ax =
- _mm256_permute2x128_si256(res_a_round, res_b_round, 0x20);
- const __m256i res_bx =
- _mm256_permute2x128_si256(res_a_round, res_b_round, 0x31);
- if (conv_params->use_jnt_comp_avg) {
- mult_add_store_aligned_256(&dst[i * dst_stride + j], &res_ax, &wt0,
- &wt1, do_average);
- mult_add_store_aligned_256(&dst[i * dst_stride + j + dst_stride],
- &res_bx, &wt0, &wt1, do_average);
- } else {
- add_store_aligned_256(&dst[i * dst_stride + j], &res_ax,
- do_average);
- add_store_aligned_256(&dst[i * dst_stride + j + dst_stride],
- &res_bx, do_average);
- }
- } else {
- const __m128i res_ax = _mm256_castsi256_si128(res_a_round);
- const __m128i res_bx = _mm256_extracti128_si256(res_a_round, 1);
-
- if (conv_params->use_jnt_comp_avg) {
- mult_add_store(&dst[i * dst_stride + j], &res_ax, &wt0_128,
- &wt1_128, do_average);
- mult_add_store(&dst[i * dst_stride + j + dst_stride], &res_bx,
- &wt0_128, &wt1_128, do_average);
- } else {
- add_store(&dst[i * dst_stride + j], &res_ax, do_average);
- add_store(&dst[i * dst_stride + j + dst_stride], &res_bx,
- do_average);
- }
- }
- s[0] = s[1];
- s[1] = s[2];
- s[2] = s[3];
-
- s[4] = s[5];
- s[5] = s[6];
- s[6] = s[7];
- }
- }
- }
-}
-#endif
diff --git a/av1/common/x86/highbd_jnt_convolve_sse4.c b/av1/common/x86/highbd_jnt_convolve_sse4.c
index 680bebd..ba092a2 100644
--- a/av1/common/x86/highbd_jnt_convolve_sse4.c
+++ b/av1/common/x86/highbd_jnt_convolve_sse4.c
@@ -16,7 +16,6 @@
#include "aom_dsp/x86/convolve_sse2.h"
#include "aom_dsp/x86/convolve_sse4_1.h"
-#if CONFIG_LOWPRECISION_BLEND
void av1_highbd_jnt_convolve_y_sse4_1(
const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
int h, InterpFilterParams *filter_params_x,
@@ -381,4 +380,3 @@
}
}
}
-#endif
diff --git a/av1/common/x86/highbd_warp_plane_sse4.c b/av1/common/x86/highbd_warp_plane_sse4.c
index 5df6f7d..9599433 100644
--- a/av1/common/x86/highbd_warp_plane_sse4.c
+++ b/av1/common/x86/highbd_warp_plane_sse4.c
@@ -21,7 +21,6 @@
int subsampling_x, int subsampling_y, int bd,
ConvolveParams *conv_params, int16_t alpha,
int16_t beta, int16_t gamma, int16_t delta) {
-#if CONFIG_LOWPRECISION_BLEND
int comp_avg = conv_params->do_average;
__m128i tmp[15];
int i, j, k;
@@ -412,359 +411,4 @@
}
}
}
-#else // CONFIG_LOWPRECISION_BLEND
- int comp_avg = conv_params->do_average;
- __m128i tmp[15];
- int i, j, k;
- const int reduce_bits_horiz =
- conv_params->round_0 +
- AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0);
- const int reduce_bits_vert = conv_params->is_compound
- ? conv_params->round_1
- : 2 * FILTER_BITS - reduce_bits_horiz;
- const int offset_bits_horiz = bd + FILTER_BITS - 1;
- assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
- assert(!(bd == 12 && reduce_bits_horiz < 5));
-
- const int w0 = conv_params->fwd_offset;
- const int w1 = conv_params->bck_offset;
- const __m128i wt0 = _mm_set1_epi32(w0);
- const __m128i wt1 = _mm_set1_epi32(w1);
-
- /* Note: For this code to work, the left/right frame borders need to be
- extended by at least 13 pixels each. By the time we get here, other
- code will have set up this border, but we allow an explicit check
- for debugging purposes.
- */
- /*for (i = 0; i < height; ++i) {
- for (j = 0; j < 13; ++j) {
- assert(ref[i * stride - 13 + j] == ref[i * stride]);
- assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
- }
- }*/
-
- for (i = 0; i < p_height; i += 8) {
- for (j = 0; j < p_width; j += 8) {
- const int32_t src_x = (p_col + j + 4) << subsampling_x;
- const int32_t src_y = (p_row + i + 4) << subsampling_y;
- const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
- const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
- const int32_t x4 = dst_x >> subsampling_x;
- const int32_t y4 = dst_y >> subsampling_y;
-
- int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
- int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
- int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
- int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-
- // Add in all the constant terms, including rounding and offset
- sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
- (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
- sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
- (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
-
- sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
- sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
-
- // Horizontal filter
- // If the block is aligned such that, after clamping, every sample
- // would be taken from the leftmost/rightmost column, then we can
- // skip the expensive horizontal filter.
- if (ix4 <= -7) {
- for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
- int iy = iy4 + k;
- if (iy < 0)
- iy = 0;
- else if (iy > height - 1)
- iy = height - 1;
- tmp[k + 7] = _mm_set1_epi16(
- (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
- ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));
- }
- } else if (ix4 >= width + 6) {
- for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
- int iy = iy4 + k;
- if (iy < 0)
- iy = 0;
- else if (iy > height - 1)
- iy = height - 1;
- tmp[k + 7] =
- _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
- ref[iy * stride + (width - 1)] *
- (1 << (FILTER_BITS - reduce_bits_horiz)));
- }
- } else {
- for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
- int iy = iy4 + k;
- if (iy < 0)
- iy = 0;
- else if (iy > height - 1)
- iy = height - 1;
- int sx = sx4 + beta * (k + 4);
-
- // Load source pixels
- const __m128i src =
- _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
- const __m128i src2 =
- _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
-
- // Filter even-index pixels
- const __m128i tmp_0 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_2 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_4 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_6 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
-
- // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
- const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
- // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
- const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
- // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
- const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
- // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
- const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
-
- // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
- const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
- // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
- const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
- // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
- const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
- // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
- const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
-
- const __m128i round_const = _mm_set1_epi32(
- (1 << offset_bits_horiz) + ((1 << reduce_bits_horiz) >> 1));
-
- // Calculate filtered results
- const __m128i res_0 = _mm_madd_epi16(src, coeff_0);
- const __m128i res_2 =
- _mm_madd_epi16(_mm_alignr_epi8(src2, src, 4), coeff_2);
- const __m128i res_4 =
- _mm_madd_epi16(_mm_alignr_epi8(src2, src, 8), coeff_4);
- const __m128i res_6 =
- _mm_madd_epi16(_mm_alignr_epi8(src2, src, 12), coeff_6);
-
- __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
- _mm_add_epi32(res_2, res_6));
- res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const),
- _mm_cvtsi32_si128(reduce_bits_horiz));
-
- // Filter odd-index pixels
- const __m128i tmp_1 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_3 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_5 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_7 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
-
- const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
- const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
- const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
- const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
-
- const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
- const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
- const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
- const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
-
- const __m128i res_1 =
- _mm_madd_epi16(_mm_alignr_epi8(src2, src, 2), coeff_1);
- const __m128i res_3 =
- _mm_madd_epi16(_mm_alignr_epi8(src2, src, 6), coeff_3);
- const __m128i res_5 =
- _mm_madd_epi16(_mm_alignr_epi8(src2, src, 10), coeff_5);
- const __m128i res_7 =
- _mm_madd_epi16(_mm_alignr_epi8(src2, src, 14), coeff_7);
-
- __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
- _mm_add_epi32(res_3, res_7));
- res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const),
- _mm_cvtsi32_si128(reduce_bits_horiz));
-
- // Combine results into one register.
- // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
- // as this order helps with the vertical filter.
- tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
- }
- }
-
- // Vertical filter
- for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
- int sy = sy4 + delta * (k + 4);
-
- // Load from tmp and rearrange pairs of consecutive rows into the
- // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
- const __m128i *src = tmp + (k + 4);
- const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
- const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
- const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
- const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
-
- // Filter even-index pixels
- const __m128i tmp_0 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_2 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_4 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_6 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
- const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
- const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
- const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
- const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
-
- const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
- const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
- const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
- const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
-
- const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
- const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
- const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
- const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
-
- const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
- _mm_add_epi32(res_4, res_6));
-
- // Filter odd-index pixels
- const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
- const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
- const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
- const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
-
- const __m128i tmp_1 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_3 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_5 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_7 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
- const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
- const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
- const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
- const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
-
- const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
- const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
- const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
- const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
-
- const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
- const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
- const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
- const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
-
- const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
- _mm_add_epi32(res_5, res_7));
-
- // Rearrange pixels back into the order 0 ... 7
- __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
- __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
- if (conv_params->is_compound) {
- __m128i *const p =
- (__m128i *)&conv_params
- ->dst[(i + k + 4) * conv_params->dst_stride + j];
- const __m128i round_const = _mm_set1_epi32(
- -(1 << (bd + 2 * FILTER_BITS - reduce_bits_horiz - 1)) +
- ((1 << (reduce_bits_vert)) >> 1));
- res_lo = _mm_add_epi32(res_lo, round_const);
- res_lo = _mm_sra_epi32(res_lo, _mm_cvtsi32_si128(reduce_bits_vert));
- if (conv_params->use_jnt_comp_avg) {
- if (comp_avg) {
- const __m128i sum =
- _mm_add_epi32(_mm_mullo_epi32(_mm_loadu_si128(p), wt0),
- _mm_mullo_epi32(res_lo, wt1));
- res_lo = _mm_srai_epi32(sum, DIST_PRECISION_BITS);
- }
- } else {
- if (comp_avg)
- res_lo =
- _mm_srai_epi32(_mm_add_epi32(_mm_loadu_si128(p), res_lo), 1);
- }
-
- _mm_storeu_si128(p, res_lo);
-
- if (p_width > 4) {
- res_hi = _mm_add_epi32(res_hi, round_const);
- res_hi = _mm_sra_epi32(res_hi, _mm_cvtsi32_si128(reduce_bits_vert));
-
- if (conv_params->use_jnt_comp_avg) {
- if (comp_avg) {
- const __m128i sum =
- _mm_add_epi32(_mm_mullo_epi32(_mm_loadu_si128(p + 1), wt0),
- _mm_mullo_epi32(res_hi, wt1));
- res_hi = _mm_srai_epi32(sum, DIST_PRECISION_BITS);
- }
- } else {
- if (comp_avg)
- res_hi = _mm_srai_epi32(
- _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi), 1);
- }
-
- _mm_storeu_si128(p + 1, res_hi);
- }
- } else {
- // Round and pack into 8 bits
- const __m128i round_const =
- _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
- ((1 << reduce_bits_vert) >> 1));
-
- const __m128i res_lo_round = _mm_srai_epi32(
- _mm_add_epi32(res_lo, round_const), reduce_bits_vert);
- const __m128i res_hi_round = _mm_srai_epi32(
- _mm_add_epi32(res_hi, round_const), reduce_bits_vert);
-
- __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
- // Clamp res_16bit to the range [0, 2^bd - 1]
- const __m128i max_val = _mm_set1_epi16((1 << bd) - 1);
- const __m128i zero = _mm_setzero_si128();
- res_16bit = _mm_max_epi16(_mm_min_epi16(res_16bit, max_val), zero);
-
- // Store, blending with 'pred' if needed
- __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
-
- // Note: If we're outputting a 4x4 block, we need to be very careful
- // to only output 4 pixels at this point, to avoid encode/decode
- // mismatches when encoding with multiple threads.
- if (p_width == 4) {
- if (comp_avg)
- res_16bit = _mm_avg_epu16(res_16bit, _mm_loadl_epi64(p));
- _mm_storel_epi64(p, res_16bit);
- } else {
- if (comp_avg)
- res_16bit = _mm_avg_epu16(res_16bit, _mm_loadu_si128(p));
- _mm_storeu_si128(p, res_16bit);
- }
- }
- }
- }
- }
-#endif // CONFIG_LOWPRECISION_BLEND
}
diff --git a/av1/common/x86/jnt_convolve_avx2.c b/av1/common/x86/jnt_convolve_avx2.c
index aaf8c7e..610b057 100644
--- a/av1/common/x86/jnt_convolve_avx2.c
+++ b/av1/common/x86/jnt_convolve_avx2.c
@@ -20,7 +20,6 @@
#include "aom_dsp/aom_filter.h"
#include "av1/common/convolve.h"
-#if CONFIG_LOWPRECISION_BLEND
void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
int dst_stride0, int w, int h,
InterpFilterParams *filter_params_x,
@@ -702,487 +701,3 @@
}
}
}
-#else
-void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride,
- const uint8_t *dst0, int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
- CONV_BUF_TYPE *dst = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
- int i, j;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
- const uint8_t *const src_ptr = src - fo_horiz;
- const int bits = FILTER_BITS - conv_params->round_1;
- const int w0 = conv_params->fwd_offset;
- const int w1 = conv_params->bck_offset;
- const __m256i wt0 = _mm256_set1_epi32(w0);
- const __m256i wt1 = _mm256_set1_epi32(w1);
- const int do_average = conv_params->do_average;
- __m256i filt[4], coeffs[4];
-
- assert(bits >= 0);
- assert(conv_params->round_0 > 0);
-
- filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
- filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
- filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
- filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
-
- prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
-
- const __m256i round_const =
- _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
-
- (void)filter_params_y;
- (void)subpel_y_q4;
- (void)dst0;
- (void)dst_stride0;
-
- for (i = 0; i < h; i += 2) {
- for (j = 0; j < w; j += 8) {
- const __m256i data = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),
- _mm256_castsi128_si256(_mm_loadu_si128(
- (__m128i *)(&src_ptr[i * src_stride + j + src_stride]))),
- 0x20);
-
- __m256i res = convolve_lowbd_x(data, coeffs, filt);
-
- res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift);
-
- const __m256i res_lo_round =
- _mm256_cvtepi16_epi32(_mm256_castsi256_si128(res));
- const __m256i res_hi_round =
- _mm256_cvtepi16_epi32(_mm256_extracti128_si256(res, 1));
-
- const __m256i res_lo_shift = _mm256_slli_epi32(res_lo_round, bits);
- const __m256i res_hi_shift = _mm256_slli_epi32(res_hi_round, bits);
-
- // Accumulate values into the destination buffer
- if (conv_params->use_jnt_comp_avg) {
- mult_add_store_aligned_256(&dst[i * dst_stride + j], &res_lo_shift,
- &wt0, &wt1, do_average);
- mult_add_store_aligned_256(&dst[i * dst_stride + j + dst_stride],
- &res_hi_shift, &wt0, &wt1, do_average);
- } else {
- add_store_aligned_256(&dst[i * dst_stride + j], &res_lo_shift,
- do_average);
- add_store_aligned_256(&dst[i * dst_stride + j + dst_stride],
- &res_hi_shift, do_average);
- }
- }
- }
-}
-
-void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride,
- const uint8_t *dst0, int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
- CONV_BUF_TYPE *dst = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
- int i, j;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const uint8_t *const src_ptr = src - fo_vert * src_stride;
- // +1 to compensate for dividing the filter coeffs by 2
- const int left_shift = FILTER_BITS - conv_params->round_0 + 1;
- const __m256i round_const =
- _mm256_set1_epi32((1 << conv_params->round_1) >> 1);
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
- const int w0 = conv_params->fwd_offset;
- const int w1 = conv_params->bck_offset;
- const __m256i wt0 = _mm256_set1_epi32(w0);
- const __m256i wt1 = _mm256_set1_epi32(w1);
- const int do_average = conv_params->do_average;
- __m256i coeffs[4], s[8];
-
- assert((FILTER_BITS - conv_params->round_0) >= 0);
-
- prepare_coeffs_lowbd(filter_params_y, subpel_y_q4, coeffs);
-
- (void)conv_params;
- (void)filter_params_x;
- (void)subpel_x_q4;
- (void)dst0;
- (void)dst_stride0;
-
- for (j = 0; j < w; j += 16) {
- const uint8_t *data = &src_ptr[j];
- __m256i src6;
-
- // Load lines a and b. Line a to lower 128, line b to upper 128
- const __m256i src_01a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
- 0x20);
-
- const __m256i src_12a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
- 0x20);
-
- const __m256i src_23a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
- 0x20);
-
- const __m256i src_34a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
- 0x20);
-
- const __m256i src_45a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
- 0x20);
-
- src6 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
- const __m256i src_56a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
- src6, 0x20);
-
- s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
- s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
- s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
-
- s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
- s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
- s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
-
- for (i = 0; i < h; i += 2) {
- data = &src_ptr[i * src_stride + j];
- const __m256i src_67a = _mm256_permute2x128_si256(
- src6,
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
- 0x20);
-
- src6 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
- const __m256i src_78a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
- src6, 0x20);
-
- s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
- s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
-
- const __m256i res_lo = convolve_lowbd(s, coeffs);
-
- const __m256i res_lo_0_32b =
- _mm256_cvtepi16_epi32(_mm256_castsi256_si128(res_lo));
- const __m256i res_lo_0_shift =
- _mm256_slli_epi32(res_lo_0_32b, left_shift);
- const __m256i res_lo_0_round = _mm256_sra_epi32(
- _mm256_add_epi32(res_lo_0_shift, round_const), round_shift);
-
- const __m256i res_lo_1_32b =
- _mm256_cvtepi16_epi32(_mm256_extracti128_si256(res_lo, 1));
- const __m256i res_lo_1_shift =
- _mm256_slli_epi32(res_lo_1_32b, left_shift);
- const __m256i res_lo_1_round = _mm256_sra_epi32(
- _mm256_add_epi32(res_lo_1_shift, round_const), round_shift);
-
- // Accumulate values into the destination buffer
- if (conv_params->use_jnt_comp_avg) {
- mult_add_store_aligned_256(&dst[i * dst_stride + j], &res_lo_0_round,
- &wt0, &wt1, do_average);
- mult_add_store_aligned_256(&dst[i * dst_stride + j + dst_stride],
- &res_lo_1_round, &wt0, &wt1, do_average);
- } else {
- add_store_aligned_256(&dst[i * dst_stride + j], &res_lo_0_round,
- do_average);
- add_store_aligned_256(&dst[i * dst_stride + j + dst_stride],
- &res_lo_1_round, do_average);
- }
-
- if (w - j > 8) {
- const __m256i res_hi = convolve_lowbd(s + 4, coeffs);
-
- const __m256i res_hi_0_32b =
- _mm256_cvtepi16_epi32(_mm256_castsi256_si128(res_hi));
- const __m256i res_hi_0_shift =
- _mm256_slli_epi32(res_hi_0_32b, left_shift);
- const __m256i res_hi_0_round = _mm256_sra_epi32(
- _mm256_add_epi32(res_hi_0_shift, round_const), round_shift);
-
- const __m256i res_hi_1_32b =
- _mm256_cvtepi16_epi32(_mm256_extracti128_si256(res_hi, 1));
- const __m256i res_hi_1_shift =
- _mm256_slli_epi32(res_hi_1_32b, left_shift);
- const __m256i res_hi_1_round = _mm256_sra_epi32(
- _mm256_add_epi32(res_hi_1_shift, round_const), round_shift);
-
- if (conv_params->use_jnt_comp_avg) {
- mult_add_store_aligned_256(&dst[i * dst_stride + j + 8],
- &res_hi_0_round, &wt0, &wt1, do_average);
- mult_add_store_aligned_256(&dst[i * dst_stride + j + 8 + dst_stride],
- &res_hi_1_round, &wt0, &wt1, do_average);
- } else {
- add_store_aligned_256(&dst[i * dst_stride + j + 8], &res_hi_0_round,
- do_average);
- add_store_aligned_256(&dst[i * dst_stride + j + 8 + dst_stride],
- &res_hi_1_round, do_average);
- }
- }
- s[0] = s[1];
- s[1] = s[2];
- s[2] = s[3];
-
- s[4] = s[5];
- s[5] = s[6];
- s[6] = s[7];
- }
- }
-}
-
-void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
- int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
- CONV_BUF_TYPE *dst = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
- const int bd = 8;
- (void)dst0;
- (void)dst_stride0;
-
- DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
- int im_h = h + filter_params_y->taps - 1;
- int im_stride = 8;
- int i, j;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
- const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
- const int w0 = conv_params->fwd_offset;
- const int w1 = conv_params->bck_offset;
- const __m256i wt0 = _mm256_set1_epi32(w0);
- const __m256i wt1 = _mm256_set1_epi32(w1);
- const int do_average = conv_params->do_average;
- const __m128i wt0_128 = _mm256_castsi256_si128(wt0);
- const __m128i wt1_128 = _mm256_castsi256_si128(wt1);
- __m256i filt[4], s[8], coeffs_x[4], coeffs_y[4];
-
- assert(conv_params->round_0 > 0);
-
- filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
- filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
- filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
- filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
-
- prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_x);
- prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
-
- const __m256i round_const_h = _mm256_set1_epi16(
- ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2)));
- const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
-
- const __m256i round_const_v = _mm256_set1_epi32(
- ((1 << conv_params->round_1) >> 1) -
- (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
- const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
-
- for (j = 0; j < w; j += 8) {
- /* Horizontal filter */
- {
- for (i = 0; i < im_h; i += 2) {
- __m256i data = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
- if (i + 1 < im_h)
- data = _mm256_inserti128_si256(
- data,
- _mm_loadu_si128(
- (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
- 1);
- __m256i res = convolve_lowbd_x(data, coeffs_x, filt);
-
- res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),
- round_shift_h);
-
- _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
- }
- }
-
- /* Vertical filter */
- {
- __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
- __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
- __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
- __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
- __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
- __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
-
- s[0] = _mm256_unpacklo_epi16(s0, s1);
- s[1] = _mm256_unpacklo_epi16(s2, s3);
- s[2] = _mm256_unpacklo_epi16(s4, s5);
-
- s[4] = _mm256_unpackhi_epi16(s0, s1);
- s[5] = _mm256_unpackhi_epi16(s2, s3);
- s[6] = _mm256_unpackhi_epi16(s4, s5);
-
- for (i = 0; i < h; i += 2) {
- const int16_t *data = &im_block[i * im_stride];
-
- const __m256i s6 =
- _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
- const __m256i s7 =
- _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
-
- s[3] = _mm256_unpacklo_epi16(s6, s7);
- s[7] = _mm256_unpackhi_epi16(s6, s7);
-
- const __m256i res_a = convolve(s, coeffs_y);
- const __m256i res_a_round = _mm256_sra_epi32(
- _mm256_add_epi32(res_a, round_const_v), round_shift_v);
-
- if (w - j > 4) {
- const __m256i res_b = convolve(s + 4, coeffs_y);
- const __m256i res_b_round = _mm256_sra_epi32(
- _mm256_add_epi32(res_b, round_const_v), round_shift_v);
- const __m256i res_ax =
- _mm256_permute2x128_si256(res_a_round, res_b_round, 0x20);
- const __m256i res_bx =
- _mm256_permute2x128_si256(res_a_round, res_b_round, 0x31);
-
- if (conv_params->use_jnt_comp_avg) {
- mult_add_store_aligned_256(&dst[i * dst_stride + j], &res_ax, &wt0,
- &wt1, do_average);
- mult_add_store_aligned_256(&dst[i * dst_stride + j + dst_stride],
- &res_bx, &wt0, &wt1, do_average);
- } else {
- add_store_aligned_256(&dst[i * dst_stride + j], &res_ax,
- do_average);
- add_store_aligned_256(&dst[i * dst_stride + j + dst_stride],
- &res_bx, do_average);
- }
- } else {
- const __m128i res_ax = _mm256_castsi256_si128(res_a_round);
- const __m128i res_bx = _mm256_extracti128_si256(res_a_round, 1);
-
- if (conv_params->use_jnt_comp_avg) {
- mult_add_store(&dst[i * dst_stride + j], &res_ax, &wt0_128,
- &wt1_128, do_average);
- mult_add_store(&dst[i * dst_stride + j + dst_stride], &res_bx,
- &wt0_128, &wt1_128, do_average);
- } else {
- add_store(&dst[i * dst_stride + j], &res_ax, do_average);
- add_store(&dst[i * dst_stride + j + dst_stride], &res_bx,
- do_average);
- }
- }
-
- s[0] = s[1];
- s[1] = s[2];
- s[2] = s[3];
-
- s[4] = s[5];
- s[5] = s[6];
- s[6] = s[7];
- }
- }
- }
-}
-
-void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
- uint8_t *dst0, int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
- CONV_BUF_TYPE *dst = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
- (void)filter_params_x;
- (void)filter_params_y;
- (void)subpel_x_q4;
- (void)subpel_y_q4;
- (void)dst0;
- (void)dst_stride0;
-
- const int bits =
- FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
- const __m128i left_shift = _mm_cvtsi32_si128(bits);
- const int do_average = conv_params->do_average;
- const int w0 = conv_params->fwd_offset;
- const int w1 = conv_params->bck_offset;
- const __m256i wt0 = _mm256_set1_epi32(w0);
- const __m256i wt1 = _mm256_set1_epi32(w1);
- const __m256i zero = _mm256_setzero_si256();
- int i, j;
-
- if (!(w % 16)) {
- for (i = 0; i < h; i += 1) {
- for (j = 0; j < w; j += 16) {
- const __m256i src_16bit = _mm256_cvtepu8_epi16(
- _mm_loadu_si128((__m128i *)(&src[i * src_stride + j])));
-
- const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
- const __m256i res_lo =
- _mm256_cvtepu16_epi32(_mm256_castsi256_si128(res));
- const __m256i res_hi =
- _mm256_cvtepu16_epi32(_mm256_extracti128_si256(res, 1));
-
- if (conv_params->use_jnt_comp_avg) {
- mult_add_store_aligned_256(&dst[i * dst_stride + j], &res_lo, &wt0,
- &wt1, do_average);
- mult_add_store_aligned_256(&dst[i * dst_stride + j + 8], &res_hi,
- &wt0, &wt1, do_average);
- } else {
- add_store_aligned_256(&dst[i * dst_stride + j], &res_lo, do_average);
- add_store_aligned_256(&dst[i * dst_stride + j + 8], &res_hi,
- do_average);
- }
- }
- }
- } else if (!(w % 4)) {
- for (i = 0; i < h; i += 2) {
- for (j = 0; j < w; j += 8) {
- const __m128i src_row_0 =
- _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j]));
- const __m128i src_row_1 =
- _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j + src_stride]));
- // since not all compilers yet support _mm256_set_m128i()
- const __m256i src_10 = _mm256_insertf128_si256(
- _mm256_castsi128_si256(src_row_0), src_row_1, 1);
-
- const __m256i src_16bit = _mm256_unpacklo_epi8(src_10, zero);
-
- const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
-
- const __m256i res_lo =
- _mm256_cvtepu16_epi32(_mm256_castsi256_si128(res));
- const __m256i res_hi =
- _mm256_cvtepu16_epi32(_mm256_extracti128_si256(res, 1));
-
- if (conv_params->use_jnt_comp_avg) {
- mult_add_store_aligned_256(&dst[i * dst_stride + j], &res_lo, &wt0,
- &wt1, do_average);
- mult_add_store_aligned_256(&dst[i * dst_stride + j + dst_stride],
- &res_hi, &wt0, &wt1, do_average);
- } else {
- add_store_aligned_256(&dst[i * dst_stride + j], &res_lo, do_average);
- add_store_aligned_256(&dst[i * dst_stride + j + dst_stride], &res_hi,
- do_average);
- }
- }
- }
- }
-}
-#endif
diff --git a/av1/common/x86/jnt_convolve_sse2.c b/av1/common/x86/jnt_convolve_sse2.c
index 166ede1..5c9b2ae 100644
--- a/av1/common/x86/jnt_convolve_sse2.c
+++ b/av1/common/x86/jnt_convolve_sse2.c
@@ -15,7 +15,6 @@
#include "aom_dsp/aom_filter.h"
#include "aom_dsp/x86/convolve_sse2.h"
-#if CONFIG_LOWPRECISION_BLEND
void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
int dst_stride0, int w, int h,
InterpFilterParams *filter_params_x,
@@ -383,4 +382,3 @@
} while (j < w);
}
}
-#endif
diff --git a/av1/common/x86/jnt_convolve_sse4.c b/av1/common/x86/jnt_convolve_sse4.c
deleted file mode 100644
index fde9677..0000000
--- a/av1/common/x86/jnt_convolve_sse4.c
+++ /dev/null
@@ -1,576 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>
-#include <smmintrin.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/aom_convolve.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/x86/convolve_common_intrin.h"
-#include "aom_dsp/x86/convolve_sse4_1.h"
-#include "av1/common/convolve.h"
-
-#if !CONFIG_LOWPRECISION_BLEND
-static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
- const int subpel_q4,
- __m128i *const coeffs /* [4] */) {
- const int16_t *const y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params, subpel_q4 & SUBPEL_MASK);
- const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
- // coeffs 0 1 0 1 2 3 2 3
- const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
- // coeffs 4 5 4 5 6 7 6 7
- const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
- coeffs[0] = _mm_unpacklo_epi64(tmp_0, tmp_0); // coeffs 0 1 0 1 0 1 0 1
- coeffs[1] = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3
- coeffs[2] = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5
- coeffs[3] = _mm_unpackhi_epi64(tmp_1, tmp_1); // coeffs 6 7 6 7 6 7 6 7
-}
-
-static INLINE __m128i convolve(const __m128i *const s,
- const __m128i *const coeffs) {
- const __m128i d0 = _mm_madd_epi16(s[0], coeffs[0]);
- const __m128i d1 = _mm_madd_epi16(s[1], coeffs[1]);
- const __m128i d2 = _mm_madd_epi16(s[2], coeffs[2]);
- const __m128i d3 = _mm_madd_epi16(s[3], coeffs[3]);
- const __m128i d = _mm_add_epi32(_mm_add_epi32(d0, d1), _mm_add_epi32(d2, d3));
- return d;
-}
-
-static INLINE __m128i convolve_lo_x(const __m128i *const s,
- const __m128i *const coeffs) {
- __m128i ss[4];
- ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
- ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128());
- ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
- ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128());
- return convolve(ss, coeffs);
-}
-
-static INLINE __m128i convolve_lo_y(const __m128i *const s,
- const __m128i *const coeffs) {
- __m128i ss[4];
- ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
- ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
- ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128());
- ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128());
- return convolve(ss, coeffs);
-}
-
-static INLINE __m128i convolve_hi_y(const __m128i *const s,
- const __m128i *const coeffs) {
- __m128i ss[4];
- ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128());
- ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128());
- ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128());
- ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128());
- return convolve(ss, coeffs);
-}
-void av1_jnt_convolve_y_sse4_1(const uint8_t *src, int src_stride,
- const uint8_t *dst0, int dst_stride0, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
- CONV_BUF_TYPE *dst = conv_params->dst;
- const int dst_stride = conv_params->dst_stride;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const uint8_t *src_ptr = src - fo_vert * src_stride;
- const int bits = FILTER_BITS - conv_params->round_0;
- const __m128i left_shift = _mm_cvtsi32_si128(bits);
- const __m128i wt0 = _mm_set1_epi32(conv_params->fwd_offset);
- const __m128i wt1 = _mm_set1_epi32(conv_params->bck_offset);
- const __m128i round_const = _mm_set1_epi32((1 << conv_params->round_1) >> 1);
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
- __m128i coeffs[4];
-
- (void)filter_params_x;
- (void)subpel_x_q4;
- (void)dst0;
- (void)dst_stride0;
-
- prepare_coeffs(filter_params_y, subpel_y_q4, coeffs);
-
- if (w == 4) {
- __m128i s[8], src6, res, res_shift;
- src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 6 * src_stride));
- s[0] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 0 * src_stride)),
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)));
- s[1] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)),
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)));
- s[2] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)),
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)));
- s[3] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)),
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)));
- s[4] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)),
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)));
- s[5] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)), src6);
-
- do {
- s[6] = _mm_unpacklo_epi8(
- src6, _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)));
- src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 8 * src_stride));
- s[7] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)), src6);
-
- res = convolve_lo_y(s + 0, coeffs);
- res_shift = _mm_sll_epi32(res, left_shift);
- res_shift =
- _mm_sra_epi32(_mm_add_epi32(res_shift, round_const), round_shift);
- if (conv_params->use_jnt_comp_avg)
- mult_add_store(dst, &res_shift, &wt0, &wt1, conv_params->do_average);
- else
- add_store(dst, &res_shift, conv_params->do_average);
- src_ptr += src_stride;
- dst += dst_stride;
-
- res = convolve_lo_y(s + 1, coeffs);
- res_shift = _mm_sll_epi32(res, left_shift);
- res_shift =
- _mm_sra_epi32(_mm_add_epi32(res_shift, round_const), round_shift);
- if (conv_params->use_jnt_comp_avg)
- mult_add_store(dst, &res_shift, &wt0, &wt1, conv_params->do_average);
- else
- add_store(dst, &res_shift, conv_params->do_average);
- src_ptr += src_stride;
- dst += dst_stride;
-
- s[0] = s[2];
- s[1] = s[3];
- s[2] = s[4];
- s[3] = s[5];
- s[4] = s[6];
- s[5] = s[7];
- h -= 2;
- } while (h);
- } else {
- assert(!(w % 8));
- int j = 0;
- do {
- __m128i s[8], src6, res_lo, res_hi, res_lo_shift, res_hi_shift;
- const uint8_t *data = &src_ptr[j];
-
- src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
- s[0] = _mm_unpacklo_epi8(
- _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
- _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
- s[1] = _mm_unpacklo_epi8(
- _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
- _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
- s[2] = _mm_unpacklo_epi8(
- _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
- _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
- s[3] = _mm_unpacklo_epi8(
- _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
- _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
- s[4] = _mm_unpacklo_epi8(
- _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
- _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
- s[5] = _mm_unpacklo_epi8(
- _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6);
-
- int i = 0;
- do {
- data = &src_ptr[i * src_stride + j];
- s[6] = _mm_unpacklo_epi8(
- src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
- src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
- s[7] = _mm_unpacklo_epi8(
- _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6);
-
- res_lo = convolve_lo_y(s, coeffs); // Filter low index pixels
- res_hi = convolve_hi_y(s, coeffs); // Filter high index pixels
- res_lo_shift = _mm_sll_epi32(res_lo, left_shift);
- res_hi_shift = _mm_sll_epi32(res_hi, left_shift);
- res_lo_shift = _mm_sra_epi32(_mm_add_epi32(res_lo_shift, round_const),
- round_shift);
- res_hi_shift = _mm_sra_epi32(_mm_add_epi32(res_hi_shift, round_const),
- round_shift);
- if (conv_params->use_jnt_comp_avg) {
- mult_add_store(dst + i * dst_stride + j + 0, &res_lo_shift, &wt0,
- &wt1, conv_params->do_average);
- mult_add_store(dst + i * dst_stride + j + 4, &res_hi_shift, &wt0,
- &wt1, conv_params->do_average);
- } else {
- add_store(dst + i * dst_stride + j + 0, &res_lo_shift,
- conv_params->do_average);
- add_store(dst + i * dst_stride + j + 4, &res_hi_shift,
- conv_params->do_average);
- }
- i++;
-
- res_lo = convolve_lo_y(s + 1, coeffs); // Filter low index pixels
- res_hi = convolve_hi_y(s + 1, coeffs); // Filter high index pixels
- res_lo_shift = _mm_sll_epi32(res_lo, left_shift);
- res_hi_shift = _mm_sll_epi32(res_hi, left_shift);
- res_lo_shift = _mm_sra_epi32(_mm_add_epi32(res_lo_shift, round_const),
- round_shift);
- res_hi_shift = _mm_sra_epi32(_mm_add_epi32(res_hi_shift, round_const),
- round_shift);
- if (conv_params->use_jnt_comp_avg) {
- mult_add_store(dst + i * dst_stride + j + 0, &res_lo_shift, &wt0,
- &wt1, conv_params->do_average);
- mult_add_store(dst + i * dst_stride + j + 4, &res_hi_shift, &wt0,
- &wt1, conv_params->do_average);
- } else {
- add_store(dst + i * dst_stride + j + 0, &res_lo_shift,
- conv_params->do_average);
- add_store(dst + i * dst_stride + j + 4, &res_hi_shift,
- conv_params->do_average);
- }
- i++;
-
- s[0] = s[2];
- s[1] = s[3];
- s[2] = s[4];
- s[3] = s[5];
- s[4] = s[6];
- s[5] = s[7];
- } while (i < h);
- j += 8;
- } while (j < w);
- }
-}
-
-void av1_jnt_convolve_x_sse4_1(const uint8_t *src, int src_stride,
- const uint8_t *dst0, int dst_stride0, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
- CONV_BUF_TYPE *dst = conv_params->dst;
- const int dst_stride = conv_params->dst_stride;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
- const uint8_t *src_ptr = src - fo_horiz;
- const int bits = FILTER_BITS - conv_params->round_1;
- const __m128i left_shift = _mm_cvtsi32_si128(bits);
- const __m128i round_const = _mm_set1_epi32((1 << conv_params->round_0) >> 1);
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
- const int w0 = conv_params->fwd_offset;
- const int w1 = conv_params->bck_offset;
- const __m128i wt0 = _mm_set1_epi32(w0);
- const __m128i wt1 = _mm_set1_epi32(w1);
- __m128i coeffs[4];
-
- (void)filter_params_y;
- (void)subpel_y_q4;
- (void)dst0;
- (void)dst_stride0;
-
- prepare_coeffs(filter_params_x, subpel_x_q4, coeffs);
-
- if (w == 4) {
- do {
- const __m128i data = _mm_loadu_si128((__m128i *)src_ptr);
- __m128i s[4];
-
- s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1));
- s[1] =
- _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
- s[2] =
- _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
- s[3] =
- _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
- const __m128i res_lo = convolve_lo_x(s, coeffs);
- const __m128i res_lo_round =
- _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
- const __m128i res_lo_shift = _mm_sll_epi32(res_lo_round, left_shift);
-
- // Accumulate values into the destination buffer
- if (conv_params->use_jnt_comp_avg)
- mult_add_store(dst, &res_lo_shift, &wt0, &wt1, conv_params->do_average);
- else
- add_store(dst, &res_lo_shift, conv_params->do_average);
- src_ptr += src_stride;
- dst += dst_stride;
- } while (--h);
- } else {
- assert(!(w % 8));
- int i = 0;
- do {
- int j = 0;
- do {
- const __m128i data =
- _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
- __m128i s[4];
-
- // Filter even-index pixels
- s[0] = data;
- s[1] = _mm_srli_si128(data, 2);
- s[2] = _mm_srli_si128(data, 4);
- s[3] = _mm_srli_si128(data, 6);
- const __m128i res_even = convolve_lo_x(s, coeffs);
-
- // Filter odd-index pixels
- s[0] = _mm_srli_si128(data, 1);
- s[1] = _mm_srli_si128(data, 3);
- s[2] = _mm_srli_si128(data, 5);
- s[3] = _mm_srli_si128(data, 7);
- const __m128i res_odd = convolve_lo_x(s, coeffs);
-
- // Rearrange pixels back into the order 0 ... 7
- const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
- const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
- const __m128i res_lo_round =
- _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
- const __m128i res_hi_round =
- _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
- const __m128i res_lo_shift = _mm_sll_epi32(res_lo_round, left_shift);
- const __m128i res_hi_shift = _mm_sll_epi32(res_hi_round, left_shift);
-
- // Accumulate values into the destination buffer
- if (conv_params->use_jnt_comp_avg) {
- mult_add_store(dst + i * dst_stride + j + 0, &res_lo_shift, &wt0,
- &wt1, conv_params->do_average);
- mult_add_store(dst + i * dst_stride + j + 4, &res_hi_shift, &wt0,
- &wt1, conv_params->do_average);
- } else {
- add_store(dst + i * dst_stride + j + 0, &res_lo_shift,
- conv_params->do_average);
- add_store(dst + i * dst_stride + j + 4, &res_hi_shift,
- conv_params->do_average);
- }
- j += 8;
- } while (j < w);
- } while (++i < h);
- }
-}
-
-void av1_jnt_convolve_2d_sse4_1(const uint8_t *src, int src_stride,
- uint8_t *dst0, int dst_stride0, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
- CONV_BUF_TYPE *dst = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
- const int bd = 8;
- (void)dst0;
- (void)dst_stride0;
-
- DECLARE_ALIGNED(16, int16_t,
- im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
- int im_h = h + filter_params_y->taps - 1;
- int im_stride = MAX_SB_SIZE;
- int i, j;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
- const int do_average = conv_params->do_average;
- const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
-
- const __m128i zero = _mm_setzero_si128();
-
- const int w0 = conv_params->fwd_offset;
- const int w1 = conv_params->bck_offset;
- const __m128i wt0 = _mm_set_epi32(w0, w0, w0, w0);
- const __m128i wt1 = _mm_set_epi32(w1, w1, w1, w1);
-
- /* Horizontal filter */
- {
- const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
- const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
- // coeffs 4 5 4 5 6 7 6 7
- const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
- // coeffs 0 1 0 1 0 1 0 1
- const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
- // coeffs 2 3 2 3 2 3 2 3
- const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
- // coeffs 4 5 4 5 4 5 4 5
- const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
- // coeffs 6 7 6 7 6 7 6 7
- const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
- const __m128i round_const = _mm_set1_epi32(
- ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
-
- for (i = 0; i < im_h; ++i) {
- for (j = 0; j < w; j += 8) {
- const __m128i data =
- _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
-
- const __m128i src_lo = _mm_unpacklo_epi8(data, zero);
- const __m128i src_hi = _mm_unpackhi_epi8(data, zero);
-
- // Filter even-index pixels
- const __m128i res_0 = _mm_madd_epi16(src_lo, coeff_01);
- const __m128i src_2 = _mm_alignr_epi8(src_hi, src_lo, 4);
- const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
- const __m128i src_4 = _mm_alignr_epi8(src_hi, src_lo, 8);
- const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
- const __m128i src_6 = _mm_alignr_epi8(src_hi, src_lo, 12);
- const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
- __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
- _mm_add_epi32(res_2, res_6));
- res_even =
- _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
-
- // Filter odd-index pixels
- const __m128i src_1 = _mm_alignr_epi8(src_hi, src_lo, 2);
- const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
- const __m128i src_3 = _mm_alignr_epi8(src_hi, src_lo, 6);
- const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
- const __m128i src_5 = _mm_alignr_epi8(src_hi, src_lo, 10);
- const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
- const __m128i src_7 = _mm_alignr_epi8(src_hi, src_lo, 14);
- const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
- __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
- _mm_add_epi32(res_3, res_7));
- res_odd =
- _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
-
- // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
- __m128i res = _mm_packs_epi32(res_even, res_odd);
- _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
- }
- }
- }
-
- /* Vertical filter */
- {
- const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
- const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
- // coeffs 4 5 4 5 6 7 6 7
- const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
- // coeffs 0 1 0 1 0 1 0 1
- const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
- // coeffs 2 3 2 3 2 3 2 3
- const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
- // coeffs 4 5 4 5 4 5 4 5
- const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
- // coeffs 6 7 6 7 6 7 6 7
- const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
- const __m128i round_const = _mm_set1_epi32(
- ((1 << conv_params->round_1) >> 1) -
- (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
-
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 8) {
- // Filter even-index pixels
- const int16_t *data = &im_block[i * im_stride + j];
- const __m128i src_0 =
- _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
- *(__m128i *)(data + 1 * im_stride));
- const __m128i src_2 =
- _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
- *(__m128i *)(data + 3 * im_stride));
- const __m128i src_4 =
- _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
- *(__m128i *)(data + 5 * im_stride));
- const __m128i src_6 =
- _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
- *(__m128i *)(data + 7 * im_stride));
-
- const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
- const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
- const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
- const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
- const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
- _mm_add_epi32(res_4, res_6));
-
- // Filter odd-index pixels
- const __m128i src_1 =
- _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
- *(__m128i *)(data + 1 * im_stride));
- const __m128i src_3 =
- _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
- *(__m128i *)(data + 3 * im_stride));
- const __m128i src_5 =
- _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
- *(__m128i *)(data + 5 * im_stride));
- const __m128i src_7 =
- _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
- *(__m128i *)(data + 7 * im_stride));
-
- const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
- const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
- const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
- const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
- const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
- _mm_add_epi32(res_5, res_7));
-
- // Rearrange pixels back into the order 0 ... 7
- const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
- const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
- const __m128i res_lo_round =
- _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
- const __m128i res_hi_round =
- _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
-
- if (conv_params->use_jnt_comp_avg) {
- // FIXME(chengchen): validate this implementation
- // original c function at: av1/common/convolve.c: av1_convolve_2d_c
- __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
- if (do_average) {
- _mm_storeu_si128(
- p + 0,
- _mm_srai_epi32(
- _mm_add_epi32(_mm_mullo_epi32(_mm_loadu_si128(p + 0), wt0),
- _mm_mullo_epi32(res_lo_round, wt1)),
- DIST_PRECISION_BITS));
- _mm_storeu_si128(
- p + 1,
- _mm_srai_epi32(
- _mm_add_epi32(_mm_mullo_epi32(_mm_loadu_si128(p + 1), wt0),
- _mm_mullo_epi32(res_hi_round, wt1)),
- DIST_PRECISION_BITS));
- } else {
- _mm_storeu_si128(p + 0, res_lo_round);
- _mm_storeu_si128(p + 1, res_hi_round);
- }
- } else {
- // Accumulate values into the destination buffer
- __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
- if (do_average) {
- _mm_storeu_si128(
- p + 0,
- _mm_srai_epi32(
- _mm_add_epi32(_mm_loadu_si128(p + 0), res_lo_round), 1));
- _mm_storeu_si128(
- p + 1,
- _mm_srai_epi32(
- _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round), 1));
- } else {
- _mm_storeu_si128(p + 0, res_lo_round);
- _mm_storeu_si128(p + 1, res_hi_round);
- }
- }
- }
- }
- }
-}
-#endif
diff --git a/av1/common/x86/jnt_convolve_ssse3.c b/av1/common/x86/jnt_convolve_ssse3.c
index 7b2b4fb..ae84d91 100644
--- a/av1/common/x86/jnt_convolve_ssse3.c
+++ b/av1/common/x86/jnt_convolve_ssse3.c
@@ -15,7 +15,6 @@
#include "aom_dsp/aom_filter.h"
#include "aom_dsp/x86/convolve_sse2.h"
-#if CONFIG_LOWPRECISION_BLEND
void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride,
uint8_t *dst0, int dst_stride0, int w, int h,
InterpFilterParams *filter_params_x,
@@ -230,4 +229,3 @@
}
}
}
-#endif
diff --git a/av1/common/x86/warp_plane_sse4.c b/av1/common/x86/warp_plane_sse4.c
index 7c279a9..f586b55 100644
--- a/av1/common/x86/warp_plane_sse4.c
+++ b/av1/common/x86/warp_plane_sse4.c
@@ -208,7 +208,6 @@
int subsampling_x, int subsampling_y,
ConvolveParams *conv_params, int16_t alpha,
int16_t beta, int16_t gamma, int16_t delta) {
-#if CONFIG_LOWPRECISION_BLEND
int comp_avg = conv_params->do_average;
__m128i tmp[15];
int i, j, k;
@@ -589,341 +588,4 @@
}
}
}
-#else // CONFIG_LOWPRECISION_BLEND
- int comp_avg = conv_params->do_average;
- __m128i tmp[15];
- int i, j, k;
- const int bd = 8;
- const int reduce_bits_horiz = conv_params->round_0;
- const int reduce_bits_vert = conv_params->is_compound
- ? conv_params->round_1
- : 2 * FILTER_BITS - reduce_bits_horiz;
- const int offset_bits_horiz = bd + FILTER_BITS - 1;
- assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
- const int w0 = conv_params->fwd_offset;
- const int w1 = conv_params->bck_offset;
- const __m128i wt0 = _mm_set1_epi32(w0);
- const __m128i wt1 = _mm_set1_epi32(w1);
- assert(FILTER_BITS == FILTER_BITS);
-
- /* Note: For this code to work, the left/right frame borders need to be
- extended by at least 13 pixels each. By the time we get here, other
- code will have set up this border, but we allow an explicit check
- for debugging purposes.
- */
- /*for (i = 0; i < height; ++i) {
- for (j = 0; j < 13; ++j) {
- assert(ref[i * stride - 13 + j] == ref[i * stride]);
- assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
- }
- }*/
-
- for (i = 0; i < p_height; i += 8) {
- for (j = 0; j < p_width; j += 8) {
- const int32_t src_x = (p_col + j + 4) << subsampling_x;
- const int32_t src_y = (p_row + i + 4) << subsampling_y;
- const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
- const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
- const int32_t x4 = dst_x >> subsampling_x;
- const int32_t y4 = dst_y >> subsampling_y;
-
- int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
- int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
- int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
- int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-
- // Add in all the constant terms, including rounding and offset
- sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
- (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
- sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
- (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
-
- sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
- sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
-
- // Horizontal filter
- // If the block is aligned such that, after clamping, every sample
- // would be taken from the leftmost/rightmost column, then we can
- // skip the expensive horizontal filter.
- if (ix4 <= -7) {
- for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
- int iy = iy4 + k;
- if (iy < 0)
- iy = 0;
- else if (iy > height - 1)
- iy = height - 1;
- tmp[k + 7] = _mm_set1_epi16(
- (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
- ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));
- }
- } else if (ix4 >= width + 6) {
- for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
- int iy = iy4 + k;
- if (iy < 0)
- iy = 0;
- else if (iy > height - 1)
- iy = height - 1;
- tmp[k + 7] =
- _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
- ref[iy * stride + (width - 1)] *
- (1 << (FILTER_BITS - reduce_bits_horiz)));
- }
- } else {
- for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
- int iy = iy4 + k;
- if (iy < 0)
- iy = 0;
- else if (iy > height - 1)
- iy = height - 1;
- int sx = sx4 + beta * (k + 4);
-
- // Load source pixels
- const __m128i src =
- _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
- const __m128i src_even =
- _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)even_mask));
- const __m128i src_odd =
- _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)odd_mask));
-
- // Filter even-index pixels
- const __m128i tmp_0 = _mm_loadl_epi64((
- __m128i *)&filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
- const __m128i tmp_1 = _mm_loadl_epi64((
- __m128i *)&filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
- const __m128i tmp_2 = _mm_loadl_epi64((
- __m128i *)&filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
- const __m128i tmp_3 = _mm_loadl_epi64((
- __m128i *)&filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
- const __m128i tmp_4 = _mm_loadl_epi64((
- __m128i *)&filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
- const __m128i tmp_5 = _mm_loadl_epi64((
- __m128i *)&filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
- const __m128i tmp_6 = _mm_loadl_epi64((
- __m128i *)&filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
- const __m128i tmp_7 = _mm_loadl_epi64((
- __m128i *)&filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
-
- // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 0 2
- const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
- // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 1 3
- const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
- // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 4 6
- const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
- // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 5 7
- const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
-
- // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 0 2 4 6
- const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
- // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 0 2 4 6
- const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
- // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 1 3 5 7
- const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
- // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 1 3 5 7
- const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
-
- // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
- const __m128i coeff_02 = _mm_unpacklo_epi64(tmp_12, tmp_14);
- // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
- const __m128i coeff_46 = _mm_unpackhi_epi64(tmp_12, tmp_14);
- // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
- const __m128i coeff_13 = _mm_unpacklo_epi64(tmp_13, tmp_15);
- // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
- const __m128i coeff_57 = _mm_unpackhi_epi64(tmp_13, tmp_15);
-
- // The pixel order we need for 'src' is:
- // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9
- const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd);
- const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff_02);
- // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13
- const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4),
- _mm_srli_si128(src_odd, 4));
- const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff_46);
- // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10
- const __m128i src_13 =
- _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2));
- const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff_13);
- // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14
- const __m128i src_57 = _mm_unpacklo_epi64(
- _mm_srli_si128(src_odd, 4), _mm_srli_si128(src_even, 6));
- const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff_57);
-
- const __m128i round_const = _mm_set1_epi16(
- (1 << offset_bits_horiz) + ((1 << reduce_bits_horiz) >> 1));
-
- // Note: The values res_02 + res_46 and res_13 + res_57 both
- // fit into int16s at this point, but their sum may be too wide to fit
- // into an int16. However, once we also add round_const, the sum of
- // all of these fits into a uint16.
- //
- // The wrapping behaviour of _mm_add_* is used here to make sure we
- // get the correct result despite converting between different
- // (implicit) types.
- const __m128i res_even = _mm_add_epi16(res_02, res_46);
- const __m128i res_odd = _mm_add_epi16(res_13, res_57);
- const __m128i res =
- _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const);
- tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz));
- }
- }
-
- // Vertical filter
- for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
- int sy = sy4 + delta * (k + 4);
-
- // Load from tmp and rearrange pairs of consecutive rows into the
- // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
- const __m128i *src = tmp + (k + 4);
- const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
- const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
- const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
- const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
-
- // Filter even-index pixels
- const __m128i tmp_0 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_2 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_4 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_6 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
- const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
- const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
- const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
- const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
-
- const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
- const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
- const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
- const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
-
- const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
- const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
- const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
- const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
-
- const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
- _mm_add_epi32(res_4, res_6));
-
- // Filter odd-index pixels
- const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
- const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
- const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
- const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
-
- const __m128i tmp_1 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_3 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_5 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_7 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
- const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
- const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
- const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
- const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
-
- const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
- const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
- const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
- const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
-
- const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
- const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
- const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
- const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
-
- const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
- _mm_add_epi32(res_5, res_7));
-
- // Rearrange pixels back into the order 0 ... 7
- __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
- __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
- if (conv_params->is_compound) {
- __m128i *const p =
- (__m128i *)&conv_params
- ->dst[(i + k + 4) * conv_params->dst_stride + j];
- const __m128i round_const = _mm_set1_epi32(
- -(1 << (bd + 2 * FILTER_BITS - reduce_bits_horiz - 1)) +
- ((1 << (reduce_bits_vert)) >> 1));
- res_lo = _mm_add_epi32(res_lo, round_const);
- res_lo = _mm_sra_epi32(res_lo, _mm_cvtsi32_si128(reduce_bits_vert));
- if (conv_params->use_jnt_comp_avg) {
- if (comp_avg) {
- res_lo = _mm_add_epi32(_mm_mullo_epi32(_mm_loadu_si128(p), wt0),
- _mm_mullo_epi32(res_lo, wt1));
- res_lo = _mm_srai_epi32(res_lo, DIST_PRECISION_BITS);
- }
- } else {
- if (comp_avg)
- res_lo =
- _mm_srai_epi32(_mm_add_epi32(_mm_loadu_si128(p), res_lo), 1);
- }
-
- _mm_storeu_si128(p, res_lo);
- if (p_width > 4) {
- res_hi = _mm_add_epi32(res_hi, round_const);
- res_hi = _mm_sra_epi32(res_hi, _mm_cvtsi32_si128(reduce_bits_vert));
- if (conv_params->use_jnt_comp_avg) {
- if (comp_avg) {
- res_hi =
- _mm_add_epi32(_mm_mullo_epi32(_mm_loadu_si128(p + 1), wt0),
- _mm_mullo_epi32(res_hi, wt1));
- res_hi = _mm_srai_epi32(res_hi, DIST_PRECISION_BITS);
- }
- } else {
- if (comp_avg)
- res_hi = _mm_srai_epi32(
- _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi), 1);
- }
-
- _mm_storeu_si128(p + 1, res_hi);
- }
- } else {
- // Round and pack into 8 bits
- const __m128i round_const =
- _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
- ((1 << reduce_bits_vert) >> 1));
-
- const __m128i res_lo_round = _mm_srai_epi32(
- _mm_add_epi32(res_lo, round_const), reduce_bits_vert);
- const __m128i res_hi_round = _mm_srai_epi32(
- _mm_add_epi32(res_hi, round_const), reduce_bits_vert);
-
- const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
- __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
-
- // Store, blending with 'pred' if needed
- __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
-
- // Note: If we're outputting a 4x4 block, we need to be very careful
- // to only output 4 pixels at this point, to avoid encode/decode
- // mismatches when encoding with multiple threads.
- if (p_width == 4) {
- if (comp_avg) {
- const __m128i orig = _mm_cvtsi32_si128(*(uint32_t *)p);
- res_8bit = _mm_avg_epu8(res_8bit, orig);
- }
- *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
- } else {
- if (comp_avg) res_8bit = _mm_avg_epu8(res_8bit, _mm_loadl_epi64(p));
- _mm_storel_epi64(p, res_8bit);
- }
- }
- }
- }
- }
-#endif // CONFIG_LOWPRECISION_BLEND
}
diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake
index 40d891b..6fd9e2c 100644
--- a/build/cmake/aom_config_defaults.cmake
+++ b/build/cmake/aom_config_defaults.cmake
@@ -95,7 +95,6 @@
set(CONFIG_FILM_GRAIN_SHOWEX 1 CACHE NUMBER "AV1 experiment flag.")
set(CONFIG_FP_MB_STATS 0 CACHE NUMBER "AV1 experiment flag.")
set(CONFIG_INTER_STATS_ONLY 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_LOWPRECISION_BLEND 1 CACHE NUMBER "AV1 experiment flag.")
set(CONFIG_OPERATING_POINTS 0 CACHE NUMBER "AV1 experiment flag.")
set(CONFIG_RD_DEBUG 0 CACHE NUMBER "AV1 experiment flag.")
set(CONFIG_TRAILING_BITS 0 CACHE NUMBER "AV1 experiment flag.")
diff --git a/test/av1_convolve_2d_test.cc b/test/av1_convolve_2d_test.cc
index 2e38b07..1d62dea 100644
--- a/test/av1_convolve_2d_test.cc
+++ b/test/av1_convolve_2d_test.cc
@@ -19,26 +19,7 @@
using libaom_test::AV1HighbdConvolve2D::AV1HighbdJntConvolve2DTest;
using std::tr1::make_tuple;
using std::tr1::tuple;
-#if !CONFIG_LOWPRECISION_BLEND
-using libaom_test::AV1Convolve2D::AV1Convolve2DTest;
-using libaom_test::AV1HighbdConvolve2D::AV1HighbdConvolve2DTest;
-#endif // CONFIG_LOWPRECISION_BLEND
namespace {
-#if !CONFIG_LOWPRECISION_BLEND
-TEST_P(AV1Convolve2DTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
-
-TEST_P(AV1Convolve2DTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); }
-
-INSTANTIATE_TEST_CASE_P(
- C_COPY, AV1Convolve2DTest,
- libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_copy_c, 0, 0));
-INSTANTIATE_TEST_CASE_P(
- C_X, AV1Convolve2DTest,
- libaom_test::AV1Convolve2D::BuildParams(av1_convolve_x_c, 1, 0));
-INSTANTIATE_TEST_CASE_P(
- C_Y, AV1Convolve2DTest,
- libaom_test::AV1Convolve2D::BuildParams(av1_convolve_y_c, 0, 1));
-#endif // !CONFIG_LOWPRECISION_BLEND
TEST_P(AV1Convolve2DSrTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
@@ -53,7 +34,6 @@
INSTANTIATE_TEST_CASE_P(
C_Y, AV1Convolve2DSrTest,
libaom_test::AV1Convolve2D::BuildParams(av1_convolve_y_sr_c, 0, 1));
-#if CONFIG_LOWPRECISION_BLEND
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(SSE2_COPY, AV1Convolve2DSrTest,
libaom_test::AV1Convolve2D::BuildParams(
@@ -84,7 +64,6 @@
libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_sr_avx2, 1, 1));
#endif // HAVE_AVX2
#endif // HAVE_SSE2
-#endif // CONFIG_LOWPRECISION_BLEND
TEST_P(AV1JntConvolve2DTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); }
@@ -100,7 +79,6 @@
C_Y, AV1JntConvolve2DTest,
libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_y_c, 0, 1));
-#if CONFIG_LOWPRECISION_BLEND
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(SSE2_COPY, AV1JntConvolve2DTest,
libaom_test::AV1Convolve2D::BuildParams(
@@ -136,26 +114,6 @@
#endif // HAVE_AVX2
#endif // HAVE_SSE4_1
#endif // HAVE_SSE2
-#endif // CONFIG_LOWPRECISION_BLEND
-
-#if !CONFIG_LOWPRECISION_BLEND
-#if HAVE_SSSE3
-TEST_P(AV1HighbdConvolve2DTest, CheckOutput) { RunCheckOutput(GET_PARAM(1)); }
-
-INSTANTIATE_TEST_CASE_P(C_X, AV1HighbdConvolve2DTest,
- libaom_test::AV1HighbdConvolve2D::BuildParams(
- av1_highbd_convolve_x_c, 1, 0));
-
-INSTANTIATE_TEST_CASE_P(C_Y, AV1HighbdConvolve2DTest,
- libaom_test::AV1HighbdConvolve2D::BuildParams(
- av1_highbd_convolve_y_c, 0, 1));
-
-INSTANTIATE_TEST_CASE_P(C_COPY, AV1HighbdConvolve2DTest,
- libaom_test::AV1HighbdConvolve2D::BuildParams(
- av1_highbd_convolve_2d_copy_c, 0, 0));
-
-#endif // HAVE_SSSE3
-#endif // !CONFIG_LOWPRECISION_BLEND
TEST_P(AV1HighbdConvolve2DSrTest, CheckOutput) { RunCheckOutput(GET_PARAM(1)); }
TEST_P(AV1HighbdConvolve2DSrTest, DISABLED_Speed) {
@@ -173,7 +131,6 @@
INSTANTIATE_TEST_CASE_P(C_COPY, AV1HighbdConvolve2DSrTest,
libaom_test::AV1HighbdConvolve2D::BuildParams(
av1_highbd_convolve_2d_copy_sr_c, 0, 0));
-#if CONFIG_LOWPRECISION_BLEND
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(SSE2_COPY, AV1HighbdConvolve2DSrTest,
libaom_test::AV1HighbdConvolve2D::BuildParams(
@@ -204,7 +161,6 @@
#endif // HAVE_AVX2
#endif // HAVE_SSSE3
#endif // HAVE_SSE2
-#endif // CONFIG_LOWPRECISION_BLEND
TEST_P(AV1HighbdJntConvolve2DTest, CheckOutput) {
RunCheckOutput(GET_PARAM(1));
}
@@ -224,7 +180,6 @@
INSTANTIATE_TEST_CASE_P(C_COPY, AV1HighbdJntConvolve2DTest,
libaom_test::AV1HighbdConvolve2D::BuildParams(
av1_highbd_jnt_convolve_2d_copy_c, 0, 0));
-#if CONFIG_LOWPRECISION_BLEND
#if HAVE_SSE4_1
INSTANTIATE_TEST_CASE_P(SSE4_1_COPY, AV1HighbdJntConvolve2DTest,
libaom_test::AV1HighbdConvolve2D::BuildParams(
@@ -253,5 +208,4 @@
av1_highbd_jnt_convolve_y_avx2, 0, 1));
#endif // HAVE_AVX2
#endif // HAVE_SSE4_1
-#endif // CONFIG_LOWPRECISION_BLEND
} // namespace
diff --git a/test/av1_convolve_2d_test_util.cc b/test/av1_convolve_2d_test_util.cc
index 62f4038..893fb10 100644
--- a/test/av1_convolve_2d_test_util.cc
+++ b/test/av1_convolve_2d_test_util.cc
@@ -31,110 +31,6 @@
::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
}
-#if !CONFIG_LOWPRECISION_BLEND
-AV1Convolve2DTest::~AV1Convolve2DTest() {}
-void AV1Convolve2DTest::SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
-
-void AV1Convolve2DTest::TearDown() { libaom_test::ClearSystemState(); }
-void AV1Convolve2DTest::RunCheckOutput(convolve_2d_func test_impl) {
- const int w = kMaxSize, h = kMaxSize;
- const int has_subx = GET_PARAM(1);
- const int has_suby = GET_PARAM(2);
- const int block_idx = GET_PARAM(3);
- int hfilter, vfilter, subx, suby;
- uint8_t input[kMaxSize * kMaxSize];
- DECLARE_ALIGNED(32, CONV_BUF_TYPE, output[MAX_SB_SQUARE]);
- DECLARE_ALIGNED(32, CONV_BUF_TYPE, output2[MAX_SB_SQUARE]);
-
- for (int i = 0; i < h; ++i)
- for (int j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8();
- for (int i = 0; i < MAX_SB_SQUARE; ++i)
- output[i] = output2[i] = rnd_.Rand31();
-
- const int out_w = block_size_wide[block_idx];
- const int out_h = block_size_high[block_idx];
- for (hfilter = EIGHTTAP_REGULAR; hfilter < INTERP_FILTERS_ALL; ++hfilter) {
- for (vfilter = EIGHTTAP_REGULAR; vfilter < INTERP_FILTERS_ALL; ++vfilter) {
- InterpFilterParams filter_params_x =
- av1_get_interp_filter_params((InterpFilter)hfilter);
- InterpFilterParams filter_params_y =
- av1_get_interp_filter_params((InterpFilter)vfilter);
- for (int do_average = 0; do_average <= 1; ++do_average) {
- ConvolveParams conv_params1 = get_conv_params_no_round(
- 0, do_average, 0, output, MAX_SB_SIZE, 1, 8);
- ConvolveParams conv_params2 = get_conv_params_no_round(
- 0, do_average, 0, output2, MAX_SB_SIZE, 1, 8);
-
- const int subx_range = has_subx ? 16 : 1;
- const int suby_range = has_suby ? 16 : 1;
- for (subx = 0; subx < subx_range; ++subx) {
- for (suby = 0; suby < suby_range; ++suby) {
- // Choose random locations within the source block
- const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
- const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
- av1_convolve_2d_c(input + offset_r * w + offset_c, w, NULL, 0,
- out_w, out_h, &filter_params_x, &filter_params_y,
- subx, suby, &conv_params1);
- test_impl(input + offset_r * w + offset_c, w, NULL, 0, out_w, out_h,
- &filter_params_x, &filter_params_y, subx, suby,
- &conv_params2);
-
- for (int i = 0; i < out_h; ++i) {
- for (int j = 0; j < out_w; ++j) {
- int idx = i * MAX_SB_SIZE + j;
- ASSERT_EQ(output[idx], output2[idx])
- << out_w << "x" << out_h << " Pixel mismatch at index "
- << idx << " = (" << i << ", " << j
- << "), sub pixel offset = (" << suby << ", " << subx << ")";
- }
- }
- }
- }
- }
- }
- }
-}
-
-void AV1Convolve2DTest::RunSpeedTest(convolve_2d_func test_impl) {
- const int w = kMaxSize, h = kMaxSize;
- const int has_subx = GET_PARAM(1);
- const int has_suby = GET_PARAM(2);
- const int block_idx = GET_PARAM(3);
-
- uint8_t input[kMaxSize * kMaxSize];
- DECLARE_ALIGNED(32, CONV_BUF_TYPE, output[MAX_SB_SQUARE]);
-
- for (int i = 0; i < h; ++i)
- for (int j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8();
-
- int hfilter = EIGHTTAP_REGULAR, vfilter = EIGHTTAP_REGULAR;
- int subx = 0, suby = 0;
-
- InterpFilterParams filter_params_x =
- av1_get_interp_filter_params((InterpFilter)hfilter);
- InterpFilterParams filter_params_y =
- av1_get_interp_filter_params((InterpFilter)vfilter);
- const int do_average = 0;
- ConvolveParams conv_params2 =
- get_conv_params_no_round(0, do_average, 0, output, MAX_SB_SIZE, 1, 8);
-
- const int out_w = block_size_wide[block_idx];
- const int out_h = block_size_high[block_idx];
- const int num_loops = 1000000000 / (out_w + out_h);
- aom_usec_timer timer;
- aom_usec_timer_start(&timer);
-
- for (int i = 0; i < num_loops; ++i)
- test_impl(input, w, NULL, 0, out_w, out_h, &filter_params_x,
- &filter_params_y, subx, suby, &conv_params2);
-
- aom_usec_timer_mark(&timer);
- const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
- printf("%d,%d convolve %3dx%-3d: %7.2f us\n", has_subx, has_suby, out_w,
- out_h, 1000.0 * elapsed_time / num_loops);
-}
-#endif // CONFIG_LOWPRECISION_BLEND
-
AV1Convolve2DSrTest::~AV1Convolve2DSrTest() {}
void AV1Convolve2DSrTest::SetUp() {
rnd_.Reset(ACMRandom::DeterministicSeed());
@@ -406,78 +302,6 @@
::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
}
-#if !CONFIG_LOWPRECISION_BLEND
-AV1HighbdConvolve2DTest::~AV1HighbdConvolve2DTest() {}
-void AV1HighbdConvolve2DTest::SetUp() {
- rnd_.Reset(ACMRandom::DeterministicSeed());
-}
-
-void AV1HighbdConvolve2DTest::TearDown() { libaom_test::ClearSystemState(); }
-
-void AV1HighbdConvolve2DTest::RunCheckOutput(
- highbd_convolve_2d_func test_impl) {
- const int w = kMaxSize, h = kMaxSize;
- const int bd = GET_PARAM(0);
- const int has_subx = GET_PARAM(2);
- const int has_suby = GET_PARAM(3);
- const int block_idx = GET_PARAM(4);
- int hfilter, vfilter, subx, suby;
- uint16_t input[kMaxSize * kMaxSize];
- DECLARE_ALIGNED(32, CONV_BUF_TYPE, output[MAX_SB_SQUARE]);
- DECLARE_ALIGNED(32, CONV_BUF_TYPE, output2[MAX_SB_SQUARE]);
-
- for (int i = 0; i < h; ++i)
- for (int j = 0; j < w; ++j)
- input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
- for (int i = 0; i < MAX_SB_SQUARE; ++i)
- output[i] = output2[i] = rnd_.Rand31();
-
- const int out_w = block_size_wide[block_idx];
- const int out_h = block_size_high[block_idx];
- for (hfilter = EIGHTTAP_REGULAR; hfilter < INTERP_FILTERS_ALL; ++hfilter) {
- for (vfilter = EIGHTTAP_REGULAR; vfilter < INTERP_FILTERS_ALL; ++vfilter) {
- InterpFilterParams filter_params_x =
- av1_get_interp_filter_params((InterpFilter)hfilter);
- InterpFilterParams filter_params_y =
- av1_get_interp_filter_params((InterpFilter)vfilter);
- for (int do_average = 0; do_average <= 1; ++do_average) {
- ConvolveParams conv_params1 = get_conv_params_no_round(
- 0, do_average, 0, output, MAX_SB_SIZE, 1, bd);
- ConvolveParams conv_params2 = get_conv_params_no_round(
- 0, do_average, 0, output2, MAX_SB_SIZE, 1, bd);
-
- const int subx_range = has_subx ? 16 : 1;
- const int suby_range = has_suby ? 16 : 1;
- for (subx = 0; subx < subx_range; ++subx) {
- for (suby = 0; suby < suby_range; ++suby) {
- // Choose random locations within the source block
- const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
- const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
- av1_highbd_convolve_2d_c(input + offset_r * w + offset_c, w, NULL,
- 0, out_w, out_h, &filter_params_x,
- &filter_params_y, subx, suby,
- &conv_params1, bd);
- test_impl(input + offset_r * w + offset_c, w, NULL, 0, out_w, out_h,
- &filter_params_x, &filter_params_y, subx, suby,
- &conv_params2, bd);
-
- for (int i = 0; i < out_h; ++i) {
- for (int j = 0; j < out_w; ++j) {
- int idx = i * MAX_SB_SIZE + j;
- ASSERT_EQ(output[idx], output2[idx])
- << out_w << "x" << out_h << " Pixel mismatch at index "
- << idx << " = (" << i << ", " << j
- << "), sub pixel offset = (" << suby << ", " << subx << ")";
- }
- }
- }
- }
- }
- }
- }
-}
-#endif // !CONFIG_LOWPRECISION_BLEND
-
AV1HighbdConvolve2DSrTest::~AV1HighbdConvolve2DSrTest() {}
void AV1HighbdConvolve2DSrTest::SetUp() {
rnd_.Reset(ACMRandom::DeterministicSeed());
diff --git a/test/av1_convolve_2d_test_util.h b/test/av1_convolve_2d_test_util.h
index 328d18c..9389d8a 100644
--- a/test/av1_convolve_2d_test_util.h
+++ b/test/av1_convolve_2d_test_util.h
@@ -36,22 +36,6 @@
::testing::internal::ParamGenerator<Convolve2DParam> BuildParams(
convolve_2d_func filter, int subx_exist, int suby_exist);
-#if !CONFIG_LOWPRECISION_BLEND
-class AV1Convolve2DTest : public ::testing::TestWithParam<Convolve2DParam> {
- public:
- virtual ~AV1Convolve2DTest();
- virtual void SetUp();
-
- virtual void TearDown();
-
- protected:
- void RunCheckOutput(convolve_2d_func test_impl);
- void RunSpeedTest(convolve_2d_func test_impl);
-
- libaom_test::ACMRandom rnd_;
-};
-#endif // !CONFIG_LOWPRECISION_BLEND
-
class AV1Convolve2DSrTest : public ::testing::TestWithParam<Convolve2DParam> {
public:
virtual ~AV1Convolve2DSrTest();
@@ -93,22 +77,6 @@
::testing::internal::ParamGenerator<HighbdConvolve2DParam> BuildParams(
highbd_convolve_2d_func filter, int subx_exist, int suby_exist);
-#if !CONFIG_LOWPRECISION_BLEND
-class AV1HighbdConvolve2DTest
- : public ::testing::TestWithParam<HighbdConvolve2DParam> {
- public:
- virtual ~AV1HighbdConvolve2DTest();
- virtual void SetUp();
-
- virtual void TearDown();
-
- protected:
- void RunCheckOutput(highbd_convolve_2d_func test_impl);
-
- libaom_test::ACMRandom rnd_;
-};
-#endif // !CONFIG_LOWPRECISION_BLEND
-
class AV1HighbdConvolve2DSrTest
: public ::testing::TestWithParam<HighbdConvolve2DParam> {
public:
diff --git a/test/av1_convolve_scale_test.cc b/test/av1_convolve_scale_test.cc
index acb50df..fd5a3f5 100644
--- a/test/av1_convolve_scale_test.cc
+++ b/test/av1_convolve_scale_test.cc
@@ -158,7 +158,6 @@
std::vector<CONV_BUF_TYPE> dst_16_data_;
};
-#if CONFIG_LOWPRECISION_BLEND
template <typename Pixel>
void FillEdge(ACMRandom *rnd, int num_pixels, int bd, bool trash, Pixel *data) {
if (!trash) {
@@ -526,5 +525,4 @@
::testing::ValuesIn(kBlockDim),
::testing::ValuesIn(kNTaps), ::testing::ValuesIn(kNTaps),
::testing::Bool(), ::testing::ValuesIn(kBDs)));
-#endif
} // namespace
diff --git a/test/test.cmake b/test/test.cmake
index f5adbf0..23f69f8 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -218,14 +218,12 @@
endif ()
if (HAVE_SSE4_1)
- if (CONFIG_LOWPRECISION_BLEND)
set(AOM_UNIT_TEST_ENCODER_SOURCES
${AOM_UNIT_TEST_ENCODER_SOURCES}
"${AOM_ROOT}/test/av1_convolve_scale_test.cc"
"${AOM_ROOT}/test/warp_filter_test_util.cc"
"${AOM_ROOT}/test/warp_filter_test_util.h"
"${AOM_ROOT}/test/warp_filter_test.cc")
- endif ()
endif ()
if (HAVE_SSE4_1)
diff --git a/test/warp_filter_test.cc b/test/warp_filter_test.cc
index 920422f..99e3d31 100644
--- a/test/warp_filter_test.cc
+++ b/test/warp_filter_test.cc
@@ -10,7 +10,6 @@
*/
#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
#include "test/warp_filter_test_util.h"
-#if CONFIG_LOWPRECISION_BLEND
using libaom_test::ACMRandom;
using libaom_test::AV1HighbdWarpFilter::AV1HighbdWarpFilterTest;
using libaom_test::AV1WarpFilter::AV1WarpFilterTest;
@@ -35,4 +34,3 @@
#endif // HAVE_SSE4_1
} // namespace
-#endif // CONFIG_LOWPRECISION_BLEND
diff --git a/test/warp_filter_test_util.cc b/test/warp_filter_test_util.cc
index 27299d9..574cdf1 100644
--- a/test/warp_filter_test_util.cc
+++ b/test/warp_filter_test_util.cc
@@ -78,7 +78,6 @@
}
namespace AV1WarpFilter {
-#if CONFIG_LOWPRECISION_BLEND
::testing::internal::ParamGenerator<WarpTestParam> BuildParams(
warp_affine_func filter) {
const WarpTestParam params[] = {
@@ -244,11 +243,9 @@
delete[] dsta;
delete[] dstb;
}
-#endif
} // namespace AV1WarpFilter
namespace AV1HighbdWarpFilter {
-#if CONFIG_LOWPRECISION_BLEND
::testing::internal::ParamGenerator<HighbdWarpTestParam> BuildParams(
highbd_warp_affine_func filter) {
const HighbdWarpTestParam params[] = {
@@ -432,6 +429,5 @@
delete[] dsta;
delete[] dstb;
}
-#endif
} // namespace AV1HighbdWarpFilter
} // namespace libaom_test
diff --git a/test/warp_filter_test_util.h b/test/warp_filter_test_util.h
index fd67b5f..e6b1944 100644
--- a/test/warp_filter_test_util.h
+++ b/test/warp_filter_test_util.h
@@ -23,7 +23,6 @@
#include "av1/common/mv.h"
#include "av1/common/common_data.h"
-#if CONFIG_LOWPRECISION_BLEND
namespace libaom_test {
void generate_warped_model(libaom_test::ACMRandom *rnd, int32_t *mat,
@@ -96,5 +95,4 @@
} // namespace libaom_test
-#endif
#endif // TEST_WARP_FILTER_TEST_UTIL_H_