Support 6-tap AVX2 horizontal interpolation filter
Add AVX2 implementations to handle the 6-tap interpolation filter
in the horizontal direction.
Change-Id: I8b669c80f02a4c3fa6aadbbce16e8cd5f44bfeb4
diff --git a/aom_dsp/x86/convolve_avx2.h b/aom_dsp/x86/convolve_avx2.h
index 1d5740f..db0f22a 100644
--- a/aom_dsp/x86/convolve_avx2.h
+++ b/aom_dsp/x86/convolve_avx2.h
@@ -413,6 +413,18 @@
return convolve_lowbd(s, coeffs);
}
+static INLINE __m256i convolve_lowbd_x_6tap(const __m256i data,
+ const __m256i *const coeffs,
+ const __m256i *const filt) {
+ __m256i s[4];
+
+ s[0] = _mm256_shuffle_epi8(data, filt[0]);
+ s[1] = _mm256_shuffle_epi8(data, filt[1]);
+ s[2] = _mm256_shuffle_epi8(data, filt[2]);
+
+ return convolve_lowbd_6tap(s, coeffs);
+}
+
static INLINE __m256i convolve_lowbd_x_4tap(const __m256i data,
const __m256i *const coeffs,
const __m256i *const filt) {
diff --git a/av1/common/x86/convolve_avx2.c b/av1/common/x86/convolve_avx2.c
index 38680d8..1a9bf5e 100644
--- a/av1/common/x86/convolve_avx2.c
+++ b/av1/common/x86/convolve_avx2.c
@@ -373,7 +373,7 @@
const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
const __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1);
const __m128i round_shift = _mm_cvtsi32_si128(bits);
- int i, is_horiz_4tap = 0;
+ int i, horiz_tap = SUBPEL_TAPS;
assert(bits >= 0);
assert((FILTER_BITS - conv_params->round_1) >= 0 ||
@@ -384,14 +384,21 @@
filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
- prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
+ const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ if (!(filter[0] | filter[1] | filter[6] | filter[7])) {
+ horiz_tap = 4;
+ } else if (!(filter[0] | filter[7])) {
+ horiz_tap = 6;
+ }
- // Condition for checking valid horz_filt taps
- if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
- is_horiz_4tap = 1;
+ if (horiz_tap == 6)
+ prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs);
+ else
+ prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
// horz_filt as 4 tap
- if (is_horiz_4tap) {
+ if (horiz_tap == 4) {
const int fo_horiz = 1;
const uint8_t *const src_ptr = src - fo_horiz;
if (w <= 8) {
@@ -461,6 +468,78 @@
}
}
}
+ } else if (horiz_tap == 6) {
+ const int fo_horiz = horiz_tap / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_horiz;
+ filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+ if (w <= 8) {
+ for (i = 0; i < h; i += 2) {
+ const __m256i data = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
+ _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
+ 0x20);
+
+ __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+ round_0_shift);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+ round_shift);
+
+ /* rounding code */
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+ if (w > 4) {
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
+ } else if (w > 2) {
+ xx_storel_32(&dst[i * dst_stride], res_0);
+ xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
+ } else {
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
+ __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
+ *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+ *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+ }
+ }
+ } else {
+ for (i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
+ // 18 19 20 21 22 23
+ const __m256i data = _mm256_inserti128_si256(
+ _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
+ 1);
+
+ __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+ round_0_shift);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+ round_shift);
+
+ /* rounding code */
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+ // Store values into the destination buffer
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ res_8b = _mm256_permute4x64_epi64(res_8b, 216);
+ __m128i res = _mm256_castsi256_si128(res_8b);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+ }
+ }
+ }
} else {
const int fo_horiz = filter_params_x->taps / 2 - 1;
const uint8_t *const src_ptr = src - fo_horiz;