Handle 6-tap interpolation filter
Make the vertical interpolation filter avx2 implementation handle
the case where the filter is 6-tap.
Change-Id: Id5575bffa344c3e9ad7ebd9e59f790f5fb9faca8
diff --git a/aom_dsp/x86/convolve_avx2.h b/aom_dsp/x86/convolve_avx2.h
index d516de5..1d5740f 100644
--- a/aom_dsp/x86/convolve_avx2.h
+++ b/aom_dsp/x86/convolve_avx2.h
@@ -295,6 +295,32 @@
coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu));
}
+static INLINE void prepare_coeffs_6t_lowbd(
+ const InterpFilterParams *const filter_params, const int subpel_q4,
+ __m256i *const coeffs /* [4] */) {
+ const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+ filter_params, subpel_q4 & SUBPEL_MASK);
+ const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
+ const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
+
+ // right shift all filter co-efficients by 1 to reduce the bits required.
+ // This extra right shift will be taken care of at the end while rounding
+ // the result.
+ // Since all filter co-efficients are even, this change will not affect the
+ // end result
+ assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
+ _mm_set1_epi16((int16_t)0xffff)));
+
+ const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
+
+ // coeffs 1 2 1 2 1 2 1 2
+ coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0402u));
+ // coeffs 3 4 3 4 3 4 3 4
+ coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u));
+ // coeffs 5 6 5 6 5 6 5 6
+ coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0c0au));
+}
+
static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
const int subpel_q4,
__m256i *const coeffs /* [4] */) {
@@ -328,6 +354,19 @@
return res;
}
+static INLINE __m256i convolve_lowbd_6tap(const __m256i *const s,
+ const __m256i *const coeffs) {
+ const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
+ const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
+ const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
+
+ // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ const __m256i res =
+ _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), res_23);
+
+ return res;
+}
+
static INLINE __m256i convolve_lowbd_4tap(const __m256i *const s,
const __m256i *const coeffs) {
const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]);
diff --git a/av1/common/x86/convolve_avx2.c b/av1/common/x86/convolve_avx2.c
index 51d99b1..38680d8 100644
--- a/av1/common/x86/convolve_avx2.c
+++ b/av1/common/x86/convolve_avx2.c
@@ -21,7 +21,7 @@
int dst_stride, int w, int h,
const InterpFilterParams *filter_params_y,
const int subpel_y_qn) {
- int i, j, is_vert_4tap = 0;
+ int i, j, vert_tap = SUBPEL_TAPS;
// right shift is F-1 because we are already dividing
// filter co-efficients by 2
const int right_shift_bits = (FILTER_BITS - 1);
@@ -32,14 +32,22 @@
__m256i coeffs[4], s[8];
__m128i d[6];
- prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
-
// Condition for checking valid vert_filt taps
- if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
- is_vert_4tap = 1;
+ const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+ if (!(filter[0] | filter[1] | filter[6] | filter[7])) {
+ vert_tap = 4;
+ } else if (!(filter[0] | filter[7])) {
+ vert_tap = 6;
+ }
+
+ if (vert_tap == 6)
+ prepare_coeffs_6t_lowbd(filter_params_y, subpel_y_qn, coeffs);
+ else
+ prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
// vert_filt as 4 tap
- if (is_vert_4tap) {
+ if (vert_tap == 4) {
const int fo_vert = 1;
const uint8_t *const src_ptr = src - fo_vert * src_stride;
for (j = 0; j < w; j += 16) {
@@ -133,6 +141,108 @@
s[4] = s[5];
}
}
+ } else if (vert_tap == 6) {
+ const int fo_vert = vert_tap / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride;
+
+ for (j = 0; j < w; j += 16) {
+ const uint8_t *data = &src_ptr[j];
+ __m256i src6;
+
+ d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+ d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+ d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+ d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ const __m256i src_01a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
+
+ const __m256i src_12a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
+
+ const __m256i src_23a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
+
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 4 * src_stride)));
+ const __m256i src_34a =
+ _mm256_permute2x128_si256(_mm256_castsi128_si256(d[3]), src6, 0x20);
+
+ s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
+ s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
+
+ s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
+ s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
+
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[i * src_stride + j];
+ const __m256i src_45a = _mm256_permute2x128_si256(
+ src6,
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+ 0x20);
+
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+ const __m256i src_56a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+ src6, 0x20);
+
+ s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+ s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
+
+ const __m256i res_lo = convolve_lowbd_6tap(s, coeffs);
+
+ /* rounding code */
+ // shift by F - 1
+ const __m256i res_16b_lo = _mm256_sra_epi16(
+ _mm256_add_epi16(res_lo, right_shift_const), right_shift);
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+
+ if (w - j > 8) {
+ const __m256i res_hi = convolve_lowbd_6tap(s + 3, coeffs);
+
+ /* rounding code */
+ // shift by F - 1
+ const __m256i res_16b_hi = _mm256_sra_epi16(
+ _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
+
+ __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_a);
+ const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_1);
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+ if (w - j > 4) {
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_1);
+ } else if (w - j > 2) {
+ xx_storel_32(&dst[i * dst_stride + j], res_0);
+ xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
+ } else {
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+ __m128i *const p_1 =
+ (__m128i *)&dst[i * dst_stride + j + dst_stride];
+ *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
+ *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
+ }
+ }
+ s[0] = s[1];
+ s[1] = s[2];
+ s[3] = s[4];
+ s[4] = s[5];
+ }
+ }
} else {
const int fo_vert = filter_params_y->taps / 2 - 1;
const uint8_t *const src_ptr = src - fo_vert * src_stride;