AVX2 implementation of 4-tap filter for widths 8, 4
Added AVX2 implementation of aom_filter_block1d8_h4_avx2,
aom_filter_block1d8_v4_avx2, aom_filter_block1d4_h4_avx2
and aom_filter_block1d4_v4_avx2. Approximately 16%
improvement is seen w.r.t 8-tap filter at unit test level.
When tested for 20 frames of BasketballDrill_832x480_50
content with speed=1 preset, 0.3% encode time
reduction is seen.
Change-Id: Ia988f263da7d63a8d294c6c1e2380cdb4b9be669
diff --git a/aom_dsp/x86/aom_asm_stubs.c b/aom_dsp/x86/aom_asm_stubs.c
index 5ed7d63..5f5bf5f 100644
--- a/aom_dsp/x86/aom_asm_stubs.c
+++ b/aom_dsp/x86/aom_asm_stubs.c
@@ -24,6 +24,10 @@
#define aom_filter_block1d16_h4_sse2 aom_filter_block1d16_h8_sse2
#define aom_filter_block1d16_v4_sse2 aom_filter_block1d16_v8_sse2
+#define aom_filter_block1d8_h4_sse2 aom_filter_block1d8_h8_sse2
+#define aom_filter_block1d8_v4_sse2 aom_filter_block1d8_v8_sse2
+#define aom_filter_block1d4_h4_sse2 aom_filter_block1d4_h8_sse2
+#define aom_filter_block1d4_v4_sse2 aom_filter_block1d4_v8_sse2
filter8_1dfunction aom_filter_block1d16_v2_sse2;
filter8_1dfunction aom_filter_block1d16_h2_sse2;
diff --git a/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
index 94d491d..94b5da1 100644
--- a/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
+++ b/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
@@ -74,6 +74,87 @@
_mm256_extractf128_si256(*a, 1));
}
+static void aom_filter_block1d4_h4_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+ src_ptr -= 3;
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ firstFilters =
+ _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
+ filt1Reg = _mm256_load_si256((__m256i const *)(filt4_d4_global_avx2));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pixels_per_line << 1;
+ dst_stride = output_pitch << 1;
+ for (i = output_height; i > 1; i -= 2) {
+ // load the 2 strides of source
+ srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+ // filter the source buffer
+ srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+
+ // multiply 4 adjacent elements with the filter and add the result
+ srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+
+ srcRegFilt32b1_1 =
+ _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt32b1_1 =
+ _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+ src_ptr += src_stride;
+
+ xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1);
+ output_ptr += dst_stride;
+ }
+
+ // if the number of strides is odd.
+ // process only 4 bytes
+ if (i > 0) {
+ __m128i srcReg1, srcRegFilt1_1;
+
+ srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+ // filter the source buffer
+ srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+
+ // multiply 4 adjacent elements with the filter and add the result
+ srcRegFilt1_1 =
+ _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
+
+ srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
+ // shift by 6 bit each 16 bit
+ srcRegFilt1_1 =
+ _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+ srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+ // save 4 bytes
+ *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
+ }
+}
+
static void aom_filter_block1d4_h8_avx2(
const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
@@ -179,6 +260,100 @@
}
}
+static void aom_filter_block1d8_h4_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg32, filt2Reg, filt3Reg;
+ __m256i secondFilters, thirdFilters;
+ __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
+ __m256i srcReg32b1, filtersReg32;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+ src_ptr -= 3;
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+
+ filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+ filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+
+ // multiply the size of the source and destination stride by two
+ src_stride = src_pixels_per_line << 1;
+ dst_stride = output_pitch << 1;
+ for (i = output_height; i > 1; i -= 2) {
+ // load the 2 strides of source
+ srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+ // filter the source buffer
+ srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+
+ // shrink to 8 bit each 16 bits
+ srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b1_1);
+
+ src_ptr += src_stride;
+
+ xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1);
+ output_ptr += dst_stride;
+ }
+
+ // if the number of strides is odd.
+ // process only 8 bytes
+ if (i > 0) {
+ __m128i srcReg1, srcRegFilt1_1;
+ __m128i srcRegFilt2, srcRegFilt3;
+
+ srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+ // filter the source buffer
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+ srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt2 =
+ _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
+ srcRegFilt3 =
+ _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(thirdFilters));
+
+ // add and saturate the results together
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt2, srcRegFilt3);
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt1_1 =
+ _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+ srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+ // shrink to 8 bit each 16 bits
+ srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+ // save 8 bytes
+ _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1);
+ }
+}
+
static void aom_filter_block1d8_h8_avx2(
const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
@@ -622,6 +797,92 @@
}
}
+static void aom_filter_block1d8_v4_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i filtersReg32, addFilterReg32;
+ __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
+ __m256i srcReg23_34_lo, srcReg45_56_lo;
+ __m256i resReg23_34_lo, resReg45_56_lo;
+ __m256i resReglo, resReg;
+ __m256i secondFilters, thirdFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the
+ // same data in both lanes of 128 bit register.
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+ srcReg4x = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
+
+ // have consecutive loads on the same 256 register
+ srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
+
+ srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
+
+ for (i = output_height; i > 1; i -= 2) {
+ // load the last 2 loads of 16 bytes and have every two
+ // consecutive loads in the same 256 bit register
+ srcReg5x = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
+ srcReg45 =
+ _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
+
+ srcReg6x = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
+ srcReg56 =
+ _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
+
+ // merge every two consecutive registers
+ srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
+ resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
+
+ // add and saturate the results together
+ resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
+
+ // shift by 6 bit each 16 bit
+ resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
+ resReglo = _mm256_srai_epi16(resReglo, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ resReg = _mm256_packus_epi16(resReglo, resReglo);
+
+ src_ptr += src_stride;
+
+ xx_storeu2_epi64(output_ptr, out_pitch, &resReg);
+
+ output_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ srcReg23_34_lo = srcReg45_56_lo;
+ srcReg4x = srcReg6x;
+ }
+}
+
static void aom_filter_block1d8_v8_avx2(
const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
@@ -1067,6 +1328,88 @@
}
}
+static void aom_filter_block1d4_v4_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i filtersReg32, addFilterReg32;
+ __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
+ __m256i srcReg23_34_lo, srcReg45_56_lo;
+ __m256i srcReg2345_3456_lo;
+ __m256i resReglo, resReg;
+ __m256i firstFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the
+ // same data in both lanes of 128 bit register.
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ firstFilters =
+ _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+ srcReg4x = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
+
+ // have consecutive loads on the same 256 register
+ srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
+
+ srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
+
+ for (i = output_height; i > 1; i -= 2) {
+ // load the last 2 loads of 16 bytes and have every two
+ // consecutive loads in the same 256 bit register
+ srcReg5x = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
+ srcReg45 =
+ _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
+
+ srcReg6x = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
+ srcReg56 =
+ _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
+
+ // merge every two consecutive registers
+ srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
+
+ srcReg2345_3456_lo = _mm256_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ resReglo = _mm256_maddubs_epi16(srcReg2345_3456_lo, firstFilters);
+
+ resReglo = _mm256_hadds_epi16(resReglo, _mm256_setzero_si256());
+
+ // shift by 6 bit each 16 bit
+ resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
+ resReglo = _mm256_srai_epi16(resReglo, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ resReg = _mm256_packus_epi16(resReglo, resReglo);
+
+ src_ptr += src_stride;
+
+ xx_storeu2_epi32(output_ptr, out_pitch, &resReg);
+
+ output_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ srcReg23_34_lo = srcReg45_56_lo;
+ srcReg4x = srcReg6x;
+ }
+}
+
#if HAVE_AVX2 && HAVE_SSSE3
filter8_1dfunction aom_filter_block1d4_v8_ssse3;
filter8_1dfunction aom_filter_block1d16_v2_ssse3;
diff --git a/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
index 29dbcce..325a21b 100644
--- a/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
+++ b/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
@@ -289,6 +289,10 @@
#define aom_filter_block1d16_h4_ssse3 aom_filter_block1d16_h8_ssse3
#define aom_filter_block1d16_v4_ssse3 aom_filter_block1d16_v8_ssse3
+#define aom_filter_block1d8_h4_ssse3 aom_filter_block1d8_h8_ssse3
+#define aom_filter_block1d8_v4_ssse3 aom_filter_block1d8_v8_ssse3
+#define aom_filter_block1d4_h4_ssse3 aom_filter_block1d4_h8_ssse3
+#define aom_filter_block1d4_v4_ssse3 aom_filter_block1d4_v8_ssse3
filter8_1dfunction aom_filter_block1d16_v2_ssse3;
filter8_1dfunction aom_filter_block1d16_h2_ssse3;
diff --git a/aom_dsp/x86/convolve.h b/aom_dsp/x86/convolve.h
index 7d8eb36..3e19682 100644
--- a/aom_dsp/x86/convolve.h
+++ b/aom_dsp/x86/convolve.h
@@ -43,14 +43,14 @@
w -= 16; \
} \
while (w >= 8) { \
- aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \
+ aom_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst, \
dst_stride, h, filter); \
src += 8; \
dst += 8; \
w -= 8; \
} \
while (w >= 4) { \
- aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \
+ aom_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst, \
dst_stride, h, filter); \
src += 4; \
dst += 4; \
diff --git a/aom_dsp/x86/convolve_avx2.h b/aom_dsp/x86/convolve_avx2.h
index d4f3865..30253f6 100644
--- a/aom_dsp/x86/convolve_avx2.h
+++ b/aom_dsp/x86/convolve_avx2.h
@@ -29,6 +29,11 @@
7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
};
+DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = {
+ 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
+ 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
+};
+
static INLINE void prepare_coeffs_lowbd(
const InterpFilterParams *const filter_params, const int subpel_q4,
__m256i *const coeffs /* [4] */) {