AVX2 implementation of 4-tap filter
Added AVX2 implementation of aom_filter_block1d16_h4_avx2
and aom_filter_block1d16_v4_avx2 for block width >= 16.
Approximately 30% improvement is seen w.r.t 8-tap filter
at unit test level. When tested for 10 frames of parkrun
720p50 content with speed=1 preset, 0.5% encode time
reduction is seen.
Change-Id: I3442fb0e3f20d23ee78e4ca44a4f39d50fe85164
diff --git a/aom_dsp/x86/aom_asm_stubs.c b/aom_dsp/x86/aom_asm_stubs.c
index 401fbdc..5ed7d63 100644
--- a/aom_dsp/x86/aom_asm_stubs.c
+++ b/aom_dsp/x86/aom_asm_stubs.c
@@ -22,6 +22,9 @@
filter8_1dfunction aom_filter_block1d4_v8_sse2;
filter8_1dfunction aom_filter_block1d4_h8_sse2;
+#define aom_filter_block1d16_h4_sse2 aom_filter_block1d16_h8_sse2
+#define aom_filter_block1d16_v4_sse2 aom_filter_block1d16_v8_sse2
+
filter8_1dfunction aom_filter_block1d16_v2_sse2;
filter8_1dfunction aom_filter_block1d16_h2_sse2;
filter8_1dfunction aom_filter_block1d8_v2_sse2;
diff --git a/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
index f3fe503..94d491d 100644
--- a/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
+++ b/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
@@ -311,6 +311,121 @@
}
}
+static void aom_filter_block1d16_h4_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg32, filt2Reg, filt3Reg;
+ __m256i secondFilters, thirdFilters;
+ __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
+ __m256i srcReg32b1, srcReg32b2, filtersReg32;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+ src_ptr -= 3;
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+
+ filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+ filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+
+ // multiply the size of the source and destination stride by two
+ src_stride = src_pixels_per_line << 1;
+ dst_stride = output_pitch << 1;
+ for (i = output_height; i > 1; i -= 2) {
+ // load the 2 strides of source
+ srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+ // filter the source buffer
+ srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+ // reading 2 strides of the next 16 bytes
+ // (part of it was being read by earlier read)
+ srcReg32b2 =
+ xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
+
+ // filter the source buffer
+ srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+ // add and saturate the results together
+ srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+ srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
+
+ src_ptr += src_stride;
+
+ xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1);
+ output_ptr += dst_stride;
+ }
+
+ // if the number of strides is odd.
+ // process only 16 bytes
+ if (i > 0) {
+ __m256i srcReg1, srcReg12;
+ __m256i srcRegFilt2, srcRegFilt3, srcRegFilt1_1;
+
+ srcReg1 = _mm256_loadu_si256((const __m256i *)(src_ptr));
+ srcReg12 = _mm256_permute4x64_epi64(srcReg1, 0x94);
+
+ // filter the source buffer
+ srcRegFilt2 = _mm256_shuffle_epi8(srcReg12, filt2Reg);
+ srcRegFilt3 = _mm256_shuffle_epi8(srcReg12, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt2 = _mm256_maddubs_epi16(srcRegFilt2, secondFilters);
+ srcRegFilt3 = _mm256_maddubs_epi16(srcRegFilt3, thirdFilters);
+
+ // add and saturate the results together
+ srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt2, srcRegFilt3);
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt1_1, addFilterReg32);
+ srcRegFilt1_1 = _mm256_srai_epi16(srcRegFilt1_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcRegFilt1_1 = _mm256_packus_epi16(srcRegFilt1_1, srcRegFilt1_1);
+ srcRegFilt1_1 = _mm256_permute4x64_epi64(srcRegFilt1_1, 0x8);
+
+ // save 16 bytes
+ _mm_store_si128((__m128i *)output_ptr,
+ _mm256_castsi256_si128(srcRegFilt1_1));
+ }
+}
+
static void aom_filter_block1d16_h8_avx2(
const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
@@ -659,6 +774,104 @@
}
}
+static void aom_filter_block1d16_v4_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i filtersReg32, addFilterReg32;
+ __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
+ __m256i srcReg23_34_lo, srcReg23_34_hi, srcReg45_56_lo, srcReg45_56_hi;
+ __m256i resReg23_34_lo, resReg23_34_hi, resReg45_56_lo, resReg45_56_hi;
+ __m256i resReglo, resReghi, resReg;
+ __m256i secondFilters, thirdFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the
+ // same data in both lanes of 128 bit register.
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ srcReg23 = xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+ srcReg4x = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
+
+ // have consecutive loads on the same 256 register
+ srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
+
+ srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
+ srcReg23_34_hi = _mm256_unpackhi_epi8(srcReg23, srcReg34);
+
+ for (i = output_height; i > 1; i -= 2) {
+ // load the last 2 loads of 16 bytes and have every two
+ // consecutive loads in the same 256 bit register
+ srcReg5x = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
+ srcReg45 =
+ _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
+
+ srcReg6x = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
+ srcReg56 =
+ _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
+
+ // merge every two consecutive registers
+ srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
+ srcReg45_56_hi = _mm256_unpackhi_epi8(srcReg45, srcReg56);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
+ resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
+
+ // add and saturate the results together
+ resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ resReg23_34_hi = _mm256_maddubs_epi16(srcReg23_34_hi, secondFilters);
+ resReg45_56_hi = _mm256_maddubs_epi16(srcReg45_56_hi, thirdFilters);
+
+ // add and saturate the results together
+ resReghi = _mm256_adds_epi16(resReg23_34_hi, resReg45_56_hi);
+
+ // shift by 6 bit each 16 bit
+ resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
+ resReghi = _mm256_adds_epi16(resReghi, addFilterReg32);
+ resReglo = _mm256_srai_epi16(resReglo, 6);
+ resReghi = _mm256_srai_epi16(resReghi, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ resReg = _mm256_packus_epi16(resReglo, resReghi);
+
+ src_ptr += src_stride;
+
+ xx_store2_mi128(output_ptr, out_pitch, &resReg);
+
+ output_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ srcReg23_34_lo = srcReg45_56_lo;
+ srcReg23_34_hi = srcReg45_56_hi;
+ srcReg4x = srcReg6x;
+ }
+}
+
static void aom_filter_block1d16_v8_avx2(
const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
diff --git a/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
index 6bcb4a5..29dbcce 100644
--- a/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
+++ b/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
@@ -287,6 +287,9 @@
filter8_1dfunction aom_filter_block1d4_v8_ssse3;
filter8_1dfunction aom_filter_block1d4_h8_ssse3;
+#define aom_filter_block1d16_h4_ssse3 aom_filter_block1d16_h8_ssse3
+#define aom_filter_block1d16_v4_ssse3 aom_filter_block1d16_v8_ssse3
+
filter8_1dfunction aom_filter_block1d16_v2_ssse3;
filter8_1dfunction aom_filter_block1d16_h2_ssse3;
filter8_1dfunction aom_filter_block1d8_v2_ssse3;
diff --git a/aom_dsp/x86/convolve.h b/aom_dsp/x86/convolve.h
index c6a008a..7d8eb36 100644
--- a/aom_dsp/x86/convolve.h
+++ b/aom_dsp/x86/convolve.h
@@ -36,7 +36,7 @@
if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) && \
(filter[2] | filter[5])) { \
while (w >= 16) { \
- aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
+ aom_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \
dst_stride, h, filter); \
src += 16; \
dst += 16; \
diff --git a/aom_dsp/x86/variance_sse2.c b/aom_dsp/x86/variance_sse2.c
index 051dff6..3c37e77 100644
--- a/aom_dsp/x86/variance_sse2.c
+++ b/aom_dsp/x86/variance_sse2.c
@@ -575,7 +575,7 @@
(subpel_search == 1)
? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR)
: av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
- int filter_taps = SUBPEL_TAPS;
+ int filter_taps = (subpel_search == 1) ? 4 : SUBPEL_TAPS;
if (!subpel_x_q3 && !subpel_y_q3) {
if (width >= 16) {
@@ -638,11 +638,20 @@
const int16_t *const kernel_y =
av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
const uint8_t *ref_start = ref - ref_stride * ((filter_taps >> 1) - 1);
- uint8_t *temp_start_horiz = temp;
+ uint8_t *temp_start_horiz =
+ (subpel_search == 1) ? temp + (filter_taps >> 1) * MAX_SB_SIZE : temp;
uint8_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
int intermediate_height =
(((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+ // TODO(Deepa): Remove the memset below when we have
+ // 4 tap simd for sse2 and ssse3.
+ if (subpel_search == 1) {
+ memset(temp_start_vert - 3 * MAX_SB_SIZE, 0, width);
+ memset(temp_start_vert - 2 * MAX_SB_SIZE, 0, width);
+ memset(temp_start_vert + (height + 2) * MAX_SB_SIZE, 0, width);
+ memset(temp_start_vert + (height + 3) * MAX_SB_SIZE, 0, width);
+ }
aom_convolve8_horiz(ref_start, ref_stride, temp_start_horiz, MAX_SB_SIZE,
kernel_x, 16, NULL, -1, width, intermediate_height);
aom_convolve8_vert(temp_start_vert, MAX_SB_SIZE, comp_pred, width, NULL, -1,