SSE2 implementation of 4-tap highbd filter
Added SSE2 implementation of the following :
- aom_highbd_filter_block1d16_h4_sse2
- aom_highbd_filter_block1d16_v4_sse2
- aom_highbd_filter_block1d8_h4_sse2
- aom_highbd_filter_block1d8_v4_sse2
- aom_highbd_filter_block1d4_h4_sse2
- aom_highbd_filter_block1d4_v4_sse2
Approximately 30% improvement is seen
w.r.t 8-tap filter at unit test level.
Change-Id: I1fd9de7972eadc3bbd9fc746eb4dea29f554a8b1
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 3685d76..d4fd97d 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -63,6 +63,7 @@
"${AOM_ROOT}/aom_dsp/x86/convolve.h"
"${AOM_ROOT}/aom_dsp/x86/convolve_sse2.h"
"${AOM_ROOT}/aom_dsp/x86/fft_sse2.c"
+ "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_sse2.c"
"${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.c"
"${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c"
"${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.c"
diff --git a/aom_dsp/x86/aom_asm_stubs.c b/aom_dsp/x86/aom_asm_stubs.c
index 2453764..22cfe8b 100644
--- a/aom_dsp/x86/aom_asm_stubs.c
+++ b/aom_dsp/x86/aom_asm_stubs.c
@@ -57,6 +57,13 @@
highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_v4_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_h4_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_v4_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_h4_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_v4_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_h4_sse2;
+
highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2;
diff --git a/aom_dsp/x86/convolve.h b/aom_dsp/x86/convolve.h
index bad9c62..ff3224e 100644
--- a/aom_dsp/x86/convolve.h
+++ b/aom_dsp/x86/convolve.h
@@ -125,21 +125,21 @@
if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) && \
(filter[2] | filter[5])) { \
while (w >= 16) { \
- aom_highbd_filter_block1d16_##dir##8_##avg##opt( \
+ aom_highbd_filter_block1d16_##dir##4_##avg##opt( \
src_start, src_stride, dst, dst_stride, h, filter, bd); \
src += 16; \
dst += 16; \
w -= 16; \
} \
while (w >= 8) { \
- aom_highbd_filter_block1d8_##dir##8_##avg##opt( \
+ aom_highbd_filter_block1d8_##dir##4_##avg##opt( \
src_start, src_stride, dst, dst_stride, h, filter, bd); \
src += 8; \
dst += 8; \
w -= 8; \
} \
while (w >= 4) { \
- aom_highbd_filter_block1d4_##dir##8_##avg##opt( \
+ aom_highbd_filter_block1d4_##dir##4_##avg##opt( \
src_start, src_stride, dst, dst_stride, h, filter, bd); \
src += 4; \
dst += 4; \
diff --git a/aom_dsp/x86/highbd_convolve_avx2.c b/aom_dsp/x86/highbd_convolve_avx2.c
index 099fcf7..e3b8c5a 100644
--- a/aom_dsp/x86/highbd_convolve_avx2.c
+++ b/aom_dsp/x86/highbd_convolve_avx2.c
@@ -992,6 +992,13 @@
#define aom_highbd_filter_block1d4_v8_avx2 aom_highbd_filter_block1d4_v8_sse2
#define aom_highbd_filter_block1d4_v2_avx2 aom_highbd_filter_block1d4_v2_sse2
+#define aom_highbd_filter_block1d16_h4_avx2 aom_highbd_filter_block1d16_h8_avx2
+#define aom_highbd_filter_block1d8_h4_avx2 aom_highbd_filter_block1d8_h8_avx2
+#define aom_highbd_filter_block1d4_h4_avx2 aom_highbd_filter_block1d4_h8_avx2
+#define aom_highbd_filter_block1d16_v4_avx2 aom_highbd_filter_block1d16_v8_avx2
+#define aom_highbd_filter_block1d8_v4_avx2 aom_highbd_filter_block1d8_v8_avx2
+#define aom_highbd_filter_block1d4_v4_avx2 aom_highbd_filter_block1d4_v8_avx2
+
HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
diff --git a/aom_dsp/x86/highbd_convolve_sse2.c b/aom_dsp/x86/highbd_convolve_sse2.c
new file mode 100644
index 0000000..a2bb283
--- /dev/null
+++ b/aom_dsp/x86/highbd_convolve_sse2.c
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <emmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/x86/convolve.h"
+
+// -----------------------------------------------------------------------------
+
+void aom_highbd_filter_block1d4_v4_sse2(const uint16_t *src_ptr,
+ ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height,
+ const int16_t *filter, int bd) {
+ __m128i filtersReg;
+ __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+ __m128i srcReg23_lo, srcReg34_lo;
+ __m128i srcReg45_lo, srcReg56_lo;
+ __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
+ __m128i resReg23_45_lo, resReg34_56_lo;
+ __m128i resReg23_45, resReg34_56;
+ __m128i addFilterReg64, secondFilters, thirdFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+ addFilterReg64 = _mm_set1_epi32(64);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+ secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3
+ thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5
+
+ // multiply the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = dst_pitch << 1;
+
+ srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+ srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+ srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3);
+
+ srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
+ srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4);
+
+ for (i = height; i > 1; i -= 2) {
+ srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
+ srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5);
+
+ srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
+ srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6);
+
+ // multiply 2 adjacent elements with the filter and add the result
+
+ resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters);
+ resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters);
+ resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters);
+ resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters);
+
+ resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo);
+ resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo);
+
+ // shift by 7 bit each 32 bit
+ resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64);
+ resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64);
+ resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7);
+ resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7);
+
+ // shrink to 16 bit each 32 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ resReg23_45 = _mm_packs_epi32(resReg23_45_lo, _mm_setzero_si128());
+ resReg34_56 = _mm_packs_epi32(resReg34_56_lo, _mm_setzero_si128());
+
+ resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128());
+ resReg23_45 = _mm_min_epi16(resReg23_45, max);
+ resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128());
+ resReg34_56 = _mm_min_epi16(resReg34_56, max);
+
+ src_ptr += src_stride;
+
+ _mm_storel_epi64((__m128i *)dst_ptr, (resReg23_45));
+ _mm_storel_epi64((__m128i *)(dst_ptr + dst_pitch), (resReg34_56));
+
+ dst_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ srcReg23_lo = srcReg45_lo;
+ srcReg34_lo = srcReg56_lo;
+ srcReg4 = srcReg6;
+ }
+}
+
+void aom_highbd_filter_block1d4_h4_sse2(const uint16_t *src_ptr,
+ ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height,
+ const int16_t *filter, int bd) {
+ __m128i filtersReg;
+ __m128i addFilterReg64;
+ __m128i secondFilters, thirdFilters;
+ __m128i srcRegFilt32b1_1;
+ __m128i srcReg32b1;
+ unsigned int i;
+ src_ptr -= 3;
+ addFilterReg64 = _mm_set1_epi32(64);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+ secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3
+ thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5
+
+ for (i = height; i > 0; i -= 1) {
+ srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2));
+
+ __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2);
+ __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4);
+ __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6);
+ __m128i ss_23 = _mm_unpacklo_epi32(srcReg32b1, ss_3_1);
+ __m128i ss_45 = _mm_unpacklo_epi32(ss_4_1, ss_5_1);
+
+ ss_23 = _mm_madd_epi16(ss_23, secondFilters);
+ ss_45 = _mm_madd_epi16(ss_45, thirdFilters);
+ srcRegFilt32b1_1 = _mm_add_epi32(ss_23, ss_45);
+
+ // shift by 7 bit each 32 bit
+ srcRegFilt32b1_1 = _mm_add_epi32(srcRegFilt32b1_1, addFilterReg64);
+ srcRegFilt32b1_1 = _mm_srai_epi32(srcRegFilt32b1_1, 7);
+
+ srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128());
+ srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+ srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max);
+
+ src_ptr += src_pitch;
+
+ _mm_storel_epi64((__m128i *)dst_ptr, srcRegFilt32b1_1);
+
+ dst_ptr += dst_pitch;
+ }
+}
+
+void aom_highbd_filter_block1d8_v4_sse2(const uint16_t *src_ptr,
+ ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height,
+ const int16_t *filter, int bd) {
+ __m128i filtersReg;
+ __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+ __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
+ __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
+ __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
+ __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
+ __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
+ __m128i resReg23_45, resReg34_56;
+ __m128i addFilterReg64, secondFilters, thirdFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+ addFilterReg64 = _mm_set1_epi32(64);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+ secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3
+ thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = dst_pitch << 1;
+
+ srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+ srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+ srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3);
+ srcReg23_hi = _mm_unpackhi_epi16(srcReg2, srcReg3);
+
+ srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
+ srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4);
+ srcReg34_hi = _mm_unpackhi_epi16(srcReg3, srcReg4);
+
+ for (i = height; i > 1; i -= 2) {
+ srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
+
+ srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5);
+ srcReg45_hi = _mm_unpackhi_epi16(srcReg4, srcReg5);
+
+ srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
+
+ srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6);
+ srcReg56_hi = _mm_unpackhi_epi16(srcReg5, srcReg6);
+
+ // multiply 2 adjacent elements with the filter and add the result
+
+ resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters);
+ resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters);
+ resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters);
+ resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters);
+
+ resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo);
+ resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo);
+
+ // multiply 2 adjacent elements with the filter and add the result
+
+ resReg23_hi = _mm_madd_epi16(srcReg23_hi, secondFilters);
+ resReg34_hi = _mm_madd_epi16(srcReg34_hi, secondFilters);
+ resReg45_hi = _mm_madd_epi16(srcReg45_hi, thirdFilters);
+ resReg56_hi = _mm_madd_epi16(srcReg56_hi, thirdFilters);
+
+ resReg23_45_hi = _mm_add_epi32(resReg23_hi, resReg45_hi);
+ resReg34_56_hi = _mm_add_epi32(resReg34_hi, resReg56_hi);
+
+ // shift by 7 bit each 32 bit
+ resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64);
+ resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64);
+ resReg23_45_hi = _mm_add_epi32(resReg23_45_hi, addFilterReg64);
+ resReg34_56_hi = _mm_add_epi32(resReg34_56_hi, addFilterReg64);
+ resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7);
+ resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7);
+ resReg23_45_hi = _mm_srai_epi32(resReg23_45_hi, 7);
+ resReg34_56_hi = _mm_srai_epi32(resReg34_56_hi, 7);
+
+ // shrink to 16 bit each 32 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ resReg23_45 = _mm_packs_epi32(resReg23_45_lo, resReg23_45_hi);
+ resReg34_56 = _mm_packs_epi32(resReg34_56_lo, resReg34_56_hi);
+
+ resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128());
+ resReg23_45 = _mm_min_epi16(resReg23_45, max);
+ resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128());
+ resReg34_56 = _mm_min_epi16(resReg34_56, max);
+
+ src_ptr += src_stride;
+
+ _mm_store_si128((__m128i *)dst_ptr, (resReg23_45));
+ _mm_store_si128((__m128i *)(dst_ptr + dst_pitch), (resReg34_56));
+
+ dst_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ srcReg23_lo = srcReg45_lo;
+ srcReg23_hi = srcReg45_hi;
+ srcReg34_lo = srcReg56_lo;
+ srcReg34_hi = srcReg56_hi;
+ srcReg4 = srcReg6;
+ }
+}
+
+void aom_highbd_filter_block1d8_h4_sse2(const uint16_t *src_ptr,
+ ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height,
+ const int16_t *filter, int bd) {
+ __m128i filtersReg;
+ __m128i addFilterReg64;
+ __m128i secondFilters, thirdFilters;
+ __m128i srcRegFilt32b1_1, srcRegFilt32b1_2;
+ __m128i srcReg32b1, srcReg32b2;
+ unsigned int i;
+ src_ptr -= 3;
+ addFilterReg64 = _mm_set1_epi32(64);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+ secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3
+ thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5
+
+ for (i = height; i > 0; i -= 1) {
+ srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2));
+ srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 6));
+
+ __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4);
+ __m128i ss_4_2 = _mm_srli_si128(srcReg32b2, 4);
+ __m128i ss_4 = _mm_unpacklo_epi64(ss_4_1, ss_4_2);
+
+ __m128i d1 = _mm_madd_epi16(srcReg32b1, secondFilters);
+ __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters);
+ srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
+
+ __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2);
+ __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6);
+ __m128i ss_3_2 = _mm_srli_si128(srcReg32b2, 2);
+ __m128i ss_5_2 = _mm_srli_si128(srcReg32b2, 6);
+ __m128i ss_3 = _mm_unpacklo_epi64(ss_3_1, ss_3_2);
+ __m128i ss_5 = _mm_unpacklo_epi64(ss_5_1, ss_5_2);
+
+ d1 = _mm_madd_epi16(ss_3, secondFilters);
+ d2 = _mm_madd_epi16(ss_5, thirdFilters);
+ srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);
+
+ __m128i res_lo_1 = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
+ __m128i res_hi_1 = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
+
+ // shift by 7 bit each 32 bit
+ res_lo_1 = _mm_add_epi32(res_lo_1, addFilterReg64);
+ res_hi_1 = _mm_add_epi32(res_hi_1, addFilterReg64);
+ res_lo_1 = _mm_srai_epi32(res_lo_1, 7);
+ res_hi_1 = _mm_srai_epi32(res_hi_1, 7);
+
+ srcRegFilt32b1_1 = _mm_packs_epi32(res_lo_1, res_hi_1);
+
+ srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+ srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max);
+
+ src_ptr += src_pitch;
+
+ _mm_store_si128((__m128i *)dst_ptr, srcRegFilt32b1_1);
+
+ dst_ptr += dst_pitch;
+ }
+}
+
+void aom_highbd_filter_block1d16_v4_sse2(const uint16_t *src_ptr,
+ ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height,
+ const int16_t *filter, int bd) {
+ aom_highbd_filter_block1d8_v4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
+ height, filter, bd);
+ aom_highbd_filter_block1d8_v4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
+ dst_pitch, height, filter, bd);
+}
+
+void aom_highbd_filter_block1d16_h4_sse2(const uint16_t *src_ptr,
+ ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height,
+ const int16_t *filter, int bd) {
+ aom_highbd_filter_block1d8_h4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
+ height, filter, bd);
+ aom_highbd_filter_block1d8_h4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
+ dst_pitch, height, filter, bd);
+}
diff --git a/aom_dsp/x86/highbd_variance_sse2.c b/aom_dsp/x86/highbd_variance_sse2.c
index 098cf96..b17573e 100644
--- a/aom_dsp/x86/highbd_variance_sse2.c
+++ b/aom_dsp/x86/highbd_variance_sse2.c
@@ -678,7 +678,7 @@
}
const InterpFilterParams *filter = av1_get_filter(subpel_search);
- int filter_taps = SUBPEL_TAPS;
+ int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS;
if (!subpel_x_q3 && !subpel_y_q3) {
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
@@ -727,11 +727,23 @@
const int16_t *const kernel_y =
av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
const uint8_t *ref_start = ref8 - ref_stride * ((filter_taps >> 1) - 1);
- uint16_t *temp_start_horiz = temp;
+ uint16_t *temp_start_horiz = (subpel_search <= USE_4_TAPS)
+ ? temp + (filter_taps >> 1) * MAX_SB_SIZE
+ : temp;
uint16_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
const int intermediate_height =
(((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+ // TODO(Sachin): Remove the memset below when we have
+ // 4 tap simd for avx2.
+ if (subpel_search <= USE_4_TAPS) {
+ memset(temp_start_vert - 3 * MAX_SB_SIZE, 0, width * sizeof(uint16_t));
+ memset(temp_start_vert - 2 * MAX_SB_SIZE, 0, width * sizeof(uint16_t));
+ memset(temp_start_vert + (height + 2) * MAX_SB_SIZE, 0,
+ width * sizeof(uint16_t));
+ memset(temp_start_vert + (height + 3) * MAX_SB_SIZE, 0,
+ width * sizeof(uint16_t));
+ }
aom_highbd_convolve8_horiz(
ref_start, ref_stride, CONVERT_TO_BYTEPTR(temp_start_horiz),
MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, intermediate_height, bd);
diff --git a/test/comp_avg_pred_test.h b/test/comp_avg_pred_test.h
index 11fa743..54bf281 100644
--- a/test/comp_avg_pred_test.h
+++ b/test/comp_avg_pred_test.h
@@ -520,7 +520,7 @@
const int num_loops = 1000000000 / (in_w + in_h);
aom_usec_timer timer;
aom_usec_timer_start(&timer);
- int subpel_search = 2; // set to 1 to test 4-tap filter.
+ int subpel_search = USE_8_TAPS; // set to USE_4_TAPS to test 4-tap filter.
for (int i = 0; i < num_loops; ++i)
aom_highbd_jnt_comp_avg_upsampled_pred_c(
NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(output),