Add SSE2 variant for MULTITAP_SHARP2 y-convolve
This CL adds SSE2 intrinsics for av1_convolve_y_sr_c for
MULTITAP_SHARP2 case, and updates AV1ConvolveYTest to support
the added intrinsic and to obtain module level gains.
Module gains improved by a factor of ~11.5x w.r.t. C
Change-Id: I09e274344f62fd1eafbc620944580af6b3bcd58f
diff --git a/aom_dsp/x86/convolve_common_intrin.h b/aom_dsp/x86/convolve_common_intrin.h
index 114828d..9e3b73e 100644
--- a/aom_dsp/x86/convolve_common_intrin.h
+++ b/aom_dsp/x86/convolve_common_intrin.h
@@ -74,6 +74,37 @@
return convolve_12tap(ss, coeffs);
}
+static INLINE __m128i convolve_lo_y_12tap(const __m128i *s,
+ const __m128i *coeffs) {
+ __m128i ss[6];
+ const __m128i zero = _mm_setzero_si128();
+ ss[0] = _mm_unpacklo_epi8(s[0], zero);
+ ss[1] = _mm_unpacklo_epi8(s[2], zero);
+ ss[2] = _mm_unpacklo_epi8(s[4], zero);
+ ss[3] = _mm_unpacklo_epi8(s[6], zero);
+ ss[4] = _mm_unpacklo_epi8(s[8], zero);
+ ss[5] = _mm_unpacklo_epi8(s[10], zero);
+ return convolve_12tap(ss, coeffs);
+}
+
+static INLINE __m128i convolve_hi_y_12tap(const __m128i *s,
+ const __m128i *coeffs) {
+ __m128i ss[6];
+ const __m128i zero = _mm_setzero_si128();
+ ss[0] = _mm_unpackhi_epi8(s[0], zero);
+ ss[1] = _mm_unpackhi_epi8(s[2], zero);
+ ss[2] = _mm_unpackhi_epi8(s[4], zero);
+ ss[3] = _mm_unpackhi_epi8(s[6], zero);
+ ss[4] = _mm_unpackhi_epi8(s[8], zero);
+ ss[5] = _mm_unpackhi_epi8(s[10], zero);
+ return convolve_12tap(ss, coeffs);
+}
+
+void av1_convolve_y_sr_12tap_sse2(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_y,
+ int subpel_y_qn);
+
void av1_convolve_x_sr_12tap_sse2(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride, int w, int h,
const InterpFilterParams *filter_params_x,
diff --git a/av1/common/convolve.c b/av1/common/convolve.c
index 7954ad1..391c063 100644
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c
@@ -544,13 +544,8 @@
av1_convolve_x_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
subpel_x_qn, conv_params);
} else if (!need_x && need_y) {
- if (filter_params_x->taps > 8 || filter_params_y->taps > 8) {
- av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
- filter_params_y, subpel_y_qn);
- } else {
- av1_convolve_y_sr(src, src_stride, dst, dst_stride, w, h, filter_params_y,
- subpel_y_qn);
- }
+ av1_convolve_y_sr(src, src_stride, dst, dst_stride, w, h, filter_params_y,
+ subpel_y_qn);
} else {
assert(need_x && need_y);
diff --git a/av1/common/x86/convolve_avx2.c b/av1/common/x86/convolve_avx2.c
index ed5fafc..3eeae67 100644
--- a/av1/common/x86/convolve_avx2.c
+++ b/av1/common/x86/convolve_avx2.c
@@ -22,341 +22,351 @@
int dst_stride, int w, int h,
const InterpFilterParams *filter_params_y,
const int subpel_y_qn) {
- int i, j, vert_tap = SUBPEL_TAPS;
- // right shift is F-1 because we are already dividing
- // filter co-efficients by 2
- const int right_shift_bits = (FILTER_BITS - 1);
- const __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits);
- const __m256i right_shift_const =
- _mm256_set1_epi16((1 << right_shift_bits) >> 1);
+ if (filter_params_y->taps > 8) {
+ if (w < 4 || h < 4) {
+ av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_y, subpel_y_qn);
+ } else {
+ av1_convolve_y_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h,
+ filter_params_y, subpel_y_qn);
+ }
+ } else {
+ int i, j, vert_tap = SUBPEL_TAPS;
+ // right shift is F-1 because we are already dividing
+ // filter co-efficients by 2
+ const int right_shift_bits = (FILTER_BITS - 1);
+ const __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits);
+ const __m256i right_shift_const =
+ _mm256_set1_epi16((1 << right_shift_bits) >> 1);
- __m256i coeffs[4], s[8];
- __m128i d[6];
+ __m256i coeffs[4], s[8];
+ __m128i d[6];
- // Condition for checking valid vert_filt taps
- const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
- filter_params_y, subpel_y_qn & SUBPEL_MASK);
- if (!(filter[0] | filter[1] | filter[6] | filter[7])) {
- vert_tap = 4;
- } else if (!(filter[0] | filter[7])) {
- vert_tap = 6;
- }
+ // Condition for checking valid vert_filt taps
+ const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+ if (!(filter[0] | filter[1] | filter[6] | filter[7])) {
+ vert_tap = 4;
+ } else if (!(filter[0] | filter[7])) {
+ vert_tap = 6;
+ }
- if (vert_tap == 6)
- prepare_coeffs_6t_lowbd(filter_params_y, subpel_y_qn, coeffs);
- else
- prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
+ if (vert_tap == 6)
+ prepare_coeffs_6t_lowbd(filter_params_y, subpel_y_qn, coeffs);
+ else
+ prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
- // vert_filt as 4 tap
- if (vert_tap == 4) {
- const int fo_vert = 1;
- const uint8_t *const src_ptr = src - fo_vert * src_stride;
- for (j = 0; j < w; j += 16) {
- const uint8_t *data = &src_ptr[j];
- d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
- d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
- d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
- d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
- d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+ // vert_filt as 4 tap
+ if (vert_tap == 4) {
+ const int fo_vert = 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride;
+ for (j = 0; j < w; j += 16) {
+ const uint8_t *data = &src_ptr[j];
+ d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+ d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+ d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+ d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+ d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
- // Load lines a and b. Line a to lower 128, line b to upper 128
- const __m256i src_01a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ const __m256i src_01a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
- const __m256i src_12a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
+ const __m256i src_12a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
- const __m256i src_23a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
+ const __m256i src_23a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
- const __m256i src_34a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
+ const __m256i src_34a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
- s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
- s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
+ s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
+ s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
- s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
- s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
+ s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
+ s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
- for (i = 0; i < h; i += 2) {
- data = &src_ptr[i * src_stride + j];
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[i * src_stride + j];
+ d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+ const __m256i src_45a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
+
+ d[4] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
+ const __m256i src_56a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[4]), 0x20);
+
+ s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+ s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
+
+ const __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1);
+ /* rounding code */
+ // shift by F - 1
+ const __m256i res_16b_lo = _mm256_sra_epi16(
+ _mm256_add_epi16(res_lo, right_shift_const), right_shift);
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+
+ if (w - j > 8) {
+ const __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1);
+
+ /* rounding code */
+ // shift by F - 1
+ const __m256i res_16b_hi = _mm256_sra_epi16(
+ _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
+
+ __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_a);
+ const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_1);
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+ if (w - j > 4) {
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_1);
+ } else if (w - j > 2) {
+ xx_storel_32(&dst[i * dst_stride + j], res_0);
+ xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
+ } else {
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+ __m128i *const p_1 =
+ (__m128i *)&dst[i * dst_stride + j + dst_stride];
+ *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
+ *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
+ }
+ }
+ s[0] = s[1];
+ s[1] = s[2];
+
+ s[3] = s[4];
+ s[4] = s[5];
+ }
+ }
+ } else if (vert_tap == 6) {
+ const int fo_vert = vert_tap / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride;
+
+ for (j = 0; j < w; j += 16) {
+ const uint8_t *data = &src_ptr[j];
+ __m256i src6;
+
+ d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+ d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+ d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+ d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ const __m256i src_01a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
+
+ const __m256i src_12a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
+
+ const __m256i src_23a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
+
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 4 * src_stride)));
+ const __m256i src_34a =
+ _mm256_permute2x128_si256(_mm256_castsi128_si256(d[3]), src6, 0x20);
+
+ s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
+ s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
+
+ s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
+ s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
+
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[i * src_stride + j];
+ const __m256i src_45a = _mm256_permute2x128_si256(
+ src6,
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+ 0x20);
+
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+ const __m256i src_56a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+ src6, 0x20);
+
+ s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+ s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
+
+ const __m256i res_lo = convolve_lowbd_6tap(s, coeffs);
+
+ /* rounding code */
+ // shift by F - 1
+ const __m256i res_16b_lo = _mm256_sra_epi16(
+ _mm256_add_epi16(res_lo, right_shift_const), right_shift);
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+
+ if (w - j > 8) {
+ const __m256i res_hi = convolve_lowbd_6tap(s + 3, coeffs);
+
+ /* rounding code */
+ // shift by F - 1
+ const __m256i res_16b_hi = _mm256_sra_epi16(
+ _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
+
+ __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_a);
+ const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_1);
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+ if (w - j > 4) {
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_1);
+ } else if (w - j > 2) {
+ xx_storel_32(&dst[i * dst_stride + j], res_0);
+ xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
+ } else {
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+ __m128i *const p_1 =
+ (__m128i *)&dst[i * dst_stride + j + dst_stride];
+ *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
+ *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
+ }
+ }
+ s[0] = s[1];
+ s[1] = s[2];
+ s[3] = s[4];
+ s[4] = s[5];
+ }
+ }
+ } else {
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride;
+
+ for (j = 0; j < w; j += 16) {
+ const uint8_t *data = &src_ptr[j];
+ __m256i src6;
+
+ d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+ d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+ d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+ d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+ d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ const __m256i src_01a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
+
+ const __m256i src_12a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
+
+ const __m256i src_23a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
+
+ const __m256i src_34a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
+
const __m256i src_45a = _mm256_permute2x128_si256(
_mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
- d[4] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
- const __m256i src_56a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[4]), 0x20);
-
- s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
- s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
-
- const __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1);
- /* rounding code */
- // shift by F - 1
- const __m256i res_16b_lo = _mm256_sra_epi16(
- _mm256_add_epi16(res_lo, right_shift_const), right_shift);
- // 8 bit conversion and saturation to uint8
- __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
-
- if (w - j > 8) {
- const __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1);
-
- /* rounding code */
- // shift by F - 1
- const __m256i res_16b_hi = _mm256_sra_epi16(
- _mm256_add_epi16(res_hi, right_shift_const), right_shift);
- // 8 bit conversion and saturation to uint8
- __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
-
- __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
-
- const __m128i res_0 = _mm256_castsi256_si128(res_a);
- const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
-
- _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
- _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
- res_1);
- } else {
- const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
- const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
- if (w - j > 4) {
- _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
- _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
- res_1);
- } else if (w - j > 2) {
- xx_storel_32(&dst[i * dst_stride + j], res_0);
- xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
- } else {
- __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
- __m128i *const p_1 =
- (__m128i *)&dst[i * dst_stride + j + dst_stride];
- *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
- *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
- }
- }
- s[0] = s[1];
- s[1] = s[2];
-
- s[3] = s[4];
- s[4] = s[5];
- }
- }
- } else if (vert_tap == 6) {
- const int fo_vert = vert_tap / 2 - 1;
- const uint8_t *const src_ptr = src - fo_vert * src_stride;
-
- for (j = 0; j < w; j += 16) {
- const uint8_t *data = &src_ptr[j];
- __m256i src6;
-
- d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
- d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
- d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
- d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
- // Load lines a and b. Line a to lower 128, line b to upper 128
- const __m256i src_01a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
-
- const __m256i src_12a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
-
- const __m256i src_23a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
-
- src6 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 4 * src_stride)));
- const __m256i src_34a =
- _mm256_permute2x128_si256(_mm256_castsi128_si256(d[3]), src6, 0x20);
-
- s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
- s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
-
- s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
- s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
-
- for (i = 0; i < h; i += 2) {
- data = &src_ptr[i * src_stride + j];
- const __m256i src_45a = _mm256_permute2x128_si256(
- src6,
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
- 0x20);
-
src6 = _mm256_castsi128_si256(
_mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
- const __m256i src_56a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
- src6, 0x20);
+ const __m256i src_56a =
+ _mm256_permute2x128_si256(_mm256_castsi128_si256(d[5]), src6, 0x20);
+ s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
+ s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
- s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
- const __m256i res_lo = convolve_lowbd_6tap(s, coeffs);
+ s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
+ s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
+ s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
- /* rounding code */
- // shift by F - 1
- const __m256i res_16b_lo = _mm256_sra_epi16(
- _mm256_add_epi16(res_lo, right_shift_const), right_shift);
- // 8 bit conversion and saturation to uint8
- __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[i * src_stride + j];
+ const __m256i src_67a = _mm256_permute2x128_si256(
+ src6,
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+ 0x20);
- if (w - j > 8) {
- const __m256i res_hi = convolve_lowbd_6tap(s + 3, coeffs);
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
+ const __m256i src_78a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+ src6, 0x20);
+
+ s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
+ s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
+
+ const __m256i res_lo = convolve_lowbd(s, coeffs);
/* rounding code */
// shift by F - 1
- const __m256i res_16b_hi = _mm256_sra_epi16(
- _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+ const __m256i res_16b_lo = _mm256_sra_epi16(
+ _mm256_add_epi16(res_lo, right_shift_const), right_shift);
// 8 bit conversion and saturation to uint8
- __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
+ __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
- __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
+ if (w - j > 8) {
+ const __m256i res_hi = convolve_lowbd(s + 4, coeffs);
- const __m128i res_0 = _mm256_castsi256_si128(res_a);
- const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+ /* rounding code */
+ // shift by F - 1
+ const __m256i res_16b_hi = _mm256_sra_epi16(
+ _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
- _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
- _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
- res_1);
- } else {
- const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
- const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
- if (w - j > 4) {
- _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
- _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_a);
+ const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
res_1);
- } else if (w - j > 2) {
- xx_storel_32(&dst[i * dst_stride + j], res_0);
- xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
} else {
- __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
- __m128i *const p_1 =
- (__m128i *)&dst[i * dst_stride + j + dst_stride];
- *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
- *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+ if (w - j > 4) {
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_1);
+ } else if (w - j > 2) {
+ xx_storel_32(&dst[i * dst_stride + j], res_0);
+ xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
+ } else {
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+ __m128i *const p_1 =
+ (__m128i *)&dst[i * dst_stride + j + dst_stride];
+ *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
+ *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
+ }
}
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
}
- s[0] = s[1];
- s[1] = s[2];
- s[3] = s[4];
- s[4] = s[5];
- }
- }
- } else {
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const uint8_t *const src_ptr = src - fo_vert * src_stride;
-
- for (j = 0; j < w; j += 16) {
- const uint8_t *data = &src_ptr[j];
- __m256i src6;
-
- d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
- d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
- d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
- d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
- d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
- d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
- // Load lines a and b. Line a to lower 128, line b to upper 128
- const __m256i src_01a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
-
- const __m256i src_12a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
-
- const __m256i src_23a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
-
- const __m256i src_34a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
-
- const __m256i src_45a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
-
- src6 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
- const __m256i src_56a =
- _mm256_permute2x128_si256(_mm256_castsi128_si256(d[5]), src6, 0x20);
-
- s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
- s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
- s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
-
- s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
- s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
- s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
-
- for (i = 0; i < h; i += 2) {
- data = &src_ptr[i * src_stride + j];
- const __m256i src_67a = _mm256_permute2x128_si256(
- src6,
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
- 0x20);
-
- src6 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
- const __m256i src_78a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
- src6, 0x20);
-
- s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
- s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
-
- const __m256i res_lo = convolve_lowbd(s, coeffs);
-
- /* rounding code */
- // shift by F - 1
- const __m256i res_16b_lo = _mm256_sra_epi16(
- _mm256_add_epi16(res_lo, right_shift_const), right_shift);
- // 8 bit conversion and saturation to uint8
- __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
-
- if (w - j > 8) {
- const __m256i res_hi = convolve_lowbd(s + 4, coeffs);
-
- /* rounding code */
- // shift by F - 1
- const __m256i res_16b_hi = _mm256_sra_epi16(
- _mm256_add_epi16(res_hi, right_shift_const), right_shift);
- // 8 bit conversion and saturation to uint8
- __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
-
- __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
-
- const __m128i res_0 = _mm256_castsi256_si128(res_a);
- const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
-
- _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
- _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
- res_1);
- } else {
- const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
- const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
- if (w - j > 4) {
- _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
- _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
- res_1);
- } else if (w - j > 2) {
- xx_storel_32(&dst[i * dst_stride + j], res_0);
- xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
- } else {
- __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
- __m128i *const p_1 =
- (__m128i *)&dst[i * dst_stride + j + dst_stride];
- *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
- *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
- }
- }
- s[0] = s[1];
- s[1] = s[2];
- s[2] = s[3];
-
- s[4] = s[5];
- s[5] = s[6];
- s[6] = s[7];
}
}
}
diff --git a/av1/common/x86/convolve_sse2.c b/av1/common/x86/convolve_sse2.c
index 5bf456e..cd5521e 100644
--- a/av1/common/x86/convolve_sse2.c
+++ b/av1/common/x86/convolve_sse2.c
@@ -75,72 +75,91 @@
return convolve(ss, coeffs);
}
-void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
- int dst_stride, int w, int h,
- const InterpFilterParams *filter_params_y,
- const int subpel_y_qn) {
+void av1_convolve_y_sr_12tap_sse2(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_y,
+ int subpel_y_qn) {
const int fo_vert = filter_params_y->taps / 2 - 1;
const uint8_t *src_ptr = src - fo_vert * src_stride;
const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS);
- __m128i coeffs[4];
+ __m128i coeffs[6];
- prepare_coeffs(filter_params_y, subpel_y_qn, coeffs);
+ prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs);
- if (w <= 4) {
- __m128i s[8], src6, res, res_round, res16;
- uint32_t res_int;
- src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 6 * src_stride));
- s[0] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 0 * src_stride)),
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)));
- s[1] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)),
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)));
- s[2] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)),
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)));
- s[3] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)),
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)));
- s[4] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)),
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)));
- s[5] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)), src6);
+ int j = 0;
+ do {
+ __m128i s[12], src10, res_lo, res_hi;
+ __m128i res_lo_round, res_hi_round, res16, res;
+ const uint8_t *data = &src_ptr[j];
+ src10 = _mm_loadl_epi64((__m128i *)(data + 10 * src_stride));
+ s[0] =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
+ s[1] =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
+ s[2] =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
+ s[3] =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
+ s[4] =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
+ s[5] =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 5 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 6 * src_stride)));
+ s[6] =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 6 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
+ s[7] =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 7 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 8 * src_stride)));
+ s[8] =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 8 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 9 * src_stride)));
+ s[9] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 9 * src_stride)), src10);
+
+ int i = 0;
do {
- s[6] = _mm_unpacklo_epi8(
- src6, _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)));
- src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 8 * src_stride));
- s[7] = _mm_unpacklo_epi8(
- _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)), src6);
+ data = &src_ptr[i * src_stride + j];
+ s[10] = _mm_unpacklo_epi8(
+ src10, _mm_loadl_epi64((__m128i *)(data + 11 * src_stride)));
+ src10 = _mm_loadl_epi64((__m128i *)(data + 12 * src_stride));
+ s[11] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 11 * src_stride)), src10);
- res = convolve_lo_y(s + 0, coeffs);
- res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
- res16 = _mm_packs_epi32(res_round, res_round);
- res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
+ res_lo = convolve_lo_y_12tap(s, coeffs); // Filter low index pixels
+ res_hi = convolve_hi_y_12tap(s, coeffs); // Filter high index pixels
- if (w == 2)
- *(uint16_t *)dst = (uint16_t)res_int;
- else
- *(uint32_t *)dst = res_int;
+ res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
- src_ptr += src_stride;
- dst += dst_stride;
+ res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+ res = _mm_packus_epi16(res16, res16);
- res = convolve_lo_y(s + 1, coeffs);
- res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
- res16 = _mm_packs_epi32(res_round, res_round);
- res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
+ _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+ i++;
- if (w == 2)
- *(uint16_t *)dst = (uint16_t)res_int;
- else
- *(uint32_t *)dst = res_int;
+ res_lo = convolve_lo_y_12tap(s + 1, coeffs); // Filter low index pixels
+ res_hi = convolve_hi_y_12tap(s + 1, coeffs); // Filter high index pixels
- src_ptr += src_stride;
- dst += dst_stride;
+ res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+ res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+ res = _mm_packus_epi16(res16, res16);
+
+ _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+ i++;
s[0] = s[2];
s[1] = s[3];
@@ -148,71 +167,90 @@
s[3] = s[5];
s[4] = s[6];
s[5] = s[7];
- h -= 2;
- } while (h);
+ s[6] = s[8];
+ s[7] = s[9];
+ s[8] = s[10];
+ s[9] = s[11];
+ } while (i < h);
+ j += 8;
+ } while (j < w);
+}
+
+void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_y_qn) {
+ if (filter_params_y->taps > 8) {
+ if (w < 8) {
+ av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_y, subpel_y_qn);
+ } else {
+ av1_convolve_y_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h,
+ filter_params_y, subpel_y_qn);
+ }
} else {
- assert(!(w % 8));
- int j = 0;
- do {
- __m128i s[8], src6, res_lo, res_hi;
- __m128i res_lo_round, res_hi_round, res16, res;
- const uint8_t *data = &src_ptr[j];
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint8_t *src_ptr = src - fo_vert * src_stride;
+ const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
+ const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS);
+ __m128i coeffs[4];
- src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
+ prepare_coeffs(filter_params_y, subpel_y_qn, coeffs);
+
+ if (w <= 4) {
+ __m128i s[8], src6, res, res_round, res16;
+ uint32_t res_int;
+ src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 6 * src_stride));
s[0] = _mm_unpacklo_epi8(
- _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
- _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 0 * src_stride)),
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)));
s[1] = _mm_unpacklo_epi8(
- _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
- _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)),
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)));
s[2] = _mm_unpacklo_epi8(
- _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
- _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)),
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)));
s[3] = _mm_unpacklo_epi8(
- _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
- _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)),
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)));
s[4] = _mm_unpacklo_epi8(
- _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
- _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)),
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)));
s[5] = _mm_unpacklo_epi8(
- _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6);
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)), src6);
- int i = 0;
do {
- data = &src_ptr[i * src_stride + j];
s[6] = _mm_unpacklo_epi8(
- src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
- src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
+ src6, _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)));
+ src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 8 * src_stride));
s[7] = _mm_unpacklo_epi8(
- _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6);
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)), src6);
- res_lo = convolve_lo_y(s, coeffs); // Filter low index pixels
- res_hi = convolve_hi_y(s, coeffs); // Filter high index pixels
+ res = convolve_lo_y(s + 0, coeffs);
+ res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
+ res16 = _mm_packs_epi32(res_round, res_round);
+ res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
- res_lo_round =
- _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
- res_hi_round =
- _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+ if (w == 2)
+ *(uint16_t *)dst = (uint16_t)res_int;
+ else
+ *(uint32_t *)dst = res_int;
- res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
- res = _mm_packus_epi16(res16, res16);
+ src_ptr += src_stride;
+ dst += dst_stride;
- _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
- i++;
+ res = convolve_lo_y(s + 1, coeffs);
+ res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
+ res16 = _mm_packs_epi32(res_round, res_round);
+ res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
- res_lo = convolve_lo_y(s + 1, coeffs); // Filter low index pixels
- res_hi = convolve_hi_y(s + 1, coeffs); // Filter high index pixels
+ if (w == 2)
+ *(uint16_t *)dst = (uint16_t)res_int;
+ else
+ *(uint32_t *)dst = res_int;
- res_lo_round =
- _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
- res_hi_round =
- _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
-
- res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
- res = _mm_packus_epi16(res16, res16);
-
- _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
- i++;
+ src_ptr += src_stride;
+ dst += dst_stride;
s[0] = s[2];
s[1] = s[3];
@@ -220,9 +258,82 @@
s[3] = s[5];
s[4] = s[6];
s[5] = s[7];
- } while (i < h);
- j += 8;
- } while (j < w);
+ h -= 2;
+ } while (h);
+ } else {
+ assert(!(w % 8));
+ int j = 0;
+ do {
+ __m128i s[8], src6, res_lo, res_hi;
+ __m128i res_lo_round, res_hi_round, res16, res;
+ const uint8_t *data = &src_ptr[j];
+
+ src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
+ s[0] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
+ s[1] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
+ s[2] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
+ s[3] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
+ s[4] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
+ s[5] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6);
+
+ int i = 0;
+ do {
+ data = &src_ptr[i * src_stride + j];
+ s[6] = _mm_unpacklo_epi8(
+ src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
+ src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
+ s[7] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6);
+
+ res_lo = convolve_lo_y(s, coeffs); // Filter low index pixels
+ res_hi = convolve_hi_y(s, coeffs); // Filter high index pixels
+
+ res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+ res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+ res = _mm_packus_epi16(res16, res16);
+
+ _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+ i++;
+
+ res_lo = convolve_lo_y(s + 1, coeffs); // Filter low index pixels
+ res_hi = convolve_hi_y(s + 1, coeffs); // Filter high index pixels
+
+ res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+ res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+ res = _mm_packus_epi16(res16, res16);
+
+ _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+ i++;
+
+ s[0] = s[2];
+ s[1] = s[3];
+ s[2] = s[4];
+ s[3] = s[5];
+ s[4] = s[6];
+ s[5] = s[7];
+ } while (i < h);
+ j += 8;
+ } while (j < w);
+ }
}
}
diff --git a/test/av1_convolve_test.cc b/test/av1_convolve_test.cc
index 06f05e9..3ae4504 100644
--- a/test/av1_convolve_test.cc
+++ b/test/av1_convolve_test.cc
@@ -498,7 +498,7 @@
public:
void RunTest() {
for (int sub_y = 0; sub_y < 16; ++sub_y) {
- for (int filter = EIGHTTAP_REGULAR; filter < INTERP_FILTERS_ALL;
+ for (int filter = EIGHTTAP_REGULAR; filter <= INTERP_FILTERS_ALL;
++filter) {
InterpFilter f = static_cast<InterpFilter>(filter);
TestConvolve(sub_y, f);
@@ -506,6 +506,15 @@
}
}
+ public:
+ void SpeedTest() {
+ for (int filter = EIGHTTAP_REGULAR; filter <= INTERP_FILTERS_ALL;
+ ++filter) {
+ InterpFilter f = static_cast<InterpFilter>(filter);
+ TestConvolveSpeed(f, 10000);
+ }
+ }
+
private:
void TestConvolve(const int sub_y, const InterpFilter filter) {
const int width = GetParam().Block().Width();
@@ -515,17 +524,51 @@
av1_get_interp_filter_params_with_block_size(filter, height);
const uint8_t *input = FirstRandomInput8(GetParam());
DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
- av1_convolve_y_sr(input, width, reference, kOutputStride, width, height,
- filter_params_y, sub_y);
+ av1_convolve_y_sr_c(input, width, reference, kOutputStride, width, height,
+ filter_params_y, sub_y);
DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
GetParam().TestFunction()(input, width, test, kOutputStride, width, height,
filter_params_y, sub_y);
AssertOutputBufferEq(reference, test, width, height);
}
+
+ private:
+ void TestConvolveSpeed(const InterpFilter filter, const int num_iters) {
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+
+ const InterpFilterParams *filter_params_y =
+ av1_get_interp_filter_params_with_block_size(filter, height);
+ const uint8_t *input = FirstRandomInput8(GetParam());
+ DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < num_iters; ++i) {
+ av1_convolve_y_sr_c(input, width, reference, kOutputStride, width, height,
+ filter_params_y, 0);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < num_iters; ++i) {
+ GetParam().TestFunction()(input, width, test, kOutputStride, width,
+ height, filter_params_y, 0);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ printf("%d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", filter, width, height, time1,
+ time2, time1 / time2);
+ }
};
TEST_P(AV1ConvolveYTest, RunTest) { RunTest(); }
+TEST_P(AV1ConvolveYTest, DISABLED_SpeedTest) { SpeedTest(); }
+
INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveYTest,
BuildLowbdParams(av1_convolve_y_sr_c));