SSE optimization of motion compensation functions
SSSE3 optimizations of av1_highbd_convolve_x_sr_c and
av1_highbd_convolve_y_sr_c have been added.
av1_highbd_convolve_x_sr_ssse3 - ~ 8.3 times faster than its
C implementation.
av1_highbd_convolve_y_sr_ssse3 - ~ 11.3 times faster than its
C implementation.
code cleanup of RunSpeedTest for AV1HighbdConvolve2DSrTest.
Change-Id: Ic6386f91c649d5d5513909f2dc2352eec338fc08
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index b813736..1949b25 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -180,6 +180,10 @@
"${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.c"
"${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c")
+ set(AOM_DSP_COMMON_INTRIN_SSSE3
+ ${AOM_DSP_COMMON_INTRIN_SSSE3}
+ "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_ssse3.c")
+
set(AOM_DSP_COMMON_INTRIN_AVX2
${AOM_DSP_COMMON_INTRIN_AVX2}
"${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c"
diff --git a/aom_dsp/x86/highbd_convolve_ssse3.c b/aom_dsp/x86/highbd_convolve_ssse3.c
new file mode 100644
index 0000000..41f4d5f
--- /dev/null
+++ b/aom_dsp/x86/highbd_convolve_ssse3.c
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+#include <assert.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+
+void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w, int h,
+ InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_q4,
+ const int subpel_y_q4,
+ ConvolveParams *conv_params, int bd) {
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_vert * src_stride;
+ (void)filter_params_x;
+ (void)subpel_x_q4;
+ (void)conv_params;
+
+ assert(conv_params->round_0 <= FILTER_BITS);
+ assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+ ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
+
+ __m128i s[16], coeffs_y[4];
+
+ const int bits = FILTER_BITS;
+
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+ const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i clip_pixel =
+ _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m128i zero = _mm_setzero_si128();
+
+ prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+ for (j = 0; j < w; j += 8) {
+ const uint16_t *data = &src_ptr[j];
+ /* Vertical filter */
+ {
+ __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+ __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+ __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+ __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+ __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+ __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+ __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
+
+ s[0] = _mm_unpacklo_epi16(s0, s1);
+ s[1] = _mm_unpacklo_epi16(s2, s3);
+ s[2] = _mm_unpacklo_epi16(s4, s5);
+
+ s[4] = _mm_unpackhi_epi16(s0, s1);
+ s[5] = _mm_unpackhi_epi16(s2, s3);
+ s[6] = _mm_unpackhi_epi16(s4, s5);
+
+ s[0 + 8] = _mm_unpacklo_epi16(s1, s2);
+ s[1 + 8] = _mm_unpacklo_epi16(s3, s4);
+ s[2 + 8] = _mm_unpacklo_epi16(s5, s6);
+
+ s[4 + 8] = _mm_unpackhi_epi16(s1, s2);
+ s[5 + 8] = _mm_unpackhi_epi16(s3, s4);
+ s[6 + 8] = _mm_unpackhi_epi16(s5, s6);
+
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[i * src_stride + j];
+
+ __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride));
+ __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride));
+
+ s[3] = _mm_unpacklo_epi16(s6, s7);
+ s[7] = _mm_unpackhi_epi16(s6, s7);
+
+ s[3 + 8] = _mm_unpacklo_epi16(s7, s8);
+ s[7 + 8] = _mm_unpackhi_epi16(s7, s8);
+
+ const __m128i res_a0 = convolve(s, coeffs_y);
+ __m128i res_a_round0 = _mm_sra_epi32(
+ _mm_add_epi32(res_a0, round_const_bits), round_shift_bits);
+
+ const __m128i res_a1 = convolve(s + 8, coeffs_y);
+ __m128i res_a_round1 = _mm_sra_epi32(
+ _mm_add_epi32(res_a1, round_const_bits), round_shift_bits);
+
+ if (w - j > 4) {
+ const __m128i res_b0 = convolve(s + 4, coeffs_y);
+ __m128i res_b_round0 = _mm_sra_epi32(
+ _mm_add_epi32(res_b0, round_const_bits), round_shift_bits);
+
+ const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y);
+ __m128i res_b_round1 = _mm_sra_epi32(
+ _mm_add_epi32(res_b1, round_const_bits), round_shift_bits);
+
+ __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0);
+ res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel);
+ res_16bit0 = _mm_max_epi16(res_16bit0, zero);
+
+ __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1);
+ res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel);
+ res_16bit1 = _mm_max_epi16(res_16bit1, zero);
+
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_16bit1);
+ } else if (w == 4) {
+ res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+ res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+ res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+ res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+ res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+ res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_a_round1);
+ } else {
+ res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+ res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+ res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+ res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+ res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+ res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+ *((uint32_t *)(&dst[i * dst_stride + j])) =
+ _mm_cvtsi128_si32(res_a_round0);
+
+ *((uint32_t *)(&dst[i * dst_stride + j + dst_stride])) =
+ _mm_cvtsi128_si32(res_a_round1);
+ }
+
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+
+ s[0 + 8] = s[1 + 8];
+ s[1 + 8] = s[2 + 8];
+ s[2 + 8] = s[3 + 8];
+
+ s[4 + 8] = s[5 + 8];
+ s[5 + 8] = s[6 + 8];
+ s[6 + 8] = s[7 + 8];
+
+ s6 = s8;
+ }
+ }
+ }
+}
+
+void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w, int h,
+ InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_q4,
+ const int subpel_y_q4,
+ ConvolveParams *conv_params, int bd) {
+ int i, j;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_horiz;
+ (void)subpel_y_q4;
+ (void)filter_params_y;
+
+ // Check that, even with 12-bit input, the intermediate values will fit
+ // into an unsigned 16-bit intermediate array.
+ assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+ __m128i s[4], coeffs_x[4];
+
+ const __m128i round_const_x =
+ _mm_set1_epi32(((1 << conv_params->round_0) >> 1));
+ const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+ const int bits = FILTER_BITS - conv_params->round_0;
+
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+ const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i clip_pixel =
+ _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m128i zero = _mm_setzero_si128();
+
+ prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+
+ for (j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ {
+ for (i = 0; i < h; i += 1) {
+ const __m128i row00 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ const __m128i row01 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
+
+ // even pixels
+ s[0] = _mm_alignr_epi8(row01, row00, 0);
+ s[1] = _mm_alignr_epi8(row01, row00, 4);
+ s[2] = _mm_alignr_epi8(row01, row00, 8);
+ s[3] = _mm_alignr_epi8(row01, row00, 12);
+
+ __m128i res_even = convolve(s, coeffs_x);
+ res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x),
+ round_shift_x);
+
+ // odd pixels
+ s[0] = _mm_alignr_epi8(row01, row00, 2);
+ s[1] = _mm_alignr_epi8(row01, row00, 6);
+ s[2] = _mm_alignr_epi8(row01, row00, 10);
+ s[3] = _mm_alignr_epi8(row01, row00, 14);
+
+ __m128i res_odd = convolve(s, coeffs_x);
+ res_odd =
+ _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x);
+
+ res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_bits),
+ round_shift_bits);
+ res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_bits),
+ round_shift_bits);
+
+ __m128i res_even1 = _mm_packs_epi32(res_even, res_even);
+ __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd);
+ __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1);
+
+ res = _mm_min_epi16(res, clip_pixel);
+ res = _mm_max_epi16(res, zero);
+
+ if (w - j > 4) {
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+ } else if (w == 4) {
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res);
+ } else {
+ *((uint32_t *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res);
+ }
+ }
+ }
+ }
+}
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 37c3a28..3a2f879 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -524,12 +524,12 @@
add_proto qw/void av1_highbd_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
add_proto qw/void av1_highbd_convolve_x_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-specialize qw/av1_highbd_convolve_x_sr c avx2/;
+specialize qw/av1_highbd_convolve_x_sr c ssse3 avx2/;
add_proto qw/void av1_highbd_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
add_proto qw/void av1_highbd_convolve_y_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-specialize qw/av1_highbd_convolve_y_sr c avx2/;
+specialize qw/av1_highbd_convolve_y_sr c ssse3 avx2/;
add_proto qw/void av1_highbd_convolve_rounding/, "const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits, int bd";
specialize qw/av1_highbd_convolve_rounding avx2/;
diff --git a/test/av1_convolve_2d_test.cc b/test/av1_convolve_2d_test.cc
index 927b8ff..d18840f 100644
--- a/test/av1_convolve_2d_test.cc
+++ b/test/av1_convolve_2d_test.cc
@@ -220,6 +220,12 @@
INSTANTIATE_TEST_CASE_P(SSSE3, AV1HighbdConvolve2DSrTest,
libaom_test::AV1HighbdConvolve2D::BuildParams(
av1_highbd_convolve_2d_sr_ssse3, 1, 1, 0));
+INSTANTIATE_TEST_CASE_P(SSSE3_X, AV1HighbdConvolve2DSrTest,
+ libaom_test::AV1HighbdConvolve2D::BuildParams(
+ av1_highbd_convolve_x_sr_ssse3, 1, 0, 0));
+INSTANTIATE_TEST_CASE_P(SSSE3_Y, AV1HighbdConvolve2DSrTest,
+ libaom_test::AV1HighbdConvolve2D::BuildParams(
+ av1_highbd_convolve_y_sr_ssse3, 0, 1, 0));
#endif
#if HAVE_AVX2
INSTANTIATE_TEST_CASE_P(AVX2, AV1HighbdConvolve2DSrTest,
diff --git a/test/av1_convolve_2d_test_util.cc b/test/av1_convolve_2d_test_util.cc
index 1c22525..d87b041 100644
--- a/test/av1_convolve_2d_test_util.cc
+++ b/test/av1_convolve_2d_test_util.cc
@@ -489,8 +489,8 @@
highbd_convolve_2d_func test_impl) {
const int w = kMaxSize, h = kMaxSize;
const int bd = GET_PARAM(0);
- // const int has_subx = GET_PARAM(2);
- // const int has_suby = GET_PARAM(3);
+ const int has_subx = GET_PARAM(2);
+ const int has_suby = GET_PARAM(3);
const int is_compound = GET_PARAM(4);
int hfilter, vfilter, subx, suby;
uint16_t input[kMaxSize * kMaxSize];
@@ -500,42 +500,48 @@
for (int i = 0; i < h; ++i)
for (int j = 0; j < w; ++j)
input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
- for (int i = 0; i < MAX_SB_SQUARE; ++i) output[i] = rnd_.Rand31();
hfilter = EIGHTTAP_REGULAR;
vfilter = EIGHTTAP_REGULAR;
int do_average = 0;
+ const int offset_r = 3;
+ const int offset_c = 3;
+ subx = 0;
+ suby = 0;
+
+ InterpFilterParams filter_params_x =
+ av1_get_interp_filter_params((InterpFilter)hfilter);
+ InterpFilterParams filter_params_y =
+ av1_get_interp_filter_params((InterpFilter)vfilter);
+
+ ConvolveParams conv_params =
+ get_conv_params_no_round(0, do_average, 0, NULL, 0, 0, bd);
+
for (int block_idx = BLOCK_4X4; block_idx < BLOCK_SIZES_ALL; ++block_idx) {
// Make sure that sizes 2xN and Nx2 are also tested for chroma.
+ const int num_sizes =
+ (block_size_wide[block_idx] == 4 || block_size_high[block_idx] == 4)
+ ? 2
+ : 1;
- const int out_w = block_size_wide[block_idx];
- const int out_h = block_size_high[block_idx];
+ for (int shift = 0; shift < num_sizes; ++shift) { // luma and chroma
+ const int out_w = block_size_wide[block_idx] >> shift;
+ const int out_h = block_size_high[block_idx] >> shift;
+ const int num_loops = 1000000000 / (out_w + out_h);
- InterpFilterParams filter_params_x =
- av1_get_interp_filter_params((InterpFilter)hfilter);
- InterpFilterParams filter_params_y =
- av1_get_interp_filter_params((InterpFilter)vfilter);
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < num_loops; ++i)
+ test_impl(input + offset_r * w + offset_c, w, output, MAX_SB_SIZE,
+ out_w, out_h, &filter_params_x, &filter_params_y, subx, suby,
+ &conv_params, bd);
- ConvolveParams conv_params =
- get_conv_params_no_round(0, do_average, 0, NULL, 0, 0, bd);
-
- const int offset_r = 3;
- const int offset_c = 3;
- subx = 0;
- suby = 0;
- const int num_loops = 1000000000 / (out_w + out_h);
- aom_usec_timer timer;
- aom_usec_timer_start(&timer);
- for (int i = 0; i < num_loops; ++i)
- test_impl(input + offset_r * w + offset_c, w, output, MAX_SB_SIZE, out_w,
- out_h, &filter_params_x, &filter_params_y, subx, suby,
- &conv_params, bd);
-
- aom_usec_timer_mark(&timer);
- const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
- printf("convolve %3dx%-3d: %7.2f ns\n", out_w, out_h,
- 1000.0 * elapsed_time / num_loops);
+ aom_usec_timer_mark(&timer);
+ const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+ printf("%d,%d convolve %3dx%-3d: %7.2f ns\n", has_subx, has_suby, out_w,
+ out_h, 1000.0 * elapsed_time / num_loops);
+ }
}
}