Add SSE2 for resize_vert_dir()
This CL adds SSE2 implementation for resize_vert_dir()
function. Also, unit test for the same is added.
Resolution Average Scaling w.r.t C
3840x2160 4.47x
2560x1440 5.16x
1920x1080 5.27x
1280x720 5.83x
640x480 6.16x
640x360 6.55x
256x256 7.69x
This is a bit-exact change.
Change-Id: I23ade35421ff0aff63d2f0be2fafbad5b6f699c3
diff --git a/av1/av1.cmake b/av1/av1.cmake
index c057856..dcc19b7 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -266,6 +266,7 @@
"${AOM_ROOT}/av1/common/x86/convolve_2d_sse2.c"
"${AOM_ROOT}/av1/common/x86/convolve_sse2.c"
"${AOM_ROOT}/av1/common/x86/jnt_convolve_sse2.c"
+ "${AOM_ROOT}/av1/common/x86/resize_sse2.c"
"${AOM_ROOT}/av1/common/x86/wiener_convolve_sse2.c")
list(APPEND AOM_AV1_COMMON_INTRIN_SSSE3
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 7d917eb..3973d91 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -555,7 +555,7 @@
}
add_proto qw/bool resize_vert_dir/, "uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col";
-specialize qw/resize_vert_dir avx2/;
+specialize qw/resize_vert_dir sse2 avx2/;
add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
specialize qw/av1_warp_affine sse4_1 avx2 neon neon_i8mm sve/;
diff --git a/av1/common/x86/resize_sse2.c b/av1/common/x86/resize_sse2.c
new file mode 100644
index 0000000..9714ecf
--- /dev/null
+++ b/av1/common/x86/resize_sse2.c
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/resize.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+#define PROCESS_RESIZE_Y_WD8 \
+ /* ah0 ah1 ... ah7 */ \
+ const __m128i AH = _mm_add_epi16(l0, l7); \
+ /* bg0 bg1 ... bh7 */ \
+ const __m128i BG = _mm_add_epi16(l1, l6); \
+ /* cf0 cf1 ... cf7 */ \
+ const __m128i CF = _mm_add_epi16(l2, l5); \
+ /* de0 de1 ... de7 */ \
+ const __m128i DE = _mm_add_epi16(l3, l4); \
+ \
+ /* ah0 bg0 ... ah3 bg3 */ \
+ const __m128i AHBG_low = _mm_unpacklo_epi16(AH, BG); \
+ /*cf0 de0 ... cf2 de2 */ \
+ const __m128i CFDE_low = _mm_unpacklo_epi16(CF, DE); \
+ \
+ /* ah4 bg4... ah7 bg7 */ \
+ const __m128i AHBG_hi = _mm_unpackhi_epi16(AH, BG); \
+ /* cf4 de4... cf7 de7 */ \
+ const __m128i CFDE_hi = _mm_unpackhi_epi16(CF, DE); \
+ \
+ /* r00 r01 r02 r03 */ \
+ const __m128i r00 = _mm_madd_epi16(AHBG_low, coeffs_y[0]); \
+ const __m128i r01 = _mm_madd_epi16(CFDE_low, coeffs_y[1]); \
+ __m128i r0 = _mm_add_epi32(r00, r01); \
+ /* r04 r05 r06 r07 */ \
+ const __m128i r10 = _mm_madd_epi16(AHBG_hi, coeffs_y[0]); \
+ const __m128i r11 = _mm_madd_epi16(CFDE_hi, coeffs_y[1]); \
+ __m128i r1 = _mm_add_epi32(r10, r11); \
+ \
+ r0 = _mm_add_epi32(r0, round_const_bits); \
+ r1 = _mm_add_epi32(r1, round_const_bits); \
+ r0 = _mm_sra_epi32(r0, round_shift_bits); \
+ r1 = _mm_sra_epi32(r1, round_shift_bits); \
+ \
+ /* r00 ... r07 (8 values of each 16bit) */ \
+ const __m128i res_16b = _mm_packs_epi32(r0, r1); \
+ /* r00 ... r07 | r00 ... r07 (16 values of each 8bit) */ \
+ const __m128i res_8b0 = _mm_packus_epi16(res_16b, res_16b); \
+ \
+ __m128i res = _mm_min_epu8(res_8b0, clip_pixel); \
+ res = _mm_max_epu8(res, zero); \
+ _mm_storel_epi64((__m128i *)&output[(i / 2) * out_stride + j], res); \
+ \
+ l0 = l2; \
+ l1 = l3; \
+ l2 = l4; \
+ l3 = l5; \
+ l4 = l6; \
+ l5 = l7; \
+ data += 2 * stride;
+
+static INLINE void prepare_filter_coeffs(const int16_t *filter,
+ __m128i *const coeffs /* [2] */) {
+ // f0 f1 f2 f3 x x x x
+ const __m128i sym_even_filter = _mm_loadl_epi64((__m128i *)filter);
+
+ // f1 f0 f3 f2 x x x x
+ const __m128i tmp1 = _mm_shufflelo_epi16(sym_even_filter, 0xb1);
+
+ // f3 f2 f3 f2 ...
+ coeffs[0] = _mm_shuffle_epi32(tmp1, 0x55);
+ // f1 f0 f1 f0 ...
+ coeffs[1] = _mm_shuffle_epi32(tmp1, 0x00);
+}
+
+bool resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride,
+ int height, int height2, int stride, int start_col) {
+ // For the GM tool, the input layer height or width is assured to be an even
+ // number. Hence the function 'down2_symodd()' is not invoked and SIMD
+ // optimization of the same is not implemented.
+ // When the input height is less than 8 and even, the potential input
+ // heights are limited to 2, 4, or 6. These scenarios require seperate
+ // handling due to padding requirements. Invoking the C function here will
+ // eliminate the need for conditional statements within the subsequent SIMD
+ // code to manage these cases.
+ if (height & 1 || height < 8) {
+ return resize_vert_dir_c(intbuf, output, out_stride, height, height2,
+ stride, start_col);
+ }
+
+ __m128i coeffs_y[2];
+ const int bits = FILTER_BITS;
+ const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+ const uint8_t max_pixel = 255;
+ const __m128i clip_pixel = _mm_set1_epi8(max_pixel);
+ const __m128i zero = _mm_setzero_si128();
+ prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_y);
+
+ const int remain_col = stride % 8;
+
+ for (int j = start_col; j < stride - remain_col; j += 8) {
+ uint8_t *data = &intbuf[j];
+ // d0 ... d7
+ const __m128i l8_3 = _mm_loadl_epi64((__m128i *)(data + 0 * stride));
+ // Padding top 3 rows with the last available row at the top.
+ // a0 ... a7
+ const __m128i l8_0 = l8_3;
+ // b0 ... b7
+ const __m128i l8_1 = l8_3;
+ // c0 ... c7
+ const __m128i l8_2 = l8_3;
+ // e0 ... e7
+ const __m128i l8_4 = _mm_loadl_epi64((__m128i *)(data + 1 * stride));
+ // f0 ... f7
+ const __m128i l8_5 = _mm_loadl_epi64((__m128i *)(data + 2 * stride));
+
+ // Convert to 16bit as addition of 2 source pixel crosses 8 bit.
+ __m128i l0 = _mm_unpacklo_epi8(l8_0, zero); // A(128bit) = a0 - a7(16 bit)
+ __m128i l1 = _mm_unpacklo_epi8(l8_1, zero); // B(128bit) = b0 - b7(16 bit)
+ __m128i l2 = _mm_unpacklo_epi8(l8_2, zero); // C(128bit) = c0 - c7(16 bit)
+ __m128i l3 = _mm_unpacklo_epi8(l8_3, zero); // D(128bit) = d0 - d7(16 bit)
+ __m128i l4 = _mm_unpacklo_epi8(l8_4, zero); // E(128bit) = e0 - e7(16 bit)
+ __m128i l5 = _mm_unpacklo_epi8(l8_5, zero); // F(128bit) = f0 - f7(16 bit)
+
+ // Increment the pointer such that the loading starts from row G.
+ data = data + 3 * stride;
+ // The core vertical SIMD processes 2 input rows simultaneously to generate
+ // output corresponding to 1 row. To streamline the core loop and eliminate
+ // the need for conditional checks, the remaining rows 4 are processed
+ // separately.
+ for (int i = 0; i < height - 4; i += 2) {
+ // g0 ... g7
+ __m128i l8_6 = _mm_loadl_epi64((__m128i *)(data));
+ // h0 ... h7
+ __m128i l8_7 = _mm_loadl_epi64((__m128i *)(data + stride));
+ __m128i l6 = _mm_unpacklo_epi8(l8_6, zero); // G(128bit):g0-g7(16b)
+ __m128i l7 = _mm_unpacklo_epi8(l8_7, zero); // H(128bit):h0-h7(16b)
+
+ PROCESS_RESIZE_Y_WD8
+ }
+
+ __m128i l8_6 = _mm_loadl_epi64((__m128i *)(data));
+ __m128i l6 = _mm_unpacklo_epi8(l8_6, zero);
+ // Process the last 4 input rows here.
+ for (int i = height - 4; i < height; i += 2) {
+ __m128i l7 = l6;
+ PROCESS_RESIZE_Y_WD8
+ }
+ }
+
+ if (remain_col)
+ return resize_vert_dir_c(intbuf, output, out_stride, height, height2,
+ stride, stride - remain_col);
+
+ return true;
+}
diff --git a/test/frame_resize_test.cc b/test/frame_resize_test.cc
index 8891304..cab6fe3 100644
--- a/test/frame_resize_test.cc
+++ b/test/frame_resize_test.cc
@@ -154,4 +154,11 @@
::testing::ValuesIn(kFrameDim)));
#endif
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, AV1ResizeYTest,
+ ::testing::Combine(::testing::Values(resize_vert_dir_sse2),
+ ::testing::ValuesIn(kFrameDim)));
+#endif
+
} // namespace