Add SSE2 for resize_vert_dir()

This CL adds SSE2 implementation for resize_vert_dir()
function. Also, unit test for the same is added.

Resolution       Average Scaling w.r.t C
 3840x2160              4.47x
 2560x1440              5.16x
 1920x1080              5.27x
 1280x720               5.83x
 640x480                6.16x
 640x360                6.55x
 256x256                7.69x

This is a bit-exact change.

Change-Id: I23ade35421ff0aff63d2f0be2fafbad5b6f699c3
diff --git a/av1/av1.cmake b/av1/av1.cmake
index c057856..dcc19b7 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -266,6 +266,7 @@
             "${AOM_ROOT}/av1/common/x86/convolve_2d_sse2.c"
             "${AOM_ROOT}/av1/common/x86/convolve_sse2.c"
             "${AOM_ROOT}/av1/common/x86/jnt_convolve_sse2.c"
+            "${AOM_ROOT}/av1/common/x86/resize_sse2.c"
             "${AOM_ROOT}/av1/common/x86/wiener_convolve_sse2.c")
 
 list(APPEND AOM_AV1_COMMON_INTRIN_SSSE3
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 7d917eb..3973d91 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -555,7 +555,7 @@
 }
 
 add_proto qw/bool resize_vert_dir/, "uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col";
-specialize qw/resize_vert_dir avx2/;
+specialize qw/resize_vert_dir sse2 avx2/;
 
 add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
 specialize qw/av1_warp_affine sse4_1 avx2 neon neon_i8mm sve/;
diff --git a/av1/common/x86/resize_sse2.c b/av1/common/x86/resize_sse2.c
new file mode 100644
index 0000000..9714ecf
--- /dev/null
+++ b/av1/common/x86/resize_sse2.c
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/resize.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+#define PROCESS_RESIZE_Y_WD8                                           \
+  /* ah0 ah1 ... ah7 */                                                \
+  const __m128i AH = _mm_add_epi16(l0, l7);                            \
+  /* bg0 bg1 ... bh7 */                                                \
+  const __m128i BG = _mm_add_epi16(l1, l6);                            \
+  /* cf0 cf1 ... cf7 */                                                \
+  const __m128i CF = _mm_add_epi16(l2, l5);                            \
+  /* de0 de1 ... de7 */                                                \
+  const __m128i DE = _mm_add_epi16(l3, l4);                            \
+                                                                       \
+  /* ah0 bg0 ... ah3 bg3 */                                            \
+  const __m128i AHBG_low = _mm_unpacklo_epi16(AH, BG);                 \
+  /*cf0 de0 ... cf2 de2 */                                             \
+  const __m128i CFDE_low = _mm_unpacklo_epi16(CF, DE);                 \
+                                                                       \
+  /* ah4 bg4... ah7 bg7 */                                             \
+  const __m128i AHBG_hi = _mm_unpackhi_epi16(AH, BG);                  \
+  /* cf4 de4... cf7 de7 */                                             \
+  const __m128i CFDE_hi = _mm_unpackhi_epi16(CF, DE);                  \
+                                                                       \
+  /* r00 r01 r02 r03 */                                                \
+  const __m128i r00 = _mm_madd_epi16(AHBG_low, coeffs_y[0]);           \
+  const __m128i r01 = _mm_madd_epi16(CFDE_low, coeffs_y[1]);           \
+  __m128i r0 = _mm_add_epi32(r00, r01);                                \
+  /* r04 r05 r06 r07 */                                                \
+  const __m128i r10 = _mm_madd_epi16(AHBG_hi, coeffs_y[0]);            \
+  const __m128i r11 = _mm_madd_epi16(CFDE_hi, coeffs_y[1]);            \
+  __m128i r1 = _mm_add_epi32(r10, r11);                                \
+                                                                       \
+  r0 = _mm_add_epi32(r0, round_const_bits);                            \
+  r1 = _mm_add_epi32(r1, round_const_bits);                            \
+  r0 = _mm_sra_epi32(r0, round_shift_bits);                            \
+  r1 = _mm_sra_epi32(r1, round_shift_bits);                            \
+                                                                       \
+  /* r00 ... r07 (8 values of each 16bit) */                           \
+  const __m128i res_16b = _mm_packs_epi32(r0, r1);                     \
+  /* r00 ... r07 | r00 ... r07 (16 values of each 8bit) */             \
+  const __m128i res_8b0 = _mm_packus_epi16(res_16b, res_16b);          \
+                                                                       \
+  __m128i res = _mm_min_epu8(res_8b0, clip_pixel);                     \
+  res = _mm_max_epu8(res, zero);                                       \
+  _mm_storel_epi64((__m128i *)&output[(i / 2) * out_stride + j], res); \
+                                                                       \
+  l0 = l2;                                                             \
+  l1 = l3;                                                             \
+  l2 = l4;                                                             \
+  l3 = l5;                                                             \
+  l4 = l6;                                                             \
+  l5 = l7;                                                             \
+  data += 2 * stride;
+
+static INLINE void prepare_filter_coeffs(const int16_t *filter,
+                                         __m128i *const coeffs /* [2] */) {
+  // f0 f1 f2 f3 x x x x
+  const __m128i sym_even_filter = _mm_loadl_epi64((__m128i *)filter);
+
+  // f1 f0 f3 f2 x x x x
+  const __m128i tmp1 = _mm_shufflelo_epi16(sym_even_filter, 0xb1);
+
+  // f3 f2 f3 f2 ...
+  coeffs[0] = _mm_shuffle_epi32(tmp1, 0x55);
+  // f1 f0 f1 f0 ...
+  coeffs[1] = _mm_shuffle_epi32(tmp1, 0x00);
+}
+
+bool resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride,
+                          int height, int height2, int stride, int start_col) {
+  // For the GM tool, the input layer height or width is assured to be an even
+  // number. Hence the function 'down2_symodd()' is not invoked and SIMD
+  // optimization of the same is not implemented.
+  // When the input height is less than 8 and even, the potential input
+  // heights are limited to 2, 4, or 6. These scenarios require seperate
+  // handling due to padding requirements. Invoking the C function here will
+  // eliminate the need for conditional statements within the subsequent SIMD
+  // code to manage these cases.
+  if (height & 1 || height < 8) {
+    return resize_vert_dir_c(intbuf, output, out_stride, height, height2,
+                             stride, start_col);
+  }
+
+  __m128i coeffs_y[2];
+  const int bits = FILTER_BITS;
+  const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const uint8_t max_pixel = 255;
+  const __m128i clip_pixel = _mm_set1_epi8(max_pixel);
+  const __m128i zero = _mm_setzero_si128();
+  prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_y);
+
+  const int remain_col = stride % 8;
+
+  for (int j = start_col; j < stride - remain_col; j += 8) {
+    uint8_t *data = &intbuf[j];
+    // d0 ... d7
+    const __m128i l8_3 = _mm_loadl_epi64((__m128i *)(data + 0 * stride));
+    // Padding top 3 rows with the last available row at the top.
+    // a0 ... a7
+    const __m128i l8_0 = l8_3;
+    // b0 ... b7
+    const __m128i l8_1 = l8_3;
+    // c0 ... c7
+    const __m128i l8_2 = l8_3;
+    // e0 ... e7
+    const __m128i l8_4 = _mm_loadl_epi64((__m128i *)(data + 1 * stride));
+    // f0 ... f7
+    const __m128i l8_5 = _mm_loadl_epi64((__m128i *)(data + 2 * stride));
+
+    // Convert to 16bit as addition of 2 source pixel crosses 8 bit.
+    __m128i l0 = _mm_unpacklo_epi8(l8_0, zero);  // A(128bit) = a0 - a7(16 bit)
+    __m128i l1 = _mm_unpacklo_epi8(l8_1, zero);  // B(128bit) = b0 - b7(16 bit)
+    __m128i l2 = _mm_unpacklo_epi8(l8_2, zero);  // C(128bit) = c0 - c7(16 bit)
+    __m128i l3 = _mm_unpacklo_epi8(l8_3, zero);  // D(128bit) = d0 - d7(16 bit)
+    __m128i l4 = _mm_unpacklo_epi8(l8_4, zero);  // E(128bit) = e0 - e7(16 bit)
+    __m128i l5 = _mm_unpacklo_epi8(l8_5, zero);  // F(128bit) = f0 - f7(16 bit)
+
+    // Increment the pointer such that the loading starts from row G.
+    data = data + 3 * stride;
+    // The core vertical SIMD processes 2 input rows simultaneously to generate
+    // output corresponding to 1 row. To streamline the core loop and eliminate
+    // the need for conditional checks, the remaining rows 4 are processed
+    // separately.
+    for (int i = 0; i < height - 4; i += 2) {
+      // g0 ... g7
+      __m128i l8_6 = _mm_loadl_epi64((__m128i *)(data));
+      // h0 ... h7
+      __m128i l8_7 = _mm_loadl_epi64((__m128i *)(data + stride));
+      __m128i l6 = _mm_unpacklo_epi8(l8_6, zero);  // G(128bit):g0-g7(16b)
+      __m128i l7 = _mm_unpacklo_epi8(l8_7, zero);  // H(128bit):h0-h7(16b)
+
+      PROCESS_RESIZE_Y_WD8
+    }
+
+    __m128i l8_6 = _mm_loadl_epi64((__m128i *)(data));
+    __m128i l6 = _mm_unpacklo_epi8(l8_6, zero);
+    // Process the last 4 input rows here.
+    for (int i = height - 4; i < height; i += 2) {
+      __m128i l7 = l6;
+      PROCESS_RESIZE_Y_WD8
+    }
+  }
+
+  if (remain_col)
+    return resize_vert_dir_c(intbuf, output, out_stride, height, height2,
+                             stride, stride - remain_col);
+
+  return true;
+}
diff --git a/test/frame_resize_test.cc b/test/frame_resize_test.cc
index 8891304..cab6fe3 100644
--- a/test/frame_resize_test.cc
+++ b/test/frame_resize_test.cc
@@ -154,4 +154,11 @@
                        ::testing::ValuesIn(kFrameDim)));
 #endif
 
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, AV1ResizeYTest,
+    ::testing::Combine(::testing::Values(resize_vert_dir_sse2),
+                       ::testing::ValuesIn(kFrameDim)));
+#endif
+
 }  // namespace