av1/common/x86/resize_avx2.c - aom - Git at Google

 /*
  * Copyright (c) 2024, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
  * was not distributed with this source code in the LICENSE file, you can
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 #include <immintrin.h>
 #include <string.h>

 #include "config/av1_rtcd.h"

 #include "av1/common/resize.h"

 #include "aom_dsp/x86/synonyms.h"

 #define CAST_HI(x) _mm256_castsi128_si256(x)
 #define CAST_LOW(x) _mm256_castsi256_si128(x)

 #define PROCESS_RESIZE_Y_WD16                                               \
   const int idx1 = AOMMIN(height - 1, i + 5);                               \
   const int idx2 = AOMMIN(height - 1, i + 6);                               \
   l6 = l10;                                                                 \
   l7 = l11;                                                                 \
   l8 = _mm_loadu_si128((__m128i *)(data + idx1 * stride));                  \
   l9 = _mm_loadu_si128((__m128i *)(data + idx2 * stride));                  \
                                                                             \
   /* g0... g15 | i0... i15 */                                               \
   const __m256i s68 =                                                       \
       _mm256_permute2x128_si256(CAST_HI(l6), CAST_HI(l8), 0x20);            \
   /* h0... h15 | j0... j15 */                                               \
   const __m256i s79 =                                                       \
       _mm256_permute2x128_si256(CAST_HI(l7), CAST_HI(l9), 0x20);            \
                                                                             \
   /* g0h0... g7g7 | i0j0... i7j */                                          \
   s[3] = _mm256_unpacklo_epi8(s68, s79);                                    \
   /* g8h8... g15g15 | i8j8... i15j15 */                                     \
   s[8] = _mm256_unpackhi_epi8(s68, s79);                                    \
                                                                             \
   __m256i res_out[2] = { 0 };                                               \
   resize_y_convolve(s, coeffs_y, res_out);                                  \
                                                                             \
   /* r00... r07 */                                                          \
   __m256i res_a_round_1 = _mm256_add_epi32(res_out[0], round_const_bits);   \
   /* r20... r27 */                                                          \
   __m256i res_a_round_2 = _mm256_add_epi32(res_out[1], round_const_bits);   \
                                                                             \
   res_a_round_1 = _mm256_sra_epi32(res_a_round_1, round_shift_bits);        \
   res_a_round_2 = _mm256_sra_epi32(res_a_round_2, round_shift_bits);        \
                                                                             \
   __m256i res_out_b[2] = { 0 };                                             \
   resize_y_convolve(s + 5, coeffs_y, res_out_b);                            \
                                                                             \
   /* r08... r015 */                                                         \
   __m256i res_b_round_1 = _mm256_add_epi32(res_out_b[0], round_const_bits); \
   /* r28... r215 */                                                         \
   __m256i res_b_round_2 = _mm256_add_epi32(res_out_b[1], round_const_bits); \
   res_b_round_1 = _mm256_sra_epi32(res_b_round_1, round_shift_bits);        \
   res_b_round_2 = _mm256_sra_epi32(res_b_round_2, round_shift_bits);        \
                                                                             \
   /* r00... r03 r20... r23 | r04... r07 r24... r27 */                       \
   __m256i res_8bit0 = _mm256_packus_epi32(res_a_round_1, res_a_round_2);    \
   /* r08... r012 r28... r212 | r013... r015 r213... r215 */                 \
   __m256i res_8bit1 = _mm256_packus_epi32(res_b_round_1, res_b_round_2);    \
   /* r00... r07 | r20... r27 */                                             \
   res_8bit0 = _mm256_permute4x64_epi64(res_8bit0, 0xd8);                    \
   /* r08... r015 | r28... r215 */                                           \
   res_8bit1 = _mm256_permute4x64_epi64(res_8bit1, 0xd8);                    \
   /* r00... r015 | r20... r215 */                                           \
   res_8bit1 = _mm256_packus_epi16(res_8bit0, res_8bit1);                    \
   res_8bit0 = _mm256_min_epu8(res_8bit1, clip_pixel);                       \
   res_8bit0 = _mm256_max_epu8(res_8bit0, zero);

 #define PROCESS_RESIZE_Y_WD8                                              \
   const int idx1 = AOMMIN(height - 1, i + 5);                             \
   const int idx2 = AOMMIN(height - 1, i + 6);                             \
   l6 = l10;                                                               \
   l7 = l11;                                                               \
   l8 = _mm_loadl_epi64((__m128i *)(data + idx1 * stride));                \
   l9 = _mm_loadl_epi64((__m128i *)(data + idx2 * stride));                \
                                                                           \
   /* g0h0... g7h7 */                                                      \
   s67 = _mm_unpacklo_epi8(l6, l7);                                        \
   /* i0j0...i7j7 */                                                       \
   __m128i s89 = _mm_unpacklo_epi8(l8, l9);                                \
                                                                           \
   /* g0h0...g7g7 | i0j0...i7j7 */                                         \
   s[3] = _mm256_permute2x128_si256(CAST_HI(s67), CAST_HI(s89), 0x20);     \
                                                                           \
   __m256i res_out[2] = { 0 };                                             \
   resize_y_convolve(s, coeffs_y, res_out);                                \
                                                                           \
   /* r00... r07 */                                                        \
   __m256i res_a_round_1 = _mm256_add_epi32(res_out[0], round_const_bits); \
   /* r20...r27 */                                                         \
   __m256i res_a_round_2 = _mm256_add_epi32(res_out[1], round_const_bits); \
   res_a_round_1 = _mm256_sra_epi32(res_a_round_1, round_shift_bits);      \
   res_a_round_2 = _mm256_sra_epi32(res_a_round_2, round_shift_bits);      \
                                                                           \
   /* r00...r03 r20...r23 | r04...r07 r24...r27 */                         \
   res_a_round_1 = _mm256_packus_epi32(res_a_round_1, res_a_round_2);      \
   /* r00...r07 | r20...r27 */                                             \
   res_a_round_1 = _mm256_permute4x64_epi64(res_a_round_1, 0xd8);          \
   res_a_round_1 = _mm256_packus_epi16(res_a_round_1, res_a_round_1);      \
   res_a_round_1 = _mm256_min_epu8(res_a_round_1, clip_pixel);             \
   res_a_round_1 = _mm256_max_epu8(res_a_round_1, zero);

 static INLINE void resize_y_convolve(const __m256i *const s,
                                      const __m256i *const coeffs,
                                      __m256i *res_out) {
   const __m256i res_0 = _mm256_maddubs_epi16(s[0], coeffs[0]);
   const __m256i res_1 = _mm256_maddubs_epi16(s[1], coeffs[1]);
   const __m256i res_2 = _mm256_maddubs_epi16(s[2], coeffs[2]);
   const __m256i res_3 = _mm256_maddubs_epi16(s[3], coeffs[3]);

   const __m256i dst_0 = _mm256_add_epi16(res_0, res_1);
   const __m256i dst_1 = _mm256_add_epi16(res_2, res_3);
   // The sum of convolve operation crosses signed 16bit. Hence, the addition
   // should happen in 32bit.
   const __m256i dst_00 = _mm256_cvtepi16_epi32(CAST_LOW(dst_0));
   const __m256i dst_01 =
       _mm256_cvtepi16_epi32(_mm256_extracti128_si256(dst_0, 1));
   const __m256i dst_10 = _mm256_cvtepi16_epi32(CAST_LOW(dst_1));
   const __m256i dst_11 =
       _mm256_cvtepi16_epi32(_mm256_extracti128_si256(dst_1, 1));

   res_out[0] = _mm256_add_epi32(dst_00, dst_10);
   res_out[1] = _mm256_add_epi32(dst_01, dst_11);
 }

 static INLINE void prepare_filter_coeffs(const int16_t *filter,
                                          __m256i *const coeffs /* [4] */) {
   // f0 f1 f2 f3 x x x x
   const __m128i sym_even_filter = _mm_loadl_epi64((__m128i *)filter);
   // f0 f1 f2 f3 f0 f1 f2 f3
   const __m128i tmp0 = _mm_shuffle_epi32(sym_even_filter, 0x44);
   // f0 f1 f2 f3 f1 f0 f3 f2
   const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, 0xb1);

   const __m128i filter_8bit = _mm_packs_epi16(tmp1, tmp1);

   // f0 f1 f0 f1 ..
   coeffs[2] = _mm256_broadcastw_epi16(filter_8bit);
   // f2 f3 f2 f3 ..
   coeffs[3] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 2));
   // f3 f2 f3 f2 ..
   coeffs[0] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 6));
   // f1 f0 f1 f0 ..
   coeffs[1] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 4));
 }

 bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride,
                           int height, int height2, int stride, int start_col) {
   assert(start_col <= stride);
   // For the GM tool, the input layer height or width is assured to be an even
   // number. Hence the function 'down2_symodd()' is not invoked and SIMD
   // optimization of the same is not implemented.
   // When the input height is less than 8 and even, the potential input
   // heights are limited to 2, 4, or 6. These scenarios require seperate
   // handling due to padding requirements. Invoking the C function here will
   // eliminate the need for conditional statements within the subsequent SIMD
   // code to manage these cases.
   if (height & 1 || height < 8) {
     return resize_vert_dir_c(intbuf, output, out_stride, height, height2,
                              stride, start_col);
   }

   __m256i s[10], coeffs_y[4];
   const int bits = FILTER_BITS;

   const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
   const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
   const uint8_t max_pixel = 255;
   const __m256i clip_pixel = _mm256_set1_epi8(max_pixel);
   const __m256i zero = _mm256_setzero_si256();

   prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_y);

   const int num_col16 = stride / 16;
   int remain_col = stride % 16;
   // The core vertical SIMD processes 4 input rows simultaneously to generate
   // output corresponding to 2 rows. To streamline the core loop and eliminate
   // the need for conditional checks, the remaining rows (4 or 6) are processed
   // separately.
   const int remain_row = (height % 4 == 0) ? 4 : 6;

   for (int j = start_col; j < stride - remain_col; j += 16) {
     const uint8_t *data = &intbuf[j];
     const __m128i l3 = _mm_loadu_si128((__m128i *)(data + 0 * stride));
     // Padding top 3 rows with the last available row at the top.
     const __m128i l0 = l3;
     const __m128i l1 = l3;
     const __m128i l2 = l3;
     const __m128i l4 = _mm_loadu_si128((__m128i *)(data + 1 * stride));

     __m128i l6, l7, l8, l9;
     __m128i l5 = _mm_loadu_si128((__m128i *)(data + 2 * stride));
     __m128i l10 = _mm_loadu_si128((__m128i *)(data + 3 * stride));
     __m128i l11 = _mm_loadu_si128((__m128i *)(data + 4 * stride));

     // a0...a15 | c0...c15
     const __m256i s02 =
         _mm256_permute2x128_si256(CAST_HI(l0), CAST_HI(l2), 0x20);
     // b0...b15 | d0...d15
     const __m256i s13 =
         _mm256_permute2x128_si256(CAST_HI(l1), CAST_HI(l3), 0x20);
     // c0...c15 | e0...e15
     const __m256i s24 =
         _mm256_permute2x128_si256(CAST_HI(l2), CAST_HI(l4), 0x20);
     // d0...d15 | f0...f15
     const __m256i s35 =
         _mm256_permute2x128_si256(CAST_HI(l3), CAST_HI(l5), 0x20);
     // e0...e15 | g0...g15
     const __m256i s46 =
         _mm256_permute2x128_si256(CAST_HI(l4), CAST_HI(l10), 0x20);
     // f0...f15 | h0...h15
     const __m256i s57 =
         _mm256_permute2x128_si256(CAST_HI(l5), CAST_HI(l11), 0x20);

     // a0b0...a7b7 | c0d0...c7d7
     s[0] = _mm256_unpacklo_epi8(s02, s13);
     // c0d0...c7d7 | e0f0...e7f7
     s[1] = _mm256_unpacklo_epi8(s24, s35);
     // e0f0...e7f7 | g0h0...g7h7
     s[2] = _mm256_unpacklo_epi8(s46, s57);

     // a8b8...a15b15 | c8d8...c15d15
     s[5] = _mm256_unpackhi_epi8(s02, s13);
     // c8d8...c15d15 | e8f8...e15f15
     s[6] = _mm256_unpackhi_epi8(s24, s35);
     // e8f8...e15f15 | g8h8...g15h15
     s[7] = _mm256_unpackhi_epi8(s46, s57);

     // height to be processed here
     const int process_ht = height - remain_row;
     for (int i = 0; i < process_ht; i += 4) {
       PROCESS_RESIZE_Y_WD16

       _mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j],
                        CAST_LOW(res_8bit0));

       _mm_storeu_si128(
           (__m128i *)&output[(i / 2) * out_stride + j + out_stride],
           _mm256_extracti128_si256(res_8bit0, 1));

       // Load the required data for processing of next 4 input rows.
       const int idx7 = AOMMIN(height - 1, i + 7);
       const int idx8 = AOMMIN(height - 1, i + 8);
       l10 = _mm_loadu_si128((__m128i *)(data + idx7 * stride));
       l11 = _mm_loadu_si128((__m128i *)(data + idx8 * stride));

       const __m256i s810 =
           _mm256_permute2x128_si256(CAST_HI(l8), CAST_HI(l10), 0x20);
       const __m256i s911 =
           _mm256_permute2x128_si256(CAST_HI(l9), CAST_HI(l11), 0x20);
       // i0j0... i7j7 | k0l0... k7l7
       s[4] = _mm256_unpacklo_epi8(s810, s911);
       // i8j8... i15j15 | k8l8... k15l15
       s[9] = _mm256_unpackhi_epi8(s810, s911);

       s[0] = s[2];
       s[1] = s[3];
       s[2] = s[4];

       s[5] = s[7];
       s[6] = s[8];
       s[7] = s[9];
     }

     // Process the remaining last 4 or 6 rows here.
     int i = process_ht;
     while (i < height - 1) {
       PROCESS_RESIZE_Y_WD16

       _mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j],
                        CAST_LOW(res_8bit0));
       i += 2;

       const int is_store_valid = (i < height - 1);
       if (is_store_valid)
         _mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j],
                          _mm256_extracti128_si256(res_8bit0, 1));
       i += 2;

       // Check if there is any remaining height to process. If so, perform the
       // necessary data loading for processing the next row.
       if (i < height - 1) {
         l10 = l11 = l9;
         const __m256i s810 =
             _mm256_permute2x128_si256(CAST_HI(l8), CAST_HI(l10), 0x20);
         const __m256i s911 =
             _mm256_permute2x128_si256(CAST_HI(l9), CAST_HI(l11), 0x20);
         // i0j0... i7j7 | k0l0... k7l7
         s[4] = _mm256_unpacklo_epi8(s810, s911);
         // i8j8... i15j15 | k8l8... k15l15
         s[9] = _mm256_unpackhi_epi8(s810, s911);

         s[0] = s[2];
         s[1] = s[3];
         s[2] = s[4];

         s[5] = s[7];
         s[6] = s[8];
         s[7] = s[9];
       }
     }
   }

   if (remain_col > 7) {
     const int processed_wd = num_col16 * 16;
     remain_col = stride % 8;

     const uint8_t *data = &intbuf[processed_wd];

     const __m128i l3 = _mm_loadl_epi64((__m128i *)(data + 0 * stride));
     // Padding top 3 rows with available top-most row.
     const __m128i l0 = l3;
     const __m128i l1 = l3;
     const __m128i l2 = l3;
     const __m128i l4 = _mm_loadl_epi64((__m128i *)(data + 1 * stride));

     __m128i l6, l7, l8, l9;
     __m128i l5 = _mm_loadl_epi64((__m128i *)(data + 2 * stride));
     __m128i l10 = _mm_loadl_epi64((__m128i *)(data + 3 * stride));
     __m128i l11 = _mm_loadl_epi64((__m128i *)(data + 4 * stride));

     // a0b0...a7b7
     const __m128i s01 = _mm_unpacklo_epi8(l0, l1);
     // c0d0...c7d7
     const __m128i s23 = _mm_unpacklo_epi8(l2, l3);
     // e0f0...e7f7
     const __m128i s45 = _mm_unpacklo_epi8(l4, l5);
     // g0h0...g7h7
     __m128i s67 = _mm_unpacklo_epi8(l10, l11);

     // a0b0...a7b7 | c0d0...c7d7
     s[0] = _mm256_permute2x128_si256(CAST_HI(s01), CAST_HI(s23), 0x20);
     // c0d0...c7d7 | e0f0...e7f7
     s[1] = _mm256_permute2x128_si256(CAST_HI(s23), CAST_HI(s45), 0x20);
     // e0f0...e7f7 | g0h0...g7h7
     s[2] = _mm256_permute2x128_si256(CAST_HI(s45), CAST_HI(s67), 0x20);

     // height to be processed here
     const int process_ht = height - remain_row;
     for (int i = 0; i < process_ht; i += 4) {
       PROCESS_RESIZE_Y_WD8

       _mm_storel_epi64((__m128i *)&output[(i / 2) * out_stride + processed_wd],
                        CAST_LOW(res_a_round_1));

       _mm_storel_epi64(
           (__m128i *)&output[(i / 2) * out_stride + processed_wd + out_stride],
           _mm256_extracti128_si256(res_a_round_1, 1));

       const int idx7 = AOMMIN(height - 1, i + 7);
       const int idx8 = AOMMIN(height - 1, i + 8);
       l10 = _mm_loadl_epi64((__m128i *)(data + idx7 * stride));
       l11 = _mm_loadl_epi64((__m128i *)(data + idx8 * stride));

       // k0l0... k7l7
       const __m128i s10s11 = _mm_unpacklo_epi8(l10, l11);
       // i0j0... i7j7 | k0l0... k7l7
       s[4] = _mm256_permute2x128_si256(CAST_HI(s89), CAST_HI(s10s11), 0x20);

       s[0] = s[2];
       s[1] = s[3];
       s[2] = s[4];
     }

     // Process the remaining last 4 or 6 rows here.
     int i = process_ht;
     while (i < height - 1) {
       PROCESS_RESIZE_Y_WD8

       _mm_storel_epi64((__m128i *)&output[(i / 2) * out_stride + processed_wd],
                        CAST_LOW(res_a_round_1));

       i += 2;

       const int is_store_valid = (i < height - 1);
       if (is_store_valid)
         _mm_storel_epi64(
             (__m128i *)&output[(i / 2) * out_stride + processed_wd],
             _mm256_extracti128_si256(res_a_round_1, 1));
       i += 2;

       // Check rows are still remaining for processing. If yes do the required
       // load of data for the next iteration.
       if (i < height - 1) {
         l10 = l11 = l9;
         // k0l0... k7l7
         const __m128i s10s11 = _mm_unpacklo_epi8(l10, l11);
         // i0j0... i7j7 | k0l0... k7l7
         s[4] = _mm256_permute2x128_si256(CAST_HI(s89), CAST_HI(s10s11), 0x20);

         s[0] = s[2];
         s[1] = s[3];
         s[2] = s[4];
       }
     }
   }

   if (remain_col)
     return resize_vert_dir_c(intbuf, output, out_stride, height, height2,
                              stride, stride - remain_col);

   return true;
 }
	/*
	* Copyright (c) 2024, Alliance for Open Media. All rights reserved
	*
	* This source code is subject to the terms of the BSD 2 Clause License and
	* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
	* was not distributed with this source code in the LICENSE file, you can
	* obtain it at www.aomedia.org/license/software. If the Alliance for Open
	* Media Patent License 1.0 was not distributed with this source code in the
	* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
	*/
	#include <immintrin.h>
	#include <string.h>

	#include "config/av1_rtcd.h"

	#include "av1/common/resize.h"

	#include "aom_dsp/x86/synonyms.h"

	#define CAST_HI(x) _mm256_castsi128_si256(x)
	#define CAST_LOW(x) _mm256_castsi256_si128(x)

	#define PROCESS_RESIZE_Y_WD16 \
	const int idx1 = AOMMIN(height - 1, i + 5); \
	const int idx2 = AOMMIN(height - 1, i + 6); \
	l6 = l10; \
	l7 = l11; \
	l8 = _mm_loadu_si128((__m128i )(data + idx1 stride)); \
	l9 = _mm_loadu_si128((__m128i )(data + idx2 stride)); \
	\
	/* g0... g15 \| i0... i15 */ \
	const __m256i s68 = \
	_mm256_permute2x128_si256(CAST_HI(l6), CAST_HI(l8), 0x20); \
	/* h0... h15 \| j0... j15 */ \
	const __m256i s79 = \
	_mm256_permute2x128_si256(CAST_HI(l7), CAST_HI(l9), 0x20); \
	\
	/* g0h0... g7g7 \| i0j0... i7j */ \
	s[3] = _mm256_unpacklo_epi8(s68, s79); \
	/* g8h8... g15g15 \| i8j8... i15j15 */ \
	s[8] = _mm256_unpackhi_epi8(s68, s79); \
	\
	__m256i res_out[2] = { 0 }; \
	resize_y_convolve(s, coeffs_y, res_out); \
	\
	/* r00... r07 */ \
	__m256i res_a_round_1 = _mm256_add_epi32(res_out[0], round_const_bits); \
	/* r20... r27 */ \
	__m256i res_a_round_2 = _mm256_add_epi32(res_out[1], round_const_bits); \
	\
	res_a_round_1 = _mm256_sra_epi32(res_a_round_1, round_shift_bits); \
	res_a_round_2 = _mm256_sra_epi32(res_a_round_2, round_shift_bits); \
	\
	__m256i res_out_b[2] = { 0 }; \
	resize_y_convolve(s + 5, coeffs_y, res_out_b); \
	\
	/* r08... r015 */ \
	__m256i res_b_round_1 = _mm256_add_epi32(res_out_b[0], round_const_bits); \
	/* r28... r215 */ \
	__m256i res_b_round_2 = _mm256_add_epi32(res_out_b[1], round_const_bits); \
	res_b_round_1 = _mm256_sra_epi32(res_b_round_1, round_shift_bits); \
	res_b_round_2 = _mm256_sra_epi32(res_b_round_2, round_shift_bits); \
	\
	/* r00... r03 r20... r23 \| r04... r07 r24... r27 */ \
	__m256i res_8bit0 = _mm256_packus_epi32(res_a_round_1, res_a_round_2); \
	/* r08... r012 r28... r212 \| r013... r015 r213... r215 */ \
	__m256i res_8bit1 = _mm256_packus_epi32(res_b_round_1, res_b_round_2); \
	/* r00... r07 \| r20... r27 */ \
	res_8bit0 = _mm256_permute4x64_epi64(res_8bit0, 0xd8); \
	/* r08... r015 \| r28... r215 */ \
	res_8bit1 = _mm256_permute4x64_epi64(res_8bit1, 0xd8); \
	/* r00... r015 \| r20... r215 */ \
	res_8bit1 = _mm256_packus_epi16(res_8bit0, res_8bit1); \
	res_8bit0 = _mm256_min_epu8(res_8bit1, clip_pixel); \
	res_8bit0 = _mm256_max_epu8(res_8bit0, zero);

	#define PROCESS_RESIZE_Y_WD8 \
	const int idx1 = AOMMIN(height - 1, i + 5); \
	const int idx2 = AOMMIN(height - 1, i + 6); \
	l6 = l10; \
	l7 = l11; \
	l8 = _mm_loadl_epi64((__m128i )(data + idx1 stride)); \
	l9 = _mm_loadl_epi64((__m128i )(data + idx2 stride)); \
	\
	/* g0h0... g7h7 */ \
	s67 = _mm_unpacklo_epi8(l6, l7); \
	/* i0j0...i7j7 */ \
	__m128i s89 = _mm_unpacklo_epi8(l8, l9); \
	\
	/* g0h0...g7g7 \| i0j0...i7j7 */ \
	s[3] = _mm256_permute2x128_si256(CAST_HI(s67), CAST_HI(s89), 0x20); \
	\
	__m256i res_out[2] = { 0 }; \
	resize_y_convolve(s, coeffs_y, res_out); \
	\
	/* r00... r07 */ \
	__m256i res_a_round_1 = _mm256_add_epi32(res_out[0], round_const_bits); \
	/* r20...r27 */ \
	__m256i res_a_round_2 = _mm256_add_epi32(res_out[1], round_const_bits); \
	res_a_round_1 = _mm256_sra_epi32(res_a_round_1, round_shift_bits); \
	res_a_round_2 = _mm256_sra_epi32(res_a_round_2, round_shift_bits); \
	\
	/* r00...r03 r20...r23 \| r04...r07 r24...r27 */ \
	res_a_round_1 = _mm256_packus_epi32(res_a_round_1, res_a_round_2); \
	/* r00...r07 \| r20...r27 */ \
	res_a_round_1 = _mm256_permute4x64_epi64(res_a_round_1, 0xd8); \
	res_a_round_1 = _mm256_packus_epi16(res_a_round_1, res_a_round_1); \
	res_a_round_1 = _mm256_min_epu8(res_a_round_1, clip_pixel); \
	res_a_round_1 = _mm256_max_epu8(res_a_round_1, zero);

	static INLINE void resize_y_convolve(const __m256i *const s,
	const __m256i *const coeffs,
	__m256i *res_out) {
	const __m256i res_0 = _mm256_maddubs_epi16(s[0], coeffs[0]);
	const __m256i res_1 = _mm256_maddubs_epi16(s[1], coeffs[1]);
	const __m256i res_2 = _mm256_maddubs_epi16(s[2], coeffs[2]);
	const __m256i res_3 = _mm256_maddubs_epi16(s[3], coeffs[3]);

	const __m256i dst_0 = _mm256_add_epi16(res_0, res_1);
	const __m256i dst_1 = _mm256_add_epi16(res_2, res_3);
	// The sum of convolve operation crosses signed 16bit. Hence, the addition
	// should happen in 32bit.
	const __m256i dst_00 = _mm256_cvtepi16_epi32(CAST_LOW(dst_0));
	const __m256i dst_01 =
	_mm256_cvtepi16_epi32(_mm256_extracti128_si256(dst_0, 1));
	const __m256i dst_10 = _mm256_cvtepi16_epi32(CAST_LOW(dst_1));
	const __m256i dst_11 =
	_mm256_cvtepi16_epi32(_mm256_extracti128_si256(dst_1, 1));

	res_out[0] = _mm256_add_epi32(dst_00, dst_10);
	res_out[1] = _mm256_add_epi32(dst_01, dst_11);
	}

	static INLINE void prepare_filter_coeffs(const int16_t *filter,
	__m256i const coeffs / [4] */) {
	// f0 f1 f2 f3 x x x x
	const __m128i sym_even_filter = _mm_loadl_epi64((__m128i *)filter);
	// f0 f1 f2 f3 f0 f1 f2 f3
	const __m128i tmp0 = _mm_shuffle_epi32(sym_even_filter, 0x44);
	// f0 f1 f2 f3 f1 f0 f3 f2
	const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, 0xb1);

	const __m128i filter_8bit = _mm_packs_epi16(tmp1, tmp1);

	// f0 f1 f0 f1 ..
	coeffs[2] = _mm256_broadcastw_epi16(filter_8bit);
	// f2 f3 f2 f3 ..
	coeffs[3] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 2));
	// f3 f2 f3 f2 ..
	coeffs[0] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 6));
	// f1 f0 f1 f0 ..
	coeffs[1] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 4));
	}

	bool resize_vert_dir_avx2(uint8_t intbuf, uint8_t output, int out_stride,
	int height, int height2, int stride, int start_col) {
	assert(start_col <= stride);
	// For the GM tool, the input layer height or width is assured to be an even
	// number. Hence the function 'down2_symodd()' is not invoked and SIMD
	// optimization of the same is not implemented.
	// When the input height is less than 8 and even, the potential input
	// heights are limited to 2, 4, or 6. These scenarios require seperate
	// handling due to padding requirements. Invoking the C function here will
	// eliminate the need for conditional statements within the subsequent SIMD
	// code to manage these cases.
	if (height & 1 \|\| height < 8) {
	return resize_vert_dir_c(intbuf, output, out_stride, height, height2,
	stride, start_col);
	}

	__m256i s[10], coeffs_y[4];
	const int bits = FILTER_BITS;

	const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
	const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
	const uint8_t max_pixel = 255;
	const __m256i clip_pixel = _mm256_set1_epi8(max_pixel);
	const __m256i zero = _mm256_setzero_si256();

	prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_y);

	const int num_col16 = stride / 16;
	int remain_col = stride % 16;
	// The core vertical SIMD processes 4 input rows simultaneously to generate
	// output corresponding to 2 rows. To streamline the core loop and eliminate
	// the need for conditional checks, the remaining rows (4 or 6) are processed
	// separately.
	const int remain_row = (height % 4 == 0) ? 4 : 6;

	for (int j = start_col; j < stride - remain_col; j += 16) {
	const uint8_t *data = &intbuf[j];
	const __m128i l3 = _mm_loadu_si128((__m128i )(data + 0 stride));
	// Padding top 3 rows with the last available row at the top.
	const __m128i l0 = l3;
	const __m128i l1 = l3;
	const __m128i l2 = l3;
	const __m128i l4 = _mm_loadu_si128((__m128i )(data + 1 stride));

	__m128i l6, l7, l8, l9;
	__m128i l5 = _mm_loadu_si128((__m128i )(data + 2 stride));
	__m128i l10 = _mm_loadu_si128((__m128i )(data + 3 stride));
	__m128i l11 = _mm_loadu_si128((__m128i )(data + 4 stride));

	// a0...a15 \| c0...c15
	const __m256i s02 =
	_mm256_permute2x128_si256(CAST_HI(l0), CAST_HI(l2), 0x20);
	// b0...b15 \| d0...d15
	const __m256i s13 =
	_mm256_permute2x128_si256(CAST_HI(l1), CAST_HI(l3), 0x20);
	// c0...c15 \| e0...e15
	const __m256i s24 =
	_mm256_permute2x128_si256(CAST_HI(l2), CAST_HI(l4), 0x20);
	// d0...d15 \| f0...f15
	const __m256i s35 =
	_mm256_permute2x128_si256(CAST_HI(l3), CAST_HI(l5), 0x20);
	// e0...e15 \| g0...g15
	const __m256i s46 =
	_mm256_permute2x128_si256(CAST_HI(l4), CAST_HI(l10), 0x20);
	// f0...f15 \| h0...h15
	const __m256i s57 =
	_mm256_permute2x128_si256(CAST_HI(l5), CAST_HI(l11), 0x20);

	// a0b0...a7b7 \| c0d0...c7d7
	s[0] = _mm256_unpacklo_epi8(s02, s13);
	// c0d0...c7d7 \| e0f0...e7f7
	s[1] = _mm256_unpacklo_epi8(s24, s35);
	// e0f0...e7f7 \| g0h0...g7h7
	s[2] = _mm256_unpacklo_epi8(s46, s57);

	// a8b8...a15b15 \| c8d8...c15d15
	s[5] = _mm256_unpackhi_epi8(s02, s13);
	// c8d8...c15d15 \| e8f8...e15f15
	s[6] = _mm256_unpackhi_epi8(s24, s35);
	// e8f8...e15f15 \| g8h8...g15h15
	s[7] = _mm256_unpackhi_epi8(s46, s57);

	// height to be processed here
	const int process_ht = height - remain_row;
	for (int i = 0; i < process_ht; i += 4) {
	PROCESS_RESIZE_Y_WD16

	_mm_storeu_si128((__m128i )&output[(i / 2) out_stride + j],
	CAST_LOW(res_8bit0));

	_mm_storeu_si128(
	(__m128i )&output[(i / 2) out_stride + j + out_stride],
	_mm256_extracti128_si256(res_8bit0, 1));

	// Load the required data for processing of next 4 input rows.
	const int idx7 = AOMMIN(height - 1, i + 7);
	const int idx8 = AOMMIN(height - 1, i + 8);
	l10 = _mm_loadu_si128((__m128i )(data + idx7 stride));
	l11 = _mm_loadu_si128((__m128i )(data + idx8 stride));

	const __m256i s810 =
	_mm256_permute2x128_si256(CAST_HI(l8), CAST_HI(l10), 0x20);
	const __m256i s911 =
	_mm256_permute2x128_si256(CAST_HI(l9), CAST_HI(l11), 0x20);
	// i0j0... i7j7 \| k0l0... k7l7
	s[4] = _mm256_unpacklo_epi8(s810, s911);
	// i8j8... i15j15 \| k8l8... k15l15
	s[9] = _mm256_unpackhi_epi8(s810, s911);

	s[0] = s[2];
	s[1] = s[3];
	s[2] = s[4];

	s[5] = s[7];
	s[6] = s[8];
	s[7] = s[9];
	}

	// Process the remaining last 4 or 6 rows here.
	int i = process_ht;
	while (i < height - 1) {
	PROCESS_RESIZE_Y_WD16

	_mm_storeu_si128((__m128i )&output[(i / 2) out_stride + j],
	CAST_LOW(res_8bit0));
	i += 2;

	const int is_store_valid = (i < height - 1);
	if (is_store_valid)
	_mm_storeu_si128((__m128i )&output[(i / 2) out_stride + j],
	_mm256_extracti128_si256(res_8bit0, 1));
	i += 2;

	// Check if there is any remaining height to process. If so, perform the
	// necessary data loading for processing the next row.
	if (i < height - 1) {
	l10 = l11 = l9;
	const __m256i s810 =
	_mm256_permute2x128_si256(CAST_HI(l8), CAST_HI(l10), 0x20);
	const __m256i s911 =
	_mm256_permute2x128_si256(CAST_HI(l9), CAST_HI(l11), 0x20);
	// i0j0... i7j7 \| k0l0... k7l7
	s[4] = _mm256_unpacklo_epi8(s810, s911);
	// i8j8... i15j15 \| k8l8... k15l15
	s[9] = _mm256_unpackhi_epi8(s810, s911);

	s[0] = s[2];
	s[1] = s[3];
	s[2] = s[4];

	s[5] = s[7];
	s[6] = s[8];
	s[7] = s[9];
	}
	}
	}

	if (remain_col > 7) {
	const int processed_wd = num_col16 * 16;
	remain_col = stride % 8;

	const uint8_t *data = &intbuf[processed_wd];

	const __m128i l3 = _mm_loadl_epi64((__m128i )(data + 0 stride));
	// Padding top 3 rows with available top-most row.
	const __m128i l0 = l3;
	const __m128i l1 = l3;
	const __m128i l2 = l3;
	const __m128i l4 = _mm_loadl_epi64((__m128i )(data + 1 stride));

	__m128i l6, l7, l8, l9;
	__m128i l5 = _mm_loadl_epi64((__m128i )(data + 2 stride));
	__m128i l10 = _mm_loadl_epi64((__m128i )(data + 3 stride));
	__m128i l11 = _mm_loadl_epi64((__m128i )(data + 4 stride));

	// a0b0...a7b7
	const __m128i s01 = _mm_unpacklo_epi8(l0, l1);
	// c0d0...c7d7
	const __m128i s23 = _mm_unpacklo_epi8(l2, l3);
	// e0f0...e7f7
	const __m128i s45 = _mm_unpacklo_epi8(l4, l5);
	// g0h0...g7h7
	__m128i s67 = _mm_unpacklo_epi8(l10, l11);

	// a0b0...a7b7 \| c0d0...c7d7
	s[0] = _mm256_permute2x128_si256(CAST_HI(s01), CAST_HI(s23), 0x20);
	// c0d0...c7d7 \| e0f0...e7f7
	s[1] = _mm256_permute2x128_si256(CAST_HI(s23), CAST_HI(s45), 0x20);
	// e0f0...e7f7 \| g0h0...g7h7
	s[2] = _mm256_permute2x128_si256(CAST_HI(s45), CAST_HI(s67), 0x20);

	// height to be processed here
	const int process_ht = height - remain_row;
	for (int i = 0; i < process_ht; i += 4) {
	PROCESS_RESIZE_Y_WD8

	_mm_storel_epi64((__m128i )&output[(i / 2) out_stride + processed_wd],
	CAST_LOW(res_a_round_1));

	_mm_storel_epi64(
	(__m128i )&output[(i / 2) out_stride + processed_wd + out_stride],
	_mm256_extracti128_si256(res_a_round_1, 1));

	const int idx7 = AOMMIN(height - 1, i + 7);
	const int idx8 = AOMMIN(height - 1, i + 8);
	l10 = _mm_loadl_epi64((__m128i )(data + idx7 stride));
	l11 = _mm_loadl_epi64((__m128i )(data + idx8 stride));

	// k0l0... k7l7
	const __m128i s10s11 = _mm_unpacklo_epi8(l10, l11);
	// i0j0... i7j7 \| k0l0... k7l7
	s[4] = _mm256_permute2x128_si256(CAST_HI(s89), CAST_HI(s10s11), 0x20);

	s[0] = s[2];
	s[1] = s[3];
	s[2] = s[4];
	}

	// Process the remaining last 4 or 6 rows here.
	int i = process_ht;
	while (i < height - 1) {
	PROCESS_RESIZE_Y_WD8

	_mm_storel_epi64((__m128i )&output[(i / 2) out_stride + processed_wd],
	CAST_LOW(res_a_round_1));

	i += 2;

	const int is_store_valid = (i < height - 1);
	if (is_store_valid)
	_mm_storel_epi64(
	(__m128i )&output[(i / 2) out_stride + processed_wd],
	_mm256_extracti128_si256(res_a_round_1, 1));
	i += 2;

	// Check rows are still remaining for processing. If yes do the required
	// load of data for the next iteration.
	if (i < height - 1) {
	l10 = l11 = l9;
	// k0l0... k7l7
	const __m128i s10s11 = _mm_unpacklo_epi8(l10, l11);
	// i0j0... i7j7 \| k0l0... k7l7
	s[4] = _mm256_permute2x128_si256(CAST_HI(s89), CAST_HI(s10s11), 0x20);

	s[0] = s[2];
	s[1] = s[3];
	s[2] = s[4];
	}
	}
	}

	if (remain_col)
	return resize_vert_dir_c(intbuf, output, out_stride, height, height2,
	stride, stride - remain_col);

	return true;
	}