av1/common/x86/resize_sse2.c - aom - Git at Google

 /*
  * Copyright (c) 2024, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
  * was not distributed with this source code in the LICENSE file, you can
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 #include <immintrin.h>

 #include "config/av1_rtcd.h"

 #include "av1/common/resize.h"

 #include "aom_dsp/x86/synonyms.h"

 #define PROCESS_RESIZE_Y_WD8                                           \
   /* ah0 ah1 ... ah7 */                                                \
   const __m128i AH = _mm_add_epi16(l0, l7);                            \
   /* bg0 bg1 ... bh7 */                                                \
   const __m128i BG = _mm_add_epi16(l1, l6);                            \
   /* cf0 cf1 ... cf7 */                                                \
   const __m128i CF = _mm_add_epi16(l2, l5);                            \
   /* de0 de1 ... de7 */                                                \
   const __m128i DE = _mm_add_epi16(l3, l4);                            \
                                                                        \
   /* ah0 bg0 ... ah3 bg3 */                                            \
   const __m128i AHBG_low = _mm_unpacklo_epi16(AH, BG);                 \
   /*cf0 de0 ... cf2 de2 */                                             \
   const __m128i CFDE_low = _mm_unpacklo_epi16(CF, DE);                 \
                                                                        \
   /* ah4 bg4... ah7 bg7 */                                             \
   const __m128i AHBG_hi = _mm_unpackhi_epi16(AH, BG);                  \
   /* cf4 de4... cf7 de7 */                                             \
   const __m128i CFDE_hi = _mm_unpackhi_epi16(CF, DE);                  \
                                                                        \
   /* r00 r01 r02 r03 */                                                \
   const __m128i r00 = _mm_madd_epi16(AHBG_low, coeffs_y[0]);           \
   const __m128i r01 = _mm_madd_epi16(CFDE_low, coeffs_y[1]);           \
   __m128i r0 = _mm_add_epi32(r00, r01);                                \
   /* r04 r05 r06 r07 */                                                \
   const __m128i r10 = _mm_madd_epi16(AHBG_hi, coeffs_y[0]);            \
   const __m128i r11 = _mm_madd_epi16(CFDE_hi, coeffs_y[1]);            \
   __m128i r1 = _mm_add_epi32(r10, r11);                                \
                                                                        \
   r0 = _mm_add_epi32(r0, round_const_bits);                            \
   r1 = _mm_add_epi32(r1, round_const_bits);                            \
   r0 = _mm_sra_epi32(r0, round_shift_bits);                            \
   r1 = _mm_sra_epi32(r1, round_shift_bits);                            \
                                                                        \
   /* r00 ... r07 (8 values of each 16bit) */                           \
   const __m128i res_16b = _mm_packs_epi32(r0, r1);                     \
   /* r00 ... r07 | r00 ... r07 (16 values of each 8bit) */             \
   const __m128i res_8b0 = _mm_packus_epi16(res_16b, res_16b);          \
                                                                        \
   __m128i res = _mm_min_epu8(res_8b0, clip_pixel);                     \
   res = _mm_max_epu8(res, zero);                                       \
   _mm_storel_epi64((__m128i *)&output[(i / 2) * out_stride + j], res); \
                                                                        \
   l0 = l2;                                                             \
   l1 = l3;                                                             \
   l2 = l4;                                                             \
   l3 = l5;                                                             \
   l4 = l6;                                                             \
   l5 = l7;                                                             \
   data += 2 * stride;

 static INLINE void prepare_filter_coeffs(const int16_t *filter,
                                          __m128i *const coeffs /* [2] */) {
   // f0 f1 f2 f3 x x x x
   const __m128i sym_even_filter = _mm_loadl_epi64((__m128i *)filter);

   // f1 f0 f3 f2 x x x x
   const __m128i tmp1 = _mm_shufflelo_epi16(sym_even_filter, 0xb1);

   // f3 f2 f3 f2 ...
   coeffs[0] = _mm_shuffle_epi32(tmp1, 0x55);
   // f1 f0 f1 f0 ...
   coeffs[1] = _mm_shuffle_epi32(tmp1, 0x00);
 }

 bool av1_resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride,
                               int height, int height2, int stride,
                               int start_col) {
   // For the GM tool, the input layer height or width is assured to be an even
   // number. Hence the function 'down2_symodd()' is not invoked and SIMD
   // optimization of the same is not implemented.
   // When the input height is less than 8 and even, the potential input
   // heights are limited to 2, 4, or 6. These scenarios require seperate
   // handling due to padding requirements. Invoking the C function here will
   // eliminate the need for conditional statements within the subsequent SIMD
   // code to manage these cases.
   if (height & 1 || height < 8) {
     return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2,
                                  stride, start_col);
   }

   __m128i coeffs_y[2];
   const int bits = FILTER_BITS;
   const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
   const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
   const uint8_t max_pixel = 255;
   const __m128i clip_pixel = _mm_set1_epi8(max_pixel);
   const __m128i zero = _mm_setzero_si128();
   prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_y);

   const int remain_col = stride % 8;

   for (int j = start_col; j < stride - remain_col; j += 8) {
     uint8_t *data = &intbuf[j];
     // d0 ... d7
     const __m128i l8_3 = _mm_loadl_epi64((__m128i *)(data + 0 * stride));
     // Padding top 3 rows with the last available row at the top.
     // a0 ... a7
     const __m128i l8_0 = l8_3;
     // b0 ... b7
     const __m128i l8_1 = l8_3;
     // c0 ... c7
     const __m128i l8_2 = l8_3;
     // e0 ... e7
     const __m128i l8_4 = _mm_loadl_epi64((__m128i *)(data + 1 * stride));
     // f0 ... f7
     const __m128i l8_5 = _mm_loadl_epi64((__m128i *)(data + 2 * stride));

     // Convert to 16bit as addition of 2 source pixel crosses 8 bit.
     __m128i l0 = _mm_unpacklo_epi8(l8_0, zero);  // A(128bit) = a0 - a7(16 bit)
     __m128i l1 = _mm_unpacklo_epi8(l8_1, zero);  // B(128bit) = b0 - b7(16 bit)
     __m128i l2 = _mm_unpacklo_epi8(l8_2, zero);  // C(128bit) = c0 - c7(16 bit)
     __m128i l3 = _mm_unpacklo_epi8(l8_3, zero);  // D(128bit) = d0 - d7(16 bit)
     __m128i l4 = _mm_unpacklo_epi8(l8_4, zero);  // E(128bit) = e0 - e7(16 bit)
     __m128i l5 = _mm_unpacklo_epi8(l8_5, zero);  // F(128bit) = f0 - f7(16 bit)

     // Increment the pointer such that the loading starts from row G.
     data = data + 3 * stride;
     // The core vertical SIMD processes 2 input rows simultaneously to generate
     // output corresponding to 1 row. To streamline the core loop and eliminate
     // the need for conditional checks, the remaining rows 4 are processed
     // separately.
     for (int i = 0; i < height - 4; i += 2) {
       // g0 ... g7
       __m128i l8_6 = _mm_loadl_epi64((__m128i *)(data));
       // h0 ... h7
       __m128i l8_7 = _mm_loadl_epi64((__m128i *)(data + stride));
       __m128i l6 = _mm_unpacklo_epi8(l8_6, zero);  // G(128bit):g0-g7(16b)
       __m128i l7 = _mm_unpacklo_epi8(l8_7, zero);  // H(128bit):h0-h7(16b)

       PROCESS_RESIZE_Y_WD8
     }

     __m128i l8_6 = _mm_loadl_epi64((__m128i *)(data));
     __m128i l6 = _mm_unpacklo_epi8(l8_6, zero);
     // Process the last 4 input rows here.
     for (int i = height - 4; i < height; i += 2) {
       __m128i l7 = l6;
       PROCESS_RESIZE_Y_WD8
     }
   }

   if (remain_col)
     return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2,
                                  stride, stride - remain_col);

   return true;
 }
	/*
	* Copyright (c) 2024, Alliance for Open Media. All rights reserved
	*
	* This source code is subject to the terms of the BSD 2 Clause License and
	* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
	* was not distributed with this source code in the LICENSE file, you can
	* obtain it at www.aomedia.org/license/software. If the Alliance for Open
	* Media Patent License 1.0 was not distributed with this source code in the
	* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
	*/
	#include <immintrin.h>

	#include "config/av1_rtcd.h"

	#include "av1/common/resize.h"

	#include "aom_dsp/x86/synonyms.h"

	#define PROCESS_RESIZE_Y_WD8 \
	/* ah0 ah1 ... ah7 */ \
	const __m128i AH = _mm_add_epi16(l0, l7); \
	/* bg0 bg1 ... bh7 */ \
	const __m128i BG = _mm_add_epi16(l1, l6); \
	/* cf0 cf1 ... cf7 */ \
	const __m128i CF = _mm_add_epi16(l2, l5); \
	/* de0 de1 ... de7 */ \
	const __m128i DE = _mm_add_epi16(l3, l4); \
	\
	/* ah0 bg0 ... ah3 bg3 */ \
	const __m128i AHBG_low = _mm_unpacklo_epi16(AH, BG); \
	/cf0 de0 ... cf2 de2 / \
	const __m128i CFDE_low = _mm_unpacklo_epi16(CF, DE); \
	\
	/* ah4 bg4... ah7 bg7 */ \
	const __m128i AHBG_hi = _mm_unpackhi_epi16(AH, BG); \
	/* cf4 de4... cf7 de7 */ \
	const __m128i CFDE_hi = _mm_unpackhi_epi16(CF, DE); \
	\
	/* r00 r01 r02 r03 */ \
	const __m128i r00 = _mm_madd_epi16(AHBG_low, coeffs_y[0]); \
	const __m128i r01 = _mm_madd_epi16(CFDE_low, coeffs_y[1]); \
	__m128i r0 = _mm_add_epi32(r00, r01); \
	/* r04 r05 r06 r07 */ \
	const __m128i r10 = _mm_madd_epi16(AHBG_hi, coeffs_y[0]); \
	const __m128i r11 = _mm_madd_epi16(CFDE_hi, coeffs_y[1]); \
	__m128i r1 = _mm_add_epi32(r10, r11); \
	\
	r0 = _mm_add_epi32(r0, round_const_bits); \
	r1 = _mm_add_epi32(r1, round_const_bits); \
	r0 = _mm_sra_epi32(r0, round_shift_bits); \
	r1 = _mm_sra_epi32(r1, round_shift_bits); \
	\
	/* r00 ... r07 (8 values of each 16bit) */ \
	const __m128i res_16b = _mm_packs_epi32(r0, r1); \
	/* r00 ... r07 \| r00 ... r07 (16 values of each 8bit) */ \
	const __m128i res_8b0 = _mm_packus_epi16(res_16b, res_16b); \
	\
	__m128i res = _mm_min_epu8(res_8b0, clip_pixel); \
	res = _mm_max_epu8(res, zero); \
	_mm_storel_epi64((__m128i )&output[(i / 2) out_stride + j], res); \
	\
	l0 = l2; \
	l1 = l3; \
	l2 = l4; \
	l3 = l5; \
	l4 = l6; \
	l5 = l7; \
	data += 2 * stride;

	static INLINE void prepare_filter_coeffs(const int16_t *filter,
	__m128i const coeffs / [2] */) {
	// f0 f1 f2 f3 x x x x
	const __m128i sym_even_filter = _mm_loadl_epi64((__m128i *)filter);

	// f1 f0 f3 f2 x x x x
	const __m128i tmp1 = _mm_shufflelo_epi16(sym_even_filter, 0xb1);

	// f3 f2 f3 f2 ...
	coeffs[0] = _mm_shuffle_epi32(tmp1, 0x55);
	// f1 f0 f1 f0 ...
	coeffs[1] = _mm_shuffle_epi32(tmp1, 0x00);
	}

	bool av1_resize_vert_dir_sse2(uint8_t intbuf, uint8_t output, int out_stride,
	int height, int height2, int stride,
	int start_col) {
	// For the GM tool, the input layer height or width is assured to be an even
	// number. Hence the function 'down2_symodd()' is not invoked and SIMD
	// optimization of the same is not implemented.
	// When the input height is less than 8 and even, the potential input
	// heights are limited to 2, 4, or 6. These scenarios require seperate
	// handling due to padding requirements. Invoking the C function here will
	// eliminate the need for conditional statements within the subsequent SIMD
	// code to manage these cases.
	if (height & 1 \|\| height < 8) {
	return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2,
	stride, start_col);
	}

	__m128i coeffs_y[2];
	const int bits = FILTER_BITS;
	const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
	const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
	const uint8_t max_pixel = 255;
	const __m128i clip_pixel = _mm_set1_epi8(max_pixel);
	const __m128i zero = _mm_setzero_si128();
	prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_y);

	const int remain_col = stride % 8;

	for (int j = start_col; j < stride - remain_col; j += 8) {
	uint8_t *data = &intbuf[j];
	// d0 ... d7
	const __m128i l8_3 = _mm_loadl_epi64((__m128i )(data + 0 stride));
	// Padding top 3 rows with the last available row at the top.
	// a0 ... a7
	const __m128i l8_0 = l8_3;
	// b0 ... b7
	const __m128i l8_1 = l8_3;
	// c0 ... c7
	const __m128i l8_2 = l8_3;
	// e0 ... e7
	const __m128i l8_4 = _mm_loadl_epi64((__m128i )(data + 1 stride));
	// f0 ... f7
	const __m128i l8_5 = _mm_loadl_epi64((__m128i )(data + 2 stride));

	// Convert to 16bit as addition of 2 source pixel crosses 8 bit.
	__m128i l0 = _mm_unpacklo_epi8(l8_0, zero); // A(128bit) = a0 - a7(16 bit)
	__m128i l1 = _mm_unpacklo_epi8(l8_1, zero); // B(128bit) = b0 - b7(16 bit)
	__m128i l2 = _mm_unpacklo_epi8(l8_2, zero); // C(128bit) = c0 - c7(16 bit)
	__m128i l3 = _mm_unpacklo_epi8(l8_3, zero); // D(128bit) = d0 - d7(16 bit)
	__m128i l4 = _mm_unpacklo_epi8(l8_4, zero); // E(128bit) = e0 - e7(16 bit)
	__m128i l5 = _mm_unpacklo_epi8(l8_5, zero); // F(128bit) = f0 - f7(16 bit)

	// Increment the pointer such that the loading starts from row G.
	data = data + 3 * stride;
	// The core vertical SIMD processes 2 input rows simultaneously to generate
	// output corresponding to 1 row. To streamline the core loop and eliminate
	// the need for conditional checks, the remaining rows 4 are processed
	// separately.
	for (int i = 0; i < height - 4; i += 2) {
	// g0 ... g7
	__m128i l8_6 = _mm_loadl_epi64((__m128i *)(data));
	// h0 ... h7
	__m128i l8_7 = _mm_loadl_epi64((__m128i *)(data + stride));
	__m128i l6 = _mm_unpacklo_epi8(l8_6, zero); // G(128bit):g0-g7(16b)
	__m128i l7 = _mm_unpacklo_epi8(l8_7, zero); // H(128bit):h0-h7(16b)

	PROCESS_RESIZE_Y_WD8
	}

	__m128i l8_6 = _mm_loadl_epi64((__m128i *)(data));
	__m128i l6 = _mm_unpacklo_epi8(l8_6, zero);
	// Process the last 4 input rows here.
	for (int i = height - 4; i < height; i += 2) {
	__m128i l7 = l6;
	PROCESS_RESIZE_Y_WD8
	}
	}

	if (remain_col)
	return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2,
	stride, stride - remain_col);

	return true;
	}