av1/common/x86/av1_txfm_sse2.h - avm - Git at Google

 /*
  * Copyright (c) 2021, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 3-Clause Clear License
  * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
  * License was not distributed with this source code in the LICENSE file, you
  * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
  * Alliance for Open Media Patent License 1.0 was not distributed with this
  * source code in the PATENTS file, you can obtain it at
  * aomedia.org/license/patent-license/.
  */
 #ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
 #define AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_

 #include <emmintrin.h>  // SSE2

 #include "config/aom_config.h"
 #include "config/av1_rtcd.h"

 #include "aom/aom_integer.h"
 #include "aom_dsp/x86/transpose_sse2.h"
 #include "aom_dsp/x86/txfm_common_sse2.h"
 #include "av1/common/av1_txfm.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

 static INLINE void btf_16_w4_sse2(
     const __m128i *const w0, const __m128i *const w1, const __m128i __rounding,
     const int8_t cos_bit, const __m128i *const in0, const __m128i *const in1,
     __m128i *const out0, __m128i *const out1) {
   const __m128i t0 = _mm_unpacklo_epi16(*in0, *in1);
   const __m128i u0 = _mm_madd_epi16(t0, *w0);
   const __m128i v0 = _mm_madd_epi16(t0, *w1);
   const __m128i a0 = _mm_add_epi32(u0, __rounding);
   const __m128i b0 = _mm_add_epi32(v0, __rounding);
   const __m128i c0 = _mm_srai_epi32(a0, cos_bit);
   const __m128i d0 = _mm_srai_epi32(b0, cos_bit);

   *out0 = _mm_packs_epi32(c0, c0);
   *out1 = _mm_packs_epi32(d0, c0);
 }

 #define btf_16_4p_sse2(w0, w1, in0, in1, out0, out1) \
   {                                                  \
     __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
     __m128i u0 = _mm_madd_epi16(t0, w0);             \
     __m128i v0 = _mm_madd_epi16(t0, w1);             \
                                                      \
     __m128i a0 = _mm_add_epi32(u0, __rounding);      \
     __m128i b0 = _mm_add_epi32(v0, __rounding);      \
                                                      \
     __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
     __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
                                                      \
     out0 = _mm_packs_epi32(c0, c0);                  \
     out1 = _mm_packs_epi32(d0, d0);                  \
   }

 #define btf_16_sse2(w0, w1, in0, in1, out0, out1) \
   {                                               \
     __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
     __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
     __m128i u0 = _mm_madd_epi16(t0, w0);          \
     __m128i u1 = _mm_madd_epi16(t1, w0);          \
     __m128i v0 = _mm_madd_epi16(t0, w1);          \
     __m128i v1 = _mm_madd_epi16(t1, w1);          \
                                                   \
     __m128i a0 = _mm_add_epi32(u0, __rounding);   \
     __m128i a1 = _mm_add_epi32(u1, __rounding);   \
     __m128i b0 = _mm_add_epi32(v0, __rounding);   \
     __m128i b1 = _mm_add_epi32(v1, __rounding);   \
                                                   \
     __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
     __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
     __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
     __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
                                                   \
     out0 = _mm_packs_epi32(c0, c1);               \
     out1 = _mm_packs_epi32(d0, d1);               \
   }

 static INLINE __m128i load_16bit_to_16bit(const int16_t *a) {
   return _mm_loadu_si128((const __m128i *)a);
 }

 static INLINE __m128i load_32bit_to_16bit(const int32_t *a) {
   const __m128i a_low = _mm_loadu_si128((const __m128i *)a);
   return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4));
 }

 static INLINE __m128i load_32bit_to_16bit_w4(const int32_t *a) {
   const __m128i a_low = _mm_loadu_si128((const __m128i *)a);
   return _mm_packs_epi32(a_low, a_low);
 }

 // Store 4 16 bit values. Sign extend the values.
 static INLINE void store_16bit_to_32bit_w4(const __m128i a, int32_t *const b) {
   const __m128i a_lo = _mm_unpacklo_epi16(a, a);
   const __m128i a_1 = _mm_srai_epi32(a_lo, 16);
   _mm_store_si128((__m128i *)b, a_1);
 }

 // Store 8 16 bit values. Sign extend the values.
 static INLINE void store_16bit_to_32bit(__m128i a, int32_t *b) {
   const __m128i a_lo = _mm_unpacklo_epi16(a, a);
   const __m128i a_hi = _mm_unpackhi_epi16(a, a);
   const __m128i a_1 = _mm_srai_epi32(a_lo, 16);
   const __m128i a_2 = _mm_srai_epi32(a_hi, 16);
   _mm_store_si128((__m128i *)b, a_1);
   _mm_store_si128((__m128i *)(b + 4), a_2);
 }

 static INLINE __m128i scale_round_sse2(const __m128i a, const int scale) {
   const __m128i scale_rounding = pair_set_epi16(scale, 1 << (NewSqrt2Bits - 1));
   const __m128i b = _mm_madd_epi16(a, scale_rounding);
   return _mm_srai_epi32(b, NewSqrt2Bits);
 }

 static INLINE void store_rect_16bit_to_32bit_w4(const __m128i a,
                                                 int32_t *const b) {
   const __m128i one = _mm_set1_epi16(1);
   const __m128i a_lo = _mm_unpacklo_epi16(a, one);
   const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
   _mm_store_si128((__m128i *)b, b_lo);
 }

 static INLINE void store_rect_16bit_to_32bit(const __m128i a,
                                              int32_t *const b) {
   const __m128i one = _mm_set1_epi16(1);
   const __m128i a_lo = _mm_unpacklo_epi16(a, one);
   const __m128i a_hi = _mm_unpackhi_epi16(a, one);
   const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
   const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2);
   _mm_store_si128((__m128i *)b, b_lo);
   _mm_store_si128((__m128i *)(b + 4), b_hi);
 }

 static INLINE void load_buffer_16bit_to_16bit_w4(const int16_t *const in,
                                                  const int stride,
                                                  __m128i *const out,
                                                  const int out_size) {
   for (int i = 0; i < out_size; ++i) {
     out[i] = _mm_loadl_epi64((const __m128i *)(in + i * stride));
   }
 }

 static INLINE void load_buffer_16bit_to_16bit_w4_flip(const int16_t *const in,
                                                       const int stride,
                                                       __m128i *const out,
                                                       const int out_size) {
   for (int i = 0; i < out_size; ++i) {
     out[out_size - i - 1] = _mm_loadl_epi64((const __m128i *)(in + i * stride));
   }
 }

 static INLINE void load_buffer_16bit_to_16bit(const int16_t *in, int stride,
                                               __m128i *out, int out_size) {
   for (int i = 0; i < out_size; ++i) {
     out[i] = load_16bit_to_16bit(in + i * stride);
   }
 }

 static INLINE void load_buffer_16bit_to_16bit_flip(const int16_t *in,
                                                    int stride, __m128i *out,
                                                    int out_size) {
   for (int i = 0; i < out_size; ++i) {
     out[out_size - i - 1] = load_16bit_to_16bit(in + i * stride);
   }
 }

 static INLINE void load_buffer_32bit_to_16bit(const int32_t *in, int stride,
                                               __m128i *out, int out_size) {
   for (int i = 0; i < out_size; ++i) {
     out[i] = load_32bit_to_16bit(in + i * stride);
   }
 }

 static INLINE void load_buffer_32bit_to_16bit_w4(const int32_t *in, int stride,
                                                  __m128i *out, int out_size) {
   for (int i = 0; i < out_size; ++i) {
     out[i] = load_32bit_to_16bit_w4(in + i * stride);
   }
 }

 static INLINE void load_buffer_32bit_to_16bit_flip(const int32_t *in,
                                                    int stride, __m128i *out,
                                                    int out_size) {
   for (int i = 0; i < out_size; ++i) {
     out[out_size - i - 1] = load_32bit_to_16bit(in + i * stride);
   }
 }

 static INLINE void store_buffer_16bit_to_32bit_w4(const __m128i *const in,
                                                   int32_t *const out,
                                                   const int stride,
                                                   const int out_size) {
   for (int i = 0; i < out_size; ++i) {
     store_16bit_to_32bit_w4(in[i], out + i * stride);
   }
 }

 static INLINE void store_buffer_16bit_to_32bit_w8(const __m128i *const in,
                                                   int32_t *const out,
                                                   const int stride,
                                                   const int out_size) {
   for (int i = 0; i < out_size; ++i) {
     store_16bit_to_32bit(in[i], out + i * stride);
   }
 }

 static INLINE void store_rect_buffer_16bit_to_32bit_w4(const __m128i *const in,
                                                        int32_t *const out,
                                                        const int stride,
                                                        const int out_size) {
   for (int i = 0; i < out_size; ++i) {
     store_rect_16bit_to_32bit_w4(in[i], out + i * stride);
   }
 }

 static INLINE void store_rect_buffer_16bit_to_32bit_w8(const __m128i *const in,
                                                        int32_t *const out,
                                                        const int stride,
                                                        const int out_size) {
   for (int i = 0; i < out_size; ++i) {
     store_rect_16bit_to_32bit(in[i], out + i * stride);
   }
 }

 static INLINE void store_buffer_16bit_to_16bit_8x8(const __m128i *in,
                                                    uint16_t *out,
                                                    const int stride) {
   for (int i = 0; i < 8; ++i) {
     _mm_store_si128((__m128i *)(out + i * stride), in[i]);
   }
 }

 static INLINE void round_shift_16bit(__m128i *in, int size, int bit) {
   if (bit < 0) {
     bit = -bit;
     __m128i rounding = _mm_set1_epi16(1 << (bit - 1));
     for (int i = 0; i < size; ++i) {
       in[i] = _mm_adds_epi16(in[i], rounding);
       in[i] = _mm_srai_epi16(in[i], bit);
     }
   } else if (bit > 0) {
     for (int i = 0; i < size; ++i) {
       in[i] = _mm_slli_epi16(in[i], bit);
     }
   }
 }

 static INLINE void flip_buf_sse2(__m128i *in, __m128i *out, int size) {
   for (int i = 0; i < size; ++i) {
     out[size - i - 1] = in[i];
   }
 }

 #if CONFIG_ADST_TUNED
 static INLINE void matrix_coef_mult_sse2(const __m128i w0, const __m128i w1,
                                          const __m128i in0, const __m128i in1,
                                          __m128i *out0, __m128i *out1) {
   __m128i t0 = _mm_unpacklo_epi16(in0, in1);
   __m128i t1 = _mm_unpackhi_epi16(in0, in1);

   __m128i v0 = _mm_unpacklo_epi16(w0, w1);
   __m128i v1 = _mm_unpackhi_epi16(w0, w1);

   *out0 = _mm_madd_epi16(t0, v0);
   *out1 = _mm_madd_epi16(t1, v1);
 }
 #endif  // CONFIG_ADST_TUNED

 void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output,
                                    int stride, TX_TYPE tx_type, int bd);

 void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output,
                                    int stride, TX_TYPE tx_type, int bd);

 void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output,
                                     int stride, TX_TYPE tx_type, int bd);

 void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output,
                                    int stride, TX_TYPE tx_type, int bd);

 void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output,
                                    int stride, TX_TYPE tx_type, int bd);

 void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output,
                                     int stride, TX_TYPE tx_type, int bd);

 void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output,
                                     int stride, TX_TYPE tx_type, int bd);

 void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output,
                                     int stride, TX_TYPE tx_type, int bd);

 void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output,
                                     int stride, TX_TYPE tx_type, int bd);

 void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output,
                                      int stride, TX_TYPE tx_type, int bd);

 void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output,
                                      int stride, TX_TYPE tx_type, int bd);

 void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output,
                                     int stride, TX_TYPE tx_type, int bd);

 void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output,
                                      int stride, TX_TYPE tx_type, int bd);

 void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output,
                                      int stride, TX_TYPE tx_type, int bd);

 void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output,
                                      int stride, TX_TYPE tx_type, int bd);

 void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output,
                                      int stride, TX_TYPE tx_type, int bd);

 #if CONFIG_FLEX_PARTITION
 void av1_lowbd_fwd_txfm2d_4x32_sse2(const int16_t *input, int32_t *output,
                                     int stride, TX_TYPE tx_type, int bd);

 void av1_lowbd_fwd_txfm2d_32x4_sse2(const int16_t *input, int32_t *output,
                                     int stride, TX_TYPE tx_type, int bd);

 void av1_lowbd_fwd_txfm2d_8x64_sse2(const int16_t *input, int32_t *output,
                                     int stride, TX_TYPE tx_type, int bd);

 void av1_lowbd_fwd_txfm2d_64x8_sse2(const int16_t *input, int32_t *output,
                                     int stride, TX_TYPE tx_type, int bd);

 void av1_lowbd_fwd_txfm2d_4x64_sse2(const int16_t *input, int32_t *output,
                                     int stride, TX_TYPE tx_type, int bd);

 void av1_lowbd_fwd_txfm2d_64x4_sse2(const int16_t *input, int32_t *output,
                                     int stride, TX_TYPE tx_type, int bd);
 #endif  // CONFIG_FLEX_PARTITION

 typedef void (*transform_1d_sse2)(const __m128i *input, __m128i *output,
                                   int8_t cos_bit);

 typedef struct {
   transform_1d_sse2 col, row;  // vertical and horizontal
 } transform_2d_sse2;

 #ifdef __cplusplus
 }
 #endif  // __cplusplus
 #endif  // AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
	/*
	* Copyright (c) 2021, Alliance for Open Media. All rights reserved
	*
	* This source code is subject to the terms of the BSD 3-Clause Clear License
	* and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
	* License was not distributed with this source code in the LICENSE file, you
	* can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the
	* Alliance for Open Media Patent License 1.0 was not distributed with this
	* source code in the PATENTS file, you can obtain it at
	* aomedia.org/license/patent-license/.
	*/
	#ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
	#define AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_

	#include <emmintrin.h> // SSE2

	#include "config/aom_config.h"
	#include "config/av1_rtcd.h"

	#include "aom/aom_integer.h"
	#include "aom_dsp/x86/transpose_sse2.h"
	#include "aom_dsp/x86/txfm_common_sse2.h"
	#include "av1/common/av1_txfm.h"

	#ifdef __cplusplus
	extern "C" {
	#endif

	static INLINE void btf_16_w4_sse2(
	const __m128i const w0, const __m128i const w1, const __m128i __rounding,
	const int8_t cos_bit, const __m128i const in0, const __m128i const in1,
	__m128i const out0, __m128i const out1) {
	const __m128i t0 = _mm_unpacklo_epi16(in0, in1);
	const __m128i u0 = _mm_madd_epi16(t0, *w0);
	const __m128i v0 = _mm_madd_epi16(t0, *w1);
	const __m128i a0 = _mm_add_epi32(u0, __rounding);
	const __m128i b0 = _mm_add_epi32(v0, __rounding);
	const __m128i c0 = _mm_srai_epi32(a0, cos_bit);
	const __m128i d0 = _mm_srai_epi32(b0, cos_bit);

	*out0 = _mm_packs_epi32(c0, c0);
	*out1 = _mm_packs_epi32(d0, c0);
	}

	#define btf_16_4p_sse2(w0, w1, in0, in1, out0, out1) \
	{ \
	__m128i t0 = _mm_unpacklo_epi16(in0, in1); \
	__m128i u0 = _mm_madd_epi16(t0, w0); \
	__m128i v0 = _mm_madd_epi16(t0, w1); \
	\
	__m128i a0 = _mm_add_epi32(u0, __rounding); \
	__m128i b0 = _mm_add_epi32(v0, __rounding); \
	\
	__m128i c0 = _mm_srai_epi32(a0, cos_bit); \
	__m128i d0 = _mm_srai_epi32(b0, cos_bit); \
	\
	out0 = _mm_packs_epi32(c0, c0); \
	out1 = _mm_packs_epi32(d0, d0); \
	}

	#define btf_16_sse2(w0, w1, in0, in1, out0, out1) \
	{ \
	__m128i t0 = _mm_unpacklo_epi16(in0, in1); \
	__m128i t1 = _mm_unpackhi_epi16(in0, in1); \
	__m128i u0 = _mm_madd_epi16(t0, w0); \
	__m128i u1 = _mm_madd_epi16(t1, w0); \
	__m128i v0 = _mm_madd_epi16(t0, w1); \
	__m128i v1 = _mm_madd_epi16(t1, w1); \
	\
	__m128i a0 = _mm_add_epi32(u0, __rounding); \
	__m128i a1 = _mm_add_epi32(u1, __rounding); \
	__m128i b0 = _mm_add_epi32(v0, __rounding); \
	__m128i b1 = _mm_add_epi32(v1, __rounding); \
	\
	__m128i c0 = _mm_srai_epi32(a0, cos_bit); \
	__m128i c1 = _mm_srai_epi32(a1, cos_bit); \
	__m128i d0 = _mm_srai_epi32(b0, cos_bit); \
	__m128i d1 = _mm_srai_epi32(b1, cos_bit); \
	\
	out0 = _mm_packs_epi32(c0, c1); \
	out1 = _mm_packs_epi32(d0, d1); \
	}

	static INLINE __m128i load_16bit_to_16bit(const int16_t *a) {
	return _mm_loadu_si128((const __m128i *)a);
	}

	static INLINE __m128i load_32bit_to_16bit(const int32_t *a) {
	const __m128i a_low = _mm_loadu_si128((const __m128i *)a);
	return _mm_packs_epi32(a_low, (const __m128i )(a + 4));
	}

	static INLINE __m128i load_32bit_to_16bit_w4(const int32_t *a) {
	const __m128i a_low = _mm_loadu_si128((const __m128i *)a);
	return _mm_packs_epi32(a_low, a_low);
	}

	// Store 4 16 bit values. Sign extend the values.
	static INLINE void store_16bit_to_32bit_w4(const __m128i a, int32_t *const b) {
	const __m128i a_lo = _mm_unpacklo_epi16(a, a);
	const __m128i a_1 = _mm_srai_epi32(a_lo, 16);
	_mm_store_si128((__m128i *)b, a_1);
	}

	// Store 8 16 bit values. Sign extend the values.
	static INLINE void store_16bit_to_32bit(__m128i a, int32_t *b) {
	const __m128i a_lo = _mm_unpacklo_epi16(a, a);
	const __m128i a_hi = _mm_unpackhi_epi16(a, a);
	const __m128i a_1 = _mm_srai_epi32(a_lo, 16);
	const __m128i a_2 = _mm_srai_epi32(a_hi, 16);
	_mm_store_si128((__m128i *)b, a_1);
	_mm_store_si128((__m128i *)(b + 4), a_2);
	}

	static INLINE __m128i scale_round_sse2(const __m128i a, const int scale) {
	const __m128i scale_rounding = pair_set_epi16(scale, 1 << (NewSqrt2Bits - 1));
	const __m128i b = _mm_madd_epi16(a, scale_rounding);
	return _mm_srai_epi32(b, NewSqrt2Bits);
	}

	static INLINE void store_rect_16bit_to_32bit_w4(const __m128i a,
	int32_t *const b) {
	const __m128i one = _mm_set1_epi16(1);
	const __m128i a_lo = _mm_unpacklo_epi16(a, one);
	const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
	_mm_store_si128((__m128i *)b, b_lo);
	}

	static INLINE void store_rect_16bit_to_32bit(const __m128i a,
	int32_t *const b) {
	const __m128i one = _mm_set1_epi16(1);
	const __m128i a_lo = _mm_unpacklo_epi16(a, one);
	const __m128i a_hi = _mm_unpackhi_epi16(a, one);
	const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
	const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2);
	_mm_store_si128((__m128i *)b, b_lo);
	_mm_store_si128((__m128i *)(b + 4), b_hi);
	}

	static INLINE void load_buffer_16bit_to_16bit_w4(const int16_t *const in,
	const int stride,
	__m128i *const out,
	const int out_size) {
	for (int i = 0; i < out_size; ++i) {
	out[i] = _mm_loadl_epi64((const __m128i )(in + i stride));
	}
	}

	static INLINE void load_buffer_16bit_to_16bit_w4_flip(const int16_t *const in,
	const int stride,
	__m128i *const out,
	const int out_size) {
	for (int i = 0; i < out_size; ++i) {
	out[out_size - i - 1] = _mm_loadl_epi64((const __m128i )(in + i stride));
	}
	}

	static INLINE void load_buffer_16bit_to_16bit(const int16_t *in, int stride,
	__m128i *out, int out_size) {
	for (int i = 0; i < out_size; ++i) {
	out[i] = load_16bit_to_16bit(in + i * stride);
	}
	}

	static INLINE void load_buffer_16bit_to_16bit_flip(const int16_t *in,
	int stride, __m128i *out,
	int out_size) {
	for (int i = 0; i < out_size; ++i) {
	out[out_size - i - 1] = load_16bit_to_16bit(in + i * stride);
	}
	}

	static INLINE void load_buffer_32bit_to_16bit(const int32_t *in, int stride,
	__m128i *out, int out_size) {
	for (int i = 0; i < out_size; ++i) {
	out[i] = load_32bit_to_16bit(in + i * stride);
	}
	}

	static INLINE void load_buffer_32bit_to_16bit_w4(const int32_t *in, int stride,
	__m128i *out, int out_size) {
	for (int i = 0; i < out_size; ++i) {
	out[i] = load_32bit_to_16bit_w4(in + i * stride);
	}
	}

	static INLINE void load_buffer_32bit_to_16bit_flip(const int32_t *in,
	int stride, __m128i *out,
	int out_size) {
	for (int i = 0; i < out_size; ++i) {
	out[out_size - i - 1] = load_32bit_to_16bit(in + i * stride);
	}
	}

	static INLINE void store_buffer_16bit_to_32bit_w4(const __m128i *const in,
	int32_t *const out,
	const int stride,
	const int out_size) {
	for (int i = 0; i < out_size; ++i) {
	store_16bit_to_32bit_w4(in[i], out + i * stride);
	}
	}

	static INLINE void store_buffer_16bit_to_32bit_w8(const __m128i *const in,
	int32_t *const out,
	const int stride,
	const int out_size) {
	for (int i = 0; i < out_size; ++i) {
	store_16bit_to_32bit(in[i], out + i * stride);
	}
	}

	static INLINE void store_rect_buffer_16bit_to_32bit_w4(const __m128i *const in,
	int32_t *const out,
	const int stride,
	const int out_size) {
	for (int i = 0; i < out_size; ++i) {
	store_rect_16bit_to_32bit_w4(in[i], out + i * stride);
	}
	}

	static INLINE void store_rect_buffer_16bit_to_32bit_w8(const __m128i *const in,
	int32_t *const out,
	const int stride,
	const int out_size) {
	for (int i = 0; i < out_size; ++i) {
	store_rect_16bit_to_32bit(in[i], out + i * stride);
	}
	}

	static INLINE void store_buffer_16bit_to_16bit_8x8(const __m128i *in,
	uint16_t *out,
	const int stride) {
	for (int i = 0; i < 8; ++i) {
	_mm_store_si128((__m128i )(out + i stride), in[i]);
	}
	}

	static INLINE void round_shift_16bit(__m128i *in, int size, int bit) {
	if (bit < 0) {
	bit = -bit;
	__m128i rounding = _mm_set1_epi16(1 << (bit - 1));
	for (int i = 0; i < size; ++i) {
	in[i] = _mm_adds_epi16(in[i], rounding);
	in[i] = _mm_srai_epi16(in[i], bit);
	}
	} else if (bit > 0) {
	for (int i = 0; i < size; ++i) {
	in[i] = _mm_slli_epi16(in[i], bit);
	}
	}
	}

	static INLINE void flip_buf_sse2(__m128i in, __m128i out, int size) {
	for (int i = 0; i < size; ++i) {
	out[size - i - 1] = in[i];
	}
	}

	#if CONFIG_ADST_TUNED
	static INLINE void matrix_coef_mult_sse2(const __m128i w0, const __m128i w1,
	const __m128i in0, const __m128i in1,
	__m128i out0, __m128i out1) {
	__m128i t0 = _mm_unpacklo_epi16(in0, in1);
	__m128i t1 = _mm_unpackhi_epi16(in0, in1);

	__m128i v0 = _mm_unpacklo_epi16(w0, w1);
	__m128i v1 = _mm_unpackhi_epi16(w0, w1);

	*out0 = _mm_madd_epi16(t0, v0);
	*out1 = _mm_madd_epi16(t1, v1);
	}
	#endif // CONFIG_ADST_TUNED

	void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t input, int32_t output,
	int stride, TX_TYPE tx_type, int bd);

	void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t input, int32_t output,
	int stride, TX_TYPE tx_type, int bd);

	void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t input, int32_t output,
	int stride, TX_TYPE tx_type, int bd);

	void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t input, int32_t output,
	int stride, TX_TYPE tx_type, int bd);

	void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t input, int32_t output,
	int stride, TX_TYPE tx_type, int bd);

	void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t input, int32_t output,
	int stride, TX_TYPE tx_type, int bd);

	void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t input, int32_t output,
	int stride, TX_TYPE tx_type, int bd);

	void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t input, int32_t output,
	int stride, TX_TYPE tx_type, int bd);

	void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t input, int32_t output,
	int stride, TX_TYPE tx_type, int bd);

	void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t input, int32_t output,
	int stride, TX_TYPE tx_type, int bd);

	void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t input, int32_t output,
	int stride, TX_TYPE tx_type, int bd);

	void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t input, int32_t output,
	int stride, TX_TYPE tx_type, int bd);

	void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t input, int32_t output,
	int stride, TX_TYPE tx_type, int bd);

	void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t input, int32_t output,
	int stride, TX_TYPE tx_type, int bd);

	void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t input, int32_t output,
	int stride, TX_TYPE tx_type, int bd);

	void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t input, int32_t output,
	int stride, TX_TYPE tx_type, int bd);

	#if CONFIG_FLEX_PARTITION
	void av1_lowbd_fwd_txfm2d_4x32_sse2(const int16_t input, int32_t output,
	int stride, TX_TYPE tx_type, int bd);

	void av1_lowbd_fwd_txfm2d_32x4_sse2(const int16_t input, int32_t output,
	int stride, TX_TYPE tx_type, int bd);

	void av1_lowbd_fwd_txfm2d_8x64_sse2(const int16_t input, int32_t output,
	int stride, TX_TYPE tx_type, int bd);

	void av1_lowbd_fwd_txfm2d_64x8_sse2(const int16_t input, int32_t output,
	int stride, TX_TYPE tx_type, int bd);

	void av1_lowbd_fwd_txfm2d_4x64_sse2(const int16_t input, int32_t output,
	int stride, TX_TYPE tx_type, int bd);

	void av1_lowbd_fwd_txfm2d_64x4_sse2(const int16_t input, int32_t output,
	int stride, TX_TYPE tx_type, int bd);
	#endif // CONFIG_FLEX_PARTITION

	typedef void (transform_1d_sse2)(const __m128i input, __m128i *output,
	int8_t cos_bit);

	typedef struct {
	transform_1d_sse2 col, row; // vertical and horizontal
	} transform_2d_sse2;

	#ifdef __cplusplus
	}
	#endif // __cplusplus
	#endif // AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_