Yi Luo | e8e8cd8 | 2016-09-21 10:45:01 -0700 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (c) 2016, Alliance for Open Media. All rights reserved |
| 3 | * |
| 4 | * This source code is subject to the terms of the BSD 2 Clause License and |
| 5 | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| 6 | * was not distributed with this source code in the LICENSE file, you can |
| 7 | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| 8 | * Media Patent License 1.0 was not distributed with this source code in the |
| 9 | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| 10 | */ |
| 11 | |
| 12 | #ifndef AOM_DSP_X86_TXFM_COMMON_AVX2_H |
| 13 | #define AOM_DSP_X86_TXFM_COMMON_AVX2_H |
| 14 | |
| 15 | #include <immintrin.h> |
| 16 | |
Yi Luo | 7317200 | 2016-10-28 10:52:04 -0700 | [diff] [blame] | 17 | #include "aom_dsp/txfm_common.h" |
Yi Luo | 6ae0054 | 2017-08-03 17:08:20 -0700 | [diff] [blame] | 18 | #include "aom_dsp/x86/common_avx2.h" |
Yi Luo | 7317200 | 2016-10-28 10:52:04 -0700 | [diff] [blame] | 19 | |
Yi Luo | e8e8cd8 | 2016-09-21 10:45:01 -0700 | [diff] [blame] | 20 | #define pair256_set_epi16(a, b) \ |
| 21 | _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ |
| 22 | (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ |
| 23 | (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ |
| 24 | (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a)) |
| 25 | |
| 26 | #define pair256_set_epi32(a, b) \ |
| 27 | _mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), (int)(b), (int)(a), \ |
| 28 | (int)(b), (int)(a)) |
| 29 | |
Yi Luo | 7317200 | 2016-10-28 10:52:04 -0700 | [diff] [blame] | 30 | static INLINE void mm256_reverse_epi16(__m256i *u) { |
| 31 | const __m256i control = _mm256_set_epi16( |
| 32 | 0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E, 0x0100, |
| 33 | 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E); |
| 34 | __m256i v = _mm256_shuffle_epi8(*u, control); |
| 35 | *u = _mm256_permute2x128_si256(v, v, 1); |
| 36 | } |
| 37 | |
Yi Luo | aaa65f2 | 2017-05-16 09:24:18 -0700 | [diff] [blame] | 38 | static INLINE __m256i butter_fly(const __m256i *a0, const __m256i *a1, |
| 39 | const __m256i *cospi) { |
Yi Luo | 7317200 | 2016-10-28 10:52:04 -0700 | [diff] [blame] | 40 | const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING); |
Yi Luo | aaa65f2 | 2017-05-16 09:24:18 -0700 | [diff] [blame] | 41 | __m256i y0 = _mm256_madd_epi16(*a0, *cospi); |
| 42 | __m256i y1 = _mm256_madd_epi16(*a1, *cospi); |
Yi Luo | 7317200 | 2016-10-28 10:52:04 -0700 | [diff] [blame] | 43 | |
| 44 | y0 = _mm256_add_epi32(y0, dct_rounding); |
| 45 | y1 = _mm256_add_epi32(y1, dct_rounding); |
| 46 | y0 = _mm256_srai_epi32(y0, DCT_CONST_BITS); |
| 47 | y1 = _mm256_srai_epi32(y1, DCT_CONST_BITS); |
| 48 | |
| 49 | return _mm256_packs_epi32(y0, y1); |
| 50 | } |
| 51 | |
| 52 | static INLINE void txfm_scaling16_avx2(const int16_t c, __m256i *in) { |
| 53 | const __m256i zero = _mm256_setzero_si256(); |
| 54 | const __m256i sqrt2_epi16 = _mm256_set1_epi16(c); |
| 55 | const __m256i dct_const_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING); |
| 56 | __m256i u0, u1; |
| 57 | int i = 0; |
| 58 | |
| 59 | while (i < 16) { |
| 60 | in[i] = _mm256_slli_epi16(in[i], 1); |
| 61 | |
| 62 | u0 = _mm256_unpacklo_epi16(zero, in[i]); |
| 63 | u1 = _mm256_unpackhi_epi16(zero, in[i]); |
| 64 | |
| 65 | u0 = _mm256_madd_epi16(u0, sqrt2_epi16); |
| 66 | u1 = _mm256_madd_epi16(u1, sqrt2_epi16); |
| 67 | |
| 68 | u0 = _mm256_add_epi32(u0, dct_const_rounding); |
| 69 | u1 = _mm256_add_epi32(u1, dct_const_rounding); |
| 70 | |
| 71 | u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS); |
| 72 | u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS); |
| 73 | in[i] = _mm256_packs_epi32(u0, u1); |
| 74 | i++; |
| 75 | } |
| 76 | } |
| 77 | |
Yi Luo | e8e8cd8 | 2016-09-21 10:45:01 -0700 | [diff] [blame] | 78 | #endif // AOM_DSP_X86_TXFM_COMMON_AVX2_H |