blob: 1a8fed71061cf28a9c2c987c6333a28a14559c23 [file] [log] [blame]
Yi Luoe8e8cd82016-09-21 10:45:01 -07001/*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12#ifndef AOM_DSP_X86_TXFM_COMMON_AVX2_H
13#define AOM_DSP_X86_TXFM_COMMON_AVX2_H
14
15#include <immintrin.h>
16
Yi Luo73172002016-10-28 10:52:04 -070017#include "aom_dsp/txfm_common.h"
Yi Luo6ae00542017-08-03 17:08:20 -070018#include "aom_dsp/x86/common_avx2.h"
Yi Luo73172002016-10-28 10:52:04 -070019
Yi Luoe8e8cd82016-09-21 10:45:01 -070020#define pair256_set_epi16(a, b) \
21 _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
22 (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
23 (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
24 (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
25
26#define pair256_set_epi32(a, b) \
27 _mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), (int)(b), (int)(a), \
28 (int)(b), (int)(a))
29
Yi Luo73172002016-10-28 10:52:04 -070030static INLINE void mm256_reverse_epi16(__m256i *u) {
31 const __m256i control = _mm256_set_epi16(
32 0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E, 0x0100,
33 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E);
34 __m256i v = _mm256_shuffle_epi8(*u, control);
35 *u = _mm256_permute2x128_si256(v, v, 1);
36}
37
Yi Luoaaa65f22017-05-16 09:24:18 -070038static INLINE __m256i butter_fly(const __m256i *a0, const __m256i *a1,
39 const __m256i *cospi) {
Yi Luo73172002016-10-28 10:52:04 -070040 const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
Yi Luoaaa65f22017-05-16 09:24:18 -070041 __m256i y0 = _mm256_madd_epi16(*a0, *cospi);
42 __m256i y1 = _mm256_madd_epi16(*a1, *cospi);
Yi Luo73172002016-10-28 10:52:04 -070043
44 y0 = _mm256_add_epi32(y0, dct_rounding);
45 y1 = _mm256_add_epi32(y1, dct_rounding);
46 y0 = _mm256_srai_epi32(y0, DCT_CONST_BITS);
47 y1 = _mm256_srai_epi32(y1, DCT_CONST_BITS);
48
49 return _mm256_packs_epi32(y0, y1);
50}
51
52static INLINE void txfm_scaling16_avx2(const int16_t c, __m256i *in) {
53 const __m256i zero = _mm256_setzero_si256();
54 const __m256i sqrt2_epi16 = _mm256_set1_epi16(c);
55 const __m256i dct_const_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
56 __m256i u0, u1;
57 int i = 0;
58
59 while (i < 16) {
60 in[i] = _mm256_slli_epi16(in[i], 1);
61
62 u0 = _mm256_unpacklo_epi16(zero, in[i]);
63 u1 = _mm256_unpackhi_epi16(zero, in[i]);
64
65 u0 = _mm256_madd_epi16(u0, sqrt2_epi16);
66 u1 = _mm256_madd_epi16(u1, sqrt2_epi16);
67
68 u0 = _mm256_add_epi32(u0, dct_const_rounding);
69 u1 = _mm256_add_epi32(u1, dct_const_rounding);
70
71 u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS);
72 u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS);
73 in[i] = _mm256_packs_epi32(u0, u1);
74 i++;
75 }
76}
77
Yi Luoe8e8cd82016-09-21 10:45:01 -070078#endif // AOM_DSP_X86_TXFM_COMMON_AVX2_H