blob: a9d6a127ca33d5e8281e60e2465c4b1c6f0a062e [file] [log] [blame]
Yi Luof6176ab2017-04-28 15:48:56 -07001/*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12#include <immintrin.h>
13
14#include "./aom_dsp_rtcd.h"
15#include "aom_dsp/inv_txfm.h"
16#include "aom_dsp/x86/inv_txfm_common_avx2.h"
17#include "aom_dsp/x86/txfm_common_avx2.h"
18
19void aom_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest,
20 int stride) {
21 __m256i in[16];
22 load_buffer_16x16(input, in);
Yi Luo40f22ef2017-05-08 16:29:39 -070023 mm256_transpose_16x16(in, in);
Yi Luof6176ab2017-04-28 15:48:56 -070024 av1_idct16_avx2(in);
Yi Luo40f22ef2017-05-08 16:29:39 -070025 mm256_transpose_16x16(in, in);
Yi Luof6176ab2017-04-28 15:48:56 -070026 av1_idct16_avx2(in);
Yi Luo40f22ef2017-05-08 16:29:39 -070027 store_buffer_16xN(in, stride, dest, 16);
Yi Luof6176ab2017-04-28 15:48:56 -070028}
29
30static INLINE void transpose_col_to_row_nz4x4(__m256i *in /*in[4]*/) {
31 const __m256i u0 = _mm256_unpacklo_epi16(in[0], in[1]);
32 const __m256i u1 = _mm256_unpacklo_epi16(in[2], in[3]);
33 const __m256i v0 = _mm256_unpacklo_epi32(u0, u1);
34 const __m256i v1 = _mm256_unpackhi_epi32(u0, u1);
35 in[0] = _mm256_permute4x64_epi64(v0, 0xA8);
36 in[1] = _mm256_permute4x64_epi64(v0, 0xA9);
37 in[2] = _mm256_permute4x64_epi64(v1, 0xA8);
38 in[3] = _mm256_permute4x64_epi64(v1, 0xA9);
39}
40
Yi Luo40f22ef2017-05-08 16:29:39 -070041#define MM256_SHUFFLE_EPI64(x0, x1, imm8) \
42 _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(x0), \
43 _mm256_castsi256_pd(x1), imm8))
Yi Luof6176ab2017-04-28 15:48:56 -070044
45static INLINE void transpose_col_to_row_nz4x16(__m256i *in /*in[16]*/) {
46 int i;
47 for (i = 0; i < 16; i += 4) {
48 transpose_col_to_row_nz4x4(&in[i]);
49 }
50
51 for (i = 0; i < 4; ++i) {
Yi Luo40f22ef2017-05-08 16:29:39 -070052 in[i] = MM256_SHUFFLE_EPI64(in[i], in[i + 4], 0);
53 in[i + 8] = MM256_SHUFFLE_EPI64(in[i + 8], in[i + 12], 0);
Yi Luof6176ab2017-04-28 15:48:56 -070054 }
55
56 for (i = 0; i < 4; ++i) {
57 in[i] = _mm256_permute2x128_si256(in[i], in[i + 8], 0x20);
58 }
59}
60
61// Coefficients 0-7 before the final butterfly
62static INLINE void idct16_10_first_half(const __m256i *in, __m256i *out) {
63 const __m256i c2p28 = pair256_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
64 const __m256i c2p04 = pair256_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
65 const __m256i v4 = _mm256_mulhrs_epi16(in[2], c2p28);
66 const __m256i v7 = _mm256_mulhrs_epi16(in[2], c2p04);
67
68 const __m256i c2p16 = pair256_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
69 const __m256i v0 = _mm256_mulhrs_epi16(in[0], c2p16);
70 const __m256i v1 = v0;
71
72 const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
73 const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
74 __m256i v5, v6;
75 unpack_butter_fly(&v7, &v4, &cospi_p16_m16, &cospi_p16_p16, &v5, &v6);
76
77 out[0] = _mm256_add_epi16(v0, v7);
78 out[1] = _mm256_add_epi16(v1, v6);
79 out[2] = _mm256_add_epi16(v1, v5);
80 out[3] = _mm256_add_epi16(v0, v4);
81 out[4] = _mm256_sub_epi16(v0, v4);
82 out[5] = _mm256_sub_epi16(v1, v5);
83 out[6] = _mm256_sub_epi16(v1, v6);
84 out[7] = _mm256_sub_epi16(v0, v7);
85}
86
87// Coefficients 8-15 before the final butterfly
88static INLINE void idct16_10_second_half(const __m256i *in, __m256i *out) {
89 const __m256i c2p30 = pair256_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
90 const __m256i c2p02 = pair256_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
91 const __m256i t0 = _mm256_mulhrs_epi16(in[1], c2p30);
92 const __m256i t7 = _mm256_mulhrs_epi16(in[1], c2p02);
93
94 const __m256i c2m26 = pair256_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64);
95 const __m256i c2p06 = pair256_set_epi16(2 * cospi_6_64, 2 * cospi_6_64);
96 const __m256i t3 = _mm256_mulhrs_epi16(in[3], c2m26);
97 const __m256i t4 = _mm256_mulhrs_epi16(in[3], c2p06);
98
99 const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
100 const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64);
101 const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
102
103 __m256i t1, t2, t5, t6;
104 unpack_butter_fly(&t0, &t7, &cospi_m08_p24, &cospi_p24_p08, &t1, &t6);
105 unpack_butter_fly(&t3, &t4, &cospi_m24_m08, &cospi_m08_p24, &t2, &t5);
106
107 out[0] = _mm256_add_epi16(t0, t3);
108 out[1] = _mm256_add_epi16(t1, t2);
109 out[6] = _mm256_add_epi16(t6, t5);
110 out[7] = _mm256_add_epi16(t7, t4);
111
112 const __m256i v2 = _mm256_sub_epi16(t1, t2);
113 const __m256i v3 = _mm256_sub_epi16(t0, t3);
114 const __m256i v4 = _mm256_sub_epi16(t7, t4);
115 const __m256i v5 = _mm256_sub_epi16(t6, t5);
116 const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
117 const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
118 unpack_butter_fly(&v5, &v2, &cospi_p16_m16, &cospi_p16_p16, &out[2], &out[5]);
119 unpack_butter_fly(&v4, &v3, &cospi_p16_m16, &cospi_p16_p16, &out[3], &out[4]);
120}
121
122static INLINE void add_sub_butterfly(const __m256i *in, __m256i *out,
123 int size) {
124 int i = 0;
125 const int num = size >> 1;
126 const int bound = size - 1;
127 while (i < num) {
128 out[i] = _mm256_add_epi16(in[i], in[bound - i]);
129 out[bound - i] = _mm256_sub_epi16(in[i], in[bound - i]);
130 i++;
131 }
132}
133
134static INLINE void idct16_10(__m256i *in /*in[16]*/) {
135 __m256i out[16];
136 idct16_10_first_half(in, out);
137 idct16_10_second_half(in, &out[8]);
138 add_sub_butterfly(out, in, 16);
139}
140
141void aom_idct16x16_10_add_avx2(const tran_low_t *input, uint8_t *dest,
142 int stride) {
143 __m256i in[16];
144
145 load_coeff(input, &in[0]);
146 load_coeff(input + 16, &in[1]);
147 load_coeff(input + 32, &in[2]);
148 load_coeff(input + 48, &in[3]);
149
150 transpose_col_to_row_nz4x4(in);
151 idct16_10(in);
152
153 transpose_col_to_row_nz4x16(in);
154 idct16_10(in);
155
Yi Luo40f22ef2017-05-08 16:29:39 -0700156 store_buffer_16xN(in, stride, dest, 16);
Yi Luof6176ab2017-04-28 15:48:56 -0700157}
158
159// Note:
160// For 16x16 int16_t matrix
161// transpose first 8 columns into first 8 rows.
162// Since only upper-left 8x8 are non-zero, the input are first 8 rows (in[8]).
163// After transposing, the 8 row vectors are in in[8].
164void transpose_col_to_row_nz8x8(__m256i *in /*in[8]*/) {
165 __m256i u0 = _mm256_unpacklo_epi16(in[0], in[1]);
166 __m256i u1 = _mm256_unpackhi_epi16(in[0], in[1]);
167 __m256i u2 = _mm256_unpacklo_epi16(in[2], in[3]);
168 __m256i u3 = _mm256_unpackhi_epi16(in[2], in[3]);
169
170 const __m256i v0 = _mm256_unpacklo_epi32(u0, u2);
171 const __m256i v1 = _mm256_unpackhi_epi32(u0, u2);
172 const __m256i v2 = _mm256_unpacklo_epi32(u1, u3);
173 const __m256i v3 = _mm256_unpackhi_epi32(u1, u3);
174
175 u0 = _mm256_unpacklo_epi16(in[4], in[5]);
176 u1 = _mm256_unpackhi_epi16(in[4], in[5]);
177 u2 = _mm256_unpacklo_epi16(in[6], in[7]);
178 u3 = _mm256_unpackhi_epi16(in[6], in[7]);
179
180 const __m256i v4 = _mm256_unpacklo_epi32(u0, u2);
181 const __m256i v5 = _mm256_unpackhi_epi32(u0, u2);
182 const __m256i v6 = _mm256_unpacklo_epi32(u1, u3);
183 const __m256i v7 = _mm256_unpackhi_epi32(u1, u3);
184
Yi Luo40f22ef2017-05-08 16:29:39 -0700185 in[0] = MM256_SHUFFLE_EPI64(v0, v4, 0);
186 in[1] = MM256_SHUFFLE_EPI64(v0, v4, 3);
187 in[2] = MM256_SHUFFLE_EPI64(v1, v5, 0);
188 in[3] = MM256_SHUFFLE_EPI64(v1, v5, 3);
189 in[4] = MM256_SHUFFLE_EPI64(v2, v6, 0);
190 in[5] = MM256_SHUFFLE_EPI64(v2, v6, 3);
191 in[6] = MM256_SHUFFLE_EPI64(v3, v7, 0);
192 in[7] = MM256_SHUFFLE_EPI64(v3, v7, 3);
Yi Luof6176ab2017-04-28 15:48:56 -0700193}
194
195// Note:
196// For 16x16 int16_t matrix
197// transpose first 8 columns into first 8 rows.
198// Since only matrix left 8x16 are non-zero, the input are total 16 rows
199// (in[16]).
200// After transposing, the 8 row vectors are in in[8]. All else are zero.
201static INLINE void transpose_col_to_row_nz8x16(__m256i *in /*in[16]*/) {
202 transpose_col_to_row_nz8x8(in);
203 transpose_col_to_row_nz8x8(&in[8]);
204
205 int i;
206 for (i = 0; i < 8; ++i) {
207 in[i] = _mm256_permute2x128_si256(in[i], in[i + 8], 0x20);
208 }
209}
210
211static INLINE void idct16_38_first_half(const __m256i *in, __m256i *out) {
212 const __m256i c2p28 = pair256_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
213 const __m256i c2p04 = pair256_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
214 __m256i t4 = _mm256_mulhrs_epi16(in[2], c2p28);
215 __m256i t7 = _mm256_mulhrs_epi16(in[2], c2p04);
216
217 const __m256i c2m20 = pair256_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64);
218 const __m256i c2p12 = pair256_set_epi16(2 * cospi_12_64, 2 * cospi_12_64);
219 __m256i t5 = _mm256_mulhrs_epi16(in[6], c2m20);
220 __m256i t6 = _mm256_mulhrs_epi16(in[6], c2p12);
221
222 const __m256i c2p16 = pair256_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
223 const __m256i c2p24 = pair256_set_epi16(2 * cospi_24_64, 2 * cospi_24_64);
224 const __m256i c2p08 = pair256_set_epi16(2 * cospi_8_64, 2 * cospi_8_64);
225 const __m256i u0 = _mm256_mulhrs_epi16(in[0], c2p16);
226 const __m256i u1 = _mm256_mulhrs_epi16(in[0], c2p16);
227 const __m256i u2 = _mm256_mulhrs_epi16(in[4], c2p24);
228 const __m256i u3 = _mm256_mulhrs_epi16(in[4], c2p08);
229
230 const __m256i u4 = _mm256_add_epi16(t4, t5);
231 const __m256i u5 = _mm256_sub_epi16(t4, t5);
232 const __m256i u6 = _mm256_sub_epi16(t7, t6);
233 const __m256i u7 = _mm256_add_epi16(t7, t6);
234
235 const __m256i t0 = _mm256_add_epi16(u0, u3);
236 const __m256i t1 = _mm256_add_epi16(u1, u2);
237 const __m256i t2 = _mm256_sub_epi16(u1, u2);
238 const __m256i t3 = _mm256_sub_epi16(u0, u3);
239
240 t4 = u4;
241 t7 = u7;
242
243 const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
244 const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
245 unpack_butter_fly(&u6, &u5, &cospi_p16_m16, &cospi_p16_p16, &t5, &t6);
246
247 out[0] = _mm256_add_epi16(t0, t7);
248 out[1] = _mm256_add_epi16(t1, t6);
249 out[2] = _mm256_add_epi16(t2, t5);
250 out[3] = _mm256_add_epi16(t3, t4);
251 out[4] = _mm256_sub_epi16(t3, t4);
252 out[5] = _mm256_sub_epi16(t2, t5);
253 out[6] = _mm256_sub_epi16(t1, t6);
254 out[7] = _mm256_sub_epi16(t0, t7);
255}
256
257static INLINE void idct16_38_second_half(const __m256i *in, __m256i *out) {
258 const __m256i c2p30 = pair256_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
259 const __m256i c2p02 = pair256_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
260 __m256i t0 = _mm256_mulhrs_epi16(in[1], c2p30);
261 __m256i t7 = _mm256_mulhrs_epi16(in[1], c2p02);
262
263 const __m256i c2m18 = pair256_set_epi16(-2 * cospi_18_64, -2 * cospi_18_64);
264 const __m256i c2p14 = pair256_set_epi16(2 * cospi_14_64, 2 * cospi_14_64);
265 __m256i t1 = _mm256_mulhrs_epi16(in[7], c2m18);
266 __m256i t6 = _mm256_mulhrs_epi16(in[7], c2p14);
267
268 const __m256i c2p22 = pair256_set_epi16(2 * cospi_22_64, 2 * cospi_22_64);
269 const __m256i c2p10 = pair256_set_epi16(2 * cospi_10_64, 2 * cospi_10_64);
270 __m256i t2 = _mm256_mulhrs_epi16(in[5], c2p22);
271 __m256i t5 = _mm256_mulhrs_epi16(in[5], c2p10);
272
273 const __m256i c2m26 = pair256_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64);
274 const __m256i c2p06 = pair256_set_epi16(2 * cospi_6_64, 2 * cospi_6_64);
275 __m256i t3 = _mm256_mulhrs_epi16(in[3], c2m26);
276 __m256i t4 = _mm256_mulhrs_epi16(in[3], c2p06);
277
278 __m256i v0, v1, v2, v3, v4, v5, v6, v7;
279 v0 = _mm256_add_epi16(t0, t1);
280 v1 = _mm256_sub_epi16(t0, t1);
281 v2 = _mm256_sub_epi16(t3, t2);
282 v3 = _mm256_add_epi16(t2, t3);
283 v4 = _mm256_add_epi16(t4, t5);
284 v5 = _mm256_sub_epi16(t4, t5);
285 v6 = _mm256_sub_epi16(t7, t6);
286 v7 = _mm256_add_epi16(t6, t7);
287
288 t0 = v0;
289 t7 = v7;
290 t3 = v3;
291 t4 = v4;
292 const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
293 const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64);
294 const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
295 unpack_butter_fly(&v1, &v6, &cospi_m08_p24, &cospi_p24_p08, &t1, &t6);
296 unpack_butter_fly(&v2, &v5, &cospi_m24_m08, &cospi_m08_p24, &t2, &t5);
297
298 v0 = _mm256_add_epi16(t0, t3);
299 v1 = _mm256_add_epi16(t1, t2);
300 v2 = _mm256_sub_epi16(t1, t2);
301 v3 = _mm256_sub_epi16(t0, t3);
302 v4 = _mm256_sub_epi16(t7, t4);
303 v5 = _mm256_sub_epi16(t6, t5);
304 v6 = _mm256_add_epi16(t6, t5);
305 v7 = _mm256_add_epi16(t7, t4);
306
307 // stage 6, (8-15)
308 out[0] = v0;
309 out[1] = v1;
310 out[6] = v6;
311 out[7] = v7;
312 const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
313 const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
314 unpack_butter_fly(&v5, &v2, &cospi_p16_m16, &cospi_p16_p16, &out[2], &out[5]);
315 unpack_butter_fly(&v4, &v3, &cospi_p16_m16, &cospi_p16_p16, &out[3], &out[4]);
316}
317
318static INLINE void idct16_38(__m256i *in /*in[16]*/) {
319 __m256i out[16];
320 idct16_38_first_half(in, out);
321 idct16_38_second_half(in, &out[8]);
322 add_sub_butterfly(out, in, 16);
323}
324
325void aom_idct16x16_38_add_avx2(const tran_low_t *input, uint8_t *dest,
326 int stride) {
327 __m256i in[16];
328
329 int i;
330 for (i = 0; i < 8; ++i) {
331 load_coeff(input + (i << 4), &in[i]);
332 }
333
334 transpose_col_to_row_nz8x8(in);
335 idct16_38(in);
336
337 transpose_col_to_row_nz8x16(in);
338 idct16_38(in);
339
Yi Luo40f22ef2017-05-08 16:29:39 -0700340 store_buffer_16xN(in, stride, dest, 16);
341}
342
343static INLINE int calculate_dc(const tran_low_t *input) {
344 int dc = (int)dct_const_round_shift(input[0] * cospi_16_64);
345 dc = (int)dct_const_round_shift(dc * cospi_16_64);
346 dc = ROUND_POWER_OF_TWO(dc, IDCT_ROUNDING_POS);
347 return dc;
Yi Luof6176ab2017-04-28 15:48:56 -0700348}
349
350void aom_idct16x16_1_add_avx2(const tran_low_t *input, uint8_t *dest,
351 int stride) {
Yi Luo40f22ef2017-05-08 16:29:39 -0700352 const int dc = calculate_dc(input);
353 if (dc == 0) return;
Yi Luof6176ab2017-04-28 15:48:56 -0700354
Yi Luo40f22ef2017-05-08 16:29:39 -0700355 const __m256i dc_value = _mm256_set1_epi16(dc);
Yi Luof6176ab2017-04-28 15:48:56 -0700356
Yi Luo40f22ef2017-05-08 16:29:39 -0700357 int i;
Yi Luof6176ab2017-04-28 15:48:56 -0700358 for (i = 0; i < 16; ++i) {
359 recon_and_store(&dc_value, dest);
360 dest += stride;
361 }
362}
Yi Luo40f22ef2017-05-08 16:29:39 -0700363
364// -----------------------------------------------------------------------------
365// 32x32 partial IDCT
366
367void aom_idct32x32_1_add_avx2(const tran_low_t *input, uint8_t *dest,
368 int stride) {
369 const int dc = calculate_dc(input);
370 if (dc == 0) return;
371
372 const __m256i dc_value = _mm256_set1_epi16(dc);
373
374 int i;
375 for (i = 0; i < 32; ++i) {
376 recon_and_store(&dc_value, dest);
377 recon_and_store(&dc_value, dest + 16);
378 dest += stride;
379 }
380}
381
382static void load_buffer_32x16(const tran_low_t *input, __m256i *in /*in[32]*/) {
383 int i;
384 for (i = 0; i < 16; ++i) {
385 load_coeff(input, &in[i]);
386 load_coeff(input + 16, &in[i + 16]);
387 input += 32;
388 }
389}
390
391// Note:
392// We extend SSSE3 operations to AVX2. Instead of operating on __m128i, we
393// operate coefficients on __m256i. Our operation capacity doubles for each
394// instruction.
395#define BUTTERFLY_PAIR(x0, x1, co0, co1) \
396 do { \
397 tmp0 = _mm256_madd_epi16(x0, co0); \
398 tmp1 = _mm256_madd_epi16(x1, co0); \
399 tmp2 = _mm256_madd_epi16(x0, co1); \
400 tmp3 = _mm256_madd_epi16(x1, co1); \
401 tmp0 = _mm256_add_epi32(tmp0, rounding); \
402 tmp1 = _mm256_add_epi32(tmp1, rounding); \
403 tmp2 = _mm256_add_epi32(tmp2, rounding); \
404 tmp3 = _mm256_add_epi32(tmp3, rounding); \
405 tmp0 = _mm256_srai_epi32(tmp0, DCT_CONST_BITS); \
406 tmp1 = _mm256_srai_epi32(tmp1, DCT_CONST_BITS); \
407 tmp2 = _mm256_srai_epi32(tmp2, DCT_CONST_BITS); \
408 tmp3 = _mm256_srai_epi32(tmp3, DCT_CONST_BITS); \
409 } while (0)
410
411static INLINE void butterfly(const __m256i *x0, const __m256i *x1,
412 const __m256i *c0, const __m256i *c1, __m256i *y0,
413 __m256i *y1) {
414 __m256i tmp0, tmp1, tmp2, tmp3, u0, u1;
415 const __m256i rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
416
417 u0 = _mm256_unpacklo_epi16(*x0, *x1);
418 u1 = _mm256_unpackhi_epi16(*x0, *x1);
419 BUTTERFLY_PAIR(u0, u1, *c0, *c1);
420 *y0 = _mm256_packs_epi32(tmp0, tmp1);
421 *y1 = _mm256_packs_epi32(tmp2, tmp3);
422}
423
424static INLINE void butterfly_self(__m256i *x0, __m256i *x1, const __m256i *c0,
425 const __m256i *c1) {
426 __m256i tmp0, tmp1, tmp2, tmp3, u0, u1;
427 const __m256i rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
428
429 u0 = _mm256_unpacklo_epi16(*x0, *x1);
430 u1 = _mm256_unpackhi_epi16(*x0, *x1);
431 BUTTERFLY_PAIR(u0, u1, *c0, *c1);
432 *x0 = _mm256_packs_epi32(tmp0, tmp1);
433 *x1 = _mm256_packs_epi32(tmp2, tmp3);
434}
435
436// For each 16x32 block __m256i in[32],
437// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
438// output pixels: 8-15 in __m256i in[32]
439static void idct32_full_16x32_quarter_2(const __m256i *in /*in[32]*/,
440 __m256i *out /*out[16]*/) {
441 __m256i u8, u9, u10, u11, u12, u13, u14, u15; // stp2_
442 __m256i v8, v9, v10, v11, v12, v13, v14, v15; // stp1_
443
444 {
445 const __m256i stg2_0 = pair256_set_epi16(cospi_30_64, -cospi_2_64);
446 const __m256i stg2_1 = pair256_set_epi16(cospi_2_64, cospi_30_64);
447 const __m256i stg2_2 = pair256_set_epi16(cospi_14_64, -cospi_18_64);
448 const __m256i stg2_3 = pair256_set_epi16(cospi_18_64, cospi_14_64);
449 butterfly(&in[2], &in[30], &stg2_0, &stg2_1, &u8, &u15);
450 butterfly(&in[18], &in[14], &stg2_2, &stg2_3, &u9, &u14);
451 }
452
453 v8 = _mm256_add_epi16(u8, u9);
454 v9 = _mm256_sub_epi16(u8, u9);
455 v14 = _mm256_sub_epi16(u15, u14);
456 v15 = _mm256_add_epi16(u15, u14);
457
458 {
459 const __m256i stg2_4 = pair256_set_epi16(cospi_22_64, -cospi_10_64);
460 const __m256i stg2_5 = pair256_set_epi16(cospi_10_64, cospi_22_64);
461 const __m256i stg2_6 = pair256_set_epi16(cospi_6_64, -cospi_26_64);
462 const __m256i stg2_7 = pair256_set_epi16(cospi_26_64, cospi_6_64);
463 butterfly(&in[10], &in[22], &stg2_4, &stg2_5, &u10, &u13);
464 butterfly(&in[26], &in[6], &stg2_6, &stg2_7, &u11, &u12);
465 }
466
467 v10 = _mm256_sub_epi16(u11, u10);
468 v11 = _mm256_add_epi16(u11, u10);
469 v12 = _mm256_add_epi16(u12, u13);
470 v13 = _mm256_sub_epi16(u12, u13);
471
472 {
473 const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
474 const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64);
475 const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
476 butterfly_self(&v9, &v14, &stg4_4, &stg4_5);
477 butterfly_self(&v10, &v13, &stg4_6, &stg4_4);
478 }
479
480 out[0] = _mm256_add_epi16(v8, v11);
481 out[1] = _mm256_add_epi16(v9, v10);
482 out[6] = _mm256_add_epi16(v14, v13);
483 out[7] = _mm256_add_epi16(v15, v12);
484
485 out[2] = _mm256_sub_epi16(v9, v10);
486 out[3] = _mm256_sub_epi16(v8, v11);
487 out[4] = _mm256_sub_epi16(v15, v12);
488 out[5] = _mm256_sub_epi16(v14, v13);
489
490 {
491 const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64);
492 const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
493 butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0);
494 butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0);
495 }
496}
497
498// For each 8x32 block __m256i in[32],
499// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
500// output pixels: 0-7 in __m256i in[32]
501static void idct32_full_16x32_quarter_1(const __m256i *in /*in[32]*/,
502 __m256i *out /*out[8]*/) {
503 __m256i u0, u1, u2, u3, u4, u5, u6, u7; // stp1_
504 __m256i v0, v1, v2, v3, v4, v5, v6, v7; // stp2_
505
506 {
507 const __m256i stg3_0 = pair256_set_epi16(cospi_28_64, -cospi_4_64);
508 const __m256i stg3_1 = pair256_set_epi16(cospi_4_64, cospi_28_64);
509 const __m256i stg3_2 = pair256_set_epi16(cospi_12_64, -cospi_20_64);
510 const __m256i stg3_3 = pair256_set_epi16(cospi_20_64, cospi_12_64);
511 butterfly(&in[4], &in[28], &stg3_0, &stg3_1, &u4, &u7);
512 butterfly(&in[20], &in[12], &stg3_2, &stg3_3, &u5, &u6);
513 }
514
515 v4 = _mm256_add_epi16(u4, u5);
516 v5 = _mm256_sub_epi16(u4, u5);
517 v6 = _mm256_sub_epi16(u7, u6);
518 v7 = _mm256_add_epi16(u7, u6);
519
520 {
521 const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64);
522 const __m256i stg4_1 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
523 const __m256i stg4_2 = pair256_set_epi16(cospi_24_64, -cospi_8_64);
524 const __m256i stg4_3 = pair256_set_epi16(cospi_8_64, cospi_24_64);
525 butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6);
526
527 butterfly(&in[0], &in[16], &stg4_0, &stg4_1, &u0, &u1);
528 butterfly(&in[8], &in[24], &stg4_2, &stg4_3, &u2, &u3);
529 }
530
531 v0 = _mm256_add_epi16(u0, u3);
532 v1 = _mm256_add_epi16(u1, u2);
533 v2 = _mm256_sub_epi16(u1, u2);
534 v3 = _mm256_sub_epi16(u0, u3);
535
536 out[0] = _mm256_add_epi16(v0, v7);
537 out[1] = _mm256_add_epi16(v1, v6);
538 out[2] = _mm256_add_epi16(v2, v5);
539 out[3] = _mm256_add_epi16(v3, v4);
540 out[4] = _mm256_sub_epi16(v3, v4);
541 out[5] = _mm256_sub_epi16(v2, v5);
542 out[6] = _mm256_sub_epi16(v1, v6);
543 out[7] = _mm256_sub_epi16(v0, v7);
544}
545
546// For each 8x32 block __m256i in[32],
547// Input with odd index,
548// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
549// output pixels: 16-23, 24-31 in __m256i in[32]
550// We avoid hide an offset, 16, inside this function. So we output 0-15 into
551// array out[16]
552static void idct32_full_16x32_quarter_3_4(const __m256i *in /*in[32]*/,
553 __m256i *out /*out[16]*/) {
554 __m256i v16, v17, v18, v19, v20, v21, v22, v23;
555 __m256i v24, v25, v26, v27, v28, v29, v30, v31;
556 __m256i u16, u17, u18, u19, u20, u21, u22, u23;
557 __m256i u24, u25, u26, u27, u28, u29, u30, u31;
558
559 {
560 const __m256i stg1_0 = pair256_set_epi16(cospi_31_64, -cospi_1_64);
561 const __m256i stg1_1 = pair256_set_epi16(cospi_1_64, cospi_31_64);
562 const __m256i stg1_2 = pair256_set_epi16(cospi_15_64, -cospi_17_64);
563 const __m256i stg1_3 = pair256_set_epi16(cospi_17_64, cospi_15_64);
564 const __m256i stg1_4 = pair256_set_epi16(cospi_23_64, -cospi_9_64);
565 const __m256i stg1_5 = pair256_set_epi16(cospi_9_64, cospi_23_64);
566 const __m256i stg1_6 = pair256_set_epi16(cospi_7_64, -cospi_25_64);
567 const __m256i stg1_7 = pair256_set_epi16(cospi_25_64, cospi_7_64);
568 const __m256i stg1_8 = pair256_set_epi16(cospi_27_64, -cospi_5_64);
569 const __m256i stg1_9 = pair256_set_epi16(cospi_5_64, cospi_27_64);
570 const __m256i stg1_10 = pair256_set_epi16(cospi_11_64, -cospi_21_64);
571 const __m256i stg1_11 = pair256_set_epi16(cospi_21_64, cospi_11_64);
572 const __m256i stg1_12 = pair256_set_epi16(cospi_19_64, -cospi_13_64);
573 const __m256i stg1_13 = pair256_set_epi16(cospi_13_64, cospi_19_64);
574 const __m256i stg1_14 = pair256_set_epi16(cospi_3_64, -cospi_29_64);
575 const __m256i stg1_15 = pair256_set_epi16(cospi_29_64, cospi_3_64);
576 butterfly(&in[1], &in[31], &stg1_0, &stg1_1, &u16, &u31);
577 butterfly(&in[17], &in[15], &stg1_2, &stg1_3, &u17, &u30);
578 butterfly(&in[9], &in[23], &stg1_4, &stg1_5, &u18, &u29);
579 butterfly(&in[25], &in[7], &stg1_6, &stg1_7, &u19, &u28);
580
581 butterfly(&in[5], &in[27], &stg1_8, &stg1_9, &u20, &u27);
582 butterfly(&in[21], &in[11], &stg1_10, &stg1_11, &u21, &u26);
583
584 butterfly(&in[13], &in[19], &stg1_12, &stg1_13, &u22, &u25);
585 butterfly(&in[29], &in[3], &stg1_14, &stg1_15, &u23, &u24);
586 }
587
588 v16 = _mm256_add_epi16(u16, u17);
589 v17 = _mm256_sub_epi16(u16, u17);
590 v18 = _mm256_sub_epi16(u19, u18);
591 v19 = _mm256_add_epi16(u19, u18);
592
593 v20 = _mm256_add_epi16(u20, u21);
594 v21 = _mm256_sub_epi16(u20, u21);
595 v22 = _mm256_sub_epi16(u23, u22);
596 v23 = _mm256_add_epi16(u23, u22);
597
598 v24 = _mm256_add_epi16(u24, u25);
599 v25 = _mm256_sub_epi16(u24, u25);
600 v26 = _mm256_sub_epi16(u27, u26);
601 v27 = _mm256_add_epi16(u27, u26);
602
603 v28 = _mm256_add_epi16(u28, u29);
604 v29 = _mm256_sub_epi16(u28, u29);
605 v30 = _mm256_sub_epi16(u31, u30);
606 v31 = _mm256_add_epi16(u31, u30);
607
608 {
609 const __m256i stg3_4 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
610 const __m256i stg3_5 = pair256_set_epi16(cospi_28_64, cospi_4_64);
611 const __m256i stg3_6 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);
612 const __m256i stg3_8 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
613 const __m256i stg3_9 = pair256_set_epi16(cospi_12_64, cospi_20_64);
614 const __m256i stg3_10 = pair256_set_epi16(-cospi_12_64, -cospi_20_64);
615 butterfly_self(&v17, &v30, &stg3_4, &stg3_5);
616 butterfly_self(&v18, &v29, &stg3_6, &stg3_4);
617 butterfly_self(&v21, &v26, &stg3_8, &stg3_9);
618 butterfly_self(&v22, &v25, &stg3_10, &stg3_8);
619 }
620
621 u16 = _mm256_add_epi16(v16, v19);
622 u17 = _mm256_add_epi16(v17, v18);
623 u18 = _mm256_sub_epi16(v17, v18);
624 u19 = _mm256_sub_epi16(v16, v19);
625 u20 = _mm256_sub_epi16(v23, v20);
626 u21 = _mm256_sub_epi16(v22, v21);
627 u22 = _mm256_add_epi16(v22, v21);
628 u23 = _mm256_add_epi16(v23, v20);
629
630 u24 = _mm256_add_epi16(v24, v27);
631 u25 = _mm256_add_epi16(v25, v26);
632 u26 = _mm256_sub_epi16(v25, v26);
633 u27 = _mm256_sub_epi16(v24, v27);
634
635 u28 = _mm256_sub_epi16(v31, v28);
636 u29 = _mm256_sub_epi16(v30, v29);
637 u30 = _mm256_add_epi16(v29, v30);
638 u31 = _mm256_add_epi16(v28, v31);
639
640 {
641 const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
642 const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64);
643 const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
644 butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
645 butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
646 butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
647 butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
648 }
649
650 out[0] = _mm256_add_epi16(u16, u23);
651 out[1] = _mm256_add_epi16(u17, u22);
652 out[2] = _mm256_add_epi16(u18, u21);
653 out[3] = _mm256_add_epi16(u19, u20);
654 out[4] = _mm256_sub_epi16(u19, u20);
655 out[5] = _mm256_sub_epi16(u18, u21);
656 out[6] = _mm256_sub_epi16(u17, u22);
657 out[7] = _mm256_sub_epi16(u16, u23);
658
659 out[8] = _mm256_sub_epi16(u31, u24);
660 out[9] = _mm256_sub_epi16(u30, u25);
661 out[10] = _mm256_sub_epi16(u29, u26);
662 out[11] = _mm256_sub_epi16(u28, u27);
663 out[12] = _mm256_add_epi16(u27, u28);
664 out[13] = _mm256_add_epi16(u26, u29);
665 out[14] = _mm256_add_epi16(u25, u30);
666 out[15] = _mm256_add_epi16(u24, u31);
667
668 {
669 const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64);
670 const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
671 butterfly_self(&out[4], &out[11], &stg6_0, &stg4_0);
672 butterfly_self(&out[5], &out[10], &stg6_0, &stg4_0);
673 butterfly_self(&out[6], &out[9], &stg6_0, &stg4_0);
674 butterfly_self(&out[7], &out[8], &stg6_0, &stg4_0);
675 }
676}
677
678static void idct32_full_16x32_quarter_1_2(const __m256i *in /*in[32]*/,
679 __m256i *out /*out[32]*/) {
680 __m256i temp[16];
681 idct32_full_16x32_quarter_1(in, temp);
682 idct32_full_16x32_quarter_2(in, &temp[8]);
683 add_sub_butterfly(temp, out, 16);
684}
685
686static void idct32_16x32(const __m256i *in /*in[32]*/,
687 __m256i *out /*out[32]*/) {
688 __m256i temp[32];
689 idct32_full_16x32_quarter_1_2(in, temp);
690 idct32_full_16x32_quarter_3_4(in, &temp[16]);
691 add_sub_butterfly(temp, out, 32);
692}
693
694void aom_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest,
695 int stride) {
696 __m256i col[64], in[32];
697 int i;
698
699 for (i = 0; i < 2; ++i) {
700 load_buffer_32x16(input, in);
701 input += 32 << 4;
702
703 mm256_transpose_16x16(in, in);
704 mm256_transpose_16x16(&in[16], &in[16]);
705 idct32_16x32(in, col + (i << 5));
706 }
707
708 for (i = 0; i < 2; ++i) {
709 int j = i << 4;
710 mm256_transpose_16x16(col + j, in);
711 mm256_transpose_16x16(col + j + 32, &in[16]);
712 idct32_16x32(in, in);
713 store_buffer_16xN(in, stride, dest, 32);
714 dest += 16;
715 }
716}
717
718// Group the coefficient calculation into smaller functions
719// to prevent stack spillover:
720// quarter_1: 0-7
721// quarter_2: 8-15
722// quarter_3_4: 16-23, 24-31
723static void idct32_16x32_135_quarter_1(const __m256i *in /*in[16]*/,
724 __m256i *out /*out[8]*/) {
725 __m256i u0, u1, u2, u3, u4, u5, u6, u7;
726 __m256i v0, v1, v2, v3, v4, v5, v6, v7;
727
728 {
729 const __m256i stk4_0 = pair256_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
730 const __m256i stk4_2 = pair256_set_epi16(2 * cospi_24_64, 2 * cospi_24_64);
731 const __m256i stk4_3 = pair256_set_epi16(2 * cospi_8_64, 2 * cospi_8_64);
732 u0 = _mm256_mulhrs_epi16(in[0], stk4_0);
733 u2 = _mm256_mulhrs_epi16(in[8], stk4_2);
734 u3 = _mm256_mulhrs_epi16(in[8], stk4_3);
735 u1 = u0;
736 }
737
738 v0 = _mm256_add_epi16(u0, u3);
739 v1 = _mm256_add_epi16(u1, u2);
740 v2 = _mm256_sub_epi16(u1, u2);
741 v3 = _mm256_sub_epi16(u0, u3);
742
743 {
744 const __m256i stk3_0 = pair256_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
745 const __m256i stk3_1 = pair256_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
746 const __m256i stk3_2 =
747 pair256_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64);
748 const __m256i stk3_3 = pair256_set_epi16(2 * cospi_12_64, 2 * cospi_12_64);
749 u4 = _mm256_mulhrs_epi16(in[4], stk3_0);
750 u7 = _mm256_mulhrs_epi16(in[4], stk3_1);
751 u5 = _mm256_mulhrs_epi16(in[12], stk3_2);
752 u6 = _mm256_mulhrs_epi16(in[12], stk3_3);
753 }
754
755 v4 = _mm256_add_epi16(u4, u5);
756 v5 = _mm256_sub_epi16(u4, u5);
757 v6 = _mm256_sub_epi16(u7, u6);
758 v7 = _mm256_add_epi16(u7, u6);
759
760 {
761 const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64);
762 const __m256i stg4_1 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
763 butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6);
764 }
765
766 out[0] = _mm256_add_epi16(v0, v7);
767 out[1] = _mm256_add_epi16(v1, v6);
768 out[2] = _mm256_add_epi16(v2, v5);
769 out[3] = _mm256_add_epi16(v3, v4);
770 out[4] = _mm256_sub_epi16(v3, v4);
771 out[5] = _mm256_sub_epi16(v2, v5);
772 out[6] = _mm256_sub_epi16(v1, v6);
773 out[7] = _mm256_sub_epi16(v0, v7);
774}
775
776static void idct32_16x32_135_quarter_2(const __m256i *in /*in[16]*/,
777 __m256i *out /*out[8]*/) {
778 __m256i u8, u9, u10, u11, u12, u13, u14, u15;
779 __m256i v8, v9, v10, v11, v12, v13, v14, v15;
780
781 {
782 const __m256i stk2_0 = pair256_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
783 const __m256i stk2_1 = pair256_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
784 const __m256i stk2_2 =
785 pair256_set_epi16(-2 * cospi_18_64, -2 * cospi_18_64);
786 const __m256i stk2_3 = pair256_set_epi16(2 * cospi_14_64, 2 * cospi_14_64);
787 const __m256i stk2_4 = pair256_set_epi16(2 * cospi_22_64, 2 * cospi_22_64);
788 const __m256i stk2_5 = pair256_set_epi16(2 * cospi_10_64, 2 * cospi_10_64);
789 const __m256i stk2_6 =
790 pair256_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64);
791 const __m256i stk2_7 = pair256_set_epi16(2 * cospi_6_64, 2 * cospi_6_64);
792 u8 = _mm256_mulhrs_epi16(in[2], stk2_0);
793 u15 = _mm256_mulhrs_epi16(in[2], stk2_1);
794 u9 = _mm256_mulhrs_epi16(in[14], stk2_2);
795 u14 = _mm256_mulhrs_epi16(in[14], stk2_3);
796 u10 = _mm256_mulhrs_epi16(in[10], stk2_4);
797 u13 = _mm256_mulhrs_epi16(in[10], stk2_5);
798 u11 = _mm256_mulhrs_epi16(in[6], stk2_6);
799 u12 = _mm256_mulhrs_epi16(in[6], stk2_7);
800 }
801
802 v8 = _mm256_add_epi16(u8, u9);
803 v9 = _mm256_sub_epi16(u8, u9);
804 v10 = _mm256_sub_epi16(u11, u10);
805 v11 = _mm256_add_epi16(u11, u10);
806 v12 = _mm256_add_epi16(u12, u13);
807 v13 = _mm256_sub_epi16(u12, u13);
808 v14 = _mm256_sub_epi16(u15, u14);
809 v15 = _mm256_add_epi16(u15, u14);
810
811 {
812 const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
813 const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64);
814 const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
815 butterfly_self(&v9, &v14, &stg4_4, &stg4_5);
816 butterfly_self(&v10, &v13, &stg4_6, &stg4_4);
817 }
818
819 out[0] = _mm256_add_epi16(v8, v11);
820 out[1] = _mm256_add_epi16(v9, v10);
821 out[2] = _mm256_sub_epi16(v9, v10);
822 out[3] = _mm256_sub_epi16(v8, v11);
823 out[4] = _mm256_sub_epi16(v15, v12);
824 out[5] = _mm256_sub_epi16(v14, v13);
825 out[6] = _mm256_add_epi16(v14, v13);
826 out[7] = _mm256_add_epi16(v15, v12);
827
828 {
829 const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64);
830 const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
831 butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0);
832 butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0);
833 }
834}
835
836// 8x32 block even indexed 8 inputs of in[16],
837// output first half 16 to out[32]
838static void idct32_16x32_quarter_1_2(const __m256i *in /*in[16]*/,
839 __m256i *out /*out[32]*/) {
840 __m256i temp[16];
841 idct32_16x32_135_quarter_1(in, temp);
842 idct32_16x32_135_quarter_2(in, &temp[8]);
843 add_sub_butterfly(temp, out, 16);
844}
845
846// 8x32 block odd indexed 8 inputs of in[16],
847// output second half 16 to out[32]
848static void idct32_16x32_quarter_3_4(const __m256i *in /*in[16]*/,
849 __m256i *out /*out[32]*/) {
850 __m256i v16, v17, v18, v19, v20, v21, v22, v23;
851 __m256i v24, v25, v26, v27, v28, v29, v30, v31;
852 __m256i u16, u17, u18, u19, u20, u21, u22, u23;
853 __m256i u24, u25, u26, u27, u28, u29, u30, u31;
854
855 {
856 const __m256i stk1_0 = pair256_set_epi16(2 * cospi_31_64, 2 * cospi_31_64);
857 const __m256i stk1_1 = pair256_set_epi16(2 * cospi_1_64, 2 * cospi_1_64);
858 const __m256i stk1_2 =
859 pair256_set_epi16(-2 * cospi_17_64, -2 * cospi_17_64);
860 const __m256i stk1_3 = pair256_set_epi16(2 * cospi_15_64, 2 * cospi_15_64);
861
862 const __m256i stk1_4 = pair256_set_epi16(2 * cospi_23_64, 2 * cospi_23_64);
863 const __m256i stk1_5 = pair256_set_epi16(2 * cospi_9_64, 2 * cospi_9_64);
864 const __m256i stk1_6 =
865 pair256_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64);
866 const __m256i stk1_7 = pair256_set_epi16(2 * cospi_7_64, 2 * cospi_7_64);
867 const __m256i stk1_8 = pair256_set_epi16(2 * cospi_27_64, 2 * cospi_27_64);
868 const __m256i stk1_9 = pair256_set_epi16(2 * cospi_5_64, 2 * cospi_5_64);
869 const __m256i stk1_10 =
870 pair256_set_epi16(-2 * cospi_21_64, -2 * cospi_21_64);
871 const __m256i stk1_11 = pair256_set_epi16(2 * cospi_11_64, 2 * cospi_11_64);
872
873 const __m256i stk1_12 = pair256_set_epi16(2 * cospi_19_64, 2 * cospi_19_64);
874 const __m256i stk1_13 = pair256_set_epi16(2 * cospi_13_64, 2 * cospi_13_64);
875 const __m256i stk1_14 =
876 pair256_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64);
877 const __m256i stk1_15 = pair256_set_epi16(2 * cospi_3_64, 2 * cospi_3_64);
878 u16 = _mm256_mulhrs_epi16(in[1], stk1_0);
879 u31 = _mm256_mulhrs_epi16(in[1], stk1_1);
880 u17 = _mm256_mulhrs_epi16(in[15], stk1_2);
881 u30 = _mm256_mulhrs_epi16(in[15], stk1_3);
882
883 u18 = _mm256_mulhrs_epi16(in[9], stk1_4);
884 u29 = _mm256_mulhrs_epi16(in[9], stk1_5);
885 u19 = _mm256_mulhrs_epi16(in[7], stk1_6);
886 u28 = _mm256_mulhrs_epi16(in[7], stk1_7);
887
888 u20 = _mm256_mulhrs_epi16(in[5], stk1_8);
889 u27 = _mm256_mulhrs_epi16(in[5], stk1_9);
890 u21 = _mm256_mulhrs_epi16(in[11], stk1_10);
891 u26 = _mm256_mulhrs_epi16(in[11], stk1_11);
892
893 u22 = _mm256_mulhrs_epi16(in[13], stk1_12);
894 u25 = _mm256_mulhrs_epi16(in[13], stk1_13);
895 u23 = _mm256_mulhrs_epi16(in[3], stk1_14);
896 u24 = _mm256_mulhrs_epi16(in[3], stk1_15);
897 }
898
899 v16 = _mm256_add_epi16(u16, u17);
900 v17 = _mm256_sub_epi16(u16, u17);
901 v18 = _mm256_sub_epi16(u19, u18);
902 v19 = _mm256_add_epi16(u19, u18);
903
904 v20 = _mm256_add_epi16(u20, u21);
905 v21 = _mm256_sub_epi16(u20, u21);
906 v22 = _mm256_sub_epi16(u23, u22);
907 v23 = _mm256_add_epi16(u23, u22);
908
909 v24 = _mm256_add_epi16(u24, u25);
910 v25 = _mm256_sub_epi16(u24, u25);
911 v26 = _mm256_sub_epi16(u27, u26);
912 v27 = _mm256_add_epi16(u27, u26);
913
914 v28 = _mm256_add_epi16(u28, u29);
915 v29 = _mm256_sub_epi16(u28, u29);
916 v30 = _mm256_sub_epi16(u31, u30);
917 v31 = _mm256_add_epi16(u31, u30);
918
919 {
920 const __m256i stg3_4 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
921 const __m256i stg3_5 = pair256_set_epi16(cospi_28_64, cospi_4_64);
922 const __m256i stg3_6 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);
923 const __m256i stg3_8 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
924 const __m256i stg3_9 = pair256_set_epi16(cospi_12_64, cospi_20_64);
925 const __m256i stg3_10 = pair256_set_epi16(-cospi_12_64, -cospi_20_64);
926
927 butterfly_self(&v17, &v30, &stg3_4, &stg3_5);
928 butterfly_self(&v18, &v29, &stg3_6, &stg3_4);
929 butterfly_self(&v21, &v26, &stg3_8, &stg3_9);
930 butterfly_self(&v22, &v25, &stg3_10, &stg3_8);
931 }
932
933 u16 = _mm256_add_epi16(v16, v19);
934 u17 = _mm256_add_epi16(v17, v18);
935 u18 = _mm256_sub_epi16(v17, v18);
936 u19 = _mm256_sub_epi16(v16, v19);
937 u20 = _mm256_sub_epi16(v23, v20);
938 u21 = _mm256_sub_epi16(v22, v21);
939 u22 = _mm256_add_epi16(v22, v21);
940 u23 = _mm256_add_epi16(v23, v20);
941
942 u24 = _mm256_add_epi16(v24, v27);
943 u25 = _mm256_add_epi16(v25, v26);
944 u26 = _mm256_sub_epi16(v25, v26);
945 u27 = _mm256_sub_epi16(v24, v27);
946 u28 = _mm256_sub_epi16(v31, v28);
947 u29 = _mm256_sub_epi16(v30, v29);
948 u30 = _mm256_add_epi16(v29, v30);
949 u31 = _mm256_add_epi16(v28, v31);
950
951 {
952 const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
953 const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64);
954 const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
955 butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
956 butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
957 butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
958 butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
959 }
960
961 out[0] = _mm256_add_epi16(u16, u23);
962 out[1] = _mm256_add_epi16(u17, u22);
963 out[2] = _mm256_add_epi16(u18, u21);
964 out[3] = _mm256_add_epi16(u19, u20);
965 v20 = _mm256_sub_epi16(u19, u20);
966 v21 = _mm256_sub_epi16(u18, u21);
967 v22 = _mm256_sub_epi16(u17, u22);
968 v23 = _mm256_sub_epi16(u16, u23);
969
970 v24 = _mm256_sub_epi16(u31, u24);
971 v25 = _mm256_sub_epi16(u30, u25);
972 v26 = _mm256_sub_epi16(u29, u26);
973 v27 = _mm256_sub_epi16(u28, u27);
974 out[12] = _mm256_add_epi16(u27, u28);
975 out[13] = _mm256_add_epi16(u26, u29);
976 out[14] = _mm256_add_epi16(u25, u30);
977 out[15] = _mm256_add_epi16(u24, u31);
978
979 {
980 const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64);
981 const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
982 butterfly(&v20, &v27, &stg6_0, &stg4_0, &out[4], &out[11]);
983 butterfly(&v21, &v26, &stg6_0, &stg4_0, &out[5], &out[10]);
984 butterfly(&v22, &v25, &stg6_0, &stg4_0, &out[6], &out[9]);
985 butterfly(&v23, &v24, &stg6_0, &stg4_0, &out[7], &out[8]);
986 }
987}
988
989// 16x16 block input __m256i in[32], output 16x32 __m256i in[32]
990static void idct32_16x32_135(__m256i *in /*in[32]*/) {
991 __m256i out[32];
992 idct32_16x32_quarter_1_2(in, out);
993 idct32_16x32_quarter_3_4(in, &out[16]);
994 add_sub_butterfly(out, in, 32);
995}
996
997static INLINE void load_buffer_from_32x32(const tran_low_t *coeff, __m256i *in,
998 int size) {
999 int i = 0;
1000 while (i < size) {
1001 load_coeff(coeff + (i << 5), &in[i]);
1002 i += 1;
1003 }
1004}
1005
Yi Luod1fb4152017-05-12 09:44:33 -07001006static INLINE void zero_buffer(__m256i *in, int num) {
1007 int i;
1008 for (i = 0; i < num; ++i) {
1009 in[i] = _mm256_setzero_si256();
1010 }
1011}
1012
Yi Luo40f22ef2017-05-08 16:29:39 -07001013// Only upper-left 16x16 has non-zero coeff
1014void aom_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest,
1015 int stride) {
1016 __m256i in[32];
Yi Luod1fb4152017-05-12 09:44:33 -07001017 zero_buffer(in, 32);
Yi Luo40f22ef2017-05-08 16:29:39 -07001018 load_buffer_from_32x32(input, in, 16);
1019 mm256_transpose_16x16(in, in);
1020 idct32_16x32_135(in);
1021
1022 __m256i out[32];
1023 mm256_transpose_16x16(in, out);
1024 idct32_16x32_135(out);
1025 store_buffer_16xN(out, stride, dest, 32);
1026 mm256_transpose_16x16(&in[16], in);
1027 idct32_16x32_135(in);
1028 store_buffer_16xN(in, stride, dest + 16, 32);
1029}
1030
1031static void idct32_34_first_half(const __m256i *in, __m256i *stp1) {
1032 const __m256i stk2_0 = pair256_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
1033 const __m256i stk2_1 = pair256_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
1034 const __m256i stk2_6 = pair256_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64);
1035 const __m256i stk2_7 = pair256_set_epi16(2 * cospi_6_64, 2 * cospi_6_64);
1036
1037 const __m256i stk3_0 = pair256_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
1038 const __m256i stk3_1 = pair256_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
1039
1040 const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64);
1041 const __m256i stk4_0 = pair256_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
1042 const __m256i stg4_1 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
1043 const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
1044 const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64);
1045 const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
1046
1047 const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
1048 __m256i u0, u1, u2, u3, u4, u5, u6, u7;
1049 __m256i x0, x1, x4, x5, x6, x7;
1050 __m256i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
1051
1052 // phase 1
1053
1054 // 0, 15
1055 u2 = _mm256_mulhrs_epi16(in[2], stk2_1); // stp2_15
1056 u3 = _mm256_mulhrs_epi16(in[6], stk2_7); // stp2_12
1057 v15 = _mm256_add_epi16(u2, u3);
1058 // in[0], in[4]
1059 x0 = _mm256_mulhrs_epi16(in[0], stk4_0); // stp1[0]
1060 x7 = _mm256_mulhrs_epi16(in[4], stk3_1); // stp1[7]
1061 v0 = _mm256_add_epi16(x0, x7); // stp2_0
1062 stp1[0] = _mm256_add_epi16(v0, v15);
1063 stp1[15] = _mm256_sub_epi16(v0, v15);
1064
1065 // in[2], in[6]
1066 u0 = _mm256_mulhrs_epi16(in[2], stk2_0); // stp2_8
1067 u1 = _mm256_mulhrs_epi16(in[6], stk2_6); // stp2_11
1068 butterfly(&u0, &u2, &stg4_4, &stg4_5, &u4, &u5); // stp2_9, stp2_14
1069 butterfly(&u1, &u3, &stg4_6, &stg4_4, &u6, &u7); // stp2_10, stp2_13
1070
1071 v8 = _mm256_add_epi16(u0, u1);
1072 v9 = _mm256_add_epi16(u4, u6);
1073 v10 = _mm256_sub_epi16(u4, u6);
1074 v11 = _mm256_sub_epi16(u0, u1);
1075 v12 = _mm256_sub_epi16(u2, u3);
1076 v13 = _mm256_sub_epi16(u5, u7);
1077 v14 = _mm256_add_epi16(u5, u7);
1078
1079 butterfly_self(&v10, &v13, &stg6_0, &stg4_0);
1080 butterfly_self(&v11, &v12, &stg6_0, &stg4_0);
1081
1082 // 1, 14
1083 x1 = _mm256_mulhrs_epi16(in[0], stk4_0); // stp1[1], stk4_1 = stk4_0
1084 // stp1[2] = stp1[0], stp1[3] = stp1[1]
1085 x4 = _mm256_mulhrs_epi16(in[4], stk3_0); // stp1[4]
1086 butterfly(&x7, &x4, &stg4_1, &stg4_0, &x5, &x6);
1087 v1 = _mm256_add_epi16(x1, x6); // stp2_1
1088 v2 = _mm256_add_epi16(x0, x5); // stp2_2
1089 stp1[1] = _mm256_add_epi16(v1, v14);
1090 stp1[14] = _mm256_sub_epi16(v1, v14);
1091
1092 stp1[2] = _mm256_add_epi16(v2, v13);
1093 stp1[13] = _mm256_sub_epi16(v2, v13);
1094
1095 v3 = _mm256_add_epi16(x1, x4); // stp2_3
1096 v4 = _mm256_sub_epi16(x1, x4); // stp2_4
1097
1098 v5 = _mm256_sub_epi16(x0, x5); // stp2_5
1099
1100 v6 = _mm256_sub_epi16(x1, x6); // stp2_6
1101 v7 = _mm256_sub_epi16(x0, x7); // stp2_7
1102 stp1[3] = _mm256_add_epi16(v3, v12);
1103 stp1[12] = _mm256_sub_epi16(v3, v12);
1104
1105 stp1[6] = _mm256_add_epi16(v6, v9);
1106 stp1[9] = _mm256_sub_epi16(v6, v9);
1107
1108 stp1[7] = _mm256_add_epi16(v7, v8);
1109 stp1[8] = _mm256_sub_epi16(v7, v8);
1110
1111 stp1[4] = _mm256_add_epi16(v4, v11);
1112 stp1[11] = _mm256_sub_epi16(v4, v11);
1113
1114 stp1[5] = _mm256_add_epi16(v5, v10);
1115 stp1[10] = _mm256_sub_epi16(v5, v10);
1116}
1117
1118static void idct32_34_second_half(const __m256i *in, __m256i *stp1) {
1119 const __m256i stk1_0 = pair256_set_epi16(2 * cospi_31_64, 2 * cospi_31_64);
1120 const __m256i stk1_1 = pair256_set_epi16(2 * cospi_1_64, 2 * cospi_1_64);
1121 const __m256i stk1_6 = pair256_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64);
1122 const __m256i stk1_7 = pair256_set_epi16(2 * cospi_7_64, 2 * cospi_7_64);
1123 const __m256i stk1_8 = pair256_set_epi16(2 * cospi_27_64, 2 * cospi_27_64);
1124 const __m256i stk1_9 = pair256_set_epi16(2 * cospi_5_64, 2 * cospi_5_64);
1125 const __m256i stk1_14 = pair256_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64);
1126 const __m256i stk1_15 = pair256_set_epi16(2 * cospi_3_64, 2 * cospi_3_64);
1127 const __m256i stg3_4 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
1128 const __m256i stg3_5 = pair256_set_epi16(cospi_28_64, cospi_4_64);
1129 const __m256i stg3_6 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);
1130 const __m256i stg3_8 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
1131 const __m256i stg3_9 = pair256_set_epi16(cospi_12_64, cospi_20_64);
1132 const __m256i stg3_10 = pair256_set_epi16(-cospi_12_64, -cospi_20_64);
1133
1134 const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64);
1135 const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
1136 const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64);
1137 const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
1138
1139 const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
1140 __m256i v16, v17, v18, v19, v20, v21, v22, v23;
1141 __m256i v24, v25, v26, v27, v28, v29, v30, v31;
1142 __m256i u16, u17, u18, u19, u20, u21, u22, u23;
1143 __m256i u24, u25, u26, u27, u28, u29, u30, u31;
1144
1145 v16 = _mm256_mulhrs_epi16(in[1], stk1_0);
1146 v31 = _mm256_mulhrs_epi16(in[1], stk1_1);
1147
1148 v19 = _mm256_mulhrs_epi16(in[7], stk1_6);
1149 v28 = _mm256_mulhrs_epi16(in[7], stk1_7);
1150
1151 v20 = _mm256_mulhrs_epi16(in[5], stk1_8);
1152 v27 = _mm256_mulhrs_epi16(in[5], stk1_9);
1153
1154 v23 = _mm256_mulhrs_epi16(in[3], stk1_14);
1155 v24 = _mm256_mulhrs_epi16(in[3], stk1_15);
1156
1157 butterfly(&v16, &v31, &stg3_4, &stg3_5, &v17, &v30);
1158 butterfly(&v19, &v28, &stg3_6, &stg3_4, &v18, &v29);
1159 butterfly(&v20, &v27, &stg3_8, &stg3_9, &v21, &v26);
1160 butterfly(&v23, &v24, &stg3_10, &stg3_8, &v22, &v25);
1161
1162 u16 = _mm256_add_epi16(v16, v19);
1163 u17 = _mm256_add_epi16(v17, v18);
1164 u18 = _mm256_sub_epi16(v17, v18);
1165 u19 = _mm256_sub_epi16(v16, v19);
1166 u20 = _mm256_sub_epi16(v23, v20);
1167 u21 = _mm256_sub_epi16(v22, v21);
1168 u22 = _mm256_add_epi16(v22, v21);
1169 u23 = _mm256_add_epi16(v23, v20);
1170 u24 = _mm256_add_epi16(v24, v27);
1171 u27 = _mm256_sub_epi16(v24, v27);
1172 u25 = _mm256_add_epi16(v25, v26);
1173 u26 = _mm256_sub_epi16(v25, v26);
1174 u28 = _mm256_sub_epi16(v31, v28);
1175 u31 = _mm256_add_epi16(v28, v31);
1176 u29 = _mm256_sub_epi16(v30, v29);
1177 u30 = _mm256_add_epi16(v29, v30);
1178
1179 butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
1180 butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
1181 butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
1182 butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
1183
1184 stp1[0] = _mm256_add_epi16(u16, u23);
1185 stp1[7] = _mm256_sub_epi16(u16, u23);
1186
1187 stp1[1] = _mm256_add_epi16(u17, u22);
1188 stp1[6] = _mm256_sub_epi16(u17, u22);
1189
1190 stp1[2] = _mm256_add_epi16(u18, u21);
1191 stp1[5] = _mm256_sub_epi16(u18, u21);
1192
1193 stp1[3] = _mm256_add_epi16(u19, u20);
1194 stp1[4] = _mm256_sub_epi16(u19, u20);
1195
1196 stp1[8] = _mm256_sub_epi16(u31, u24);
1197 stp1[15] = _mm256_add_epi16(u24, u31);
1198
1199 stp1[9] = _mm256_sub_epi16(u30, u25);
1200 stp1[14] = _mm256_add_epi16(u25, u30);
1201
1202 stp1[10] = _mm256_sub_epi16(u29, u26);
1203 stp1[13] = _mm256_add_epi16(u26, u29);
1204
1205 stp1[11] = _mm256_sub_epi16(u28, u27);
1206 stp1[12] = _mm256_add_epi16(u27, u28);
1207
1208 butterfly_self(&stp1[4], &stp1[11], &stg6_0, &stg4_0);
1209 butterfly_self(&stp1[5], &stp1[10], &stg6_0, &stg4_0);
1210 butterfly_self(&stp1[6], &stp1[9], &stg6_0, &stg4_0);
1211 butterfly_self(&stp1[7], &stp1[8], &stg6_0, &stg4_0);
1212}
1213
1214// 16x16 block input __m256i in[32], output 16x32 __m256i in[32]
1215static void idct32_16x32_34(__m256i *in /*in[32]*/) {
1216 __m256i out[32];
1217 idct32_34_first_half(in, out);
1218 idct32_34_second_half(in, &out[16]);
1219 add_sub_butterfly(out, in, 32);
1220}
1221
1222// Only upper-left 8x8 has non-zero coeff
1223void aom_idct32x32_34_add_avx2(const tran_low_t *input, uint8_t *dest,
1224 int stride) {
1225 __m256i in[32];
Yi Luod1fb4152017-05-12 09:44:33 -07001226 zero_buffer(in, 32);
Yi Luo40f22ef2017-05-08 16:29:39 -07001227 load_buffer_from_32x32(input, in, 8);
1228 mm256_transpose_16x16(in, in);
1229 idct32_16x32_34(in);
1230
1231 __m256i out[32];
1232 mm256_transpose_16x16(in, out);
1233 idct32_16x32_34(out);
1234 store_buffer_16xN(out, stride, dest, 32);
1235 mm256_transpose_16x16(&in[16], in);
1236 idct32_16x32_34(in);
1237 store_buffer_16xN(in, stride, dest + 16, 32);
1238}