blob: 6c424d968457fab62637a079c6ae69a8d9988550 [file] [log] [blame]
Yaowu Xuc27fc142016-08-22 16:08:15 -07001/*
Yaowu Xubde4ac82016-11-28 15:26:06 -08002 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
Yaowu Xuc27fc142016-08-22 16:08:15 -07003 *
Yaowu Xubde4ac82016-11-28 15:26:06 -08004 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
Yaowu Xuc27fc142016-08-22 16:08:15 -070010 */
11
12#include <stdlib.h>
Yaowu Xuf883b422016-08-30 14:01:10 -070013#include "av1/common/av1_inv_txfm1d.h"
Yaowu Xuc27fc142016-08-22 16:08:15 -070014#if CONFIG_COEFFICIENT_RANGE_CHECKING
15#define range_check(stage, input, buf, size, bit) \
16 { \
17 int i, j; \
18 for (i = 0; i < size; ++i) { \
19 int buf_bit = get_max_bit(abs(buf[i])) + 1; \
20 if (buf_bit > bit) { \
21 printf("======== %s %d overflow ========\n", __FILE__, __LINE__); \
22 printf("stage: %d node: %d\n", stage, i); \
23 printf("bit: %d buf_bit: %d buf[i]: %d\n", bit, buf_bit, buf[i]); \
24 printf("input:\n"); \
25 for (j = 0; j < size; j++) { \
26 printf("%d,", input[j]); \
27 } \
28 printf("\n"); \
29 assert(0); \
30 } \
31 } \
32 }
33#else
34#define range_check(stage, input, buf, size, bit) \
35 { \
clang-format67948d32016-09-07 22:40:40 -070036 (void)stage; \
37 (void)input; \
38 (void)buf; \
39 (void)size; \
40 (void)bit; \
Yaowu Xuc27fc142016-08-22 16:08:15 -070041 }
42#endif
43
Angie Chiang792519b2016-10-18 12:24:20 -070044// TODO(angiebird): Make 1-d txfm functions static
Yaowu Xuf883b422016-08-30 14:01:10 -070045void av1_idct4_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
46 const int8_t *stage_range) {
Yaowu Xuc27fc142016-08-22 16:08:15 -070047 const int32_t size = 4;
48 const int32_t *cospi;
49
50 int32_t stage = 0;
51 int32_t *bf0, *bf1;
52 int32_t step[4];
53
54 // stage 0;
55 range_check(stage, input, input, size, stage_range[stage]);
56
57 // stage 1;
58 stage++;
59 bf1 = output;
60 bf1[0] = input[0];
61 bf1[1] = input[2];
62 bf1[2] = input[1];
63 bf1[3] = input[3];
64 range_check(stage, input, bf1, size, stage_range[stage]);
65
66 // stage 2
67 stage++;
68 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
69 bf0 = output;
70 bf1 = step;
71 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
72 bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit[stage]);
73 bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit[stage]);
74 bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit[stage]);
75 range_check(stage, input, bf1, size, stage_range[stage]);
76
77 // stage 3
78 stage++;
79 bf0 = step;
80 bf1 = output;
81 bf1[0] = bf0[0] + bf0[3];
82 bf1[1] = bf0[1] + bf0[2];
83 bf1[2] = bf0[1] - bf0[2];
84 bf1[3] = bf0[0] - bf0[3];
85 range_check(stage, input, bf1, size, stage_range[stage]);
86}
87
Yaowu Xuf883b422016-08-30 14:01:10 -070088void av1_idct8_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
89 const int8_t *stage_range) {
Yaowu Xuc27fc142016-08-22 16:08:15 -070090 const int32_t size = 8;
91 const int32_t *cospi;
92
93 int32_t stage = 0;
94 int32_t *bf0, *bf1;
95 int32_t step[8];
96
97 // stage 0;
98 range_check(stage, input, input, size, stage_range[stage]);
99
100 // stage 1;
101 stage++;
102 bf1 = output;
103 bf1[0] = input[0];
104 bf1[1] = input[4];
105 bf1[2] = input[2];
106 bf1[3] = input[6];
107 bf1[4] = input[1];
108 bf1[5] = input[5];
109 bf1[6] = input[3];
110 bf1[7] = input[7];
111 range_check(stage, input, bf1, size, stage_range[stage]);
112
113 // stage 2
114 stage++;
115 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
116 bf0 = output;
117 bf1 = step;
118 bf1[0] = bf0[0];
119 bf1[1] = bf0[1];
120 bf1[2] = bf0[2];
121 bf1[3] = bf0[3];
122 bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit[stage]);
123 bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit[stage]);
124 bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit[stage]);
125 bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit[stage]);
126 range_check(stage, input, bf1, size, stage_range[stage]);
127
128 // stage 3
129 stage++;
130 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
131 bf0 = step;
132 bf1 = output;
133 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
134 bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit[stage]);
135 bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit[stage]);
136 bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit[stage]);
137 bf1[4] = bf0[4] + bf0[5];
138 bf1[5] = bf0[4] - bf0[5];
139 bf1[6] = -bf0[6] + bf0[7];
140 bf1[7] = bf0[6] + bf0[7];
141 range_check(stage, input, bf1, size, stage_range[stage]);
142
143 // stage 4
144 stage++;
145 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
146 bf0 = output;
147 bf1 = step;
148 bf1[0] = bf0[0] + bf0[3];
149 bf1[1] = bf0[1] + bf0[2];
150 bf1[2] = bf0[1] - bf0[2];
151 bf1[3] = bf0[0] - bf0[3];
152 bf1[4] = bf0[4];
153 bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
154 bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
155 bf1[7] = bf0[7];
156 range_check(stage, input, bf1, size, stage_range[stage]);
157
158 // stage 5
159 stage++;
160 bf0 = step;
161 bf1 = output;
162 bf1[0] = bf0[0] + bf0[7];
163 bf1[1] = bf0[1] + bf0[6];
164 bf1[2] = bf0[2] + bf0[5];
165 bf1[3] = bf0[3] + bf0[4];
166 bf1[4] = bf0[3] - bf0[4];
167 bf1[5] = bf0[2] - bf0[5];
168 bf1[6] = bf0[1] - bf0[6];
169 bf1[7] = bf0[0] - bf0[7];
170 range_check(stage, input, bf1, size, stage_range[stage]);
171}
172
Yaowu Xuf883b422016-08-30 14:01:10 -0700173void av1_idct16_new(const int32_t *input, int32_t *output,
174 const int8_t *cos_bit, const int8_t *stage_range) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700175 const int32_t size = 16;
176 const int32_t *cospi;
177
178 int32_t stage = 0;
179 int32_t *bf0, *bf1;
180 int32_t step[16];
181
182 // stage 0;
183 range_check(stage, input, input, size, stage_range[stage]);
184
185 // stage 1;
186 stage++;
187 bf1 = output;
188 bf1[0] = input[0];
189 bf1[1] = input[8];
190 bf1[2] = input[4];
191 bf1[3] = input[12];
192 bf1[4] = input[2];
193 bf1[5] = input[10];
194 bf1[6] = input[6];
195 bf1[7] = input[14];
196 bf1[8] = input[1];
197 bf1[9] = input[9];
198 bf1[10] = input[5];
199 bf1[11] = input[13];
200 bf1[12] = input[3];
201 bf1[13] = input[11];
202 bf1[14] = input[7];
203 bf1[15] = input[15];
204 range_check(stage, input, bf1, size, stage_range[stage]);
205
206 // stage 2
207 stage++;
208 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
209 bf0 = output;
210 bf1 = step;
211 bf1[0] = bf0[0];
212 bf1[1] = bf0[1];
213 bf1[2] = bf0[2];
214 bf1[3] = bf0[3];
215 bf1[4] = bf0[4];
216 bf1[5] = bf0[5];
217 bf1[6] = bf0[6];
218 bf1[7] = bf0[7];
219 bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit[stage]);
220 bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit[stage]);
221 bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit[stage]);
222 bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit[stage]);
223 bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit[stage]);
224 bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit[stage]);
225 bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit[stage]);
226 bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit[stage]);
227 range_check(stage, input, bf1, size, stage_range[stage]);
228
229 // stage 3
230 stage++;
231 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
232 bf0 = step;
233 bf1 = output;
234 bf1[0] = bf0[0];
235 bf1[1] = bf0[1];
236 bf1[2] = bf0[2];
237 bf1[3] = bf0[3];
238 bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit[stage]);
239 bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit[stage]);
240 bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit[stage]);
241 bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit[stage]);
242 bf1[8] = bf0[8] + bf0[9];
243 bf1[9] = bf0[8] - bf0[9];
244 bf1[10] = -bf0[10] + bf0[11];
245 bf1[11] = bf0[10] + bf0[11];
246 bf1[12] = bf0[12] + bf0[13];
247 bf1[13] = bf0[12] - bf0[13];
248 bf1[14] = -bf0[14] + bf0[15];
249 bf1[15] = bf0[14] + bf0[15];
250 range_check(stage, input, bf1, size, stage_range[stage]);
251
252 // stage 4
253 stage++;
254 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
255 bf0 = output;
256 bf1 = step;
257 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
258 bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit[stage]);
259 bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit[stage]);
260 bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit[stage]);
261 bf1[4] = bf0[4] + bf0[5];
262 bf1[5] = bf0[4] - bf0[5];
263 bf1[6] = -bf0[6] + bf0[7];
264 bf1[7] = bf0[6] + bf0[7];
265 bf1[8] = bf0[8];
266 bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
267 bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
268 bf1[11] = bf0[11];
269 bf1[12] = bf0[12];
270 bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit[stage]);
271 bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit[stage]);
272 bf1[15] = bf0[15];
273 range_check(stage, input, bf1, size, stage_range[stage]);
274
275 // stage 5
276 stage++;
277 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
278 bf0 = step;
279 bf1 = output;
280 bf1[0] = bf0[0] + bf0[3];
281 bf1[1] = bf0[1] + bf0[2];
282 bf1[2] = bf0[1] - bf0[2];
283 bf1[3] = bf0[0] - bf0[3];
284 bf1[4] = bf0[4];
285 bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
286 bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
287 bf1[7] = bf0[7];
288 bf1[8] = bf0[8] + bf0[11];
289 bf1[9] = bf0[9] + bf0[10];
290 bf1[10] = bf0[9] - bf0[10];
291 bf1[11] = bf0[8] - bf0[11];
292 bf1[12] = -bf0[12] + bf0[15];
293 bf1[13] = -bf0[13] + bf0[14];
294 bf1[14] = bf0[13] + bf0[14];
295 bf1[15] = bf0[12] + bf0[15];
296 range_check(stage, input, bf1, size, stage_range[stage]);
297
298 // stage 6
299 stage++;
300 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
301 bf0 = output;
302 bf1 = step;
303 bf1[0] = bf0[0] + bf0[7];
304 bf1[1] = bf0[1] + bf0[6];
305 bf1[2] = bf0[2] + bf0[5];
306 bf1[3] = bf0[3] + bf0[4];
307 bf1[4] = bf0[3] - bf0[4];
308 bf1[5] = bf0[2] - bf0[5];
309 bf1[6] = bf0[1] - bf0[6];
310 bf1[7] = bf0[0] - bf0[7];
311 bf1[8] = bf0[8];
312 bf1[9] = bf0[9];
313 bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
314 bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
315 bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
316 bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
317 bf1[14] = bf0[14];
318 bf1[15] = bf0[15];
319 range_check(stage, input, bf1, size, stage_range[stage]);
320
321 // stage 7
322 stage++;
323 bf0 = step;
324 bf1 = output;
325 bf1[0] = bf0[0] + bf0[15];
326 bf1[1] = bf0[1] + bf0[14];
327 bf1[2] = bf0[2] + bf0[13];
328 bf1[3] = bf0[3] + bf0[12];
329 bf1[4] = bf0[4] + bf0[11];
330 bf1[5] = bf0[5] + bf0[10];
331 bf1[6] = bf0[6] + bf0[9];
332 bf1[7] = bf0[7] + bf0[8];
333 bf1[8] = bf0[7] - bf0[8];
334 bf1[9] = bf0[6] - bf0[9];
335 bf1[10] = bf0[5] - bf0[10];
336 bf1[11] = bf0[4] - bf0[11];
337 bf1[12] = bf0[3] - bf0[12];
338 bf1[13] = bf0[2] - bf0[13];
339 bf1[14] = bf0[1] - bf0[14];
340 bf1[15] = bf0[0] - bf0[15];
341 range_check(stage, input, bf1, size, stage_range[stage]);
342}
343
Yaowu Xuf883b422016-08-30 14:01:10 -0700344void av1_idct32_new(const int32_t *input, int32_t *output,
345 const int8_t *cos_bit, const int8_t *stage_range) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700346 const int32_t size = 32;
347 const int32_t *cospi;
348
349 int32_t stage = 0;
350 int32_t *bf0, *bf1;
351 int32_t step[32];
352
353 // stage 0;
354 range_check(stage, input, input, size, stage_range[stage]);
355
356 // stage 1;
357 stage++;
358 bf1 = output;
359 bf1[0] = input[0];
360 bf1[1] = input[16];
361 bf1[2] = input[8];
362 bf1[3] = input[24];
363 bf1[4] = input[4];
364 bf1[5] = input[20];
365 bf1[6] = input[12];
366 bf1[7] = input[28];
367 bf1[8] = input[2];
368 bf1[9] = input[18];
369 bf1[10] = input[10];
370 bf1[11] = input[26];
371 bf1[12] = input[6];
372 bf1[13] = input[22];
373 bf1[14] = input[14];
374 bf1[15] = input[30];
375 bf1[16] = input[1];
376 bf1[17] = input[17];
377 bf1[18] = input[9];
378 bf1[19] = input[25];
379 bf1[20] = input[5];
380 bf1[21] = input[21];
381 bf1[22] = input[13];
382 bf1[23] = input[29];
383 bf1[24] = input[3];
384 bf1[25] = input[19];
385 bf1[26] = input[11];
386 bf1[27] = input[27];
387 bf1[28] = input[7];
388 bf1[29] = input[23];
389 bf1[30] = input[15];
390 bf1[31] = input[31];
391 range_check(stage, input, bf1, size, stage_range[stage]);
392
393 // stage 2
394 stage++;
395 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
396 bf0 = output;
397 bf1 = step;
398 bf1[0] = bf0[0];
399 bf1[1] = bf0[1];
400 bf1[2] = bf0[2];
401 bf1[3] = bf0[3];
402 bf1[4] = bf0[4];
403 bf1[5] = bf0[5];
404 bf1[6] = bf0[6];
405 bf1[7] = bf0[7];
406 bf1[8] = bf0[8];
407 bf1[9] = bf0[9];
408 bf1[10] = bf0[10];
409 bf1[11] = bf0[11];
410 bf1[12] = bf0[12];
411 bf1[13] = bf0[13];
412 bf1[14] = bf0[14];
413 bf1[15] = bf0[15];
414 bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit[stage]);
415 bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit[stage]);
416 bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit[stage]);
417 bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit[stage]);
418 bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit[stage]);
419 bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit[stage]);
420 bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit[stage]);
421 bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit[stage]);
422 bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit[stage]);
423 bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit[stage]);
424 bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit[stage]);
425 bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit[stage]);
426 bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit[stage]);
427 bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit[stage]);
428 bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit[stage]);
429 bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit[stage]);
430 range_check(stage, input, bf1, size, stage_range[stage]);
431
432 // stage 3
433 stage++;
434 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
435 bf0 = step;
436 bf1 = output;
437 bf1[0] = bf0[0];
438 bf1[1] = bf0[1];
439 bf1[2] = bf0[2];
440 bf1[3] = bf0[3];
441 bf1[4] = bf0[4];
442 bf1[5] = bf0[5];
443 bf1[6] = bf0[6];
444 bf1[7] = bf0[7];
445 bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit[stage]);
446 bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit[stage]);
447 bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit[stage]);
448 bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit[stage]);
449 bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit[stage]);
450 bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit[stage]);
451 bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit[stage]);
452 bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit[stage]);
453 bf1[16] = bf0[16] + bf0[17];
454 bf1[17] = bf0[16] - bf0[17];
455 bf1[18] = -bf0[18] + bf0[19];
456 bf1[19] = bf0[18] + bf0[19];
457 bf1[20] = bf0[20] + bf0[21];
458 bf1[21] = bf0[20] - bf0[21];
459 bf1[22] = -bf0[22] + bf0[23];
460 bf1[23] = bf0[22] + bf0[23];
461 bf1[24] = bf0[24] + bf0[25];
462 bf1[25] = bf0[24] - bf0[25];
463 bf1[26] = -bf0[26] + bf0[27];
464 bf1[27] = bf0[26] + bf0[27];
465 bf1[28] = bf0[28] + bf0[29];
466 bf1[29] = bf0[28] - bf0[29];
467 bf1[30] = -bf0[30] + bf0[31];
468 bf1[31] = bf0[30] + bf0[31];
469 range_check(stage, input, bf1, size, stage_range[stage]);
470
471 // stage 4
472 stage++;
473 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
474 bf0 = output;
475 bf1 = step;
476 bf1[0] = bf0[0];
477 bf1[1] = bf0[1];
478 bf1[2] = bf0[2];
479 bf1[3] = bf0[3];
480 bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit[stage]);
481 bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit[stage]);
482 bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit[stage]);
483 bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit[stage]);
484 bf1[8] = bf0[8] + bf0[9];
485 bf1[9] = bf0[8] - bf0[9];
486 bf1[10] = -bf0[10] + bf0[11];
487 bf1[11] = bf0[10] + bf0[11];
488 bf1[12] = bf0[12] + bf0[13];
489 bf1[13] = bf0[12] - bf0[13];
490 bf1[14] = -bf0[14] + bf0[15];
491 bf1[15] = bf0[14] + bf0[15];
492 bf1[16] = bf0[16];
493 bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit[stage]);
494 bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit[stage]);
495 bf1[19] = bf0[19];
496 bf1[20] = bf0[20];
497 bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit[stage]);
498 bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit[stage]);
499 bf1[23] = bf0[23];
500 bf1[24] = bf0[24];
501 bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit[stage]);
502 bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit[stage]);
503 bf1[27] = bf0[27];
504 bf1[28] = bf0[28];
505 bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit[stage]);
506 bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit[stage]);
507 bf1[31] = bf0[31];
508 range_check(stage, input, bf1, size, stage_range[stage]);
509
510 // stage 5
511 stage++;
512 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
513 bf0 = step;
514 bf1 = output;
515 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
516 bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit[stage]);
517 bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit[stage]);
518 bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit[stage]);
519 bf1[4] = bf0[4] + bf0[5];
520 bf1[5] = bf0[4] - bf0[5];
521 bf1[6] = -bf0[6] + bf0[7];
522 bf1[7] = bf0[6] + bf0[7];
523 bf1[8] = bf0[8];
524 bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
525 bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
526 bf1[11] = bf0[11];
527 bf1[12] = bf0[12];
528 bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit[stage]);
529 bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit[stage]);
530 bf1[15] = bf0[15];
531 bf1[16] = bf0[16] + bf0[19];
532 bf1[17] = bf0[17] + bf0[18];
533 bf1[18] = bf0[17] - bf0[18];
534 bf1[19] = bf0[16] - bf0[19];
535 bf1[20] = -bf0[20] + bf0[23];
536 bf1[21] = -bf0[21] + bf0[22];
537 bf1[22] = bf0[21] + bf0[22];
538 bf1[23] = bf0[20] + bf0[23];
539 bf1[24] = bf0[24] + bf0[27];
540 bf1[25] = bf0[25] + bf0[26];
541 bf1[26] = bf0[25] - bf0[26];
542 bf1[27] = bf0[24] - bf0[27];
543 bf1[28] = -bf0[28] + bf0[31];
544 bf1[29] = -bf0[29] + bf0[30];
545 bf1[30] = bf0[29] + bf0[30];
546 bf1[31] = bf0[28] + bf0[31];
547 range_check(stage, input, bf1, size, stage_range[stage]);
548
549 // stage 6
550 stage++;
551 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
552 bf0 = output;
553 bf1 = step;
554 bf1[0] = bf0[0] + bf0[3];
555 bf1[1] = bf0[1] + bf0[2];
556 bf1[2] = bf0[1] - bf0[2];
557 bf1[3] = bf0[0] - bf0[3];
558 bf1[4] = bf0[4];
559 bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
560 bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
561 bf1[7] = bf0[7];
562 bf1[8] = bf0[8] + bf0[11];
563 bf1[9] = bf0[9] + bf0[10];
564 bf1[10] = bf0[9] - bf0[10];
565 bf1[11] = bf0[8] - bf0[11];
566 bf1[12] = -bf0[12] + bf0[15];
567 bf1[13] = -bf0[13] + bf0[14];
568 bf1[14] = bf0[13] + bf0[14];
569 bf1[15] = bf0[12] + bf0[15];
570 bf1[16] = bf0[16];
571 bf1[17] = bf0[17];
572 bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit[stage]);
573 bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit[stage]);
574 bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit[stage]);
575 bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit[stage]);
576 bf1[22] = bf0[22];
577 bf1[23] = bf0[23];
578 bf1[24] = bf0[24];
579 bf1[25] = bf0[25];
580 bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit[stage]);
581 bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit[stage]);
582 bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit[stage]);
583 bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit[stage]);
584 bf1[30] = bf0[30];
585 bf1[31] = bf0[31];
586 range_check(stage, input, bf1, size, stage_range[stage]);
587
588 // stage 7
589 stage++;
590 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
591 bf0 = step;
592 bf1 = output;
593 bf1[0] = bf0[0] + bf0[7];
594 bf1[1] = bf0[1] + bf0[6];
595 bf1[2] = bf0[2] + bf0[5];
596 bf1[3] = bf0[3] + bf0[4];
597 bf1[4] = bf0[3] - bf0[4];
598 bf1[5] = bf0[2] - bf0[5];
599 bf1[6] = bf0[1] - bf0[6];
600 bf1[7] = bf0[0] - bf0[7];
601 bf1[8] = bf0[8];
602 bf1[9] = bf0[9];
603 bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
604 bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
605 bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
606 bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
607 bf1[14] = bf0[14];
608 bf1[15] = bf0[15];
609 bf1[16] = bf0[16] + bf0[23];
610 bf1[17] = bf0[17] + bf0[22];
611 bf1[18] = bf0[18] + bf0[21];
612 bf1[19] = bf0[19] + bf0[20];
613 bf1[20] = bf0[19] - bf0[20];
614 bf1[21] = bf0[18] - bf0[21];
615 bf1[22] = bf0[17] - bf0[22];
616 bf1[23] = bf0[16] - bf0[23];
617 bf1[24] = -bf0[24] + bf0[31];
618 bf1[25] = -bf0[25] + bf0[30];
619 bf1[26] = -bf0[26] + bf0[29];
620 bf1[27] = -bf0[27] + bf0[28];
621 bf1[28] = bf0[27] + bf0[28];
622 bf1[29] = bf0[26] + bf0[29];
623 bf1[30] = bf0[25] + bf0[30];
624 bf1[31] = bf0[24] + bf0[31];
625 range_check(stage, input, bf1, size, stage_range[stage]);
626
627 // stage 8
628 stage++;
629 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
630 bf0 = output;
631 bf1 = step;
632 bf1[0] = bf0[0] + bf0[15];
633 bf1[1] = bf0[1] + bf0[14];
634 bf1[2] = bf0[2] + bf0[13];
635 bf1[3] = bf0[3] + bf0[12];
636 bf1[4] = bf0[4] + bf0[11];
637 bf1[5] = bf0[5] + bf0[10];
638 bf1[6] = bf0[6] + bf0[9];
639 bf1[7] = bf0[7] + bf0[8];
640 bf1[8] = bf0[7] - bf0[8];
641 bf1[9] = bf0[6] - bf0[9];
642 bf1[10] = bf0[5] - bf0[10];
643 bf1[11] = bf0[4] - bf0[11];
644 bf1[12] = bf0[3] - bf0[12];
645 bf1[13] = bf0[2] - bf0[13];
646 bf1[14] = bf0[1] - bf0[14];
647 bf1[15] = bf0[0] - bf0[15];
648 bf1[16] = bf0[16];
649 bf1[17] = bf0[17];
650 bf1[18] = bf0[18];
651 bf1[19] = bf0[19];
652 bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
653 bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
654 bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
655 bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
656 bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
657 bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
658 bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
659 bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
660 bf1[28] = bf0[28];
661 bf1[29] = bf0[29];
662 bf1[30] = bf0[30];
663 bf1[31] = bf0[31];
664 range_check(stage, input, bf1, size, stage_range[stage]);
665
666 // stage 9
667 stage++;
668 bf0 = step;
669 bf1 = output;
670 bf1[0] = bf0[0] + bf0[31];
671 bf1[1] = bf0[1] + bf0[30];
672 bf1[2] = bf0[2] + bf0[29];
673 bf1[3] = bf0[3] + bf0[28];
674 bf1[4] = bf0[4] + bf0[27];
675 bf1[5] = bf0[5] + bf0[26];
676 bf1[6] = bf0[6] + bf0[25];
677 bf1[7] = bf0[7] + bf0[24];
678 bf1[8] = bf0[8] + bf0[23];
679 bf1[9] = bf0[9] + bf0[22];
680 bf1[10] = bf0[10] + bf0[21];
681 bf1[11] = bf0[11] + bf0[20];
682 bf1[12] = bf0[12] + bf0[19];
683 bf1[13] = bf0[13] + bf0[18];
684 bf1[14] = bf0[14] + bf0[17];
685 bf1[15] = bf0[15] + bf0[16];
686 bf1[16] = bf0[15] - bf0[16];
687 bf1[17] = bf0[14] - bf0[17];
688 bf1[18] = bf0[13] - bf0[18];
689 bf1[19] = bf0[12] - bf0[19];
690 bf1[20] = bf0[11] - bf0[20];
691 bf1[21] = bf0[10] - bf0[21];
692 bf1[22] = bf0[9] - bf0[22];
693 bf1[23] = bf0[8] - bf0[23];
694 bf1[24] = bf0[7] - bf0[24];
695 bf1[25] = bf0[6] - bf0[25];
696 bf1[26] = bf0[5] - bf0[26];
697 bf1[27] = bf0[4] - bf0[27];
698 bf1[28] = bf0[3] - bf0[28];
699 bf1[29] = bf0[2] - bf0[29];
700 bf1[30] = bf0[1] - bf0[30];
701 bf1[31] = bf0[0] - bf0[31];
702 range_check(stage, input, bf1, size, stage_range[stage]);
703}
704
Yaowu Xuf883b422016-08-30 14:01:10 -0700705void av1_iadst4_new(const int32_t *input, int32_t *output,
706 const int8_t *cos_bit, const int8_t *stage_range) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700707 const int32_t size = 4;
708 const int32_t *cospi;
709
710 int32_t stage = 0;
711 int32_t *bf0, *bf1;
712 int32_t step[4];
713
714 // stage 0;
715 range_check(stage, input, input, size, stage_range[stage]);
716
717 // stage 1;
718 stage++;
719 bf1 = output;
720 bf1[0] = input[0];
721 bf1[1] = -input[3];
722 bf1[2] = -input[1];
723 bf1[3] = input[2];
724 range_check(stage, input, bf1, size, stage_range[stage]);
725
726 // stage 2
727 stage++;
728 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
729 bf0 = output;
730 bf1 = step;
731 bf1[0] = bf0[0];
732 bf1[1] = bf0[1];
733 bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
734 bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit[stage]);
735 range_check(stage, input, bf1, size, stage_range[stage]);
736
737 // stage 3
738 stage++;
Yaowu Xuc27fc142016-08-22 16:08:15 -0700739 bf0 = step;
740 bf1 = output;
741 bf1[0] = bf0[0] + bf0[2];
742 bf1[1] = bf0[1] + bf0[3];
743 bf1[2] = bf0[0] - bf0[2];
744 bf1[3] = bf0[1] - bf0[3];
745 range_check(stage, input, bf1, size, stage_range[stage]);
746
747 // stage 4
748 stage++;
749 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
750 bf0 = output;
751 bf1 = step;
752 bf1[0] = half_btf(cospi[8], bf0[0], cospi[56], bf0[1], cos_bit[stage]);
753 bf1[1] = half_btf(cospi[56], bf0[0], -cospi[8], bf0[1], cos_bit[stage]);
754 bf1[2] = half_btf(cospi[40], bf0[2], cospi[24], bf0[3], cos_bit[stage]);
755 bf1[3] = half_btf(cospi[24], bf0[2], -cospi[40], bf0[3], cos_bit[stage]);
756 range_check(stage, input, bf1, size, stage_range[stage]);
757
758 // stage 5
759 stage++;
760 bf0 = step;
761 bf1 = output;
762 bf1[0] = bf0[1];
763 bf1[1] = bf0[2];
764 bf1[2] = bf0[3];
765 bf1[3] = bf0[0];
766 range_check(stage, input, bf1, size, stage_range[stage]);
767}
768
Yaowu Xuf883b422016-08-30 14:01:10 -0700769void av1_iadst8_new(const int32_t *input, int32_t *output,
770 const int8_t *cos_bit, const int8_t *stage_range) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700771 const int32_t size = 8;
772 const int32_t *cospi;
773
774 int32_t stage = 0;
775 int32_t *bf0, *bf1;
776 int32_t step[8];
777
778 // stage 0;
779 range_check(stage, input, input, size, stage_range[stage]);
780
781 // stage 1;
782 stage++;
783 bf1 = output;
784 bf1[0] = input[0];
785 bf1[1] = -input[7];
786 bf1[2] = -input[3];
787 bf1[3] = input[4];
788 bf1[4] = -input[1];
789 bf1[5] = input[6];
790 bf1[6] = input[2];
791 bf1[7] = -input[5];
792 range_check(stage, input, bf1, size, stage_range[stage]);
793
794 // stage 2
795 stage++;
796 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
797 bf0 = output;
798 bf1 = step;
799 bf1[0] = bf0[0];
800 bf1[1] = bf0[1];
801 bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
802 bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit[stage]);
803 bf1[4] = bf0[4];
804 bf1[5] = bf0[5];
805 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
806 bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit[stage]);
807 range_check(stage, input, bf1, size, stage_range[stage]);
808
809 // stage 3
810 stage++;
Yaowu Xuc27fc142016-08-22 16:08:15 -0700811 bf0 = step;
812 bf1 = output;
813 bf1[0] = bf0[0] + bf0[2];
814 bf1[1] = bf0[1] + bf0[3];
815 bf1[2] = bf0[0] - bf0[2];
816 bf1[3] = bf0[1] - bf0[3];
817 bf1[4] = bf0[4] + bf0[6];
818 bf1[5] = bf0[5] + bf0[7];
819 bf1[6] = bf0[4] - bf0[6];
820 bf1[7] = bf0[5] - bf0[7];
821 range_check(stage, input, bf1, size, stage_range[stage]);
822
823 // stage 4
824 stage++;
825 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
826 bf0 = output;
827 bf1 = step;
828 bf1[0] = bf0[0];
829 bf1[1] = bf0[1];
830 bf1[2] = bf0[2];
831 bf1[3] = bf0[3];
832 bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
833 bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit[stage]);
834 bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
835 bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit[stage]);
836 range_check(stage, input, bf1, size, stage_range[stage]);
837
838 // stage 5
839 stage++;
Yaowu Xuc27fc142016-08-22 16:08:15 -0700840 bf0 = step;
841 bf1 = output;
842 bf1[0] = bf0[0] + bf0[4];
843 bf1[1] = bf0[1] + bf0[5];
844 bf1[2] = bf0[2] + bf0[6];
845 bf1[3] = bf0[3] + bf0[7];
846 bf1[4] = bf0[0] - bf0[4];
847 bf1[5] = bf0[1] - bf0[5];
848 bf1[6] = bf0[2] - bf0[6];
849 bf1[7] = bf0[3] - bf0[7];
850 range_check(stage, input, bf1, size, stage_range[stage]);
851
852 // stage 6
853 stage++;
854 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
855 bf0 = output;
856 bf1 = step;
857 bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit[stage]);
858 bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit[stage]);
859 bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit[stage]);
860 bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit[stage]);
861 bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit[stage]);
862 bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit[stage]);
863 bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit[stage]);
864 bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit[stage]);
865 range_check(stage, input, bf1, size, stage_range[stage]);
866
867 // stage 7
868 stage++;
869 bf0 = step;
870 bf1 = output;
871 bf1[0] = bf0[1];
872 bf1[1] = bf0[6];
873 bf1[2] = bf0[3];
874 bf1[3] = bf0[4];
875 bf1[4] = bf0[5];
876 bf1[5] = bf0[2];
877 bf1[6] = bf0[7];
878 bf1[7] = bf0[0];
879 range_check(stage, input, bf1, size, stage_range[stage]);
880}
881
Yaowu Xuf883b422016-08-30 14:01:10 -0700882void av1_iadst16_new(const int32_t *input, int32_t *output,
883 const int8_t *cos_bit, const int8_t *stage_range) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700884 const int32_t size = 16;
885 const int32_t *cospi;
886
887 int32_t stage = 0;
888 int32_t *bf0, *bf1;
889 int32_t step[16];
890
891 // stage 0;
892 range_check(stage, input, input, size, stage_range[stage]);
893
894 // stage 1;
895 stage++;
896 bf1 = output;
897 bf1[0] = input[0];
898 bf1[1] = -input[15];
899 bf1[2] = -input[7];
900 bf1[3] = input[8];
901 bf1[4] = -input[3];
902 bf1[5] = input[12];
903 bf1[6] = input[4];
904 bf1[7] = -input[11];
905 bf1[8] = -input[1];
906 bf1[9] = input[14];
907 bf1[10] = input[6];
908 bf1[11] = -input[9];
909 bf1[12] = input[2];
910 bf1[13] = -input[13];
911 bf1[14] = -input[5];
912 bf1[15] = input[10];
913 range_check(stage, input, bf1, size, stage_range[stage]);
914
915 // stage 2
916 stage++;
917 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
918 bf0 = output;
919 bf1 = step;
920 bf1[0] = bf0[0];
921 bf1[1] = bf0[1];
922 bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
923 bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit[stage]);
924 bf1[4] = bf0[4];
925 bf1[5] = bf0[5];
926 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
927 bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit[stage]);
928 bf1[8] = bf0[8];
929 bf1[9] = bf0[9];
930 bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit[stage]);
931 bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit[stage]);
932 bf1[12] = bf0[12];
933 bf1[13] = bf0[13];
934 bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit[stage]);
935 bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit[stage]);
936 range_check(stage, input, bf1, size, stage_range[stage]);
937
938 // stage 3
939 stage++;
Yaowu Xuc27fc142016-08-22 16:08:15 -0700940 bf0 = step;
941 bf1 = output;
942 bf1[0] = bf0[0] + bf0[2];
943 bf1[1] = bf0[1] + bf0[3];
944 bf1[2] = bf0[0] - bf0[2];
945 bf1[3] = bf0[1] - bf0[3];
946 bf1[4] = bf0[4] + bf0[6];
947 bf1[5] = bf0[5] + bf0[7];
948 bf1[6] = bf0[4] - bf0[6];
949 bf1[7] = bf0[5] - bf0[7];
950 bf1[8] = bf0[8] + bf0[10];
951 bf1[9] = bf0[9] + bf0[11];
952 bf1[10] = bf0[8] - bf0[10];
953 bf1[11] = bf0[9] - bf0[11];
954 bf1[12] = bf0[12] + bf0[14];
955 bf1[13] = bf0[13] + bf0[15];
956 bf1[14] = bf0[12] - bf0[14];
957 bf1[15] = bf0[13] - bf0[15];
958 range_check(stage, input, bf1, size, stage_range[stage]);
959
960 // stage 4
961 stage++;
962 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
963 bf0 = output;
964 bf1 = step;
965 bf1[0] = bf0[0];
966 bf1[1] = bf0[1];
967 bf1[2] = bf0[2];
968 bf1[3] = bf0[3];
969 bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
970 bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit[stage]);
971 bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
972 bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit[stage]);
973 bf1[8] = bf0[8];
974 bf1[9] = bf0[9];
975 bf1[10] = bf0[10];
976 bf1[11] = bf0[11];
977 bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit[stage]);
978 bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit[stage]);
979 bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit[stage]);
980 bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit[stage]);
981 range_check(stage, input, bf1, size, stage_range[stage]);
982
983 // stage 5
984 stage++;
Yaowu Xuc27fc142016-08-22 16:08:15 -0700985 bf0 = step;
986 bf1 = output;
987 bf1[0] = bf0[0] + bf0[4];
988 bf1[1] = bf0[1] + bf0[5];
989 bf1[2] = bf0[2] + bf0[6];
990 bf1[3] = bf0[3] + bf0[7];
991 bf1[4] = bf0[0] - bf0[4];
992 bf1[5] = bf0[1] - bf0[5];
993 bf1[6] = bf0[2] - bf0[6];
994 bf1[7] = bf0[3] - bf0[7];
995 bf1[8] = bf0[8] + bf0[12];
996 bf1[9] = bf0[9] + bf0[13];
997 bf1[10] = bf0[10] + bf0[14];
998 bf1[11] = bf0[11] + bf0[15];
999 bf1[12] = bf0[8] - bf0[12];
1000 bf1[13] = bf0[9] - bf0[13];
1001 bf1[14] = bf0[10] - bf0[14];
1002 bf1[15] = bf0[11] - bf0[15];
1003 range_check(stage, input, bf1, size, stage_range[stage]);
1004
1005 // stage 6
1006 stage++;
1007 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
1008 bf0 = output;
1009 bf1 = step;
1010 bf1[0] = bf0[0];
1011 bf1[1] = bf0[1];
1012 bf1[2] = bf0[2];
1013 bf1[3] = bf0[3];
1014 bf1[4] = bf0[4];
1015 bf1[5] = bf0[5];
1016 bf1[6] = bf0[6];
1017 bf1[7] = bf0[7];
1018 bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit[stage]);
1019 bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit[stage]);
1020 bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit[stage]);
1021 bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit[stage]);
1022 bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit[stage]);
1023 bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit[stage]);
1024 bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit[stage]);
1025 bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit[stage]);
1026 range_check(stage, input, bf1, size, stage_range[stage]);
1027
1028 // stage 7
1029 stage++;
Yaowu Xuc27fc142016-08-22 16:08:15 -07001030 bf0 = step;
1031 bf1 = output;
1032 bf1[0] = bf0[0] + bf0[8];
1033 bf1[1] = bf0[1] + bf0[9];
1034 bf1[2] = bf0[2] + bf0[10];
1035 bf1[3] = bf0[3] + bf0[11];
1036 bf1[4] = bf0[4] + bf0[12];
1037 bf1[5] = bf0[5] + bf0[13];
1038 bf1[6] = bf0[6] + bf0[14];
1039 bf1[7] = bf0[7] + bf0[15];
1040 bf1[8] = bf0[0] - bf0[8];
1041 bf1[9] = bf0[1] - bf0[9];
1042 bf1[10] = bf0[2] - bf0[10];
1043 bf1[11] = bf0[3] - bf0[11];
1044 bf1[12] = bf0[4] - bf0[12];
1045 bf1[13] = bf0[5] - bf0[13];
1046 bf1[14] = bf0[6] - bf0[14];
1047 bf1[15] = bf0[7] - bf0[15];
1048 range_check(stage, input, bf1, size, stage_range[stage]);
1049
1050 // stage 8
1051 stage++;
1052 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
1053 bf0 = output;
1054 bf1 = step;
1055 bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit[stage]);
1056 bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit[stage]);
1057 bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit[stage]);
1058 bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit[stage]);
1059 bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit[stage]);
1060 bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit[stage]);
1061 bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit[stage]);
1062 bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit[stage]);
1063 bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit[stage]);
1064 bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit[stage]);
1065 bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit[stage]);
1066 bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit[stage]);
1067 bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit[stage]);
1068 bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit[stage]);
1069 bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit[stage]);
1070 bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit[stage]);
1071 range_check(stage, input, bf1, size, stage_range[stage]);
1072
1073 // stage 9
1074 stage++;
1075 bf0 = step;
1076 bf1 = output;
1077 bf1[0] = bf0[1];
1078 bf1[1] = bf0[14];
1079 bf1[2] = bf0[3];
1080 bf1[3] = bf0[12];
1081 bf1[4] = bf0[5];
1082 bf1[5] = bf0[10];
1083 bf1[6] = bf0[7];
1084 bf1[7] = bf0[8];
1085 bf1[8] = bf0[9];
1086 bf1[9] = bf0[6];
1087 bf1[10] = bf0[11];
1088 bf1[11] = bf0[4];
1089 bf1[12] = bf0[13];
1090 bf1[13] = bf0[2];
1091 bf1[14] = bf0[15];
1092 bf1[15] = bf0[0];
1093 range_check(stage, input, bf1, size, stage_range[stage]);
1094}
1095
Yaowu Xuf883b422016-08-30 14:01:10 -07001096void av1_iadst32_new(const int32_t *input, int32_t *output,
1097 const int8_t *cos_bit, const int8_t *stage_range) {
Yaowu Xuc27fc142016-08-22 16:08:15 -07001098 const int32_t size = 32;
1099 const int32_t *cospi;
1100
1101 int32_t stage = 0;
1102 int32_t *bf0, *bf1;
1103 int32_t step[32];
1104
1105 // stage 0;
1106 range_check(stage, input, input, size, stage_range[stage]);
1107
1108 // stage 1;
1109 stage++;
1110 bf1 = output;
1111 bf1[0] = input[0];
1112 bf1[1] = -input[31];
1113 bf1[2] = -input[15];
1114 bf1[3] = input[16];
1115 bf1[4] = -input[7];
1116 bf1[5] = input[24];
1117 bf1[6] = input[8];
1118 bf1[7] = -input[23];
1119 bf1[8] = -input[3];
1120 bf1[9] = input[28];
1121 bf1[10] = input[12];
1122 bf1[11] = -input[19];
1123 bf1[12] = input[4];
1124 bf1[13] = -input[27];
1125 bf1[14] = -input[11];
1126 bf1[15] = input[20];
1127 bf1[16] = -input[1];
1128 bf1[17] = input[30];
1129 bf1[18] = input[14];
1130 bf1[19] = -input[17];
1131 bf1[20] = input[6];
1132 bf1[21] = -input[25];
1133 bf1[22] = -input[9];
1134 bf1[23] = input[22];
1135 bf1[24] = input[2];
1136 bf1[25] = -input[29];
1137 bf1[26] = -input[13];
1138 bf1[27] = input[18];
1139 bf1[28] = -input[5];
1140 bf1[29] = input[26];
1141 bf1[30] = input[10];
1142 bf1[31] = -input[21];
1143 range_check(stage, input, bf1, size, stage_range[stage]);
1144
1145 // stage 2
1146 stage++;
1147 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
1148 bf0 = output;
1149 bf1 = step;
1150 bf1[0] = bf0[0];
1151 bf1[1] = bf0[1];
1152 bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
1153 bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit[stage]);
1154 bf1[4] = bf0[4];
1155 bf1[5] = bf0[5];
1156 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
1157 bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit[stage]);
1158 bf1[8] = bf0[8];
1159 bf1[9] = bf0[9];
1160 bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit[stage]);
1161 bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit[stage]);
1162 bf1[12] = bf0[12];
1163 bf1[13] = bf0[13];
1164 bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit[stage]);
1165 bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit[stage]);
1166 bf1[16] = bf0[16];
1167 bf1[17] = bf0[17];
1168 bf1[18] = half_btf(cospi[32], bf0[18], cospi[32], bf0[19], cos_bit[stage]);
1169 bf1[19] = half_btf(cospi[32], bf0[18], -cospi[32], bf0[19], cos_bit[stage]);
1170 bf1[20] = bf0[20];
1171 bf1[21] = bf0[21];
1172 bf1[22] = half_btf(cospi[32], bf0[22], cospi[32], bf0[23], cos_bit[stage]);
1173 bf1[23] = half_btf(cospi[32], bf0[22], -cospi[32], bf0[23], cos_bit[stage]);
1174 bf1[24] = bf0[24];
1175 bf1[25] = bf0[25];
1176 bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[27], cos_bit[stage]);
1177 bf1[27] = half_btf(cospi[32], bf0[26], -cospi[32], bf0[27], cos_bit[stage]);
1178 bf1[28] = bf0[28];
1179 bf1[29] = bf0[29];
1180 bf1[30] = half_btf(cospi[32], bf0[30], cospi[32], bf0[31], cos_bit[stage]);
1181 bf1[31] = half_btf(cospi[32], bf0[30], -cospi[32], bf0[31], cos_bit[stage]);
1182 range_check(stage, input, bf1, size, stage_range[stage]);
1183
1184 // stage 3
1185 stage++;
Yaowu Xuc27fc142016-08-22 16:08:15 -07001186 bf0 = step;
1187 bf1 = output;
1188 bf1[0] = bf0[0] + bf0[2];
1189 bf1[1] = bf0[1] + bf0[3];
1190 bf1[2] = bf0[0] - bf0[2];
1191 bf1[3] = bf0[1] - bf0[3];
1192 bf1[4] = bf0[4] + bf0[6];
1193 bf1[5] = bf0[5] + bf0[7];
1194 bf1[6] = bf0[4] - bf0[6];
1195 bf1[7] = bf0[5] - bf0[7];
1196 bf1[8] = bf0[8] + bf0[10];
1197 bf1[9] = bf0[9] + bf0[11];
1198 bf1[10] = bf0[8] - bf0[10];
1199 bf1[11] = bf0[9] - bf0[11];
1200 bf1[12] = bf0[12] + bf0[14];
1201 bf1[13] = bf0[13] + bf0[15];
1202 bf1[14] = bf0[12] - bf0[14];
1203 bf1[15] = bf0[13] - bf0[15];
1204 bf1[16] = bf0[16] + bf0[18];
1205 bf1[17] = bf0[17] + bf0[19];
1206 bf1[18] = bf0[16] - bf0[18];
1207 bf1[19] = bf0[17] - bf0[19];
1208 bf1[20] = bf0[20] + bf0[22];
1209 bf1[21] = bf0[21] + bf0[23];
1210 bf1[22] = bf0[20] - bf0[22];
1211 bf1[23] = bf0[21] - bf0[23];
1212 bf1[24] = bf0[24] + bf0[26];
1213 bf1[25] = bf0[25] + bf0[27];
1214 bf1[26] = bf0[24] - bf0[26];
1215 bf1[27] = bf0[25] - bf0[27];
1216 bf1[28] = bf0[28] + bf0[30];
1217 bf1[29] = bf0[29] + bf0[31];
1218 bf1[30] = bf0[28] - bf0[30];
1219 bf1[31] = bf0[29] - bf0[31];
1220 range_check(stage, input, bf1, size, stage_range[stage]);
1221
1222 // stage 4
1223 stage++;
1224 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
1225 bf0 = output;
1226 bf1 = step;
1227 bf1[0] = bf0[0];
1228 bf1[1] = bf0[1];
1229 bf1[2] = bf0[2];
1230 bf1[3] = bf0[3];
1231 bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
1232 bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit[stage]);
1233 bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
1234 bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit[stage]);
1235 bf1[8] = bf0[8];
1236 bf1[9] = bf0[9];
1237 bf1[10] = bf0[10];
1238 bf1[11] = bf0[11];
1239 bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit[stage]);
1240 bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit[stage]);
1241 bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit[stage]);
1242 bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit[stage]);
1243 bf1[16] = bf0[16];
1244 bf1[17] = bf0[17];
1245 bf1[18] = bf0[18];
1246 bf1[19] = bf0[19];
1247 bf1[20] = half_btf(cospi[16], bf0[20], cospi[48], bf0[21], cos_bit[stage]);
1248 bf1[21] = half_btf(cospi[48], bf0[20], -cospi[16], bf0[21], cos_bit[stage]);
1249 bf1[22] = half_btf(-cospi[48], bf0[22], cospi[16], bf0[23], cos_bit[stage]);
1250 bf1[23] = half_btf(cospi[16], bf0[22], cospi[48], bf0[23], cos_bit[stage]);
1251 bf1[24] = bf0[24];
1252 bf1[25] = bf0[25];
1253 bf1[26] = bf0[26];
1254 bf1[27] = bf0[27];
1255 bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[29], cos_bit[stage]);
1256 bf1[29] = half_btf(cospi[48], bf0[28], -cospi[16], bf0[29], cos_bit[stage]);
1257 bf1[30] = half_btf(-cospi[48], bf0[30], cospi[16], bf0[31], cos_bit[stage]);
1258 bf1[31] = half_btf(cospi[16], bf0[30], cospi[48], bf0[31], cos_bit[stage]);
1259 range_check(stage, input, bf1, size, stage_range[stage]);
1260
1261 // stage 5
1262 stage++;
Yaowu Xuc27fc142016-08-22 16:08:15 -07001263 bf0 = step;
1264 bf1 = output;
1265 bf1[0] = bf0[0] + bf0[4];
1266 bf1[1] = bf0[1] + bf0[5];
1267 bf1[2] = bf0[2] + bf0[6];
1268 bf1[3] = bf0[3] + bf0[7];
1269 bf1[4] = bf0[0] - bf0[4];
1270 bf1[5] = bf0[1] - bf0[5];
1271 bf1[6] = bf0[2] - bf0[6];
1272 bf1[7] = bf0[3] - bf0[7];
1273 bf1[8] = bf0[8] + bf0[12];
1274 bf1[9] = bf0[9] + bf0[13];
1275 bf1[10] = bf0[10] + bf0[14];
1276 bf1[11] = bf0[11] + bf0[15];
1277 bf1[12] = bf0[8] - bf0[12];
1278 bf1[13] = bf0[9] - bf0[13];
1279 bf1[14] = bf0[10] - bf0[14];
1280 bf1[15] = bf0[11] - bf0[15];
1281 bf1[16] = bf0[16] + bf0[20];
1282 bf1[17] = bf0[17] + bf0[21];
1283 bf1[18] = bf0[18] + bf0[22];
1284 bf1[19] = bf0[19] + bf0[23];
1285 bf1[20] = bf0[16] - bf0[20];
1286 bf1[21] = bf0[17] - bf0[21];
1287 bf1[22] = bf0[18] - bf0[22];
1288 bf1[23] = bf0[19] - bf0[23];
1289 bf1[24] = bf0[24] + bf0[28];
1290 bf1[25] = bf0[25] + bf0[29];
1291 bf1[26] = bf0[26] + bf0[30];
1292 bf1[27] = bf0[27] + bf0[31];
1293 bf1[28] = bf0[24] - bf0[28];
1294 bf1[29] = bf0[25] - bf0[29];
1295 bf1[30] = bf0[26] - bf0[30];
1296 bf1[31] = bf0[27] - bf0[31];
1297 range_check(stage, input, bf1, size, stage_range[stage]);
1298
1299 // stage 6
1300 stage++;
1301 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
1302 bf0 = output;
1303 bf1 = step;
1304 bf1[0] = bf0[0];
1305 bf1[1] = bf0[1];
1306 bf1[2] = bf0[2];
1307 bf1[3] = bf0[3];
1308 bf1[4] = bf0[4];
1309 bf1[5] = bf0[5];
1310 bf1[6] = bf0[6];
1311 bf1[7] = bf0[7];
1312 bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit[stage]);
1313 bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit[stage]);
1314 bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit[stage]);
1315 bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit[stage]);
1316 bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit[stage]);
1317 bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit[stage]);
1318 bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit[stage]);
1319 bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit[stage]);
1320 bf1[16] = bf0[16];
1321 bf1[17] = bf0[17];
1322 bf1[18] = bf0[18];
1323 bf1[19] = bf0[19];
1324 bf1[20] = bf0[20];
1325 bf1[21] = bf0[21];
1326 bf1[22] = bf0[22];
1327 bf1[23] = bf0[23];
1328 bf1[24] = half_btf(cospi[8], bf0[24], cospi[56], bf0[25], cos_bit[stage]);
1329 bf1[25] = half_btf(cospi[56], bf0[24], -cospi[8], bf0[25], cos_bit[stage]);
1330 bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[27], cos_bit[stage]);
1331 bf1[27] = half_btf(cospi[24], bf0[26], -cospi[40], bf0[27], cos_bit[stage]);
1332 bf1[28] = half_btf(-cospi[56], bf0[28], cospi[8], bf0[29], cos_bit[stage]);
1333 bf1[29] = half_btf(cospi[8], bf0[28], cospi[56], bf0[29], cos_bit[stage]);
1334 bf1[30] = half_btf(-cospi[24], bf0[30], cospi[40], bf0[31], cos_bit[stage]);
1335 bf1[31] = half_btf(cospi[40], bf0[30], cospi[24], bf0[31], cos_bit[stage]);
1336 range_check(stage, input, bf1, size, stage_range[stage]);
1337
1338 // stage 7
1339 stage++;
Yaowu Xuc27fc142016-08-22 16:08:15 -07001340 bf0 = step;
1341 bf1 = output;
1342 bf1[0] = bf0[0] + bf0[8];
1343 bf1[1] = bf0[1] + bf0[9];
1344 bf1[2] = bf0[2] + bf0[10];
1345 bf1[3] = bf0[3] + bf0[11];
1346 bf1[4] = bf0[4] + bf0[12];
1347 bf1[5] = bf0[5] + bf0[13];
1348 bf1[6] = bf0[6] + bf0[14];
1349 bf1[7] = bf0[7] + bf0[15];
1350 bf1[8] = bf0[0] - bf0[8];
1351 bf1[9] = bf0[1] - bf0[9];
1352 bf1[10] = bf0[2] - bf0[10];
1353 bf1[11] = bf0[3] - bf0[11];
1354 bf1[12] = bf0[4] - bf0[12];
1355 bf1[13] = bf0[5] - bf0[13];
1356 bf1[14] = bf0[6] - bf0[14];
1357 bf1[15] = bf0[7] - bf0[15];
1358 bf1[16] = bf0[16] + bf0[24];
1359 bf1[17] = bf0[17] + bf0[25];
1360 bf1[18] = bf0[18] + bf0[26];
1361 bf1[19] = bf0[19] + bf0[27];
1362 bf1[20] = bf0[20] + bf0[28];
1363 bf1[21] = bf0[21] + bf0[29];
1364 bf1[22] = bf0[22] + bf0[30];
1365 bf1[23] = bf0[23] + bf0[31];
1366 bf1[24] = bf0[16] - bf0[24];
1367 bf1[25] = bf0[17] - bf0[25];
1368 bf1[26] = bf0[18] - bf0[26];
1369 bf1[27] = bf0[19] - bf0[27];
1370 bf1[28] = bf0[20] - bf0[28];
1371 bf1[29] = bf0[21] - bf0[29];
1372 bf1[30] = bf0[22] - bf0[30];
1373 bf1[31] = bf0[23] - bf0[31];
1374 range_check(stage, input, bf1, size, stage_range[stage]);
1375
1376 // stage 8
1377 stage++;
1378 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
1379 bf0 = output;
1380 bf1 = step;
1381 bf1[0] = bf0[0];
1382 bf1[1] = bf0[1];
1383 bf1[2] = bf0[2];
1384 bf1[3] = bf0[3];
1385 bf1[4] = bf0[4];
1386 bf1[5] = bf0[5];
1387 bf1[6] = bf0[6];
1388 bf1[7] = bf0[7];
1389 bf1[8] = bf0[8];
1390 bf1[9] = bf0[9];
1391 bf1[10] = bf0[10];
1392 bf1[11] = bf0[11];
1393 bf1[12] = bf0[12];
1394 bf1[13] = bf0[13];
1395 bf1[14] = bf0[14];
1396 bf1[15] = bf0[15];
1397 bf1[16] = half_btf(cospi[4], bf0[16], cospi[60], bf0[17], cos_bit[stage]);
1398 bf1[17] = half_btf(cospi[60], bf0[16], -cospi[4], bf0[17], cos_bit[stage]);
1399 bf1[18] = half_btf(cospi[20], bf0[18], cospi[44], bf0[19], cos_bit[stage]);
1400 bf1[19] = half_btf(cospi[44], bf0[18], -cospi[20], bf0[19], cos_bit[stage]);
1401 bf1[20] = half_btf(cospi[36], bf0[20], cospi[28], bf0[21], cos_bit[stage]);
1402 bf1[21] = half_btf(cospi[28], bf0[20], -cospi[36], bf0[21], cos_bit[stage]);
1403 bf1[22] = half_btf(cospi[52], bf0[22], cospi[12], bf0[23], cos_bit[stage]);
1404 bf1[23] = half_btf(cospi[12], bf0[22], -cospi[52], bf0[23], cos_bit[stage]);
1405 bf1[24] = half_btf(-cospi[60], bf0[24], cospi[4], bf0[25], cos_bit[stage]);
1406 bf1[25] = half_btf(cospi[4], bf0[24], cospi[60], bf0[25], cos_bit[stage]);
1407 bf1[26] = half_btf(-cospi[44], bf0[26], cospi[20], bf0[27], cos_bit[stage]);
1408 bf1[27] = half_btf(cospi[20], bf0[26], cospi[44], bf0[27], cos_bit[stage]);
1409 bf1[28] = half_btf(-cospi[28], bf0[28], cospi[36], bf0[29], cos_bit[stage]);
1410 bf1[29] = half_btf(cospi[36], bf0[28], cospi[28], bf0[29], cos_bit[stage]);
1411 bf1[30] = half_btf(-cospi[12], bf0[30], cospi[52], bf0[31], cos_bit[stage]);
1412 bf1[31] = half_btf(cospi[52], bf0[30], cospi[12], bf0[31], cos_bit[stage]);
1413 range_check(stage, input, bf1, size, stage_range[stage]);
1414
1415 // stage 9
1416 stage++;
Yaowu Xuc27fc142016-08-22 16:08:15 -07001417 bf0 = step;
1418 bf1 = output;
1419 bf1[0] = bf0[0] + bf0[16];
1420 bf1[1] = bf0[1] + bf0[17];
1421 bf1[2] = bf0[2] + bf0[18];
1422 bf1[3] = bf0[3] + bf0[19];
1423 bf1[4] = bf0[4] + bf0[20];
1424 bf1[5] = bf0[5] + bf0[21];
1425 bf1[6] = bf0[6] + bf0[22];
1426 bf1[7] = bf0[7] + bf0[23];
1427 bf1[8] = bf0[8] + bf0[24];
1428 bf1[9] = bf0[9] + bf0[25];
1429 bf1[10] = bf0[10] + bf0[26];
1430 bf1[11] = bf0[11] + bf0[27];
1431 bf1[12] = bf0[12] + bf0[28];
1432 bf1[13] = bf0[13] + bf0[29];
1433 bf1[14] = bf0[14] + bf0[30];
1434 bf1[15] = bf0[15] + bf0[31];
1435 bf1[16] = bf0[0] - bf0[16];
1436 bf1[17] = bf0[1] - bf0[17];
1437 bf1[18] = bf0[2] - bf0[18];
1438 bf1[19] = bf0[3] - bf0[19];
1439 bf1[20] = bf0[4] - bf0[20];
1440 bf1[21] = bf0[5] - bf0[21];
1441 bf1[22] = bf0[6] - bf0[22];
1442 bf1[23] = bf0[7] - bf0[23];
1443 bf1[24] = bf0[8] - bf0[24];
1444 bf1[25] = bf0[9] - bf0[25];
1445 bf1[26] = bf0[10] - bf0[26];
1446 bf1[27] = bf0[11] - bf0[27];
1447 bf1[28] = bf0[12] - bf0[28];
1448 bf1[29] = bf0[13] - bf0[29];
1449 bf1[30] = bf0[14] - bf0[30];
1450 bf1[31] = bf0[15] - bf0[31];
1451 range_check(stage, input, bf1, size, stage_range[stage]);
1452
1453 // stage 10
1454 stage++;
1455 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
1456 bf0 = output;
1457 bf1 = step;
1458 bf1[0] = half_btf(cospi[1], bf0[0], cospi[63], bf0[1], cos_bit[stage]);
1459 bf1[1] = half_btf(cospi[63], bf0[0], -cospi[1], bf0[1], cos_bit[stage]);
1460 bf1[2] = half_btf(cospi[5], bf0[2], cospi[59], bf0[3], cos_bit[stage]);
1461 bf1[3] = half_btf(cospi[59], bf0[2], -cospi[5], bf0[3], cos_bit[stage]);
1462 bf1[4] = half_btf(cospi[9], bf0[4], cospi[55], bf0[5], cos_bit[stage]);
1463 bf1[5] = half_btf(cospi[55], bf0[4], -cospi[9], bf0[5], cos_bit[stage]);
1464 bf1[6] = half_btf(cospi[13], bf0[6], cospi[51], bf0[7], cos_bit[stage]);
1465 bf1[7] = half_btf(cospi[51], bf0[6], -cospi[13], bf0[7], cos_bit[stage]);
1466 bf1[8] = half_btf(cospi[17], bf0[8], cospi[47], bf0[9], cos_bit[stage]);
1467 bf1[9] = half_btf(cospi[47], bf0[8], -cospi[17], bf0[9], cos_bit[stage]);
1468 bf1[10] = half_btf(cospi[21], bf0[10], cospi[43], bf0[11], cos_bit[stage]);
1469 bf1[11] = half_btf(cospi[43], bf0[10], -cospi[21], bf0[11], cos_bit[stage]);
1470 bf1[12] = half_btf(cospi[25], bf0[12], cospi[39], bf0[13], cos_bit[stage]);
1471 bf1[13] = half_btf(cospi[39], bf0[12], -cospi[25], bf0[13], cos_bit[stage]);
1472 bf1[14] = half_btf(cospi[29], bf0[14], cospi[35], bf0[15], cos_bit[stage]);
1473 bf1[15] = half_btf(cospi[35], bf0[14], -cospi[29], bf0[15], cos_bit[stage]);
1474 bf1[16] = half_btf(cospi[33], bf0[16], cospi[31], bf0[17], cos_bit[stage]);
1475 bf1[17] = half_btf(cospi[31], bf0[16], -cospi[33], bf0[17], cos_bit[stage]);
1476 bf1[18] = half_btf(cospi[37], bf0[18], cospi[27], bf0[19], cos_bit[stage]);
1477 bf1[19] = half_btf(cospi[27], bf0[18], -cospi[37], bf0[19], cos_bit[stage]);
1478 bf1[20] = half_btf(cospi[41], bf0[20], cospi[23], bf0[21], cos_bit[stage]);
1479 bf1[21] = half_btf(cospi[23], bf0[20], -cospi[41], bf0[21], cos_bit[stage]);
1480 bf1[22] = half_btf(cospi[45], bf0[22], cospi[19], bf0[23], cos_bit[stage]);
1481 bf1[23] = half_btf(cospi[19], bf0[22], -cospi[45], bf0[23], cos_bit[stage]);
1482 bf1[24] = half_btf(cospi[49], bf0[24], cospi[15], bf0[25], cos_bit[stage]);
1483 bf1[25] = half_btf(cospi[15], bf0[24], -cospi[49], bf0[25], cos_bit[stage]);
1484 bf1[26] = half_btf(cospi[53], bf0[26], cospi[11], bf0[27], cos_bit[stage]);
1485 bf1[27] = half_btf(cospi[11], bf0[26], -cospi[53], bf0[27], cos_bit[stage]);
1486 bf1[28] = half_btf(cospi[57], bf0[28], cospi[7], bf0[29], cos_bit[stage]);
1487 bf1[29] = half_btf(cospi[7], bf0[28], -cospi[57], bf0[29], cos_bit[stage]);
1488 bf1[30] = half_btf(cospi[61], bf0[30], cospi[3], bf0[31], cos_bit[stage]);
1489 bf1[31] = half_btf(cospi[3], bf0[30], -cospi[61], bf0[31], cos_bit[stage]);
1490 range_check(stage, input, bf1, size, stage_range[stage]);
1491
1492 // stage 11
1493 stage++;
1494 bf0 = step;
1495 bf1 = output;
1496 bf1[0] = bf0[1];
1497 bf1[1] = bf0[30];
1498 bf1[2] = bf0[3];
1499 bf1[3] = bf0[28];
1500 bf1[4] = bf0[5];
1501 bf1[5] = bf0[26];
1502 bf1[6] = bf0[7];
1503 bf1[7] = bf0[24];
1504 bf1[8] = bf0[9];
1505 bf1[9] = bf0[22];
1506 bf1[10] = bf0[11];
1507 bf1[11] = bf0[20];
1508 bf1[12] = bf0[13];
1509 bf1[13] = bf0[18];
1510 bf1[14] = bf0[15];
1511 bf1[15] = bf0[16];
1512 bf1[16] = bf0[17];
1513 bf1[17] = bf0[14];
1514 bf1[18] = bf0[19];
1515 bf1[19] = bf0[12];
1516 bf1[20] = bf0[21];
1517 bf1[21] = bf0[10];
1518 bf1[22] = bf0[23];
1519 bf1[23] = bf0[8];
1520 bf1[24] = bf0[25];
1521 bf1[25] = bf0[6];
1522 bf1[26] = bf0[27];
1523 bf1[27] = bf0[4];
1524 bf1[28] = bf0[29];
1525 bf1[29] = bf0[2];
1526 bf1[30] = bf0[31];
1527 bf1[31] = bf0[0];
1528 range_check(stage, input, bf1, size, stage_range[stage]);
1529}
Angie Chiang792519b2016-10-18 12:24:20 -07001530
1531#if CONFIG_TX64X64
1532void av1_idct64_new(const int32_t *input, int32_t *output,
1533 const int8_t *cos_bit, const int8_t *stage_range) {
1534 const int32_t size = 64;
1535 const int32_t *cospi;
1536
1537 int32_t stage = 0;
1538 int32_t *bf0, *bf1;
1539 int32_t step[64];
1540
1541 // stage 0;
1542 range_check(stage, input, input, size, stage_range[stage]);
1543
1544 // stage 1;
1545 stage++;
1546 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
1547 bf1 = output;
1548 bf1[0] = input[0];
1549 bf1[1] = input[32];
1550 bf1[2] = input[16];
1551 bf1[3] = input[48];
1552 bf1[4] = input[8];
1553 bf1[5] = input[40];
1554 bf1[6] = input[24];
1555 bf1[7] = input[56];
1556 bf1[8] = input[4];
1557 bf1[9] = input[36];
1558 bf1[10] = input[20];
1559 bf1[11] = input[52];
1560 bf1[12] = input[12];
1561 bf1[13] = input[44];
1562 bf1[14] = input[28];
1563 bf1[15] = input[60];
1564 bf1[16] = input[2];
1565 bf1[17] = input[34];
1566 bf1[18] = input[18];
1567 bf1[19] = input[50];
1568 bf1[20] = input[10];
1569 bf1[21] = input[42];
1570 bf1[22] = input[26];
1571 bf1[23] = input[58];
1572 bf1[24] = input[6];
1573 bf1[25] = input[38];
1574 bf1[26] = input[22];
1575 bf1[27] = input[54];
1576 bf1[28] = input[14];
1577 bf1[29] = input[46];
1578 bf1[30] = input[30];
1579 bf1[31] = input[62];
1580 bf1[32] = input[1];
1581 bf1[33] = input[33];
1582 bf1[34] = input[17];
1583 bf1[35] = input[49];
1584 bf1[36] = input[9];
1585 bf1[37] = input[41];
1586 bf1[38] = input[25];
1587 bf1[39] = input[57];
1588 bf1[40] = input[5];
1589 bf1[41] = input[37];
1590 bf1[42] = input[21];
1591 bf1[43] = input[53];
1592 bf1[44] = input[13];
1593 bf1[45] = input[45];
1594 bf1[46] = input[29];
1595 bf1[47] = input[61];
1596 bf1[48] = input[3];
1597 bf1[49] = input[35];
1598 bf1[50] = input[19];
1599 bf1[51] = input[51];
1600 bf1[52] = input[11];
1601 bf1[53] = input[43];
1602 bf1[54] = input[27];
1603 bf1[55] = input[59];
1604 bf1[56] = input[7];
1605 bf1[57] = input[39];
1606 bf1[58] = input[23];
1607 bf1[59] = input[55];
1608 bf1[60] = input[15];
1609 bf1[61] = input[47];
1610 bf1[62] = input[31];
1611 bf1[63] = input[63];
1612 range_check(stage, input, bf1, size, stage_range[stage]);
1613
1614 // stage 2
1615 stage++;
1616 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
1617 bf0 = output;
1618 bf1 = step;
1619 bf1[0] = bf0[0];
1620 bf1[1] = bf0[1];
1621 bf1[2] = bf0[2];
1622 bf1[3] = bf0[3];
1623 bf1[4] = bf0[4];
1624 bf1[5] = bf0[5];
1625 bf1[6] = bf0[6];
1626 bf1[7] = bf0[7];
1627 bf1[8] = bf0[8];
1628 bf1[9] = bf0[9];
1629 bf1[10] = bf0[10];
1630 bf1[11] = bf0[11];
1631 bf1[12] = bf0[12];
1632 bf1[13] = bf0[13];
1633 bf1[14] = bf0[14];
1634 bf1[15] = bf0[15];
1635 bf1[16] = bf0[16];
1636 bf1[17] = bf0[17];
1637 bf1[18] = bf0[18];
1638 bf1[19] = bf0[19];
1639 bf1[20] = bf0[20];
1640 bf1[21] = bf0[21];
1641 bf1[22] = bf0[22];
1642 bf1[23] = bf0[23];
1643 bf1[24] = bf0[24];
1644 bf1[25] = bf0[25];
1645 bf1[26] = bf0[26];
1646 bf1[27] = bf0[27];
1647 bf1[28] = bf0[28];
1648 bf1[29] = bf0[29];
1649 bf1[30] = bf0[30];
1650 bf1[31] = bf0[31];
1651 bf1[32] = half_btf(cospi[63], bf0[32], -cospi[1], bf0[63], cos_bit[stage]);
1652 bf1[33] = half_btf(cospi[31], bf0[33], -cospi[33], bf0[62], cos_bit[stage]);
1653 bf1[34] = half_btf(cospi[47], bf0[34], -cospi[17], bf0[61], cos_bit[stage]);
1654 bf1[35] = half_btf(cospi[15], bf0[35], -cospi[49], bf0[60], cos_bit[stage]);
1655 bf1[36] = half_btf(cospi[55], bf0[36], -cospi[9], bf0[59], cos_bit[stage]);
1656 bf1[37] = half_btf(cospi[23], bf0[37], -cospi[41], bf0[58], cos_bit[stage]);
1657 bf1[38] = half_btf(cospi[39], bf0[38], -cospi[25], bf0[57], cos_bit[stage]);
1658 bf1[39] = half_btf(cospi[7], bf0[39], -cospi[57], bf0[56], cos_bit[stage]);
1659 bf1[40] = half_btf(cospi[59], bf0[40], -cospi[5], bf0[55], cos_bit[stage]);
1660 bf1[41] = half_btf(cospi[27], bf0[41], -cospi[37], bf0[54], cos_bit[stage]);
1661 bf1[42] = half_btf(cospi[43], bf0[42], -cospi[21], bf0[53], cos_bit[stage]);
1662 bf1[43] = half_btf(cospi[11], bf0[43], -cospi[53], bf0[52], cos_bit[stage]);
1663 bf1[44] = half_btf(cospi[51], bf0[44], -cospi[13], bf0[51], cos_bit[stage]);
1664 bf1[45] = half_btf(cospi[19], bf0[45], -cospi[45], bf0[50], cos_bit[stage]);
1665 bf1[46] = half_btf(cospi[35], bf0[46], -cospi[29], bf0[49], cos_bit[stage]);
1666 bf1[47] = half_btf(cospi[3], bf0[47], -cospi[61], bf0[48], cos_bit[stage]);
1667 bf1[48] = half_btf(cospi[61], bf0[47], cospi[3], bf0[48], cos_bit[stage]);
1668 bf1[49] = half_btf(cospi[29], bf0[46], cospi[35], bf0[49], cos_bit[stage]);
1669 bf1[50] = half_btf(cospi[45], bf0[45], cospi[19], bf0[50], cos_bit[stage]);
1670 bf1[51] = half_btf(cospi[13], bf0[44], cospi[51], bf0[51], cos_bit[stage]);
1671 bf1[52] = half_btf(cospi[53], bf0[43], cospi[11], bf0[52], cos_bit[stage]);
1672 bf1[53] = half_btf(cospi[21], bf0[42], cospi[43], bf0[53], cos_bit[stage]);
1673 bf1[54] = half_btf(cospi[37], bf0[41], cospi[27], bf0[54], cos_bit[stage]);
1674 bf1[55] = half_btf(cospi[5], bf0[40], cospi[59], bf0[55], cos_bit[stage]);
1675 bf1[56] = half_btf(cospi[57], bf0[39], cospi[7], bf0[56], cos_bit[stage]);
1676 bf1[57] = half_btf(cospi[25], bf0[38], cospi[39], bf0[57], cos_bit[stage]);
1677 bf1[58] = half_btf(cospi[41], bf0[37], cospi[23], bf0[58], cos_bit[stage]);
1678 bf1[59] = half_btf(cospi[9], bf0[36], cospi[55], bf0[59], cos_bit[stage]);
1679 bf1[60] = half_btf(cospi[49], bf0[35], cospi[15], bf0[60], cos_bit[stage]);
1680 bf1[61] = half_btf(cospi[17], bf0[34], cospi[47], bf0[61], cos_bit[stage]);
1681 bf1[62] = half_btf(cospi[33], bf0[33], cospi[31], bf0[62], cos_bit[stage]);
1682 bf1[63] = half_btf(cospi[1], bf0[32], cospi[63], bf0[63], cos_bit[stage]);
1683 range_check(stage, input, bf1, size, stage_range[stage]);
1684
1685 // stage 3
1686 stage++;
1687 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
1688 bf0 = step;
1689 bf1 = output;
1690 bf1[0] = bf0[0];
1691 bf1[1] = bf0[1];
1692 bf1[2] = bf0[2];
1693 bf1[3] = bf0[3];
1694 bf1[4] = bf0[4];
1695 bf1[5] = bf0[5];
1696 bf1[6] = bf0[6];
1697 bf1[7] = bf0[7];
1698 bf1[8] = bf0[8];
1699 bf1[9] = bf0[9];
1700 bf1[10] = bf0[10];
1701 bf1[11] = bf0[11];
1702 bf1[12] = bf0[12];
1703 bf1[13] = bf0[13];
1704 bf1[14] = bf0[14];
1705 bf1[15] = bf0[15];
1706 bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit[stage]);
1707 bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit[stage]);
1708 bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit[stage]);
1709 bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit[stage]);
1710 bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit[stage]);
1711 bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit[stage]);
1712 bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit[stage]);
1713 bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit[stage]);
1714 bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit[stage]);
1715 bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit[stage]);
1716 bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit[stage]);
1717 bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit[stage]);
1718 bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit[stage]);
1719 bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit[stage]);
1720 bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit[stage]);
1721 bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit[stage]);
1722 bf1[32] = bf0[32] + bf0[33];
1723 bf1[33] = bf0[32] - bf0[33];
1724 bf1[34] = -bf0[34] + bf0[35];
1725 bf1[35] = bf0[34] + bf0[35];
1726 bf1[36] = bf0[36] + bf0[37];
1727 bf1[37] = bf0[36] - bf0[37];
1728 bf1[38] = -bf0[38] + bf0[39];
1729 bf1[39] = bf0[38] + bf0[39];
1730 bf1[40] = bf0[40] + bf0[41];
1731 bf1[41] = bf0[40] - bf0[41];
1732 bf1[42] = -bf0[42] + bf0[43];
1733 bf1[43] = bf0[42] + bf0[43];
1734 bf1[44] = bf0[44] + bf0[45];
1735 bf1[45] = bf0[44] - bf0[45];
1736 bf1[46] = -bf0[46] + bf0[47];
1737 bf1[47] = bf0[46] + bf0[47];
1738 bf1[48] = bf0[48] + bf0[49];
1739 bf1[49] = bf0[48] - bf0[49];
1740 bf1[50] = -bf0[50] + bf0[51];
1741 bf1[51] = bf0[50] + bf0[51];
1742 bf1[52] = bf0[52] + bf0[53];
1743 bf1[53] = bf0[52] - bf0[53];
1744 bf1[54] = -bf0[54] + bf0[55];
1745 bf1[55] = bf0[54] + bf0[55];
1746 bf1[56] = bf0[56] + bf0[57];
1747 bf1[57] = bf0[56] - bf0[57];
1748 bf1[58] = -bf0[58] + bf0[59];
1749 bf1[59] = bf0[58] + bf0[59];
1750 bf1[60] = bf0[60] + bf0[61];
1751 bf1[61] = bf0[60] - bf0[61];
1752 bf1[62] = -bf0[62] + bf0[63];
1753 bf1[63] = bf0[62] + bf0[63];
1754 range_check(stage, input, bf1, size, stage_range[stage]);
1755
1756 // stage 4
1757 stage++;
1758 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
1759 bf0 = output;
1760 bf1 = step;
1761 bf1[0] = bf0[0];
1762 bf1[1] = bf0[1];
1763 bf1[2] = bf0[2];
1764 bf1[3] = bf0[3];
1765 bf1[4] = bf0[4];
1766 bf1[5] = bf0[5];
1767 bf1[6] = bf0[6];
1768 bf1[7] = bf0[7];
1769 bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit[stage]);
1770 bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit[stage]);
1771 bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit[stage]);
1772 bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit[stage]);
1773 bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit[stage]);
1774 bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit[stage]);
1775 bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit[stage]);
1776 bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit[stage]);
1777 bf1[16] = bf0[16] + bf0[17];
1778 bf1[17] = bf0[16] - bf0[17];
1779 bf1[18] = -bf0[18] + bf0[19];
1780 bf1[19] = bf0[18] + bf0[19];
1781 bf1[20] = bf0[20] + bf0[21];
1782 bf1[21] = bf0[20] - bf0[21];
1783 bf1[22] = -bf0[22] + bf0[23];
1784 bf1[23] = bf0[22] + bf0[23];
1785 bf1[24] = bf0[24] + bf0[25];
1786 bf1[25] = bf0[24] - bf0[25];
1787 bf1[26] = -bf0[26] + bf0[27];
1788 bf1[27] = bf0[26] + bf0[27];
1789 bf1[28] = bf0[28] + bf0[29];
1790 bf1[29] = bf0[28] - bf0[29];
1791 bf1[30] = -bf0[30] + bf0[31];
1792 bf1[31] = bf0[30] + bf0[31];
1793 bf1[32] = bf0[32];
1794 bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit[stage]);
1795 bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit[stage]);
1796 bf1[35] = bf0[35];
1797 bf1[36] = bf0[36];
1798 bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit[stage]);
1799 bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit[stage]);
1800 bf1[39] = bf0[39];
1801 bf1[40] = bf0[40];
1802 bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit[stage]);
1803 bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit[stage]);
1804 bf1[43] = bf0[43];
1805 bf1[44] = bf0[44];
1806 bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit[stage]);
1807 bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit[stage]);
1808 bf1[47] = bf0[47];
1809 bf1[48] = bf0[48];
1810 bf1[49] = half_btf(-cospi[52], bf0[46], cospi[12], bf0[49], cos_bit[stage]);
1811 bf1[50] = half_btf(cospi[12], bf0[45], cospi[52], bf0[50], cos_bit[stage]);
1812 bf1[51] = bf0[51];
1813 bf1[52] = bf0[52];
1814 bf1[53] = half_btf(-cospi[20], bf0[42], cospi[44], bf0[53], cos_bit[stage]);
1815 bf1[54] = half_btf(cospi[44], bf0[41], cospi[20], bf0[54], cos_bit[stage]);
1816 bf1[55] = bf0[55];
1817 bf1[56] = bf0[56];
1818 bf1[57] = half_btf(-cospi[36], bf0[38], cospi[28], bf0[57], cos_bit[stage]);
1819 bf1[58] = half_btf(cospi[28], bf0[37], cospi[36], bf0[58], cos_bit[stage]);
1820 bf1[59] = bf0[59];
1821 bf1[60] = bf0[60];
1822 bf1[61] = half_btf(-cospi[4], bf0[34], cospi[60], bf0[61], cos_bit[stage]);
1823 bf1[62] = half_btf(cospi[60], bf0[33], cospi[4], bf0[62], cos_bit[stage]);
1824 bf1[63] = bf0[63];
1825 range_check(stage, input, bf1, size, stage_range[stage]);
1826
1827 // stage 5
1828 stage++;
1829 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
1830 bf0 = step;
1831 bf1 = output;
1832 bf1[0] = bf0[0];
1833 bf1[1] = bf0[1];
1834 bf1[2] = bf0[2];
1835 bf1[3] = bf0[3];
1836 bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit[stage]);
1837 bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit[stage]);
1838 bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit[stage]);
1839 bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit[stage]);
1840 bf1[8] = bf0[8] + bf0[9];
1841 bf1[9] = bf0[8] - bf0[9];
1842 bf1[10] = -bf0[10] + bf0[11];
1843 bf1[11] = bf0[10] + bf0[11];
1844 bf1[12] = bf0[12] + bf0[13];
1845 bf1[13] = bf0[12] - bf0[13];
1846 bf1[14] = -bf0[14] + bf0[15];
1847 bf1[15] = bf0[14] + bf0[15];
1848 bf1[16] = bf0[16];
1849 bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit[stage]);
1850 bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit[stage]);
1851 bf1[19] = bf0[19];
1852 bf1[20] = bf0[20];
1853 bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit[stage]);
1854 bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit[stage]);
1855 bf1[23] = bf0[23];
1856 bf1[24] = bf0[24];
1857 bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit[stage]);
1858 bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit[stage]);
1859 bf1[27] = bf0[27];
1860 bf1[28] = bf0[28];
1861 bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit[stage]);
1862 bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit[stage]);
1863 bf1[31] = bf0[31];
1864 bf1[32] = bf0[32] + bf0[35];
1865 bf1[33] = bf0[33] + bf0[34];
1866 bf1[34] = bf0[33] - bf0[34];
1867 bf1[35] = bf0[32] - bf0[35];
1868 bf1[36] = -bf0[36] + bf0[39];
1869 bf1[37] = -bf0[37] + bf0[38];
1870 bf1[38] = bf0[37] + bf0[38];
1871 bf1[39] = bf0[36] + bf0[39];
1872 bf1[40] = bf0[40] + bf0[43];
1873 bf1[41] = bf0[41] + bf0[42];
1874 bf1[42] = bf0[41] - bf0[42];
1875 bf1[43] = bf0[40] - bf0[43];
1876 bf1[44] = -bf0[44] + bf0[47];
1877 bf1[45] = -bf0[45] + bf0[46];
1878 bf1[46] = bf0[45] + bf0[46];
1879 bf1[47] = bf0[44] + bf0[47];
1880 bf1[48] = bf0[48] + bf0[51];
1881 bf1[49] = bf0[49] + bf0[50];
1882 bf1[50] = bf0[49] - bf0[50];
1883 bf1[51] = bf0[48] - bf0[51];
1884 bf1[52] = -bf0[52] + bf0[55];
1885 bf1[53] = -bf0[53] + bf0[54];
1886 bf1[54] = bf0[53] + bf0[54];
1887 bf1[55] = bf0[52] + bf0[55];
1888 bf1[56] = bf0[56] + bf0[59];
1889 bf1[57] = bf0[57] + bf0[58];
1890 bf1[58] = bf0[57] - bf0[58];
1891 bf1[59] = bf0[56] - bf0[59];
1892 bf1[60] = -bf0[60] + bf0[63];
1893 bf1[61] = -bf0[61] + bf0[62];
1894 bf1[62] = bf0[61] + bf0[62];
1895 bf1[63] = bf0[60] + bf0[63];
1896 range_check(stage, input, bf1, size, stage_range[stage]);
1897
1898 // stage 6
1899 stage++;
1900 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
1901 bf0 = output;
1902 bf1 = step;
1903 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
1904 bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit[stage]);
1905 bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit[stage]);
1906 bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit[stage]);
1907 bf1[4] = bf0[4] + bf0[5];
1908 bf1[5] = bf0[4] - bf0[5];
1909 bf1[6] = -bf0[6] + bf0[7];
1910 bf1[7] = bf0[6] + bf0[7];
1911 bf1[8] = bf0[8];
1912 bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
1913 bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
1914 bf1[11] = bf0[11];
1915 bf1[12] = bf0[12];
1916 bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit[stage]);
1917 bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit[stage]);
1918 bf1[15] = bf0[15];
1919 bf1[16] = bf0[16] + bf0[19];
1920 bf1[17] = bf0[17] + bf0[18];
1921 bf1[18] = bf0[17] - bf0[18];
1922 bf1[19] = bf0[16] - bf0[19];
1923 bf1[20] = -bf0[20] + bf0[23];
1924 bf1[21] = -bf0[21] + bf0[22];
1925 bf1[22] = bf0[21] + bf0[22];
1926 bf1[23] = bf0[20] + bf0[23];
1927 bf1[24] = bf0[24] + bf0[27];
1928 bf1[25] = bf0[25] + bf0[26];
1929 bf1[26] = bf0[25] - bf0[26];
1930 bf1[27] = bf0[24] - bf0[27];
1931 bf1[28] = -bf0[28] + bf0[31];
1932 bf1[29] = -bf0[29] + bf0[30];
1933 bf1[30] = bf0[29] + bf0[30];
1934 bf1[31] = bf0[28] + bf0[31];
1935 bf1[32] = bf0[32];
1936 bf1[33] = bf0[33];
1937 bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit[stage]);
1938 bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit[stage]);
1939 bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit[stage]);
1940 bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit[stage]);
1941 bf1[38] = bf0[38];
1942 bf1[39] = bf0[39];
1943 bf1[40] = bf0[40];
1944 bf1[41] = bf0[41];
1945 bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit[stage]);
1946 bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit[stage]);
1947 bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit[stage]);
1948 bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit[stage]);
1949 bf1[46] = bf0[46];
1950 bf1[47] = bf0[47];
1951 bf1[48] = bf0[48];
1952 bf1[49] = bf0[49];
1953 bf1[50] = half_btf(-cospi[40], bf0[45], cospi[24], bf0[50], cos_bit[stage]);
1954 bf1[51] = half_btf(-cospi[40], bf0[44], cospi[24], bf0[51], cos_bit[stage]);
1955 bf1[52] = half_btf(cospi[24], bf0[43], cospi[40], bf0[52], cos_bit[stage]);
1956 bf1[53] = half_btf(cospi[24], bf0[42], cospi[40], bf0[53], cos_bit[stage]);
1957 bf1[54] = bf0[54];
1958 bf1[55] = bf0[55];
1959 bf1[56] = bf0[56];
1960 bf1[57] = bf0[57];
1961 bf1[58] = half_btf(-cospi[8], bf0[37], cospi[56], bf0[58], cos_bit[stage]);
1962 bf1[59] = half_btf(-cospi[8], bf0[36], cospi[56], bf0[59], cos_bit[stage]);
1963 bf1[60] = half_btf(cospi[56], bf0[35], cospi[8], bf0[60], cos_bit[stage]);
1964 bf1[61] = half_btf(cospi[56], bf0[34], cospi[8], bf0[61], cos_bit[stage]);
1965 bf1[62] = bf0[62];
1966 bf1[63] = bf0[63];
1967 range_check(stage, input, bf1, size, stage_range[stage]);
1968
1969 // stage 7
1970 stage++;
1971 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
1972 bf0 = step;
1973 bf1 = output;
1974 bf1[0] = bf0[0] + bf0[3];
1975 bf1[1] = bf0[1] + bf0[2];
1976 bf1[2] = bf0[1] - bf0[2];
1977 bf1[3] = bf0[0] - bf0[3];
1978 bf1[4] = bf0[4];
1979 bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
1980 bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
1981 bf1[7] = bf0[7];
1982 bf1[8] = bf0[8] + bf0[11];
1983 bf1[9] = bf0[9] + bf0[10];
1984 bf1[10] = bf0[9] - bf0[10];
1985 bf1[11] = bf0[8] - bf0[11];
1986 bf1[12] = -bf0[12] + bf0[15];
1987 bf1[13] = -bf0[13] + bf0[14];
1988 bf1[14] = bf0[13] + bf0[14];
1989 bf1[15] = bf0[12] + bf0[15];
1990 bf1[16] = bf0[16];
1991 bf1[17] = bf0[17];
1992 bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit[stage]);
1993 bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit[stage]);
1994 bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit[stage]);
1995 bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit[stage]);
1996 bf1[22] = bf0[22];
1997 bf1[23] = bf0[23];
1998 bf1[24] = bf0[24];
1999 bf1[25] = bf0[25];
2000 bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit[stage]);
2001 bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit[stage]);
2002 bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit[stage]);
2003 bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit[stage]);
2004 bf1[30] = bf0[30];
2005 bf1[31] = bf0[31];
2006 bf1[32] = bf0[32] + bf0[39];
2007 bf1[33] = bf0[33] + bf0[38];
2008 bf1[34] = bf0[34] + bf0[37];
2009 bf1[35] = bf0[35] + bf0[36];
2010 bf1[36] = bf0[35] - bf0[36];
2011 bf1[37] = bf0[34] - bf0[37];
2012 bf1[38] = bf0[33] - bf0[38];
2013 bf1[39] = bf0[32] - bf0[39];
2014 bf1[40] = -bf0[40] + bf0[47];
2015 bf1[41] = -bf0[41] + bf0[46];
2016 bf1[42] = -bf0[42] + bf0[45];
2017 bf1[43] = -bf0[43] + bf0[44];
2018 bf1[44] = bf0[43] + bf0[44];
2019 bf1[45] = bf0[42] + bf0[45];
2020 bf1[46] = bf0[41] + bf0[46];
2021 bf1[47] = bf0[40] + bf0[47];
2022 bf1[48] = bf0[48] + bf0[55];
2023 bf1[49] = bf0[49] + bf0[54];
2024 bf1[50] = bf0[50] + bf0[53];
2025 bf1[51] = bf0[51] + bf0[52];
2026 bf1[52] = bf0[51] - bf0[52];
2027 bf1[53] = bf0[50] - bf0[53];
2028 bf1[54] = bf0[49] - bf0[54];
2029 bf1[55] = bf0[48] - bf0[55];
2030 bf1[56] = -bf0[56] + bf0[63];
2031 bf1[57] = -bf0[57] + bf0[62];
2032 bf1[58] = -bf0[58] + bf0[61];
2033 bf1[59] = -bf0[59] + bf0[60];
2034 bf1[60] = bf0[59] + bf0[60];
2035 bf1[61] = bf0[58] + bf0[61];
2036 bf1[62] = bf0[57] + bf0[62];
2037 bf1[63] = bf0[56] + bf0[63];
2038 range_check(stage, input, bf1, size, stage_range[stage]);
2039
2040 // stage 8
2041 stage++;
2042 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
2043 bf0 = output;
2044 bf1 = step;
2045 bf1[0] = bf0[0] + bf0[7];
2046 bf1[1] = bf0[1] + bf0[6];
2047 bf1[2] = bf0[2] + bf0[5];
2048 bf1[3] = bf0[3] + bf0[4];
2049 bf1[4] = bf0[3] - bf0[4];
2050 bf1[5] = bf0[2] - bf0[5];
2051 bf1[6] = bf0[1] - bf0[6];
2052 bf1[7] = bf0[0] - bf0[7];
2053 bf1[8] = bf0[8];
2054 bf1[9] = bf0[9];
2055 bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
2056 bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
2057 bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
2058 bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
2059 bf1[14] = bf0[14];
2060 bf1[15] = bf0[15];
2061 bf1[16] = bf0[16] + bf0[23];
2062 bf1[17] = bf0[17] + bf0[22];
2063 bf1[18] = bf0[18] + bf0[21];
2064 bf1[19] = bf0[19] + bf0[20];
2065 bf1[20] = bf0[19] - bf0[20];
2066 bf1[21] = bf0[18] - bf0[21];
2067 bf1[22] = bf0[17] - bf0[22];
2068 bf1[23] = bf0[16] - bf0[23];
2069 bf1[24] = -bf0[24] + bf0[31];
2070 bf1[25] = -bf0[25] + bf0[30];
2071 bf1[26] = -bf0[26] + bf0[29];
2072 bf1[27] = -bf0[27] + bf0[28];
2073 bf1[28] = bf0[27] + bf0[28];
2074 bf1[29] = bf0[26] + bf0[29];
2075 bf1[30] = bf0[25] + bf0[30];
2076 bf1[31] = bf0[24] + bf0[31];
2077 bf1[32] = bf0[32];
2078 bf1[33] = bf0[33];
2079 bf1[34] = bf0[34];
2080 bf1[35] = bf0[35];
2081 bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit[stage]);
2082 bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit[stage]);
2083 bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit[stage]);
2084 bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit[stage]);
2085 bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit[stage]);
2086 bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit[stage]);
2087 bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit[stage]);
2088 bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit[stage]);
2089 bf1[44] = bf0[44];
2090 bf1[45] = bf0[45];
2091 bf1[46] = bf0[46];
2092 bf1[47] = bf0[47];
2093 bf1[48] = bf0[48];
2094 bf1[49] = bf0[49];
2095 bf1[50] = bf0[50];
2096 bf1[51] = bf0[51];
2097 bf1[52] = half_btf(-cospi[16], bf0[43], cospi[48], bf0[52], cos_bit[stage]);
2098 bf1[53] = half_btf(-cospi[16], bf0[42], cospi[48], bf0[53], cos_bit[stage]);
2099 bf1[54] = half_btf(-cospi[16], bf0[41], cospi[48], bf0[54], cos_bit[stage]);
2100 bf1[55] = half_btf(-cospi[16], bf0[40], cospi[48], bf0[55], cos_bit[stage]);
2101 bf1[56] = half_btf(cospi[48], bf0[39], cospi[16], bf0[56], cos_bit[stage]);
2102 bf1[57] = half_btf(cospi[48], bf0[38], cospi[16], bf0[57], cos_bit[stage]);
2103 bf1[58] = half_btf(cospi[48], bf0[37], cospi[16], bf0[58], cos_bit[stage]);
2104 bf1[59] = half_btf(cospi[48], bf0[36], cospi[16], bf0[59], cos_bit[stage]);
2105 bf1[60] = bf0[60];
2106 bf1[61] = bf0[61];
2107 bf1[62] = bf0[62];
2108 bf1[63] = bf0[63];
2109 range_check(stage, input, bf1, size, stage_range[stage]);
2110
2111 // stage 9
2112 stage++;
2113 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
2114 bf0 = step;
2115 bf1 = output;
2116 bf1[0] = bf0[0] + bf0[15];
2117 bf1[1] = bf0[1] + bf0[14];
2118 bf1[2] = bf0[2] + bf0[13];
2119 bf1[3] = bf0[3] + bf0[12];
2120 bf1[4] = bf0[4] + bf0[11];
2121 bf1[5] = bf0[5] + bf0[10];
2122 bf1[6] = bf0[6] + bf0[9];
2123 bf1[7] = bf0[7] + bf0[8];
2124 bf1[8] = bf0[7] - bf0[8];
2125 bf1[9] = bf0[6] - bf0[9];
2126 bf1[10] = bf0[5] - bf0[10];
2127 bf1[11] = bf0[4] - bf0[11];
2128 bf1[12] = bf0[3] - bf0[12];
2129 bf1[13] = bf0[2] - bf0[13];
2130 bf1[14] = bf0[1] - bf0[14];
2131 bf1[15] = bf0[0] - bf0[15];
2132 bf1[16] = bf0[16];
2133 bf1[17] = bf0[17];
2134 bf1[18] = bf0[18];
2135 bf1[19] = bf0[19];
2136 bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
2137 bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
2138 bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
2139 bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
2140 bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
2141 bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
2142 bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
2143 bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
2144 bf1[28] = bf0[28];
2145 bf1[29] = bf0[29];
2146 bf1[30] = bf0[30];
2147 bf1[31] = bf0[31];
2148 bf1[32] = bf0[32] + bf0[47];
2149 bf1[33] = bf0[33] + bf0[46];
2150 bf1[34] = bf0[34] + bf0[45];
2151 bf1[35] = bf0[35] + bf0[44];
2152 bf1[36] = bf0[36] + bf0[43];
2153 bf1[37] = bf0[37] + bf0[42];
2154 bf1[38] = bf0[38] + bf0[41];
2155 bf1[39] = bf0[39] + bf0[40];
2156 bf1[40] = bf0[39] - bf0[40];
2157 bf1[41] = bf0[38] - bf0[41];
2158 bf1[42] = bf0[37] - bf0[42];
2159 bf1[43] = bf0[36] - bf0[43];
2160 bf1[44] = bf0[35] - bf0[44];
2161 bf1[45] = bf0[34] - bf0[45];
2162 bf1[46] = bf0[33] - bf0[46];
2163 bf1[47] = bf0[32] - bf0[47];
2164 bf1[48] = -bf0[48] + bf0[63];
2165 bf1[49] = -bf0[49] + bf0[62];
2166 bf1[50] = -bf0[50] + bf0[61];
2167 bf1[51] = -bf0[51] + bf0[60];
2168 bf1[52] = -bf0[52] + bf0[59];
2169 bf1[53] = -bf0[53] + bf0[58];
2170 bf1[54] = -bf0[54] + bf0[57];
2171 bf1[55] = -bf0[55] + bf0[56];
2172 bf1[56] = bf0[55] + bf0[56];
2173 bf1[57] = bf0[54] + bf0[57];
2174 bf1[58] = bf0[53] + bf0[58];
2175 bf1[59] = bf0[52] + bf0[59];
2176 bf1[60] = bf0[51] + bf0[60];
2177 bf1[61] = bf0[50] + bf0[61];
2178 bf1[62] = bf0[49] + bf0[62];
2179 bf1[63] = bf0[48] + bf0[63];
2180 range_check(stage, input, bf1, size, stage_range[stage]);
2181
2182 // stage 10
2183 stage++;
2184 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
2185 bf0 = output;
2186 bf1 = step;
2187 bf1[0] = bf0[0] + bf0[31];
2188 bf1[1] = bf0[1] + bf0[30];
2189 bf1[2] = bf0[2] + bf0[29];
2190 bf1[3] = bf0[3] + bf0[28];
2191 bf1[4] = bf0[4] + bf0[27];
2192 bf1[5] = bf0[5] + bf0[26];
2193 bf1[6] = bf0[6] + bf0[25];
2194 bf1[7] = bf0[7] + bf0[24];
2195 bf1[8] = bf0[8] + bf0[23];
2196 bf1[9] = bf0[9] + bf0[22];
2197 bf1[10] = bf0[10] + bf0[21];
2198 bf1[11] = bf0[11] + bf0[20];
2199 bf1[12] = bf0[12] + bf0[19];
2200 bf1[13] = bf0[13] + bf0[18];
2201 bf1[14] = bf0[14] + bf0[17];
2202 bf1[15] = bf0[15] + bf0[16];
2203 bf1[16] = bf0[15] - bf0[16];
2204 bf1[17] = bf0[14] - bf0[17];
2205 bf1[18] = bf0[13] - bf0[18];
2206 bf1[19] = bf0[12] - bf0[19];
2207 bf1[20] = bf0[11] - bf0[20];
2208 bf1[21] = bf0[10] - bf0[21];
2209 bf1[22] = bf0[9] - bf0[22];
2210 bf1[23] = bf0[8] - bf0[23];
2211 bf1[24] = bf0[7] - bf0[24];
2212 bf1[25] = bf0[6] - bf0[25];
2213 bf1[26] = bf0[5] - bf0[26];
2214 bf1[27] = bf0[4] - bf0[27];
2215 bf1[28] = bf0[3] - bf0[28];
2216 bf1[29] = bf0[2] - bf0[29];
2217 bf1[30] = bf0[1] - bf0[30];
2218 bf1[31] = bf0[0] - bf0[31];
2219 bf1[32] = bf0[32];
2220 bf1[33] = bf0[33];
2221 bf1[34] = bf0[34];
2222 bf1[35] = bf0[35];
2223 bf1[36] = bf0[36];
2224 bf1[37] = bf0[37];
2225 bf1[38] = bf0[38];
2226 bf1[39] = bf0[39];
2227 bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit[stage]);
2228 bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit[stage]);
2229 bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit[stage]);
2230 bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit[stage]);
2231 bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit[stage]);
2232 bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit[stage]);
2233 bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit[stage]);
2234 bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit[stage]);
2235 bf1[48] = half_btf(cospi[32], bf0[47], cospi[32], bf0[48], cos_bit[stage]);
2236 bf1[49] = half_btf(cospi[32], bf0[46], cospi[32], bf0[49], cos_bit[stage]);
2237 bf1[50] = half_btf(cospi[32], bf0[45], cospi[32], bf0[50], cos_bit[stage]);
2238 bf1[51] = half_btf(cospi[32], bf0[44], cospi[32], bf0[51], cos_bit[stage]);
2239 bf1[52] = half_btf(cospi[32], bf0[43], cospi[32], bf0[52], cos_bit[stage]);
2240 bf1[53] = half_btf(cospi[32], bf0[42], cospi[32], bf0[53], cos_bit[stage]);
2241 bf1[54] = half_btf(cospi[32], bf0[41], cospi[32], bf0[54], cos_bit[stage]);
2242 bf1[55] = half_btf(cospi[32], bf0[40], cospi[32], bf0[55], cos_bit[stage]);
2243 bf1[56] = bf0[56];
2244 bf1[57] = bf0[57];
2245 bf1[58] = bf0[58];
2246 bf1[59] = bf0[59];
2247 bf1[60] = bf0[60];
2248 bf1[61] = bf0[61];
2249 bf1[62] = bf0[62];
2250 bf1[63] = bf0[63];
2251 range_check(stage, input, bf1, size, stage_range[stage]);
2252
2253 // stage 11
2254 stage++;
2255 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
2256 bf0 = step;
2257 bf1 = output;
2258 bf1[0] = bf0[0] + bf0[63];
2259 bf1[1] = bf0[1] + bf0[62];
2260 bf1[2] = bf0[2] + bf0[61];
2261 bf1[3] = bf0[3] + bf0[60];
2262 bf1[4] = bf0[4] + bf0[59];
2263 bf1[5] = bf0[5] + bf0[58];
2264 bf1[6] = bf0[6] + bf0[57];
2265 bf1[7] = bf0[7] + bf0[56];
2266 bf1[8] = bf0[8] + bf0[55];
2267 bf1[9] = bf0[9] + bf0[54];
2268 bf1[10] = bf0[10] + bf0[53];
2269 bf1[11] = bf0[11] + bf0[52];
2270 bf1[12] = bf0[12] + bf0[51];
2271 bf1[13] = bf0[13] + bf0[50];
2272 bf1[14] = bf0[14] + bf0[49];
2273 bf1[15] = bf0[15] + bf0[48];
2274 bf1[16] = bf0[16] + bf0[47];
2275 bf1[17] = bf0[17] + bf0[46];
2276 bf1[18] = bf0[18] + bf0[45];
2277 bf1[19] = bf0[19] + bf0[44];
2278 bf1[20] = bf0[20] + bf0[43];
2279 bf1[21] = bf0[21] + bf0[42];
2280 bf1[22] = bf0[22] + bf0[41];
2281 bf1[23] = bf0[23] + bf0[40];
2282 bf1[24] = bf0[24] + bf0[39];
2283 bf1[25] = bf0[25] + bf0[38];
2284 bf1[26] = bf0[26] + bf0[37];
2285 bf1[27] = bf0[27] + bf0[36];
2286 bf1[28] = bf0[28] + bf0[35];
2287 bf1[29] = bf0[29] + bf0[34];
2288 bf1[30] = bf0[30] + bf0[33];
2289 bf1[31] = bf0[31] + bf0[32];
2290 bf1[32] = bf0[31] - bf0[32];
2291 bf1[33] = bf0[30] - bf0[33];
2292 bf1[34] = bf0[29] - bf0[34];
2293 bf1[35] = bf0[28] - bf0[35];
2294 bf1[36] = bf0[27] - bf0[36];
2295 bf1[37] = bf0[26] - bf0[37];
2296 bf1[38] = bf0[25] - bf0[38];
2297 bf1[39] = bf0[24] - bf0[39];
2298 bf1[40] = bf0[23] - bf0[40];
2299 bf1[41] = bf0[22] - bf0[41];
2300 bf1[42] = bf0[21] - bf0[42];
2301 bf1[43] = bf0[20] - bf0[43];
2302 bf1[44] = bf0[19] - bf0[44];
2303 bf1[45] = bf0[18] - bf0[45];
2304 bf1[46] = bf0[17] - bf0[46];
2305 bf1[47] = bf0[16] - bf0[47];
2306 bf1[48] = bf0[15] - bf0[48];
2307 bf1[49] = bf0[14] - bf0[49];
2308 bf1[50] = bf0[13] - bf0[50];
2309 bf1[51] = bf0[12] - bf0[51];
2310 bf1[52] = bf0[11] - bf0[52];
2311 bf1[53] = bf0[10] - bf0[53];
2312 bf1[54] = bf0[9] - bf0[54];
2313 bf1[55] = bf0[8] - bf0[55];
2314 bf1[56] = bf0[7] - bf0[56];
2315 bf1[57] = bf0[6] - bf0[57];
2316 bf1[58] = bf0[5] - bf0[58];
2317 bf1[59] = bf0[4] - bf0[59];
2318 bf1[60] = bf0[3] - bf0[60];
2319 bf1[61] = bf0[2] - bf0[61];
2320 bf1[62] = bf0[1] - bf0[62];
2321 bf1[63] = bf0[0] - bf0[63];
2322 range_check(stage, input, bf1, size, stage_range[stage]);
2323}
2324#endif // CONFIG_TX64X64