blob: 726445f2a9bf5e8b7ccde73c78791e0e14f19e7d [file] [log] [blame]
Yaowu Xuc27fc142016-08-22 16:08:15 -07001/*
Yaowu Xubde4ac82016-11-28 15:26:06 -08002 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
Yaowu Xuc27fc142016-08-22 16:08:15 -07003 *
Yaowu Xubde4ac82016-11-28 15:26:06 -08004 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
Yaowu Xuc27fc142016-08-22 16:08:15 -070010 */
11
12#include <stdlib.h>
Yaowu Xuf883b422016-08-30 14:01:10 -070013#include "av1/common/av1_fwd_txfm1d.h"
Yaowu Xuc27fc142016-08-22 16:08:15 -070014#if CONFIG_COEFFICIENT_RANGE_CHECKING
15#define range_check(stage, input, buf, size, bit) \
16 { \
17 int i, j; \
18 for (i = 0; i < size; ++i) { \
19 int buf_bit = get_max_bit(abs(buf[i])) + 1; \
20 if (buf_bit > bit) { \
21 printf("======== %s %d overflow ========\n", __FILE__, __LINE__); \
22 printf("stage: %d node: %d\n", stage, i); \
23 printf("bit: %d buf_bit: %d buf[i]: %d\n", bit, buf_bit, buf[i]); \
24 printf("input:\n"); \
25 for (j = 0; j < size; j++) { \
26 printf("%d,", input[j]); \
27 } \
28 printf("\n"); \
29 assert(0); \
30 } \
31 } \
32 }
33#else
34#define range_check(stage, input, buf, size, bit) \
35 { \
clang-format67948d32016-09-07 22:40:40 -070036 (void)stage; \
37 (void)input; \
38 (void)buf; \
39 (void)size; \
40 (void)bit; \
Yaowu Xuc27fc142016-08-22 16:08:15 -070041 }
42#endif
43
Angie Chiang792519b2016-10-18 12:24:20 -070044// TODO(angiebird): Make 1-d txfm functions static
Yaowu Xuf883b422016-08-30 14:01:10 -070045void av1_fdct4_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
46 const int8_t *stage_range) {
Yaowu Xuc27fc142016-08-22 16:08:15 -070047 const int32_t size = 4;
48 const int32_t *cospi;
49
50 int32_t stage = 0;
51 int32_t *bf0, *bf1;
52 int32_t step[4];
53
54 // stage 0;
55 range_check(stage, input, input, size, stage_range[stage]);
56
57 // stage 1;
58 stage++;
59 bf1 = output;
60 bf1[0] = input[0] + input[3];
61 bf1[1] = input[1] + input[2];
62 bf1[2] = -input[2] + input[1];
63 bf1[3] = -input[3] + input[0];
64 range_check(stage, input, bf1, size, stage_range[stage]);
65
66 // stage 2
67 stage++;
68 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
69 bf0 = output;
70 bf1 = step;
71 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
72 bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
73 bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
74 bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
75 range_check(stage, input, bf1, size, stage_range[stage]);
76
77 // stage 3
78 stage++;
79 bf0 = step;
80 bf1 = output;
81 bf1[0] = bf0[0];
82 bf1[1] = bf0[2];
83 bf1[2] = bf0[1];
84 bf1[3] = bf0[3];
85 range_check(stage, input, bf1, size, stage_range[stage]);
86}
87
Yaowu Xuf883b422016-08-30 14:01:10 -070088void av1_fdct8_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
89 const int8_t *stage_range) {
Yaowu Xuc27fc142016-08-22 16:08:15 -070090 const int32_t size = 8;
91 const int32_t *cospi;
92
93 int32_t stage = 0;
94 int32_t *bf0, *bf1;
95 int32_t step[8];
96
97 // stage 0;
98 range_check(stage, input, input, size, stage_range[stage]);
99
100 // stage 1;
101 stage++;
102 bf1 = output;
103 bf1[0] = input[0] + input[7];
104 bf1[1] = input[1] + input[6];
105 bf1[2] = input[2] + input[5];
106 bf1[3] = input[3] + input[4];
107 bf1[4] = -input[4] + input[3];
108 bf1[5] = -input[5] + input[2];
109 bf1[6] = -input[6] + input[1];
110 bf1[7] = -input[7] + input[0];
111 range_check(stage, input, bf1, size, stage_range[stage]);
112
113 // stage 2
114 stage++;
115 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
116 bf0 = output;
117 bf1 = step;
118 bf1[0] = bf0[0] + bf0[3];
119 bf1[1] = bf0[1] + bf0[2];
120 bf1[2] = -bf0[2] + bf0[1];
121 bf1[3] = -bf0[3] + bf0[0];
122 bf1[4] = bf0[4];
123 bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
124 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
125 bf1[7] = bf0[7];
126 range_check(stage, input, bf1, size, stage_range[stage]);
127
128 // stage 3
129 stage++;
130 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
131 bf0 = step;
132 bf1 = output;
133 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
134 bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
135 bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
136 bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
137 bf1[4] = bf0[4] + bf0[5];
138 bf1[5] = -bf0[5] + bf0[4];
139 bf1[6] = -bf0[6] + bf0[7];
140 bf1[7] = bf0[7] + bf0[6];
141 range_check(stage, input, bf1, size, stage_range[stage]);
142
143 // stage 4
144 stage++;
145 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
146 bf0 = output;
147 bf1 = step;
148 bf1[0] = bf0[0];
149 bf1[1] = bf0[1];
150 bf1[2] = bf0[2];
151 bf1[3] = bf0[3];
152 bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
153 bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
154 bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
155 bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
156 range_check(stage, input, bf1, size, stage_range[stage]);
157
158 // stage 5
159 stage++;
160 bf0 = step;
161 bf1 = output;
162 bf1[0] = bf0[0];
163 bf1[1] = bf0[4];
164 bf1[2] = bf0[2];
165 bf1[3] = bf0[6];
166 bf1[4] = bf0[1];
167 bf1[5] = bf0[5];
168 bf1[6] = bf0[3];
169 bf1[7] = bf0[7];
170 range_check(stage, input, bf1, size, stage_range[stage]);
171}
172
Yaowu Xuf883b422016-08-30 14:01:10 -0700173void av1_fdct16_new(const int32_t *input, int32_t *output,
174 const int8_t *cos_bit, const int8_t *stage_range) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700175 const int32_t size = 16;
176 const int32_t *cospi;
177
178 int32_t stage = 0;
179 int32_t *bf0, *bf1;
180 int32_t step[16];
181
182 // stage 0;
183 range_check(stage, input, input, size, stage_range[stage]);
184
185 // stage 1;
186 stage++;
187 bf1 = output;
188 bf1[0] = input[0] + input[15];
189 bf1[1] = input[1] + input[14];
190 bf1[2] = input[2] + input[13];
191 bf1[3] = input[3] + input[12];
192 bf1[4] = input[4] + input[11];
193 bf1[5] = input[5] + input[10];
194 bf1[6] = input[6] + input[9];
195 bf1[7] = input[7] + input[8];
196 bf1[8] = -input[8] + input[7];
197 bf1[9] = -input[9] + input[6];
198 bf1[10] = -input[10] + input[5];
199 bf1[11] = -input[11] + input[4];
200 bf1[12] = -input[12] + input[3];
201 bf1[13] = -input[13] + input[2];
202 bf1[14] = -input[14] + input[1];
203 bf1[15] = -input[15] + input[0];
204 range_check(stage, input, bf1, size, stage_range[stage]);
205
206 // stage 2
207 stage++;
208 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
209 bf0 = output;
210 bf1 = step;
211 bf1[0] = bf0[0] + bf0[7];
212 bf1[1] = bf0[1] + bf0[6];
213 bf1[2] = bf0[2] + bf0[5];
214 bf1[3] = bf0[3] + bf0[4];
215 bf1[4] = -bf0[4] + bf0[3];
216 bf1[5] = -bf0[5] + bf0[2];
217 bf1[6] = -bf0[6] + bf0[1];
218 bf1[7] = -bf0[7] + bf0[0];
219 bf1[8] = bf0[8];
220 bf1[9] = bf0[9];
221 bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
222 bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
223 bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit[stage]);
224 bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit[stage]);
225 bf1[14] = bf0[14];
226 bf1[15] = bf0[15];
227 range_check(stage, input, bf1, size, stage_range[stage]);
228
229 // stage 3
230 stage++;
231 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
232 bf0 = step;
233 bf1 = output;
234 bf1[0] = bf0[0] + bf0[3];
235 bf1[1] = bf0[1] + bf0[2];
236 bf1[2] = -bf0[2] + bf0[1];
237 bf1[3] = -bf0[3] + bf0[0];
238 bf1[4] = bf0[4];
239 bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
240 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
241 bf1[7] = bf0[7];
242 bf1[8] = bf0[8] + bf0[11];
243 bf1[9] = bf0[9] + bf0[10];
244 bf1[10] = -bf0[10] + bf0[9];
245 bf1[11] = -bf0[11] + bf0[8];
246 bf1[12] = -bf0[12] + bf0[15];
247 bf1[13] = -bf0[13] + bf0[14];
248 bf1[14] = bf0[14] + bf0[13];
249 bf1[15] = bf0[15] + bf0[12];
250 range_check(stage, input, bf1, size, stage_range[stage]);
251
252 // stage 4
253 stage++;
254 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
255 bf0 = output;
256 bf1 = step;
257 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
258 bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
259 bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
260 bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
261 bf1[4] = bf0[4] + bf0[5];
262 bf1[5] = -bf0[5] + bf0[4];
263 bf1[6] = -bf0[6] + bf0[7];
264 bf1[7] = bf0[7] + bf0[6];
265 bf1[8] = bf0[8];
266 bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
267 bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
268 bf1[11] = bf0[11];
269 bf1[12] = bf0[12];
270 bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit[stage]);
271 bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit[stage]);
272 bf1[15] = bf0[15];
273 range_check(stage, input, bf1, size, stage_range[stage]);
274
275 // stage 5
276 stage++;
277 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
278 bf0 = step;
279 bf1 = output;
280 bf1[0] = bf0[0];
281 bf1[1] = bf0[1];
282 bf1[2] = bf0[2];
283 bf1[3] = bf0[3];
284 bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
285 bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
286 bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
287 bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
288 bf1[8] = bf0[8] + bf0[9];
289 bf1[9] = -bf0[9] + bf0[8];
290 bf1[10] = -bf0[10] + bf0[11];
291 bf1[11] = bf0[11] + bf0[10];
292 bf1[12] = bf0[12] + bf0[13];
293 bf1[13] = -bf0[13] + bf0[12];
294 bf1[14] = -bf0[14] + bf0[15];
295 bf1[15] = bf0[15] + bf0[14];
296 range_check(stage, input, bf1, size, stage_range[stage]);
297
298 // stage 6
299 stage++;
300 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
301 bf0 = output;
302 bf1 = step;
303 bf1[0] = bf0[0];
304 bf1[1] = bf0[1];
305 bf1[2] = bf0[2];
306 bf1[3] = bf0[3];
307 bf1[4] = bf0[4];
308 bf1[5] = bf0[5];
309 bf1[6] = bf0[6];
310 bf1[7] = bf0[7];
311 bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit[stage]);
312 bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit[stage]);
313 bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit[stage]);
314 bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit[stage]);
315 bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit[stage]);
316 bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit[stage]);
317 bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit[stage]);
318 bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit[stage]);
319 range_check(stage, input, bf1, size, stage_range[stage]);
320
321 // stage 7
322 stage++;
323 bf0 = step;
324 bf1 = output;
325 bf1[0] = bf0[0];
326 bf1[1] = bf0[8];
327 bf1[2] = bf0[4];
328 bf1[3] = bf0[12];
329 bf1[4] = bf0[2];
330 bf1[5] = bf0[10];
331 bf1[6] = bf0[6];
332 bf1[7] = bf0[14];
333 bf1[8] = bf0[1];
334 bf1[9] = bf0[9];
335 bf1[10] = bf0[5];
336 bf1[11] = bf0[13];
337 bf1[12] = bf0[3];
338 bf1[13] = bf0[11];
339 bf1[14] = bf0[7];
340 bf1[15] = bf0[15];
341 range_check(stage, input, bf1, size, stage_range[stage]);
342}
343
Yaowu Xuf883b422016-08-30 14:01:10 -0700344void av1_fdct32_new(const int32_t *input, int32_t *output,
345 const int8_t *cos_bit, const int8_t *stage_range) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700346 const int32_t size = 32;
347 const int32_t *cospi;
348
349 int32_t stage = 0;
350 int32_t *bf0, *bf1;
351 int32_t step[32];
352
353 // stage 0;
354 range_check(stage, input, input, size, stage_range[stage]);
355
356 // stage 1;
357 stage++;
358 bf1 = output;
359 bf1[0] = input[0] + input[31];
360 bf1[1] = input[1] + input[30];
361 bf1[2] = input[2] + input[29];
362 bf1[3] = input[3] + input[28];
363 bf1[4] = input[4] + input[27];
364 bf1[5] = input[5] + input[26];
365 bf1[6] = input[6] + input[25];
366 bf1[7] = input[7] + input[24];
367 bf1[8] = input[8] + input[23];
368 bf1[9] = input[9] + input[22];
369 bf1[10] = input[10] + input[21];
370 bf1[11] = input[11] + input[20];
371 bf1[12] = input[12] + input[19];
372 bf1[13] = input[13] + input[18];
373 bf1[14] = input[14] + input[17];
374 bf1[15] = input[15] + input[16];
375 bf1[16] = -input[16] + input[15];
376 bf1[17] = -input[17] + input[14];
377 bf1[18] = -input[18] + input[13];
378 bf1[19] = -input[19] + input[12];
379 bf1[20] = -input[20] + input[11];
380 bf1[21] = -input[21] + input[10];
381 bf1[22] = -input[22] + input[9];
382 bf1[23] = -input[23] + input[8];
383 bf1[24] = -input[24] + input[7];
384 bf1[25] = -input[25] + input[6];
385 bf1[26] = -input[26] + input[5];
386 bf1[27] = -input[27] + input[4];
387 bf1[28] = -input[28] + input[3];
388 bf1[29] = -input[29] + input[2];
389 bf1[30] = -input[30] + input[1];
390 bf1[31] = -input[31] + input[0];
391 range_check(stage, input, bf1, size, stage_range[stage]);
392
393 // stage 2
394 stage++;
395 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
396 bf0 = output;
397 bf1 = step;
398 bf1[0] = bf0[0] + bf0[15];
399 bf1[1] = bf0[1] + bf0[14];
400 bf1[2] = bf0[2] + bf0[13];
401 bf1[3] = bf0[3] + bf0[12];
402 bf1[4] = bf0[4] + bf0[11];
403 bf1[5] = bf0[5] + bf0[10];
404 bf1[6] = bf0[6] + bf0[9];
405 bf1[7] = bf0[7] + bf0[8];
406 bf1[8] = -bf0[8] + bf0[7];
407 bf1[9] = -bf0[9] + bf0[6];
408 bf1[10] = -bf0[10] + bf0[5];
409 bf1[11] = -bf0[11] + bf0[4];
410 bf1[12] = -bf0[12] + bf0[3];
411 bf1[13] = -bf0[13] + bf0[2];
412 bf1[14] = -bf0[14] + bf0[1];
413 bf1[15] = -bf0[15] + bf0[0];
414 bf1[16] = bf0[16];
415 bf1[17] = bf0[17];
416 bf1[18] = bf0[18];
417 bf1[19] = bf0[19];
418 bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
419 bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
420 bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
421 bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
422 bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit[stage]);
423 bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit[stage]);
424 bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit[stage]);
425 bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit[stage]);
426 bf1[28] = bf0[28];
427 bf1[29] = bf0[29];
428 bf1[30] = bf0[30];
429 bf1[31] = bf0[31];
430 range_check(stage, input, bf1, size, stage_range[stage]);
431
432 // stage 3
433 stage++;
434 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
435 bf0 = step;
436 bf1 = output;
437 bf1[0] = bf0[0] + bf0[7];
438 bf1[1] = bf0[1] + bf0[6];
439 bf1[2] = bf0[2] + bf0[5];
440 bf1[3] = bf0[3] + bf0[4];
441 bf1[4] = -bf0[4] + bf0[3];
442 bf1[5] = -bf0[5] + bf0[2];
443 bf1[6] = -bf0[6] + bf0[1];
444 bf1[7] = -bf0[7] + bf0[0];
445 bf1[8] = bf0[8];
446 bf1[9] = bf0[9];
447 bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
448 bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
449 bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit[stage]);
450 bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit[stage]);
451 bf1[14] = bf0[14];
452 bf1[15] = bf0[15];
453 bf1[16] = bf0[16] + bf0[23];
454 bf1[17] = bf0[17] + bf0[22];
455 bf1[18] = bf0[18] + bf0[21];
456 bf1[19] = bf0[19] + bf0[20];
457 bf1[20] = -bf0[20] + bf0[19];
458 bf1[21] = -bf0[21] + bf0[18];
459 bf1[22] = -bf0[22] + bf0[17];
460 bf1[23] = -bf0[23] + bf0[16];
461 bf1[24] = -bf0[24] + bf0[31];
462 bf1[25] = -bf0[25] + bf0[30];
463 bf1[26] = -bf0[26] + bf0[29];
464 bf1[27] = -bf0[27] + bf0[28];
465 bf1[28] = bf0[28] + bf0[27];
466 bf1[29] = bf0[29] + bf0[26];
467 bf1[30] = bf0[30] + bf0[25];
468 bf1[31] = bf0[31] + bf0[24];
469 range_check(stage, input, bf1, size, stage_range[stage]);
470
471 // stage 4
472 stage++;
473 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
474 bf0 = output;
475 bf1 = step;
476 bf1[0] = bf0[0] + bf0[3];
477 bf1[1] = bf0[1] + bf0[2];
478 bf1[2] = -bf0[2] + bf0[1];
479 bf1[3] = -bf0[3] + bf0[0];
480 bf1[4] = bf0[4];
481 bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
482 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
483 bf1[7] = bf0[7];
484 bf1[8] = bf0[8] + bf0[11];
485 bf1[9] = bf0[9] + bf0[10];
486 bf1[10] = -bf0[10] + bf0[9];
487 bf1[11] = -bf0[11] + bf0[8];
488 bf1[12] = -bf0[12] + bf0[15];
489 bf1[13] = -bf0[13] + bf0[14];
490 bf1[14] = bf0[14] + bf0[13];
491 bf1[15] = bf0[15] + bf0[12];
492 bf1[16] = bf0[16];
493 bf1[17] = bf0[17];
494 bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit[stage]);
495 bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit[stage]);
496 bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit[stage]);
497 bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit[stage]);
498 bf1[22] = bf0[22];
499 bf1[23] = bf0[23];
500 bf1[24] = bf0[24];
501 bf1[25] = bf0[25];
502 bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit[stage]);
503 bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit[stage]);
504 bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit[stage]);
505 bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit[stage]);
506 bf1[30] = bf0[30];
507 bf1[31] = bf0[31];
508 range_check(stage, input, bf1, size, stage_range[stage]);
509
510 // stage 5
511 stage++;
512 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
513 bf0 = step;
514 bf1 = output;
515 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
516 bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
517 bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
518 bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
519 bf1[4] = bf0[4] + bf0[5];
520 bf1[5] = -bf0[5] + bf0[4];
521 bf1[6] = -bf0[6] + bf0[7];
522 bf1[7] = bf0[7] + bf0[6];
523 bf1[8] = bf0[8];
524 bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
525 bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
526 bf1[11] = bf0[11];
527 bf1[12] = bf0[12];
528 bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit[stage]);
529 bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit[stage]);
530 bf1[15] = bf0[15];
531 bf1[16] = bf0[16] + bf0[19];
532 bf1[17] = bf0[17] + bf0[18];
533 bf1[18] = -bf0[18] + bf0[17];
534 bf1[19] = -bf0[19] + bf0[16];
535 bf1[20] = -bf0[20] + bf0[23];
536 bf1[21] = -bf0[21] + bf0[22];
537 bf1[22] = bf0[22] + bf0[21];
538 bf1[23] = bf0[23] + bf0[20];
539 bf1[24] = bf0[24] + bf0[27];
540 bf1[25] = bf0[25] + bf0[26];
541 bf1[26] = -bf0[26] + bf0[25];
542 bf1[27] = -bf0[27] + bf0[24];
543 bf1[28] = -bf0[28] + bf0[31];
544 bf1[29] = -bf0[29] + bf0[30];
545 bf1[30] = bf0[30] + bf0[29];
546 bf1[31] = bf0[31] + bf0[28];
547 range_check(stage, input, bf1, size, stage_range[stage]);
548
549 // stage 6
550 stage++;
551 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
552 bf0 = output;
553 bf1 = step;
554 bf1[0] = bf0[0];
555 bf1[1] = bf0[1];
556 bf1[2] = bf0[2];
557 bf1[3] = bf0[3];
558 bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
559 bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
560 bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
561 bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
562 bf1[8] = bf0[8] + bf0[9];
563 bf1[9] = -bf0[9] + bf0[8];
564 bf1[10] = -bf0[10] + bf0[11];
565 bf1[11] = bf0[11] + bf0[10];
566 bf1[12] = bf0[12] + bf0[13];
567 bf1[13] = -bf0[13] + bf0[12];
568 bf1[14] = -bf0[14] + bf0[15];
569 bf1[15] = bf0[15] + bf0[14];
570 bf1[16] = bf0[16];
571 bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit[stage]);
572 bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit[stage]);
573 bf1[19] = bf0[19];
574 bf1[20] = bf0[20];
575 bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit[stage]);
576 bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit[stage]);
577 bf1[23] = bf0[23];
578 bf1[24] = bf0[24];
579 bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit[stage]);
580 bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit[stage]);
581 bf1[27] = bf0[27];
582 bf1[28] = bf0[28];
583 bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit[stage]);
584 bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit[stage]);
585 bf1[31] = bf0[31];
586 range_check(stage, input, bf1, size, stage_range[stage]);
587
588 // stage 7
589 stage++;
590 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
591 bf0 = step;
592 bf1 = output;
593 bf1[0] = bf0[0];
594 bf1[1] = bf0[1];
595 bf1[2] = bf0[2];
596 bf1[3] = bf0[3];
597 bf1[4] = bf0[4];
598 bf1[5] = bf0[5];
599 bf1[6] = bf0[6];
600 bf1[7] = bf0[7];
601 bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit[stage]);
602 bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit[stage]);
603 bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit[stage]);
604 bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit[stage]);
605 bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit[stage]);
606 bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit[stage]);
607 bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit[stage]);
608 bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit[stage]);
609 bf1[16] = bf0[16] + bf0[17];
610 bf1[17] = -bf0[17] + bf0[16];
611 bf1[18] = -bf0[18] + bf0[19];
612 bf1[19] = bf0[19] + bf0[18];
613 bf1[20] = bf0[20] + bf0[21];
614 bf1[21] = -bf0[21] + bf0[20];
615 bf1[22] = -bf0[22] + bf0[23];
616 bf1[23] = bf0[23] + bf0[22];
617 bf1[24] = bf0[24] + bf0[25];
618 bf1[25] = -bf0[25] + bf0[24];
619 bf1[26] = -bf0[26] + bf0[27];
620 bf1[27] = bf0[27] + bf0[26];
621 bf1[28] = bf0[28] + bf0[29];
622 bf1[29] = -bf0[29] + bf0[28];
623 bf1[30] = -bf0[30] + bf0[31];
624 bf1[31] = bf0[31] + bf0[30];
625 range_check(stage, input, bf1, size, stage_range[stage]);
626
627 // stage 8
628 stage++;
629 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
630 bf0 = output;
631 bf1 = step;
632 bf1[0] = bf0[0];
633 bf1[1] = bf0[1];
634 bf1[2] = bf0[2];
635 bf1[3] = bf0[3];
636 bf1[4] = bf0[4];
637 bf1[5] = bf0[5];
638 bf1[6] = bf0[6];
639 bf1[7] = bf0[7];
640 bf1[8] = bf0[8];
641 bf1[9] = bf0[9];
642 bf1[10] = bf0[10];
643 bf1[11] = bf0[11];
644 bf1[12] = bf0[12];
645 bf1[13] = bf0[13];
646 bf1[14] = bf0[14];
647 bf1[15] = bf0[15];
648 bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit[stage]);
649 bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit[stage]);
650 bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit[stage]);
651 bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit[stage]);
652 bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit[stage]);
653 bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit[stage]);
654 bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit[stage]);
655 bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit[stage]);
656 bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit[stage]);
657 bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit[stage]);
658 bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit[stage]);
659 bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit[stage]);
660 bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit[stage]);
661 bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit[stage]);
662 bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit[stage]);
663 bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit[stage]);
664 range_check(stage, input, bf1, size, stage_range[stage]);
665
666 // stage 9
667 stage++;
668 bf0 = step;
669 bf1 = output;
670 bf1[0] = bf0[0];
671 bf1[1] = bf0[16];
672 bf1[2] = bf0[8];
673 bf1[3] = bf0[24];
674 bf1[4] = bf0[4];
675 bf1[5] = bf0[20];
676 bf1[6] = bf0[12];
677 bf1[7] = bf0[28];
678 bf1[8] = bf0[2];
679 bf1[9] = bf0[18];
680 bf1[10] = bf0[10];
681 bf1[11] = bf0[26];
682 bf1[12] = bf0[6];
683 bf1[13] = bf0[22];
684 bf1[14] = bf0[14];
685 bf1[15] = bf0[30];
686 bf1[16] = bf0[1];
687 bf1[17] = bf0[17];
688 bf1[18] = bf0[9];
689 bf1[19] = bf0[25];
690 bf1[20] = bf0[5];
691 bf1[21] = bf0[21];
692 bf1[22] = bf0[13];
693 bf1[23] = bf0[29];
694 bf1[24] = bf0[3];
695 bf1[25] = bf0[19];
696 bf1[26] = bf0[11];
697 bf1[27] = bf0[27];
698 bf1[28] = bf0[7];
699 bf1[29] = bf0[23];
700 bf1[30] = bf0[15];
701 bf1[31] = bf0[31];
702 range_check(stage, input, bf1, size, stage_range[stage]);
703}
704
Yaowu Xuf883b422016-08-30 14:01:10 -0700705void av1_fadst4_new(const int32_t *input, int32_t *output,
706 const int8_t *cos_bit, const int8_t *stage_range) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700707 const int32_t size = 4;
708 const int32_t *cospi;
709
710 int32_t stage = 0;
711 int32_t *bf0, *bf1;
712 int32_t step[4];
713
714 // stage 0;
715 range_check(stage, input, input, size, stage_range[stage]);
716
717 // stage 1;
718 stage++;
719 bf1 = output;
720 bf1[0] = input[3];
721 bf1[1] = input[0];
722 bf1[2] = input[1];
723 bf1[3] = input[2];
724 range_check(stage, input, bf1, size, stage_range[stage]);
725
726 // stage 2
727 stage++;
728 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
729 bf0 = output;
730 bf1 = step;
731 bf1[0] = half_btf(cospi[8], bf0[0], cospi[56], bf0[1], cos_bit[stage]);
732 bf1[1] = half_btf(-cospi[8], bf0[1], cospi[56], bf0[0], cos_bit[stage]);
733 bf1[2] = half_btf(cospi[40], bf0[2], cospi[24], bf0[3], cos_bit[stage]);
734 bf1[3] = half_btf(-cospi[40], bf0[3], cospi[24], bf0[2], cos_bit[stage]);
735 range_check(stage, input, bf1, size, stage_range[stage]);
736
737 // stage 3
738 stage++;
Yaowu Xuc27fc142016-08-22 16:08:15 -0700739 bf0 = step;
740 bf1 = output;
741 bf1[0] = bf0[0] + bf0[2];
742 bf1[1] = bf0[1] + bf0[3];
743 bf1[2] = -bf0[2] + bf0[0];
744 bf1[3] = -bf0[3] + bf0[1];
745 range_check(stage, input, bf1, size, stage_range[stage]);
746
747 // stage 4
748 stage++;
749 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
750 bf0 = output;
751 bf1 = step;
752 bf1[0] = bf0[0];
753 bf1[1] = bf0[1];
754 bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
755 bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
756 range_check(stage, input, bf1, size, stage_range[stage]);
757
758 // stage 5
759 stage++;
760 bf0 = step;
761 bf1 = output;
762 bf1[0] = bf0[0];
763 bf1[1] = -bf0[2];
764 bf1[2] = bf0[3];
765 bf1[3] = -bf0[1];
766 range_check(stage, input, bf1, size, stage_range[stage]);
767}
768
Yaowu Xuf883b422016-08-30 14:01:10 -0700769void av1_fadst8_new(const int32_t *input, int32_t *output,
770 const int8_t *cos_bit, const int8_t *stage_range) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700771 const int32_t size = 8;
772 const int32_t *cospi;
773
774 int32_t stage = 0;
775 int32_t *bf0, *bf1;
776 int32_t step[8];
777
778 // stage 0;
779 range_check(stage, input, input, size, stage_range[stage]);
780
781 // stage 1;
782 stage++;
783 bf1 = output;
784 bf1[0] = input[7];
785 bf1[1] = input[0];
786 bf1[2] = input[5];
787 bf1[3] = input[2];
788 bf1[4] = input[3];
789 bf1[5] = input[4];
790 bf1[6] = input[1];
791 bf1[7] = input[6];
792 range_check(stage, input, bf1, size, stage_range[stage]);
793
794 // stage 2
795 stage++;
796 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
797 bf0 = output;
798 bf1 = step;
799 bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit[stage]);
800 bf1[1] = half_btf(-cospi[4], bf0[1], cospi[60], bf0[0], cos_bit[stage]);
801 bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit[stage]);
802 bf1[3] = half_btf(-cospi[20], bf0[3], cospi[44], bf0[2], cos_bit[stage]);
803 bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit[stage]);
804 bf1[5] = half_btf(-cospi[36], bf0[5], cospi[28], bf0[4], cos_bit[stage]);
805 bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit[stage]);
806 bf1[7] = half_btf(-cospi[52], bf0[7], cospi[12], bf0[6], cos_bit[stage]);
807 range_check(stage, input, bf1, size, stage_range[stage]);
808
809 // stage 3
810 stage++;
Yaowu Xuc27fc142016-08-22 16:08:15 -0700811 bf0 = step;
812 bf1 = output;
813 bf1[0] = bf0[0] + bf0[4];
814 bf1[1] = bf0[1] + bf0[5];
815 bf1[2] = bf0[2] + bf0[6];
816 bf1[3] = bf0[3] + bf0[7];
817 bf1[4] = -bf0[4] + bf0[0];
818 bf1[5] = -bf0[5] + bf0[1];
819 bf1[6] = -bf0[6] + bf0[2];
820 bf1[7] = -bf0[7] + bf0[3];
821 range_check(stage, input, bf1, size, stage_range[stage]);
822
823 // stage 4
824 stage++;
825 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
826 bf0 = output;
827 bf1 = step;
828 bf1[0] = bf0[0];
829 bf1[1] = bf0[1];
830 bf1[2] = bf0[2];
831 bf1[3] = bf0[3];
832 bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
833 bf1[5] = half_btf(-cospi[16], bf0[5], cospi[48], bf0[4], cos_bit[stage]);
834 bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
835 bf1[7] = half_btf(cospi[48], bf0[7], cospi[16], bf0[6], cos_bit[stage]);
836 range_check(stage, input, bf1, size, stage_range[stage]);
837
838 // stage 5
839 stage++;
Yaowu Xuc27fc142016-08-22 16:08:15 -0700840 bf0 = step;
841 bf1 = output;
842 bf1[0] = bf0[0] + bf0[2];
843 bf1[1] = bf0[1] + bf0[3];
844 bf1[2] = -bf0[2] + bf0[0];
845 bf1[3] = -bf0[3] + bf0[1];
846 bf1[4] = bf0[4] + bf0[6];
847 bf1[5] = bf0[5] + bf0[7];
848 bf1[6] = -bf0[6] + bf0[4];
849 bf1[7] = -bf0[7] + bf0[5];
850 range_check(stage, input, bf1, size, stage_range[stage]);
851
852 // stage 6
853 stage++;
854 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
855 bf0 = output;
856 bf1 = step;
857 bf1[0] = bf0[0];
858 bf1[1] = bf0[1];
859 bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
860 bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
861 bf1[4] = bf0[4];
862 bf1[5] = bf0[5];
863 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
864 bf1[7] = half_btf(-cospi[32], bf0[7], cospi[32], bf0[6], cos_bit[stage]);
865 range_check(stage, input, bf1, size, stage_range[stage]);
866
867 // stage 7
868 stage++;
869 bf0 = step;
870 bf1 = output;
871 bf1[0] = bf0[0];
872 bf1[1] = -bf0[4];
873 bf1[2] = bf0[6];
874 bf1[3] = -bf0[2];
875 bf1[4] = bf0[3];
876 bf1[5] = -bf0[7];
877 bf1[6] = bf0[5];
878 bf1[7] = -bf0[1];
879 range_check(stage, input, bf1, size, stage_range[stage]);
880}
881
Yaowu Xuf883b422016-08-30 14:01:10 -0700882void av1_fadst16_new(const int32_t *input, int32_t *output,
883 const int8_t *cos_bit, const int8_t *stage_range) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700884 const int32_t size = 16;
885 const int32_t *cospi;
886
887 int32_t stage = 0;
888 int32_t *bf0, *bf1;
889 int32_t step[16];
890
891 // stage 0;
892 range_check(stage, input, input, size, stage_range[stage]);
893
894 // stage 1;
895 stage++;
896 bf1 = output;
897 bf1[0] = input[15];
898 bf1[1] = input[0];
899 bf1[2] = input[13];
900 bf1[3] = input[2];
901 bf1[4] = input[11];
902 bf1[5] = input[4];
903 bf1[6] = input[9];
904 bf1[7] = input[6];
905 bf1[8] = input[7];
906 bf1[9] = input[8];
907 bf1[10] = input[5];
908 bf1[11] = input[10];
909 bf1[12] = input[3];
910 bf1[13] = input[12];
911 bf1[14] = input[1];
912 bf1[15] = input[14];
913 range_check(stage, input, bf1, size, stage_range[stage]);
914
915 // stage 2
916 stage++;
917 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
918 bf0 = output;
919 bf1 = step;
920 bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit[stage]);
921 bf1[1] = half_btf(-cospi[2], bf0[1], cospi[62], bf0[0], cos_bit[stage]);
922 bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit[stage]);
923 bf1[3] = half_btf(-cospi[10], bf0[3], cospi[54], bf0[2], cos_bit[stage]);
924 bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit[stage]);
925 bf1[5] = half_btf(-cospi[18], bf0[5], cospi[46], bf0[4], cos_bit[stage]);
926 bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit[stage]);
927 bf1[7] = half_btf(-cospi[26], bf0[7], cospi[38], bf0[6], cos_bit[stage]);
928 bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit[stage]);
929 bf1[9] = half_btf(-cospi[34], bf0[9], cospi[30], bf0[8], cos_bit[stage]);
930 bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit[stage]);
931 bf1[11] = half_btf(-cospi[42], bf0[11], cospi[22], bf0[10], cos_bit[stage]);
932 bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit[stage]);
933 bf1[13] = half_btf(-cospi[50], bf0[13], cospi[14], bf0[12], cos_bit[stage]);
934 bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit[stage]);
935 bf1[15] = half_btf(-cospi[58], bf0[15], cospi[6], bf0[14], cos_bit[stage]);
936 range_check(stage, input, bf1, size, stage_range[stage]);
937
938 // stage 3
939 stage++;
940 bf0 = step;
941 bf1 = output;
942 bf1[0] = bf0[0] + bf0[8];
943 bf1[1] = bf0[1] + bf0[9];
944 bf1[2] = bf0[2] + bf0[10];
945 bf1[3] = bf0[3] + bf0[11];
946 bf1[4] = bf0[4] + bf0[12];
947 bf1[5] = bf0[5] + bf0[13];
948 bf1[6] = bf0[6] + bf0[14];
949 bf1[7] = bf0[7] + bf0[15];
950 bf1[8] = -bf0[8] + bf0[0];
951 bf1[9] = -bf0[9] + bf0[1];
952 bf1[10] = -bf0[10] + bf0[2];
953 bf1[11] = -bf0[11] + bf0[3];
954 bf1[12] = -bf0[12] + bf0[4];
955 bf1[13] = -bf0[13] + bf0[5];
956 bf1[14] = -bf0[14] + bf0[6];
957 bf1[15] = -bf0[15] + bf0[7];
958 range_check(stage, input, bf1, size, stage_range[stage]);
959
960 // stage 4
961 stage++;
962 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
963 bf0 = output;
964 bf1 = step;
965 bf1[0] = bf0[0];
966 bf1[1] = bf0[1];
967 bf1[2] = bf0[2];
968 bf1[3] = bf0[3];
969 bf1[4] = bf0[4];
970 bf1[5] = bf0[5];
971 bf1[6] = bf0[6];
972 bf1[7] = bf0[7];
973 bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit[stage]);
974 bf1[9] = half_btf(-cospi[8], bf0[9], cospi[56], bf0[8], cos_bit[stage]);
975 bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit[stage]);
976 bf1[11] = half_btf(-cospi[40], bf0[11], cospi[24], bf0[10], cos_bit[stage]);
977 bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit[stage]);
978 bf1[13] = half_btf(cospi[56], bf0[13], cospi[8], bf0[12], cos_bit[stage]);
979 bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit[stage]);
980 bf1[15] = half_btf(cospi[24], bf0[15], cospi[40], bf0[14], cos_bit[stage]);
981 range_check(stage, input, bf1, size, stage_range[stage]);
982
983 // stage 5
984 stage++;
985 bf0 = step;
986 bf1 = output;
987 bf1[0] = bf0[0] + bf0[4];
988 bf1[1] = bf0[1] + bf0[5];
989 bf1[2] = bf0[2] + bf0[6];
990 bf1[3] = bf0[3] + bf0[7];
991 bf1[4] = -bf0[4] + bf0[0];
992 bf1[5] = -bf0[5] + bf0[1];
993 bf1[6] = -bf0[6] + bf0[2];
994 bf1[7] = -bf0[7] + bf0[3];
995 bf1[8] = bf0[8] + bf0[12];
996 bf1[9] = bf0[9] + bf0[13];
997 bf1[10] = bf0[10] + bf0[14];
998 bf1[11] = bf0[11] + bf0[15];
999 bf1[12] = -bf0[12] + bf0[8];
1000 bf1[13] = -bf0[13] + bf0[9];
1001 bf1[14] = -bf0[14] + bf0[10];
1002 bf1[15] = -bf0[15] + bf0[11];
1003 range_check(stage, input, bf1, size, stage_range[stage]);
1004
1005 // stage 6
1006 stage++;
1007 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
1008 bf0 = output;
1009 bf1 = step;
1010 bf1[0] = bf0[0];
1011 bf1[1] = bf0[1];
1012 bf1[2] = bf0[2];
1013 bf1[3] = bf0[3];
1014 bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
1015 bf1[5] = half_btf(-cospi[16], bf0[5], cospi[48], bf0[4], cos_bit[stage]);
1016 bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
1017 bf1[7] = half_btf(cospi[48], bf0[7], cospi[16], bf0[6], cos_bit[stage]);
1018 bf1[8] = bf0[8];
1019 bf1[9] = bf0[9];
1020 bf1[10] = bf0[10];
1021 bf1[11] = bf0[11];
1022 bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit[stage]);
1023 bf1[13] = half_btf(-cospi[16], bf0[13], cospi[48], bf0[12], cos_bit[stage]);
1024 bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit[stage]);
1025 bf1[15] = half_btf(cospi[48], bf0[15], cospi[16], bf0[14], cos_bit[stage]);
1026 range_check(stage, input, bf1, size, stage_range[stage]);
1027
1028 // stage 7
1029 stage++;
1030 bf0 = step;
1031 bf1 = output;
1032 bf1[0] = bf0[0] + bf0[2];
1033 bf1[1] = bf0[1] + bf0[3];
1034 bf1[2] = -bf0[2] + bf0[0];
1035 bf1[3] = -bf0[3] + bf0[1];
1036 bf1[4] = bf0[4] + bf0[6];
1037 bf1[5] = bf0[5] + bf0[7];
1038 bf1[6] = -bf0[6] + bf0[4];
1039 bf1[7] = -bf0[7] + bf0[5];
1040 bf1[8] = bf0[8] + bf0[10];
1041 bf1[9] = bf0[9] + bf0[11];
1042 bf1[10] = -bf0[10] + bf0[8];
1043 bf1[11] = -bf0[11] + bf0[9];
1044 bf1[12] = bf0[12] + bf0[14];
1045 bf1[13] = bf0[13] + bf0[15];
1046 bf1[14] = -bf0[14] + bf0[12];
1047 bf1[15] = -bf0[15] + bf0[13];
1048 range_check(stage, input, bf1, size, stage_range[stage]);
1049
1050 // stage 8
1051 stage++;
1052 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
1053 bf0 = output;
1054 bf1 = step;
1055 bf1[0] = bf0[0];
1056 bf1[1] = bf0[1];
1057 bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
1058 bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
1059 bf1[4] = bf0[4];
1060 bf1[5] = bf0[5];
1061 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
1062 bf1[7] = half_btf(-cospi[32], bf0[7], cospi[32], bf0[6], cos_bit[stage]);
1063 bf1[8] = bf0[8];
1064 bf1[9] = bf0[9];
1065 bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit[stage]);
1066 bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[10], cos_bit[stage]);
1067 bf1[12] = bf0[12];
1068 bf1[13] = bf0[13];
1069 bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit[stage]);
1070 bf1[15] = half_btf(-cospi[32], bf0[15], cospi[32], bf0[14], cos_bit[stage]);
1071 range_check(stage, input, bf1, size, stage_range[stage]);
1072
1073 // stage 9
1074 stage++;
1075 bf0 = step;
1076 bf1 = output;
1077 bf1[0] = bf0[0];
1078 bf1[1] = -bf0[8];
1079 bf1[2] = bf0[12];
1080 bf1[3] = -bf0[4];
1081 bf1[4] = bf0[6];
1082 bf1[5] = -bf0[14];
1083 bf1[6] = bf0[10];
1084 bf1[7] = -bf0[2];
1085 bf1[8] = bf0[3];
1086 bf1[9] = -bf0[11];
1087 bf1[10] = bf0[15];
1088 bf1[11] = -bf0[7];
1089 bf1[12] = bf0[5];
1090 bf1[13] = -bf0[13];
1091 bf1[14] = bf0[9];
1092 bf1[15] = -bf0[1];
1093 range_check(stage, input, bf1, size, stage_range[stage]);
1094}
1095
Yaowu Xuf883b422016-08-30 14:01:10 -07001096void av1_fadst32_new(const int32_t *input, int32_t *output,
1097 const int8_t *cos_bit, const int8_t *stage_range) {
Yaowu Xuc27fc142016-08-22 16:08:15 -07001098 const int32_t size = 32;
1099 const int32_t *cospi;
1100
1101 int32_t stage = 0;
1102 int32_t *bf0, *bf1;
1103 int32_t step[32];
1104
1105 // stage 0;
1106 range_check(stage, input, input, size, stage_range[stage]);
1107
1108 // stage 1;
1109 stage++;
1110 bf1 = output;
1111 bf1[0] = input[31];
1112 bf1[1] = input[0];
1113 bf1[2] = input[29];
1114 bf1[3] = input[2];
1115 bf1[4] = input[27];
1116 bf1[5] = input[4];
1117 bf1[6] = input[25];
1118 bf1[7] = input[6];
1119 bf1[8] = input[23];
1120 bf1[9] = input[8];
1121 bf1[10] = input[21];
1122 bf1[11] = input[10];
1123 bf1[12] = input[19];
1124 bf1[13] = input[12];
1125 bf1[14] = input[17];
1126 bf1[15] = input[14];
1127 bf1[16] = input[15];
1128 bf1[17] = input[16];
1129 bf1[18] = input[13];
1130 bf1[19] = input[18];
1131 bf1[20] = input[11];
1132 bf1[21] = input[20];
1133 bf1[22] = input[9];
1134 bf1[23] = input[22];
1135 bf1[24] = input[7];
1136 bf1[25] = input[24];
1137 bf1[26] = input[5];
1138 bf1[27] = input[26];
1139 bf1[28] = input[3];
1140 bf1[29] = input[28];
1141 bf1[30] = input[1];
1142 bf1[31] = input[30];
1143 range_check(stage, input, bf1, size, stage_range[stage]);
1144
1145 // stage 2
1146 stage++;
1147 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
1148 bf0 = output;
1149 bf1 = step;
1150 bf1[0] = half_btf(cospi[1], bf0[0], cospi[63], bf0[1], cos_bit[stage]);
1151 bf1[1] = half_btf(-cospi[1], bf0[1], cospi[63], bf0[0], cos_bit[stage]);
1152 bf1[2] = half_btf(cospi[5], bf0[2], cospi[59], bf0[3], cos_bit[stage]);
1153 bf1[3] = half_btf(-cospi[5], bf0[3], cospi[59], bf0[2], cos_bit[stage]);
1154 bf1[4] = half_btf(cospi[9], bf0[4], cospi[55], bf0[5], cos_bit[stage]);
1155 bf1[5] = half_btf(-cospi[9], bf0[5], cospi[55], bf0[4], cos_bit[stage]);
1156 bf1[6] = half_btf(cospi[13], bf0[6], cospi[51], bf0[7], cos_bit[stage]);
1157 bf1[7] = half_btf(-cospi[13], bf0[7], cospi[51], bf0[6], cos_bit[stage]);
1158 bf1[8] = half_btf(cospi[17], bf0[8], cospi[47], bf0[9], cos_bit[stage]);
1159 bf1[9] = half_btf(-cospi[17], bf0[9], cospi[47], bf0[8], cos_bit[stage]);
1160 bf1[10] = half_btf(cospi[21], bf0[10], cospi[43], bf0[11], cos_bit[stage]);
1161 bf1[11] = half_btf(-cospi[21], bf0[11], cospi[43], bf0[10], cos_bit[stage]);
1162 bf1[12] = half_btf(cospi[25], bf0[12], cospi[39], bf0[13], cos_bit[stage]);
1163 bf1[13] = half_btf(-cospi[25], bf0[13], cospi[39], bf0[12], cos_bit[stage]);
1164 bf1[14] = half_btf(cospi[29], bf0[14], cospi[35], bf0[15], cos_bit[stage]);
1165 bf1[15] = half_btf(-cospi[29], bf0[15], cospi[35], bf0[14], cos_bit[stage]);
1166 bf1[16] = half_btf(cospi[33], bf0[16], cospi[31], bf0[17], cos_bit[stage]);
1167 bf1[17] = half_btf(-cospi[33], bf0[17], cospi[31], bf0[16], cos_bit[stage]);
1168 bf1[18] = half_btf(cospi[37], bf0[18], cospi[27], bf0[19], cos_bit[stage]);
1169 bf1[19] = half_btf(-cospi[37], bf0[19], cospi[27], bf0[18], cos_bit[stage]);
1170 bf1[20] = half_btf(cospi[41], bf0[20], cospi[23], bf0[21], cos_bit[stage]);
1171 bf1[21] = half_btf(-cospi[41], bf0[21], cospi[23], bf0[20], cos_bit[stage]);
1172 bf1[22] = half_btf(cospi[45], bf0[22], cospi[19], bf0[23], cos_bit[stage]);
1173 bf1[23] = half_btf(-cospi[45], bf0[23], cospi[19], bf0[22], cos_bit[stage]);
1174 bf1[24] = half_btf(cospi[49], bf0[24], cospi[15], bf0[25], cos_bit[stage]);
1175 bf1[25] = half_btf(-cospi[49], bf0[25], cospi[15], bf0[24], cos_bit[stage]);
1176 bf1[26] = half_btf(cospi[53], bf0[26], cospi[11], bf0[27], cos_bit[stage]);
1177 bf1[27] = half_btf(-cospi[53], bf0[27], cospi[11], bf0[26], cos_bit[stage]);
1178 bf1[28] = half_btf(cospi[57], bf0[28], cospi[7], bf0[29], cos_bit[stage]);
1179 bf1[29] = half_btf(-cospi[57], bf0[29], cospi[7], bf0[28], cos_bit[stage]);
1180 bf1[30] = half_btf(cospi[61], bf0[30], cospi[3], bf0[31], cos_bit[stage]);
1181 bf1[31] = half_btf(-cospi[61], bf0[31], cospi[3], bf0[30], cos_bit[stage]);
1182 range_check(stage, input, bf1, size, stage_range[stage]);
1183
1184 // stage 3
1185 stage++;
1186 bf0 = step;
1187 bf1 = output;
1188 bf1[0] = bf0[0] + bf0[16];
1189 bf1[1] = bf0[1] + bf0[17];
1190 bf1[2] = bf0[2] + bf0[18];
1191 bf1[3] = bf0[3] + bf0[19];
1192 bf1[4] = bf0[4] + bf0[20];
1193 bf1[5] = bf0[5] + bf0[21];
1194 bf1[6] = bf0[6] + bf0[22];
1195 bf1[7] = bf0[7] + bf0[23];
1196 bf1[8] = bf0[8] + bf0[24];
1197 bf1[9] = bf0[9] + bf0[25];
1198 bf1[10] = bf0[10] + bf0[26];
1199 bf1[11] = bf0[11] + bf0[27];
1200 bf1[12] = bf0[12] + bf0[28];
1201 bf1[13] = bf0[13] + bf0[29];
1202 bf1[14] = bf0[14] + bf0[30];
1203 bf1[15] = bf0[15] + bf0[31];
1204 bf1[16] = -bf0[16] + bf0[0];
1205 bf1[17] = -bf0[17] + bf0[1];
1206 bf1[18] = -bf0[18] + bf0[2];
1207 bf1[19] = -bf0[19] + bf0[3];
1208 bf1[20] = -bf0[20] + bf0[4];
1209 bf1[21] = -bf0[21] + bf0[5];
1210 bf1[22] = -bf0[22] + bf0[6];
1211 bf1[23] = -bf0[23] + bf0[7];
1212 bf1[24] = -bf0[24] + bf0[8];
1213 bf1[25] = -bf0[25] + bf0[9];
1214 bf1[26] = -bf0[26] + bf0[10];
1215 bf1[27] = -bf0[27] + bf0[11];
1216 bf1[28] = -bf0[28] + bf0[12];
1217 bf1[29] = -bf0[29] + bf0[13];
1218 bf1[30] = -bf0[30] + bf0[14];
1219 bf1[31] = -bf0[31] + bf0[15];
1220 range_check(stage, input, bf1, size, stage_range[stage]);
1221
1222 // stage 4
1223 stage++;
1224 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
1225 bf0 = output;
1226 bf1 = step;
1227 bf1[0] = bf0[0];
1228 bf1[1] = bf0[1];
1229 bf1[2] = bf0[2];
1230 bf1[3] = bf0[3];
1231 bf1[4] = bf0[4];
1232 bf1[5] = bf0[5];
1233 bf1[6] = bf0[6];
1234 bf1[7] = bf0[7];
1235 bf1[8] = bf0[8];
1236 bf1[9] = bf0[9];
1237 bf1[10] = bf0[10];
1238 bf1[11] = bf0[11];
1239 bf1[12] = bf0[12];
1240 bf1[13] = bf0[13];
1241 bf1[14] = bf0[14];
1242 bf1[15] = bf0[15];
1243 bf1[16] = half_btf(cospi[4], bf0[16], cospi[60], bf0[17], cos_bit[stage]);
1244 bf1[17] = half_btf(-cospi[4], bf0[17], cospi[60], bf0[16], cos_bit[stage]);
1245 bf1[18] = half_btf(cospi[20], bf0[18], cospi[44], bf0[19], cos_bit[stage]);
1246 bf1[19] = half_btf(-cospi[20], bf0[19], cospi[44], bf0[18], cos_bit[stage]);
1247 bf1[20] = half_btf(cospi[36], bf0[20], cospi[28], bf0[21], cos_bit[stage]);
1248 bf1[21] = half_btf(-cospi[36], bf0[21], cospi[28], bf0[20], cos_bit[stage]);
1249 bf1[22] = half_btf(cospi[52], bf0[22], cospi[12], bf0[23], cos_bit[stage]);
1250 bf1[23] = half_btf(-cospi[52], bf0[23], cospi[12], bf0[22], cos_bit[stage]);
1251 bf1[24] = half_btf(-cospi[60], bf0[24], cospi[4], bf0[25], cos_bit[stage]);
1252 bf1[25] = half_btf(cospi[60], bf0[25], cospi[4], bf0[24], cos_bit[stage]);
1253 bf1[26] = half_btf(-cospi[44], bf0[26], cospi[20], bf0[27], cos_bit[stage]);
1254 bf1[27] = half_btf(cospi[44], bf0[27], cospi[20], bf0[26], cos_bit[stage]);
1255 bf1[28] = half_btf(-cospi[28], bf0[28], cospi[36], bf0[29], cos_bit[stage]);
1256 bf1[29] = half_btf(cospi[28], bf0[29], cospi[36], bf0[28], cos_bit[stage]);
1257 bf1[30] = half_btf(-cospi[12], bf0[30], cospi[52], bf0[31], cos_bit[stage]);
1258 bf1[31] = half_btf(cospi[12], bf0[31], cospi[52], bf0[30], cos_bit[stage]);
1259 range_check(stage, input, bf1, size, stage_range[stage]);
1260
1261 // stage 5
1262 stage++;
1263 bf0 = step;
1264 bf1 = output;
1265 bf1[0] = bf0[0] + bf0[8];
1266 bf1[1] = bf0[1] + bf0[9];
1267 bf1[2] = bf0[2] + bf0[10];
1268 bf1[3] = bf0[3] + bf0[11];
1269 bf1[4] = bf0[4] + bf0[12];
1270 bf1[5] = bf0[5] + bf0[13];
1271 bf1[6] = bf0[6] + bf0[14];
1272 bf1[7] = bf0[7] + bf0[15];
1273 bf1[8] = -bf0[8] + bf0[0];
1274 bf1[9] = -bf0[9] + bf0[1];
1275 bf1[10] = -bf0[10] + bf0[2];
1276 bf1[11] = -bf0[11] + bf0[3];
1277 bf1[12] = -bf0[12] + bf0[4];
1278 bf1[13] = -bf0[13] + bf0[5];
1279 bf1[14] = -bf0[14] + bf0[6];
1280 bf1[15] = -bf0[15] + bf0[7];
1281 bf1[16] = bf0[16] + bf0[24];
1282 bf1[17] = bf0[17] + bf0[25];
1283 bf1[18] = bf0[18] + bf0[26];
1284 bf1[19] = bf0[19] + bf0[27];
1285 bf1[20] = bf0[20] + bf0[28];
1286 bf1[21] = bf0[21] + bf0[29];
1287 bf1[22] = bf0[22] + bf0[30];
1288 bf1[23] = bf0[23] + bf0[31];
1289 bf1[24] = -bf0[24] + bf0[16];
1290 bf1[25] = -bf0[25] + bf0[17];
1291 bf1[26] = -bf0[26] + bf0[18];
1292 bf1[27] = -bf0[27] + bf0[19];
1293 bf1[28] = -bf0[28] + bf0[20];
1294 bf1[29] = -bf0[29] + bf0[21];
1295 bf1[30] = -bf0[30] + bf0[22];
1296 bf1[31] = -bf0[31] + bf0[23];
1297 range_check(stage, input, bf1, size, stage_range[stage]);
1298
1299 // stage 6
1300 stage++;
1301 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
1302 bf0 = output;
1303 bf1 = step;
1304 bf1[0] = bf0[0];
1305 bf1[1] = bf0[1];
1306 bf1[2] = bf0[2];
1307 bf1[3] = bf0[3];
1308 bf1[4] = bf0[4];
1309 bf1[5] = bf0[5];
1310 bf1[6] = bf0[6];
1311 bf1[7] = bf0[7];
1312 bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit[stage]);
1313 bf1[9] = half_btf(-cospi[8], bf0[9], cospi[56], bf0[8], cos_bit[stage]);
1314 bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit[stage]);
1315 bf1[11] = half_btf(-cospi[40], bf0[11], cospi[24], bf0[10], cos_bit[stage]);
1316 bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit[stage]);
1317 bf1[13] = half_btf(cospi[56], bf0[13], cospi[8], bf0[12], cos_bit[stage]);
1318 bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit[stage]);
1319 bf1[15] = half_btf(cospi[24], bf0[15], cospi[40], bf0[14], cos_bit[stage]);
1320 bf1[16] = bf0[16];
1321 bf1[17] = bf0[17];
1322 bf1[18] = bf0[18];
1323 bf1[19] = bf0[19];
1324 bf1[20] = bf0[20];
1325 bf1[21] = bf0[21];
1326 bf1[22] = bf0[22];
1327 bf1[23] = bf0[23];
1328 bf1[24] = half_btf(cospi[8], bf0[24], cospi[56], bf0[25], cos_bit[stage]);
1329 bf1[25] = half_btf(-cospi[8], bf0[25], cospi[56], bf0[24], cos_bit[stage]);
1330 bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[27], cos_bit[stage]);
1331 bf1[27] = half_btf(-cospi[40], bf0[27], cospi[24], bf0[26], cos_bit[stage]);
1332 bf1[28] = half_btf(-cospi[56], bf0[28], cospi[8], bf0[29], cos_bit[stage]);
1333 bf1[29] = half_btf(cospi[56], bf0[29], cospi[8], bf0[28], cos_bit[stage]);
1334 bf1[30] = half_btf(-cospi[24], bf0[30], cospi[40], bf0[31], cos_bit[stage]);
1335 bf1[31] = half_btf(cospi[24], bf0[31], cospi[40], bf0[30], cos_bit[stage]);
1336 range_check(stage, input, bf1, size, stage_range[stage]);
1337
1338 // stage 7
1339 stage++;
1340 bf0 = step;
1341 bf1 = output;
1342 bf1[0] = bf0[0] + bf0[4];
1343 bf1[1] = bf0[1] + bf0[5];
1344 bf1[2] = bf0[2] + bf0[6];
1345 bf1[3] = bf0[3] + bf0[7];
1346 bf1[4] = -bf0[4] + bf0[0];
1347 bf1[5] = -bf0[5] + bf0[1];
1348 bf1[6] = -bf0[6] + bf0[2];
1349 bf1[7] = -bf0[7] + bf0[3];
1350 bf1[8] = bf0[8] + bf0[12];
1351 bf1[9] = bf0[9] + bf0[13];
1352 bf1[10] = bf0[10] + bf0[14];
1353 bf1[11] = bf0[11] + bf0[15];
1354 bf1[12] = -bf0[12] + bf0[8];
1355 bf1[13] = -bf0[13] + bf0[9];
1356 bf1[14] = -bf0[14] + bf0[10];
1357 bf1[15] = -bf0[15] + bf0[11];
1358 bf1[16] = bf0[16] + bf0[20];
1359 bf1[17] = bf0[17] + bf0[21];
1360 bf1[18] = bf0[18] + bf0[22];
1361 bf1[19] = bf0[19] + bf0[23];
1362 bf1[20] = -bf0[20] + bf0[16];
1363 bf1[21] = -bf0[21] + bf0[17];
1364 bf1[22] = -bf0[22] + bf0[18];
1365 bf1[23] = -bf0[23] + bf0[19];
1366 bf1[24] = bf0[24] + bf0[28];
1367 bf1[25] = bf0[25] + bf0[29];
1368 bf1[26] = bf0[26] + bf0[30];
1369 bf1[27] = bf0[27] + bf0[31];
1370 bf1[28] = -bf0[28] + bf0[24];
1371 bf1[29] = -bf0[29] + bf0[25];
1372 bf1[30] = -bf0[30] + bf0[26];
1373 bf1[31] = -bf0[31] + bf0[27];
1374 range_check(stage, input, bf1, size, stage_range[stage]);
1375
1376 // stage 8
1377 stage++;
1378 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
1379 bf0 = output;
1380 bf1 = step;
1381 bf1[0] = bf0[0];
1382 bf1[1] = bf0[1];
1383 bf1[2] = bf0[2];
1384 bf1[3] = bf0[3];
1385 bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
1386 bf1[5] = half_btf(-cospi[16], bf0[5], cospi[48], bf0[4], cos_bit[stage]);
1387 bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
1388 bf1[7] = half_btf(cospi[48], bf0[7], cospi[16], bf0[6], cos_bit[stage]);
1389 bf1[8] = bf0[8];
1390 bf1[9] = bf0[9];
1391 bf1[10] = bf0[10];
1392 bf1[11] = bf0[11];
1393 bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit[stage]);
1394 bf1[13] = half_btf(-cospi[16], bf0[13], cospi[48], bf0[12], cos_bit[stage]);
1395 bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit[stage]);
1396 bf1[15] = half_btf(cospi[48], bf0[15], cospi[16], bf0[14], cos_bit[stage]);
1397 bf1[16] = bf0[16];
1398 bf1[17] = bf0[17];
1399 bf1[18] = bf0[18];
1400 bf1[19] = bf0[19];
1401 bf1[20] = half_btf(cospi[16], bf0[20], cospi[48], bf0[21], cos_bit[stage]);
1402 bf1[21] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[20], cos_bit[stage]);
1403 bf1[22] = half_btf(-cospi[48], bf0[22], cospi[16], bf0[23], cos_bit[stage]);
1404 bf1[23] = half_btf(cospi[48], bf0[23], cospi[16], bf0[22], cos_bit[stage]);
1405 bf1[24] = bf0[24];
1406 bf1[25] = bf0[25];
1407 bf1[26] = bf0[26];
1408 bf1[27] = bf0[27];
1409 bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[29], cos_bit[stage]);
1410 bf1[29] = half_btf(-cospi[16], bf0[29], cospi[48], bf0[28], cos_bit[stage]);
1411 bf1[30] = half_btf(-cospi[48], bf0[30], cospi[16], bf0[31], cos_bit[stage]);
1412 bf1[31] = half_btf(cospi[48], bf0[31], cospi[16], bf0[30], cos_bit[stage]);
1413 range_check(stage, input, bf1, size, stage_range[stage]);
1414
1415 // stage 9
1416 stage++;
1417 bf0 = step;
1418 bf1 = output;
1419 bf1[0] = bf0[0] + bf0[2];
1420 bf1[1] = bf0[1] + bf0[3];
1421 bf1[2] = -bf0[2] + bf0[0];
1422 bf1[3] = -bf0[3] + bf0[1];
1423 bf1[4] = bf0[4] + bf0[6];
1424 bf1[5] = bf0[5] + bf0[7];
1425 bf1[6] = -bf0[6] + bf0[4];
1426 bf1[7] = -bf0[7] + bf0[5];
1427 bf1[8] = bf0[8] + bf0[10];
1428 bf1[9] = bf0[9] + bf0[11];
1429 bf1[10] = -bf0[10] + bf0[8];
1430 bf1[11] = -bf0[11] + bf0[9];
1431 bf1[12] = bf0[12] + bf0[14];
1432 bf1[13] = bf0[13] + bf0[15];
1433 bf1[14] = -bf0[14] + bf0[12];
1434 bf1[15] = -bf0[15] + bf0[13];
1435 bf1[16] = bf0[16] + bf0[18];
1436 bf1[17] = bf0[17] + bf0[19];
1437 bf1[18] = -bf0[18] + bf0[16];
1438 bf1[19] = -bf0[19] + bf0[17];
1439 bf1[20] = bf0[20] + bf0[22];
1440 bf1[21] = bf0[21] + bf0[23];
1441 bf1[22] = -bf0[22] + bf0[20];
1442 bf1[23] = -bf0[23] + bf0[21];
1443 bf1[24] = bf0[24] + bf0[26];
1444 bf1[25] = bf0[25] + bf0[27];
1445 bf1[26] = -bf0[26] + bf0[24];
1446 bf1[27] = -bf0[27] + bf0[25];
1447 bf1[28] = bf0[28] + bf0[30];
1448 bf1[29] = bf0[29] + bf0[31];
1449 bf1[30] = -bf0[30] + bf0[28];
1450 bf1[31] = -bf0[31] + bf0[29];
1451 range_check(stage, input, bf1, size, stage_range[stage]);
1452
1453 // stage 10
1454 stage++;
1455 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
1456 bf0 = output;
1457 bf1 = step;
1458 bf1[0] = bf0[0];
1459 bf1[1] = bf0[1];
1460 bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
1461 bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
1462 bf1[4] = bf0[4];
1463 bf1[5] = bf0[5];
1464 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
1465 bf1[7] = half_btf(-cospi[32], bf0[7], cospi[32], bf0[6], cos_bit[stage]);
1466 bf1[8] = bf0[8];
1467 bf1[9] = bf0[9];
1468 bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit[stage]);
1469 bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[10], cos_bit[stage]);
1470 bf1[12] = bf0[12];
1471 bf1[13] = bf0[13];
1472 bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit[stage]);
1473 bf1[15] = half_btf(-cospi[32], bf0[15], cospi[32], bf0[14], cos_bit[stage]);
1474 bf1[16] = bf0[16];
1475 bf1[17] = bf0[17];
1476 bf1[18] = half_btf(cospi[32], bf0[18], cospi[32], bf0[19], cos_bit[stage]);
1477 bf1[19] = half_btf(-cospi[32], bf0[19], cospi[32], bf0[18], cos_bit[stage]);
1478 bf1[20] = bf0[20];
1479 bf1[21] = bf0[21];
1480 bf1[22] = half_btf(cospi[32], bf0[22], cospi[32], bf0[23], cos_bit[stage]);
1481 bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[22], cos_bit[stage]);
1482 bf1[24] = bf0[24];
1483 bf1[25] = bf0[25];
1484 bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[27], cos_bit[stage]);
1485 bf1[27] = half_btf(-cospi[32], bf0[27], cospi[32], bf0[26], cos_bit[stage]);
1486 bf1[28] = bf0[28];
1487 bf1[29] = bf0[29];
1488 bf1[30] = half_btf(cospi[32], bf0[30], cospi[32], bf0[31], cos_bit[stage]);
1489 bf1[31] = half_btf(-cospi[32], bf0[31], cospi[32], bf0[30], cos_bit[stage]);
1490 range_check(stage, input, bf1, size, stage_range[stage]);
1491
1492 // stage 11
1493 stage++;
1494 bf0 = step;
1495 bf1 = output;
1496 bf1[0] = bf0[0];
1497 bf1[1] = -bf0[16];
1498 bf1[2] = bf0[24];
1499 bf1[3] = -bf0[8];
1500 bf1[4] = bf0[12];
1501 bf1[5] = -bf0[28];
1502 bf1[6] = bf0[20];
1503 bf1[7] = -bf0[4];
1504 bf1[8] = bf0[6];
1505 bf1[9] = -bf0[22];
1506 bf1[10] = bf0[30];
1507 bf1[11] = -bf0[14];
1508 bf1[12] = bf0[10];
1509 bf1[13] = -bf0[26];
1510 bf1[14] = bf0[18];
1511 bf1[15] = -bf0[2];
1512 bf1[16] = bf0[3];
1513 bf1[17] = -bf0[19];
1514 bf1[18] = bf0[27];
1515 bf1[19] = -bf0[11];
1516 bf1[20] = bf0[15];
1517 bf1[21] = -bf0[31];
1518 bf1[22] = bf0[23];
1519 bf1[23] = -bf0[7];
1520 bf1[24] = bf0[5];
1521 bf1[25] = -bf0[21];
1522 bf1[26] = bf0[29];
1523 bf1[27] = -bf0[13];
1524 bf1[28] = bf0[9];
1525 bf1[29] = -bf0[25];
1526 bf1[30] = bf0[17];
1527 bf1[31] = -bf0[1];
1528 range_check(stage, input, bf1, size, stage_range[stage]);
1529}
Angie Chiang792519b2016-10-18 12:24:20 -07001530
1531#if CONFIG_TX64X64
1532void av1_fdct64_new(const int32_t *input, int32_t *output,
1533 const int8_t *cos_bit, const int8_t *stage_range) {
1534 const int32_t size = 64;
1535 const int32_t *cospi;
1536
1537 int32_t stage = 0;
1538 int32_t *bf0, *bf1;
1539 int32_t step[64];
1540
1541 // stage 0;
1542 range_check(stage, input, input, size, stage_range[stage]);
1543
1544 // stage 1;
1545 stage++;
1546 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
1547 bf1 = output;
1548 bf1[0] = input[0] + input[63];
1549 bf1[1] = input[1] + input[62];
1550 bf1[2] = input[2] + input[61];
1551 bf1[3] = input[3] + input[60];
1552 bf1[4] = input[4] + input[59];
1553 bf1[5] = input[5] + input[58];
1554 bf1[6] = input[6] + input[57];
1555 bf1[7] = input[7] + input[56];
1556 bf1[8] = input[8] + input[55];
1557 bf1[9] = input[9] + input[54];
1558 bf1[10] = input[10] + input[53];
1559 bf1[11] = input[11] + input[52];
1560 bf1[12] = input[12] + input[51];
1561 bf1[13] = input[13] + input[50];
1562 bf1[14] = input[14] + input[49];
1563 bf1[15] = input[15] + input[48];
1564 bf1[16] = input[16] + input[47];
1565 bf1[17] = input[17] + input[46];
1566 bf1[18] = input[18] + input[45];
1567 bf1[19] = input[19] + input[44];
1568 bf1[20] = input[20] + input[43];
1569 bf1[21] = input[21] + input[42];
1570 bf1[22] = input[22] + input[41];
1571 bf1[23] = input[23] + input[40];
1572 bf1[24] = input[24] + input[39];
1573 bf1[25] = input[25] + input[38];
1574 bf1[26] = input[26] + input[37];
1575 bf1[27] = input[27] + input[36];
1576 bf1[28] = input[28] + input[35];
1577 bf1[29] = input[29] + input[34];
1578 bf1[30] = input[30] + input[33];
1579 bf1[31] = input[31] + input[32];
1580 bf1[32] = -input[32] + input[31];
1581 bf1[33] = -input[33] + input[30];
1582 bf1[34] = -input[34] + input[29];
1583 bf1[35] = -input[35] + input[28];
1584 bf1[36] = -input[36] + input[27];
1585 bf1[37] = -input[37] + input[26];
1586 bf1[38] = -input[38] + input[25];
1587 bf1[39] = -input[39] + input[24];
1588 bf1[40] = -input[40] + input[23];
1589 bf1[41] = -input[41] + input[22];
1590 bf1[42] = -input[42] + input[21];
1591 bf1[43] = -input[43] + input[20];
1592 bf1[44] = -input[44] + input[19];
1593 bf1[45] = -input[45] + input[18];
1594 bf1[46] = -input[46] + input[17];
1595 bf1[47] = -input[47] + input[16];
1596 bf1[48] = -input[48] + input[15];
1597 bf1[49] = -input[49] + input[14];
1598 bf1[50] = -input[50] + input[13];
1599 bf1[51] = -input[51] + input[12];
1600 bf1[52] = -input[52] + input[11];
1601 bf1[53] = -input[53] + input[10];
1602 bf1[54] = -input[54] + input[9];
1603 bf1[55] = -input[55] + input[8];
1604 bf1[56] = -input[56] + input[7];
1605 bf1[57] = -input[57] + input[6];
1606 bf1[58] = -input[58] + input[5];
1607 bf1[59] = -input[59] + input[4];
1608 bf1[60] = -input[60] + input[3];
1609 bf1[61] = -input[61] + input[2];
1610 bf1[62] = -input[62] + input[1];
1611 bf1[63] = -input[63] + input[0];
1612 range_check(stage, input, bf1, size, stage_range[stage]);
1613
1614 // stage 2
1615 stage++;
1616 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
1617 bf0 = output;
1618 bf1 = step;
1619 bf1[0] = bf0[0] + bf0[31];
1620 bf1[1] = bf0[1] + bf0[30];
1621 bf1[2] = bf0[2] + bf0[29];
1622 bf1[3] = bf0[3] + bf0[28];
1623 bf1[4] = bf0[4] + bf0[27];
1624 bf1[5] = bf0[5] + bf0[26];
1625 bf1[6] = bf0[6] + bf0[25];
1626 bf1[7] = bf0[7] + bf0[24];
1627 bf1[8] = bf0[8] + bf0[23];
1628 bf1[9] = bf0[9] + bf0[22];
1629 bf1[10] = bf0[10] + bf0[21];
1630 bf1[11] = bf0[11] + bf0[20];
1631 bf1[12] = bf0[12] + bf0[19];
1632 bf1[13] = bf0[13] + bf0[18];
1633 bf1[14] = bf0[14] + bf0[17];
1634 bf1[15] = bf0[15] + bf0[16];
1635 bf1[16] = -bf0[16] + bf0[15];
1636 bf1[17] = -bf0[17] + bf0[14];
1637 bf1[18] = -bf0[18] + bf0[13];
1638 bf1[19] = -bf0[19] + bf0[12];
1639 bf1[20] = -bf0[20] + bf0[11];
1640 bf1[21] = -bf0[21] + bf0[10];
1641 bf1[22] = -bf0[22] + bf0[9];
1642 bf1[23] = -bf0[23] + bf0[8];
1643 bf1[24] = -bf0[24] + bf0[7];
1644 bf1[25] = -bf0[25] + bf0[6];
1645 bf1[26] = -bf0[26] + bf0[5];
1646 bf1[27] = -bf0[27] + bf0[4];
1647 bf1[28] = -bf0[28] + bf0[3];
1648 bf1[29] = -bf0[29] + bf0[2];
1649 bf1[30] = -bf0[30] + bf0[1];
1650 bf1[31] = -bf0[31] + bf0[0];
1651 bf1[32] = bf0[32];
1652 bf1[33] = bf0[33];
1653 bf1[34] = bf0[34];
1654 bf1[35] = bf0[35];
1655 bf1[36] = bf0[36];
1656 bf1[37] = bf0[37];
1657 bf1[38] = bf0[38];
1658 bf1[39] = bf0[39];
1659 bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit[stage]);
1660 bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit[stage]);
1661 bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit[stage]);
1662 bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit[stage]);
1663 bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit[stage]);
1664 bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit[stage]);
1665 bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit[stage]);
1666 bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit[stage]);
1667 bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit[stage]);
1668 bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit[stage]);
1669 bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit[stage]);
1670 bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit[stage]);
1671 bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit[stage]);
1672 bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit[stage]);
1673 bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit[stage]);
1674 bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit[stage]);
1675 bf1[56] = bf0[56];
1676 bf1[57] = bf0[57];
1677 bf1[58] = bf0[58];
1678 bf1[59] = bf0[59];
1679 bf1[60] = bf0[60];
1680 bf1[61] = bf0[61];
1681 bf1[62] = bf0[62];
1682 bf1[63] = bf0[63];
1683 range_check(stage, input, bf1, size, stage_range[stage]);
1684
1685 // stage 3
1686 stage++;
1687 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
1688 bf0 = step;
1689 bf1 = output;
1690 bf1[0] = bf0[0] + bf0[15];
1691 bf1[1] = bf0[1] + bf0[14];
1692 bf1[2] = bf0[2] + bf0[13];
1693 bf1[3] = bf0[3] + bf0[12];
1694 bf1[4] = bf0[4] + bf0[11];
1695 bf1[5] = bf0[5] + bf0[10];
1696 bf1[6] = bf0[6] + bf0[9];
1697 bf1[7] = bf0[7] + bf0[8];
1698 bf1[8] = -bf0[8] + bf0[7];
1699 bf1[9] = -bf0[9] + bf0[6];
1700 bf1[10] = -bf0[10] + bf0[5];
1701 bf1[11] = -bf0[11] + bf0[4];
1702 bf1[12] = -bf0[12] + bf0[3];
1703 bf1[13] = -bf0[13] + bf0[2];
1704 bf1[14] = -bf0[14] + bf0[1];
1705 bf1[15] = -bf0[15] + bf0[0];
1706 bf1[16] = bf0[16];
1707 bf1[17] = bf0[17];
1708 bf1[18] = bf0[18];
1709 bf1[19] = bf0[19];
1710 bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
1711 bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
1712 bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
1713 bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
1714 bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit[stage]);
1715 bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit[stage]);
1716 bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit[stage]);
1717 bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit[stage]);
1718 bf1[28] = bf0[28];
1719 bf1[29] = bf0[29];
1720 bf1[30] = bf0[30];
1721 bf1[31] = bf0[31];
1722 bf1[32] = bf0[32] + bf0[47];
1723 bf1[33] = bf0[33] + bf0[46];
1724 bf1[34] = bf0[34] + bf0[45];
1725 bf1[35] = bf0[35] + bf0[44];
1726 bf1[36] = bf0[36] + bf0[43];
1727 bf1[37] = bf0[37] + bf0[42];
1728 bf1[38] = bf0[38] + bf0[41];
1729 bf1[39] = bf0[39] + bf0[40];
1730 bf1[40] = -bf0[40] + bf0[39];
1731 bf1[41] = -bf0[41] + bf0[38];
1732 bf1[42] = -bf0[42] + bf0[37];
1733 bf1[43] = -bf0[43] + bf0[36];
1734 bf1[44] = -bf0[44] + bf0[35];
1735 bf1[45] = -bf0[45] + bf0[34];
1736 bf1[46] = -bf0[46] + bf0[33];
1737 bf1[47] = -bf0[47] + bf0[32];
1738 bf1[48] = -bf0[48] + bf0[63];
1739 bf1[49] = -bf0[49] + bf0[62];
1740 bf1[50] = -bf0[50] + bf0[61];
1741 bf1[51] = -bf0[51] + bf0[60];
1742 bf1[52] = -bf0[52] + bf0[59];
1743 bf1[53] = -bf0[53] + bf0[58];
1744 bf1[54] = -bf0[54] + bf0[57];
1745 bf1[55] = -bf0[55] + bf0[56];
1746 bf1[56] = bf0[56] + bf0[55];
1747 bf1[57] = bf0[57] + bf0[54];
1748 bf1[58] = bf0[58] + bf0[53];
1749 bf1[59] = bf0[59] + bf0[52];
1750 bf1[60] = bf0[60] + bf0[51];
1751 bf1[61] = bf0[61] + bf0[50];
1752 bf1[62] = bf0[62] + bf0[49];
1753 bf1[63] = bf0[63] + bf0[48];
1754 range_check(stage, input, bf1, size, stage_range[stage]);
1755
1756 // stage 4
1757 stage++;
1758 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
1759 bf0 = output;
1760 bf1 = step;
1761 bf1[0] = bf0[0] + bf0[7];
1762 bf1[1] = bf0[1] + bf0[6];
1763 bf1[2] = bf0[2] + bf0[5];
1764 bf1[3] = bf0[3] + bf0[4];
1765 bf1[4] = -bf0[4] + bf0[3];
1766 bf1[5] = -bf0[5] + bf0[2];
1767 bf1[6] = -bf0[6] + bf0[1];
1768 bf1[7] = -bf0[7] + bf0[0];
1769 bf1[8] = bf0[8];
1770 bf1[9] = bf0[9];
1771 bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
1772 bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
1773 bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit[stage]);
1774 bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit[stage]);
1775 bf1[14] = bf0[14];
1776 bf1[15] = bf0[15];
1777 bf1[16] = bf0[16] + bf0[23];
1778 bf1[17] = bf0[17] + bf0[22];
1779 bf1[18] = bf0[18] + bf0[21];
1780 bf1[19] = bf0[19] + bf0[20];
1781 bf1[20] = -bf0[20] + bf0[19];
1782 bf1[21] = -bf0[21] + bf0[18];
1783 bf1[22] = -bf0[22] + bf0[17];
1784 bf1[23] = -bf0[23] + bf0[16];
1785 bf1[24] = -bf0[24] + bf0[31];
1786 bf1[25] = -bf0[25] + bf0[30];
1787 bf1[26] = -bf0[26] + bf0[29];
1788 bf1[27] = -bf0[27] + bf0[28];
1789 bf1[28] = bf0[28] + bf0[27];
1790 bf1[29] = bf0[29] + bf0[26];
1791 bf1[30] = bf0[30] + bf0[25];
1792 bf1[31] = bf0[31] + bf0[24];
1793 bf1[32] = bf0[32];
1794 bf1[33] = bf0[33];
1795 bf1[34] = bf0[34];
1796 bf1[35] = bf0[35];
1797 bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit[stage]);
1798 bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit[stage]);
1799 bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit[stage]);
1800 bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit[stage]);
1801 bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit[stage]);
1802 bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit[stage]);
1803 bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit[stage]);
1804 bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit[stage]);
1805 bf1[44] = bf0[44];
1806 bf1[45] = bf0[45];
1807 bf1[46] = bf0[46];
1808 bf1[47] = bf0[47];
1809 bf1[48] = bf0[48];
1810 bf1[49] = bf0[49];
1811 bf1[50] = bf0[50];
1812 bf1[51] = bf0[51];
1813 bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit[stage]);
1814 bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit[stage]);
1815 bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit[stage]);
1816 bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit[stage]);
1817 bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit[stage]);
1818 bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit[stage]);
1819 bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit[stage]);
1820 bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit[stage]);
1821 bf1[60] = bf0[60];
1822 bf1[61] = bf0[61];
1823 bf1[62] = bf0[62];
1824 bf1[63] = bf0[63];
1825 range_check(stage, input, bf1, size, stage_range[stage]);
1826
1827 // stage 5
1828 stage++;
1829 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
1830 bf0 = step;
1831 bf1 = output;
1832 bf1[0] = bf0[0] + bf0[3];
1833 bf1[1] = bf0[1] + bf0[2];
1834 bf1[2] = -bf0[2] + bf0[1];
1835 bf1[3] = -bf0[3] + bf0[0];
1836 bf1[4] = bf0[4];
1837 bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
1838 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
1839 bf1[7] = bf0[7];
1840 bf1[8] = bf0[8] + bf0[11];
1841 bf1[9] = bf0[9] + bf0[10];
1842 bf1[10] = -bf0[10] + bf0[9];
1843 bf1[11] = -bf0[11] + bf0[8];
1844 bf1[12] = -bf0[12] + bf0[15];
1845 bf1[13] = -bf0[13] + bf0[14];
1846 bf1[14] = bf0[14] + bf0[13];
1847 bf1[15] = bf0[15] + bf0[12];
1848 bf1[16] = bf0[16];
1849 bf1[17] = bf0[17];
1850 bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit[stage]);
1851 bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit[stage]);
1852 bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit[stage]);
1853 bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit[stage]);
1854 bf1[22] = bf0[22];
1855 bf1[23] = bf0[23];
1856 bf1[24] = bf0[24];
1857 bf1[25] = bf0[25];
1858 bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit[stage]);
1859 bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit[stage]);
1860 bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit[stage]);
1861 bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit[stage]);
1862 bf1[30] = bf0[30];
1863 bf1[31] = bf0[31];
1864 bf1[32] = bf0[32] + bf0[39];
1865 bf1[33] = bf0[33] + bf0[38];
1866 bf1[34] = bf0[34] + bf0[37];
1867 bf1[35] = bf0[35] + bf0[36];
1868 bf1[36] = -bf0[36] + bf0[35];
1869 bf1[37] = -bf0[37] + bf0[34];
1870 bf1[38] = -bf0[38] + bf0[33];
1871 bf1[39] = -bf0[39] + bf0[32];
1872 bf1[40] = -bf0[40] + bf0[47];
1873 bf1[41] = -bf0[41] + bf0[46];
1874 bf1[42] = -bf0[42] + bf0[45];
1875 bf1[43] = -bf0[43] + bf0[44];
1876 bf1[44] = bf0[44] + bf0[43];
1877 bf1[45] = bf0[45] + bf0[42];
1878 bf1[46] = bf0[46] + bf0[41];
1879 bf1[47] = bf0[47] + bf0[40];
1880 bf1[48] = bf0[48] + bf0[55];
1881 bf1[49] = bf0[49] + bf0[54];
1882 bf1[50] = bf0[50] + bf0[53];
1883 bf1[51] = bf0[51] + bf0[52];
1884 bf1[52] = -bf0[52] + bf0[51];
1885 bf1[53] = -bf0[53] + bf0[50];
1886 bf1[54] = -bf0[54] + bf0[49];
1887 bf1[55] = -bf0[55] + bf0[48];
1888 bf1[56] = -bf0[56] + bf0[63];
1889 bf1[57] = -bf0[57] + bf0[62];
1890 bf1[58] = -bf0[58] + bf0[61];
1891 bf1[59] = -bf0[59] + bf0[60];
1892 bf1[60] = bf0[60] + bf0[59];
1893 bf1[61] = bf0[61] + bf0[58];
1894 bf1[62] = bf0[62] + bf0[57];
1895 bf1[63] = bf0[63] + bf0[56];
1896 range_check(stage, input, bf1, size, stage_range[stage]);
1897
1898 // stage 6
1899 stage++;
1900 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
1901 bf0 = output;
1902 bf1 = step;
1903 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
1904 bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
1905 bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
1906 bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
1907 bf1[4] = bf0[4] + bf0[5];
1908 bf1[5] = -bf0[5] + bf0[4];
1909 bf1[6] = -bf0[6] + bf0[7];
1910 bf1[7] = bf0[7] + bf0[6];
1911 bf1[8] = bf0[8];
1912 bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
1913 bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
1914 bf1[11] = bf0[11];
1915 bf1[12] = bf0[12];
1916 bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit[stage]);
1917 bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit[stage]);
1918 bf1[15] = bf0[15];
1919 bf1[16] = bf0[16] + bf0[19];
1920 bf1[17] = bf0[17] + bf0[18];
1921 bf1[18] = -bf0[18] + bf0[17];
1922 bf1[19] = -bf0[19] + bf0[16];
1923 bf1[20] = -bf0[20] + bf0[23];
1924 bf1[21] = -bf0[21] + bf0[22];
1925 bf1[22] = bf0[22] + bf0[21];
1926 bf1[23] = bf0[23] + bf0[20];
1927 bf1[24] = bf0[24] + bf0[27];
1928 bf1[25] = bf0[25] + bf0[26];
1929 bf1[26] = -bf0[26] + bf0[25];
1930 bf1[27] = -bf0[27] + bf0[24];
1931 bf1[28] = -bf0[28] + bf0[31];
1932 bf1[29] = -bf0[29] + bf0[30];
1933 bf1[30] = bf0[30] + bf0[29];
1934 bf1[31] = bf0[31] + bf0[28];
1935 bf1[32] = bf0[32];
1936 bf1[33] = bf0[33];
1937 bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit[stage]);
1938 bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit[stage]);
1939 bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit[stage]);
1940 bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit[stage]);
1941 bf1[38] = bf0[38];
1942 bf1[39] = bf0[39];
1943 bf1[40] = bf0[40];
1944 bf1[41] = bf0[41];
1945 bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit[stage]);
1946 bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit[stage]);
1947 bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit[stage]);
1948 bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit[stage]);
1949 bf1[46] = bf0[46];
1950 bf1[47] = bf0[47];
1951 bf1[48] = bf0[48];
1952 bf1[49] = bf0[49];
1953 bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit[stage]);
1954 bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit[stage]);
1955 bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit[stage]);
1956 bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit[stage]);
1957 bf1[54] = bf0[54];
1958 bf1[55] = bf0[55];
1959 bf1[56] = bf0[56];
1960 bf1[57] = bf0[57];
1961 bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit[stage]);
1962 bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit[stage]);
1963 bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit[stage]);
1964 bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit[stage]);
1965 bf1[62] = bf0[62];
1966 bf1[63] = bf0[63];
1967 range_check(stage, input, bf1, size, stage_range[stage]);
1968
1969 // stage 7
1970 stage++;
1971 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
1972 bf0 = step;
1973 bf1 = output;
1974 bf1[0] = bf0[0];
1975 bf1[1] = bf0[1];
1976 bf1[2] = bf0[2];
1977 bf1[3] = bf0[3];
1978 bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
1979 bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
1980 bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
1981 bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
1982 bf1[8] = bf0[8] + bf0[9];
1983 bf1[9] = -bf0[9] + bf0[8];
1984 bf1[10] = -bf0[10] + bf0[11];
1985 bf1[11] = bf0[11] + bf0[10];
1986 bf1[12] = bf0[12] + bf0[13];
1987 bf1[13] = -bf0[13] + bf0[12];
1988 bf1[14] = -bf0[14] + bf0[15];
1989 bf1[15] = bf0[15] + bf0[14];
1990 bf1[16] = bf0[16];
1991 bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit[stage]);
1992 bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit[stage]);
1993 bf1[19] = bf0[19];
1994 bf1[20] = bf0[20];
1995 bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit[stage]);
1996 bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit[stage]);
1997 bf1[23] = bf0[23];
1998 bf1[24] = bf0[24];
1999 bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit[stage]);
2000 bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit[stage]);
2001 bf1[27] = bf0[27];
2002 bf1[28] = bf0[28];
2003 bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit[stage]);
2004 bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit[stage]);
2005 bf1[31] = bf0[31];
2006 bf1[32] = bf0[32] + bf0[35];
2007 bf1[33] = bf0[33] + bf0[34];
2008 bf1[34] = -bf0[34] + bf0[33];
2009 bf1[35] = -bf0[35] + bf0[32];
2010 bf1[36] = -bf0[36] + bf0[39];
2011 bf1[37] = -bf0[37] + bf0[38];
2012 bf1[38] = bf0[38] + bf0[37];
2013 bf1[39] = bf0[39] + bf0[36];
2014 bf1[40] = bf0[40] + bf0[43];
2015 bf1[41] = bf0[41] + bf0[42];
2016 bf1[42] = -bf0[42] + bf0[41];
2017 bf1[43] = -bf0[43] + bf0[40];
2018 bf1[44] = -bf0[44] + bf0[47];
2019 bf1[45] = -bf0[45] + bf0[46];
2020 bf1[46] = bf0[46] + bf0[45];
2021 bf1[47] = bf0[47] + bf0[44];
2022 bf1[48] = bf0[48] + bf0[51];
2023 bf1[49] = bf0[49] + bf0[50];
2024 bf1[50] = -bf0[50] + bf0[49];
2025 bf1[51] = -bf0[51] + bf0[48];
2026 bf1[52] = -bf0[52] + bf0[55];
2027 bf1[53] = -bf0[53] + bf0[54];
2028 bf1[54] = bf0[54] + bf0[53];
2029 bf1[55] = bf0[55] + bf0[52];
2030 bf1[56] = bf0[56] + bf0[59];
2031 bf1[57] = bf0[57] + bf0[58];
2032 bf1[58] = -bf0[58] + bf0[57];
2033 bf1[59] = -bf0[59] + bf0[56];
2034 bf1[60] = -bf0[60] + bf0[63];
2035 bf1[61] = -bf0[61] + bf0[62];
2036 bf1[62] = bf0[62] + bf0[61];
2037 bf1[63] = bf0[63] + bf0[60];
2038 range_check(stage, input, bf1, size, stage_range[stage]);
2039
2040 // stage 8
2041 stage++;
2042 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
2043 bf0 = output;
2044 bf1 = step;
2045 bf1[0] = bf0[0];
2046 bf1[1] = bf0[1];
2047 bf1[2] = bf0[2];
2048 bf1[3] = bf0[3];
2049 bf1[4] = bf0[4];
2050 bf1[5] = bf0[5];
2051 bf1[6] = bf0[6];
2052 bf1[7] = bf0[7];
2053 bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit[stage]);
2054 bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit[stage]);
2055 bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit[stage]);
2056 bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit[stage]);
2057 bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit[stage]);
2058 bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit[stage]);
2059 bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit[stage]);
2060 bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit[stage]);
2061 bf1[16] = bf0[16] + bf0[17];
2062 bf1[17] = -bf0[17] + bf0[16];
2063 bf1[18] = -bf0[18] + bf0[19];
2064 bf1[19] = bf0[19] + bf0[18];
2065 bf1[20] = bf0[20] + bf0[21];
2066 bf1[21] = -bf0[21] + bf0[20];
2067 bf1[22] = -bf0[22] + bf0[23];
2068 bf1[23] = bf0[23] + bf0[22];
2069 bf1[24] = bf0[24] + bf0[25];
2070 bf1[25] = -bf0[25] + bf0[24];
2071 bf1[26] = -bf0[26] + bf0[27];
2072 bf1[27] = bf0[27] + bf0[26];
2073 bf1[28] = bf0[28] + bf0[29];
2074 bf1[29] = -bf0[29] + bf0[28];
2075 bf1[30] = -bf0[30] + bf0[31];
2076 bf1[31] = bf0[31] + bf0[30];
2077 bf1[32] = bf0[32];
2078 bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit[stage]);
2079 bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit[stage]);
2080 bf1[35] = bf0[35];
2081 bf1[36] = bf0[36];
2082 bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit[stage]);
2083 bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit[stage]);
2084 bf1[39] = bf0[39];
2085 bf1[40] = bf0[40];
2086 bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit[stage]);
2087 bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit[stage]);
2088 bf1[43] = bf0[43];
2089 bf1[44] = bf0[44];
2090 bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit[stage]);
2091 bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit[stage]);
2092 bf1[47] = bf0[47];
2093 bf1[48] = bf0[48];
2094 bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit[stage]);
2095 bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit[stage]);
2096 bf1[51] = bf0[51];
2097 bf1[52] = bf0[52];
2098 bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit[stage]);
2099 bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit[stage]);
2100 bf1[55] = bf0[55];
2101 bf1[56] = bf0[56];
2102 bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit[stage]);
2103 bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit[stage]);
2104 bf1[59] = bf0[59];
2105 bf1[60] = bf0[60];
2106 bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit[stage]);
2107 bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit[stage]);
2108 bf1[63] = bf0[63];
2109 range_check(stage, input, bf1, size, stage_range[stage]);
2110
2111 // stage 9
2112 stage++;
2113 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
2114 bf0 = step;
2115 bf1 = output;
2116 bf1[0] = bf0[0];
2117 bf1[1] = bf0[1];
2118 bf1[2] = bf0[2];
2119 bf1[3] = bf0[3];
2120 bf1[4] = bf0[4];
2121 bf1[5] = bf0[5];
2122 bf1[6] = bf0[6];
2123 bf1[7] = bf0[7];
2124 bf1[8] = bf0[8];
2125 bf1[9] = bf0[9];
2126 bf1[10] = bf0[10];
2127 bf1[11] = bf0[11];
2128 bf1[12] = bf0[12];
2129 bf1[13] = bf0[13];
2130 bf1[14] = bf0[14];
2131 bf1[15] = bf0[15];
2132 bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit[stage]);
2133 bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit[stage]);
2134 bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit[stage]);
2135 bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit[stage]);
2136 bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit[stage]);
2137 bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit[stage]);
2138 bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit[stage]);
2139 bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit[stage]);
2140 bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit[stage]);
2141 bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit[stage]);
2142 bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit[stage]);
2143 bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit[stage]);
2144 bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit[stage]);
2145 bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit[stage]);
2146 bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit[stage]);
2147 bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit[stage]);
2148 bf1[32] = bf0[32] + bf0[33];
2149 bf1[33] = -bf0[33] + bf0[32];
2150 bf1[34] = -bf0[34] + bf0[35];
2151 bf1[35] = bf0[35] + bf0[34];
2152 bf1[36] = bf0[36] + bf0[37];
2153 bf1[37] = -bf0[37] + bf0[36];
2154 bf1[38] = -bf0[38] + bf0[39];
2155 bf1[39] = bf0[39] + bf0[38];
2156 bf1[40] = bf0[40] + bf0[41];
2157 bf1[41] = -bf0[41] + bf0[40];
2158 bf1[42] = -bf0[42] + bf0[43];
2159 bf1[43] = bf0[43] + bf0[42];
2160 bf1[44] = bf0[44] + bf0[45];
2161 bf1[45] = -bf0[45] + bf0[44];
2162 bf1[46] = -bf0[46] + bf0[47];
2163 bf1[47] = bf0[47] + bf0[46];
2164 bf1[48] = bf0[48] + bf0[49];
2165 bf1[49] = -bf0[49] + bf0[48];
2166 bf1[50] = -bf0[50] + bf0[51];
2167 bf1[51] = bf0[51] + bf0[50];
2168 bf1[52] = bf0[52] + bf0[53];
2169 bf1[53] = -bf0[53] + bf0[52];
2170 bf1[54] = -bf0[54] + bf0[55];
2171 bf1[55] = bf0[55] + bf0[54];
2172 bf1[56] = bf0[56] + bf0[57];
2173 bf1[57] = -bf0[57] + bf0[56];
2174 bf1[58] = -bf0[58] + bf0[59];
2175 bf1[59] = bf0[59] + bf0[58];
2176 bf1[60] = bf0[60] + bf0[61];
2177 bf1[61] = -bf0[61] + bf0[60];
2178 bf1[62] = -bf0[62] + bf0[63];
2179 bf1[63] = bf0[63] + bf0[62];
2180 range_check(stage, input, bf1, size, stage_range[stage]);
2181
2182 // stage 10
2183 stage++;
2184 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
2185 bf0 = output;
2186 bf1 = step;
2187 bf1[0] = bf0[0];
2188 bf1[1] = bf0[1];
2189 bf1[2] = bf0[2];
2190 bf1[3] = bf0[3];
2191 bf1[4] = bf0[4];
2192 bf1[5] = bf0[5];
2193 bf1[6] = bf0[6];
2194 bf1[7] = bf0[7];
2195 bf1[8] = bf0[8];
2196 bf1[9] = bf0[9];
2197 bf1[10] = bf0[10];
2198 bf1[11] = bf0[11];
2199 bf1[12] = bf0[12];
2200 bf1[13] = bf0[13];
2201 bf1[14] = bf0[14];
2202 bf1[15] = bf0[15];
2203 bf1[16] = bf0[16];
2204 bf1[17] = bf0[17];
2205 bf1[18] = bf0[18];
2206 bf1[19] = bf0[19];
2207 bf1[20] = bf0[20];
2208 bf1[21] = bf0[21];
2209 bf1[22] = bf0[22];
2210 bf1[23] = bf0[23];
2211 bf1[24] = bf0[24];
2212 bf1[25] = bf0[25];
2213 bf1[26] = bf0[26];
2214 bf1[27] = bf0[27];
2215 bf1[28] = bf0[28];
2216 bf1[29] = bf0[29];
2217 bf1[30] = bf0[30];
2218 bf1[31] = bf0[31];
2219 bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit[stage]);
2220 bf1[33] = half_btf(cospi[31], bf0[33], cospi[33], bf0[62], cos_bit[stage]);
2221 bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit[stage]);
2222 bf1[35] = half_btf(cospi[15], bf0[35], cospi[49], bf0[60], cos_bit[stage]);
2223 bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit[stage]);
2224 bf1[37] = half_btf(cospi[23], bf0[37], cospi[41], bf0[58], cos_bit[stage]);
2225 bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit[stage]);
2226 bf1[39] = half_btf(cospi[7], bf0[39], cospi[57], bf0[56], cos_bit[stage]);
2227 bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit[stage]);
2228 bf1[41] = half_btf(cospi[27], bf0[41], cospi[37], bf0[54], cos_bit[stage]);
2229 bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit[stage]);
2230 bf1[43] = half_btf(cospi[11], bf0[43], cospi[53], bf0[52], cos_bit[stage]);
2231 bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit[stage]);
2232 bf1[45] = half_btf(cospi[19], bf0[45], cospi[45], bf0[50], cos_bit[stage]);
2233 bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit[stage]);
2234 bf1[47] = half_btf(cospi[3], bf0[47], cospi[61], bf0[48], cos_bit[stage]);
2235 bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit[stage]);
2236 bf1[49] = half_btf(cospi[35], bf0[49], -cospi[29], bf0[46], cos_bit[stage]);
2237 bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit[stage]);
2238 bf1[51] = half_btf(cospi[51], bf0[51], -cospi[13], bf0[44], cos_bit[stage]);
2239 bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit[stage]);
2240 bf1[53] = half_btf(cospi[43], bf0[53], -cospi[21], bf0[42], cos_bit[stage]);
2241 bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit[stage]);
2242 bf1[55] = half_btf(cospi[59], bf0[55], -cospi[5], bf0[40], cos_bit[stage]);
2243 bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit[stage]);
2244 bf1[57] = half_btf(cospi[39], bf0[57], -cospi[25], bf0[38], cos_bit[stage]);
2245 bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit[stage]);
2246 bf1[59] = half_btf(cospi[55], bf0[59], -cospi[9], bf0[36], cos_bit[stage]);
2247 bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit[stage]);
2248 bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit[stage]);
2249 bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit[stage]);
2250 bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit[stage]);
2251 range_check(stage, input, bf1, size, stage_range[stage]);
2252
2253 // stage 11
2254 stage++;
2255 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
2256 bf0 = step;
2257 bf1 = output;
2258 bf1[0] = bf0[0];
2259 bf1[1] = bf0[32];
2260 bf1[2] = bf0[16];
2261 bf1[3] = bf0[48];
2262 bf1[4] = bf0[8];
2263 bf1[5] = bf0[40];
2264 bf1[6] = bf0[24];
2265 bf1[7] = bf0[56];
2266 bf1[8] = bf0[4];
2267 bf1[9] = bf0[36];
2268 bf1[10] = bf0[20];
2269 bf1[11] = bf0[52];
2270 bf1[12] = bf0[12];
2271 bf1[13] = bf0[44];
2272 bf1[14] = bf0[28];
2273 bf1[15] = bf0[60];
2274 bf1[16] = bf0[2];
2275 bf1[17] = bf0[34];
2276 bf1[18] = bf0[18];
2277 bf1[19] = bf0[50];
2278 bf1[20] = bf0[10];
2279 bf1[21] = bf0[42];
2280 bf1[22] = bf0[26];
2281 bf1[23] = bf0[58];
2282 bf1[24] = bf0[6];
2283 bf1[25] = bf0[38];
2284 bf1[26] = bf0[22];
2285 bf1[27] = bf0[54];
2286 bf1[28] = bf0[14];
2287 bf1[29] = bf0[46];
2288 bf1[30] = bf0[30];
2289 bf1[31] = bf0[62];
2290 bf1[32] = bf0[1];
2291 bf1[33] = bf0[33];
2292 bf1[34] = bf0[17];
2293 bf1[35] = bf0[49];
2294 bf1[36] = bf0[9];
2295 bf1[37] = bf0[41];
2296 bf1[38] = bf0[25];
2297 bf1[39] = bf0[57];
2298 bf1[40] = bf0[5];
2299 bf1[41] = bf0[37];
2300 bf1[42] = bf0[21];
2301 bf1[43] = bf0[53];
2302 bf1[44] = bf0[13];
2303 bf1[45] = bf0[45];
2304 bf1[46] = bf0[29];
2305 bf1[47] = bf0[61];
2306 bf1[48] = bf0[3];
2307 bf1[49] = bf0[35];
2308 bf1[50] = bf0[19];
2309 bf1[51] = bf0[51];
2310 bf1[52] = bf0[11];
2311 bf1[53] = bf0[43];
2312 bf1[54] = bf0[27];
2313 bf1[55] = bf0[59];
2314 bf1[56] = bf0[7];
2315 bf1[57] = bf0[39];
2316 bf1[58] = bf0[23];
2317 bf1[59] = bf0[55];
2318 bf1[60] = bf0[15];
2319 bf1[61] = bf0[47];
2320 bf1[62] = bf0[31];
2321 bf1[63] = bf0[63];
2322 range_check(stage, input, bf1, size, stage_range[stage]);
2323}
2324#endif // CONFIG_TX64X64