blob: 4c695ae7d17815b002fe3ef7631e5e484694573f [file] [log] [blame]
Yaowu Xuc27fc142016-08-22 16:08:15 -07001/*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <stdlib.h>
Yaowu Xuf883b422016-08-30 14:01:10 -070012#include "av1/common/av1_fwd_txfm1d.h"
Yaowu Xuc27fc142016-08-22 16:08:15 -070013#if CONFIG_COEFFICIENT_RANGE_CHECKING
14#define range_check(stage, input, buf, size, bit) \
15 { \
16 int i, j; \
17 for (i = 0; i < size; ++i) { \
18 int buf_bit = get_max_bit(abs(buf[i])) + 1; \
19 if (buf_bit > bit) { \
20 printf("======== %s %d overflow ========\n", __FILE__, __LINE__); \
21 printf("stage: %d node: %d\n", stage, i); \
22 printf("bit: %d buf_bit: %d buf[i]: %d\n", bit, buf_bit, buf[i]); \
23 printf("input:\n"); \
24 for (j = 0; j < size; j++) { \
25 printf("%d,", input[j]); \
26 } \
27 printf("\n"); \
28 assert(0); \
29 } \
30 } \
31 }
32#else
33#define range_check(stage, input, buf, size, bit) \
34 { \
clang-format67948d32016-09-07 22:40:40 -070035 (void)stage; \
36 (void)input; \
37 (void)buf; \
38 (void)size; \
39 (void)bit; \
Yaowu Xuc27fc142016-08-22 16:08:15 -070040 }
41#endif
42
Yaowu Xuf883b422016-08-30 14:01:10 -070043void av1_fdct4_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
44 const int8_t *stage_range) {
Yaowu Xuc27fc142016-08-22 16:08:15 -070045 const int32_t size = 4;
46 const int32_t *cospi;
47
48 int32_t stage = 0;
49 int32_t *bf0, *bf1;
50 int32_t step[4];
51
52 // stage 0;
53 range_check(stage, input, input, size, stage_range[stage]);
54
55 // stage 1;
56 stage++;
57 bf1 = output;
58 bf1[0] = input[0] + input[3];
59 bf1[1] = input[1] + input[2];
60 bf1[2] = -input[2] + input[1];
61 bf1[3] = -input[3] + input[0];
62 range_check(stage, input, bf1, size, stage_range[stage]);
63
64 // stage 2
65 stage++;
66 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
67 bf0 = output;
68 bf1 = step;
69 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
70 bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
71 bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
72 bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
73 range_check(stage, input, bf1, size, stage_range[stage]);
74
75 // stage 3
76 stage++;
77 bf0 = step;
78 bf1 = output;
79 bf1[0] = bf0[0];
80 bf1[1] = bf0[2];
81 bf1[2] = bf0[1];
82 bf1[3] = bf0[3];
83 range_check(stage, input, bf1, size, stage_range[stage]);
84}
85
Yaowu Xuf883b422016-08-30 14:01:10 -070086void av1_fdct8_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
87 const int8_t *stage_range) {
Yaowu Xuc27fc142016-08-22 16:08:15 -070088 const int32_t size = 8;
89 const int32_t *cospi;
90
91 int32_t stage = 0;
92 int32_t *bf0, *bf1;
93 int32_t step[8];
94
95 // stage 0;
96 range_check(stage, input, input, size, stage_range[stage]);
97
98 // stage 1;
99 stage++;
100 bf1 = output;
101 bf1[0] = input[0] + input[7];
102 bf1[1] = input[1] + input[6];
103 bf1[2] = input[2] + input[5];
104 bf1[3] = input[3] + input[4];
105 bf1[4] = -input[4] + input[3];
106 bf1[5] = -input[5] + input[2];
107 bf1[6] = -input[6] + input[1];
108 bf1[7] = -input[7] + input[0];
109 range_check(stage, input, bf1, size, stage_range[stage]);
110
111 // stage 2
112 stage++;
113 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
114 bf0 = output;
115 bf1 = step;
116 bf1[0] = bf0[0] + bf0[3];
117 bf1[1] = bf0[1] + bf0[2];
118 bf1[2] = -bf0[2] + bf0[1];
119 bf1[3] = -bf0[3] + bf0[0];
120 bf1[4] = bf0[4];
121 bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
122 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
123 bf1[7] = bf0[7];
124 range_check(stage, input, bf1, size, stage_range[stage]);
125
126 // stage 3
127 stage++;
128 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
129 bf0 = step;
130 bf1 = output;
131 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
132 bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
133 bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
134 bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
135 bf1[4] = bf0[4] + bf0[5];
136 bf1[5] = -bf0[5] + bf0[4];
137 bf1[6] = -bf0[6] + bf0[7];
138 bf1[7] = bf0[7] + bf0[6];
139 range_check(stage, input, bf1, size, stage_range[stage]);
140
141 // stage 4
142 stage++;
143 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
144 bf0 = output;
145 bf1 = step;
146 bf1[0] = bf0[0];
147 bf1[1] = bf0[1];
148 bf1[2] = bf0[2];
149 bf1[3] = bf0[3];
150 bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
151 bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
152 bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
153 bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
154 range_check(stage, input, bf1, size, stage_range[stage]);
155
156 // stage 5
157 stage++;
158 bf0 = step;
159 bf1 = output;
160 bf1[0] = bf0[0];
161 bf1[1] = bf0[4];
162 bf1[2] = bf0[2];
163 bf1[3] = bf0[6];
164 bf1[4] = bf0[1];
165 bf1[5] = bf0[5];
166 bf1[6] = bf0[3];
167 bf1[7] = bf0[7];
168 range_check(stage, input, bf1, size, stage_range[stage]);
169}
170
Yaowu Xuf883b422016-08-30 14:01:10 -0700171void av1_fdct16_new(const int32_t *input, int32_t *output,
172 const int8_t *cos_bit, const int8_t *stage_range) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700173 const int32_t size = 16;
174 const int32_t *cospi;
175
176 int32_t stage = 0;
177 int32_t *bf0, *bf1;
178 int32_t step[16];
179
180 // stage 0;
181 range_check(stage, input, input, size, stage_range[stage]);
182
183 // stage 1;
184 stage++;
185 bf1 = output;
186 bf1[0] = input[0] + input[15];
187 bf1[1] = input[1] + input[14];
188 bf1[2] = input[2] + input[13];
189 bf1[3] = input[3] + input[12];
190 bf1[4] = input[4] + input[11];
191 bf1[5] = input[5] + input[10];
192 bf1[6] = input[6] + input[9];
193 bf1[7] = input[7] + input[8];
194 bf1[8] = -input[8] + input[7];
195 bf1[9] = -input[9] + input[6];
196 bf1[10] = -input[10] + input[5];
197 bf1[11] = -input[11] + input[4];
198 bf1[12] = -input[12] + input[3];
199 bf1[13] = -input[13] + input[2];
200 bf1[14] = -input[14] + input[1];
201 bf1[15] = -input[15] + input[0];
202 range_check(stage, input, bf1, size, stage_range[stage]);
203
204 // stage 2
205 stage++;
206 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
207 bf0 = output;
208 bf1 = step;
209 bf1[0] = bf0[0] + bf0[7];
210 bf1[1] = bf0[1] + bf0[6];
211 bf1[2] = bf0[2] + bf0[5];
212 bf1[3] = bf0[3] + bf0[4];
213 bf1[4] = -bf0[4] + bf0[3];
214 bf1[5] = -bf0[5] + bf0[2];
215 bf1[6] = -bf0[6] + bf0[1];
216 bf1[7] = -bf0[7] + bf0[0];
217 bf1[8] = bf0[8];
218 bf1[9] = bf0[9];
219 bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
220 bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
221 bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit[stage]);
222 bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit[stage]);
223 bf1[14] = bf0[14];
224 bf1[15] = bf0[15];
225 range_check(stage, input, bf1, size, stage_range[stage]);
226
227 // stage 3
228 stage++;
229 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
230 bf0 = step;
231 bf1 = output;
232 bf1[0] = bf0[0] + bf0[3];
233 bf1[1] = bf0[1] + bf0[2];
234 bf1[2] = -bf0[2] + bf0[1];
235 bf1[3] = -bf0[3] + bf0[0];
236 bf1[4] = bf0[4];
237 bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
238 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
239 bf1[7] = bf0[7];
240 bf1[8] = bf0[8] + bf0[11];
241 bf1[9] = bf0[9] + bf0[10];
242 bf1[10] = -bf0[10] + bf0[9];
243 bf1[11] = -bf0[11] + bf0[8];
244 bf1[12] = -bf0[12] + bf0[15];
245 bf1[13] = -bf0[13] + bf0[14];
246 bf1[14] = bf0[14] + bf0[13];
247 bf1[15] = bf0[15] + bf0[12];
248 range_check(stage, input, bf1, size, stage_range[stage]);
249
250 // stage 4
251 stage++;
252 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
253 bf0 = output;
254 bf1 = step;
255 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
256 bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
257 bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
258 bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
259 bf1[4] = bf0[4] + bf0[5];
260 bf1[5] = -bf0[5] + bf0[4];
261 bf1[6] = -bf0[6] + bf0[7];
262 bf1[7] = bf0[7] + bf0[6];
263 bf1[8] = bf0[8];
264 bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
265 bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
266 bf1[11] = bf0[11];
267 bf1[12] = bf0[12];
268 bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit[stage]);
269 bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit[stage]);
270 bf1[15] = bf0[15];
271 range_check(stage, input, bf1, size, stage_range[stage]);
272
273 // stage 5
274 stage++;
275 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
276 bf0 = step;
277 bf1 = output;
278 bf1[0] = bf0[0];
279 bf1[1] = bf0[1];
280 bf1[2] = bf0[2];
281 bf1[3] = bf0[3];
282 bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
283 bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
284 bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
285 bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
286 bf1[8] = bf0[8] + bf0[9];
287 bf1[9] = -bf0[9] + bf0[8];
288 bf1[10] = -bf0[10] + bf0[11];
289 bf1[11] = bf0[11] + bf0[10];
290 bf1[12] = bf0[12] + bf0[13];
291 bf1[13] = -bf0[13] + bf0[12];
292 bf1[14] = -bf0[14] + bf0[15];
293 bf1[15] = bf0[15] + bf0[14];
294 range_check(stage, input, bf1, size, stage_range[stage]);
295
296 // stage 6
297 stage++;
298 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
299 bf0 = output;
300 bf1 = step;
301 bf1[0] = bf0[0];
302 bf1[1] = bf0[1];
303 bf1[2] = bf0[2];
304 bf1[3] = bf0[3];
305 bf1[4] = bf0[4];
306 bf1[5] = bf0[5];
307 bf1[6] = bf0[6];
308 bf1[7] = bf0[7];
309 bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit[stage]);
310 bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit[stage]);
311 bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit[stage]);
312 bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit[stage]);
313 bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit[stage]);
314 bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit[stage]);
315 bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit[stage]);
316 bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit[stage]);
317 range_check(stage, input, bf1, size, stage_range[stage]);
318
319 // stage 7
320 stage++;
321 bf0 = step;
322 bf1 = output;
323 bf1[0] = bf0[0];
324 bf1[1] = bf0[8];
325 bf1[2] = bf0[4];
326 bf1[3] = bf0[12];
327 bf1[4] = bf0[2];
328 bf1[5] = bf0[10];
329 bf1[6] = bf0[6];
330 bf1[7] = bf0[14];
331 bf1[8] = bf0[1];
332 bf1[9] = bf0[9];
333 bf1[10] = bf0[5];
334 bf1[11] = bf0[13];
335 bf1[12] = bf0[3];
336 bf1[13] = bf0[11];
337 bf1[14] = bf0[7];
338 bf1[15] = bf0[15];
339 range_check(stage, input, bf1, size, stage_range[stage]);
340}
341
Yaowu Xuf883b422016-08-30 14:01:10 -0700342void av1_fdct32_new(const int32_t *input, int32_t *output,
343 const int8_t *cos_bit, const int8_t *stage_range) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700344 const int32_t size = 32;
345 const int32_t *cospi;
346
347 int32_t stage = 0;
348 int32_t *bf0, *bf1;
349 int32_t step[32];
350
351 // stage 0;
352 range_check(stage, input, input, size, stage_range[stage]);
353
354 // stage 1;
355 stage++;
356 bf1 = output;
357 bf1[0] = input[0] + input[31];
358 bf1[1] = input[1] + input[30];
359 bf1[2] = input[2] + input[29];
360 bf1[3] = input[3] + input[28];
361 bf1[4] = input[4] + input[27];
362 bf1[5] = input[5] + input[26];
363 bf1[6] = input[6] + input[25];
364 bf1[7] = input[7] + input[24];
365 bf1[8] = input[8] + input[23];
366 bf1[9] = input[9] + input[22];
367 bf1[10] = input[10] + input[21];
368 bf1[11] = input[11] + input[20];
369 bf1[12] = input[12] + input[19];
370 bf1[13] = input[13] + input[18];
371 bf1[14] = input[14] + input[17];
372 bf1[15] = input[15] + input[16];
373 bf1[16] = -input[16] + input[15];
374 bf1[17] = -input[17] + input[14];
375 bf1[18] = -input[18] + input[13];
376 bf1[19] = -input[19] + input[12];
377 bf1[20] = -input[20] + input[11];
378 bf1[21] = -input[21] + input[10];
379 bf1[22] = -input[22] + input[9];
380 bf1[23] = -input[23] + input[8];
381 bf1[24] = -input[24] + input[7];
382 bf1[25] = -input[25] + input[6];
383 bf1[26] = -input[26] + input[5];
384 bf1[27] = -input[27] + input[4];
385 bf1[28] = -input[28] + input[3];
386 bf1[29] = -input[29] + input[2];
387 bf1[30] = -input[30] + input[1];
388 bf1[31] = -input[31] + input[0];
389 range_check(stage, input, bf1, size, stage_range[stage]);
390
391 // stage 2
392 stage++;
393 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
394 bf0 = output;
395 bf1 = step;
396 bf1[0] = bf0[0] + bf0[15];
397 bf1[1] = bf0[1] + bf0[14];
398 bf1[2] = bf0[2] + bf0[13];
399 bf1[3] = bf0[3] + bf0[12];
400 bf1[4] = bf0[4] + bf0[11];
401 bf1[5] = bf0[5] + bf0[10];
402 bf1[6] = bf0[6] + bf0[9];
403 bf1[7] = bf0[7] + bf0[8];
404 bf1[8] = -bf0[8] + bf0[7];
405 bf1[9] = -bf0[9] + bf0[6];
406 bf1[10] = -bf0[10] + bf0[5];
407 bf1[11] = -bf0[11] + bf0[4];
408 bf1[12] = -bf0[12] + bf0[3];
409 bf1[13] = -bf0[13] + bf0[2];
410 bf1[14] = -bf0[14] + bf0[1];
411 bf1[15] = -bf0[15] + bf0[0];
412 bf1[16] = bf0[16];
413 bf1[17] = bf0[17];
414 bf1[18] = bf0[18];
415 bf1[19] = bf0[19];
416 bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
417 bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
418 bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
419 bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
420 bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit[stage]);
421 bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit[stage]);
422 bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit[stage]);
423 bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit[stage]);
424 bf1[28] = bf0[28];
425 bf1[29] = bf0[29];
426 bf1[30] = bf0[30];
427 bf1[31] = bf0[31];
428 range_check(stage, input, bf1, size, stage_range[stage]);
429
430 // stage 3
431 stage++;
432 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
433 bf0 = step;
434 bf1 = output;
435 bf1[0] = bf0[0] + bf0[7];
436 bf1[1] = bf0[1] + bf0[6];
437 bf1[2] = bf0[2] + bf0[5];
438 bf1[3] = bf0[3] + bf0[4];
439 bf1[4] = -bf0[4] + bf0[3];
440 bf1[5] = -bf0[5] + bf0[2];
441 bf1[6] = -bf0[6] + bf0[1];
442 bf1[7] = -bf0[7] + bf0[0];
443 bf1[8] = bf0[8];
444 bf1[9] = bf0[9];
445 bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
446 bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
447 bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit[stage]);
448 bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit[stage]);
449 bf1[14] = bf0[14];
450 bf1[15] = bf0[15];
451 bf1[16] = bf0[16] + bf0[23];
452 bf1[17] = bf0[17] + bf0[22];
453 bf1[18] = bf0[18] + bf0[21];
454 bf1[19] = bf0[19] + bf0[20];
455 bf1[20] = -bf0[20] + bf0[19];
456 bf1[21] = -bf0[21] + bf0[18];
457 bf1[22] = -bf0[22] + bf0[17];
458 bf1[23] = -bf0[23] + bf0[16];
459 bf1[24] = -bf0[24] + bf0[31];
460 bf1[25] = -bf0[25] + bf0[30];
461 bf1[26] = -bf0[26] + bf0[29];
462 bf1[27] = -bf0[27] + bf0[28];
463 bf1[28] = bf0[28] + bf0[27];
464 bf1[29] = bf0[29] + bf0[26];
465 bf1[30] = bf0[30] + bf0[25];
466 bf1[31] = bf0[31] + bf0[24];
467 range_check(stage, input, bf1, size, stage_range[stage]);
468
469 // stage 4
470 stage++;
471 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
472 bf0 = output;
473 bf1 = step;
474 bf1[0] = bf0[0] + bf0[3];
475 bf1[1] = bf0[1] + bf0[2];
476 bf1[2] = -bf0[2] + bf0[1];
477 bf1[3] = -bf0[3] + bf0[0];
478 bf1[4] = bf0[4];
479 bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
480 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
481 bf1[7] = bf0[7];
482 bf1[8] = bf0[8] + bf0[11];
483 bf1[9] = bf0[9] + bf0[10];
484 bf1[10] = -bf0[10] + bf0[9];
485 bf1[11] = -bf0[11] + bf0[8];
486 bf1[12] = -bf0[12] + bf0[15];
487 bf1[13] = -bf0[13] + bf0[14];
488 bf1[14] = bf0[14] + bf0[13];
489 bf1[15] = bf0[15] + bf0[12];
490 bf1[16] = bf0[16];
491 bf1[17] = bf0[17];
492 bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit[stage]);
493 bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit[stage]);
494 bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit[stage]);
495 bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit[stage]);
496 bf1[22] = bf0[22];
497 bf1[23] = bf0[23];
498 bf1[24] = bf0[24];
499 bf1[25] = bf0[25];
500 bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit[stage]);
501 bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit[stage]);
502 bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit[stage]);
503 bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit[stage]);
504 bf1[30] = bf0[30];
505 bf1[31] = bf0[31];
506 range_check(stage, input, bf1, size, stage_range[stage]);
507
508 // stage 5
509 stage++;
510 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
511 bf0 = step;
512 bf1 = output;
513 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
514 bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
515 bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
516 bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
517 bf1[4] = bf0[4] + bf0[5];
518 bf1[5] = -bf0[5] + bf0[4];
519 bf1[6] = -bf0[6] + bf0[7];
520 bf1[7] = bf0[7] + bf0[6];
521 bf1[8] = bf0[8];
522 bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
523 bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
524 bf1[11] = bf0[11];
525 bf1[12] = bf0[12];
526 bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit[stage]);
527 bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit[stage]);
528 bf1[15] = bf0[15];
529 bf1[16] = bf0[16] + bf0[19];
530 bf1[17] = bf0[17] + bf0[18];
531 bf1[18] = -bf0[18] + bf0[17];
532 bf1[19] = -bf0[19] + bf0[16];
533 bf1[20] = -bf0[20] + bf0[23];
534 bf1[21] = -bf0[21] + bf0[22];
535 bf1[22] = bf0[22] + bf0[21];
536 bf1[23] = bf0[23] + bf0[20];
537 bf1[24] = bf0[24] + bf0[27];
538 bf1[25] = bf0[25] + bf0[26];
539 bf1[26] = -bf0[26] + bf0[25];
540 bf1[27] = -bf0[27] + bf0[24];
541 bf1[28] = -bf0[28] + bf0[31];
542 bf1[29] = -bf0[29] + bf0[30];
543 bf1[30] = bf0[30] + bf0[29];
544 bf1[31] = bf0[31] + bf0[28];
545 range_check(stage, input, bf1, size, stage_range[stage]);
546
547 // stage 6
548 stage++;
549 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
550 bf0 = output;
551 bf1 = step;
552 bf1[0] = bf0[0];
553 bf1[1] = bf0[1];
554 bf1[2] = bf0[2];
555 bf1[3] = bf0[3];
556 bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
557 bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
558 bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
559 bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
560 bf1[8] = bf0[8] + bf0[9];
561 bf1[9] = -bf0[9] + bf0[8];
562 bf1[10] = -bf0[10] + bf0[11];
563 bf1[11] = bf0[11] + bf0[10];
564 bf1[12] = bf0[12] + bf0[13];
565 bf1[13] = -bf0[13] + bf0[12];
566 bf1[14] = -bf0[14] + bf0[15];
567 bf1[15] = bf0[15] + bf0[14];
568 bf1[16] = bf0[16];
569 bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit[stage]);
570 bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit[stage]);
571 bf1[19] = bf0[19];
572 bf1[20] = bf0[20];
573 bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit[stage]);
574 bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit[stage]);
575 bf1[23] = bf0[23];
576 bf1[24] = bf0[24];
577 bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit[stage]);
578 bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit[stage]);
579 bf1[27] = bf0[27];
580 bf1[28] = bf0[28];
581 bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit[stage]);
582 bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit[stage]);
583 bf1[31] = bf0[31];
584 range_check(stage, input, bf1, size, stage_range[stage]);
585
586 // stage 7
587 stage++;
588 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
589 bf0 = step;
590 bf1 = output;
591 bf1[0] = bf0[0];
592 bf1[1] = bf0[1];
593 bf1[2] = bf0[2];
594 bf1[3] = bf0[3];
595 bf1[4] = bf0[4];
596 bf1[5] = bf0[5];
597 bf1[6] = bf0[6];
598 bf1[7] = bf0[7];
599 bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit[stage]);
600 bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit[stage]);
601 bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit[stage]);
602 bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit[stage]);
603 bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit[stage]);
604 bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit[stage]);
605 bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit[stage]);
606 bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit[stage]);
607 bf1[16] = bf0[16] + bf0[17];
608 bf1[17] = -bf0[17] + bf0[16];
609 bf1[18] = -bf0[18] + bf0[19];
610 bf1[19] = bf0[19] + bf0[18];
611 bf1[20] = bf0[20] + bf0[21];
612 bf1[21] = -bf0[21] + bf0[20];
613 bf1[22] = -bf0[22] + bf0[23];
614 bf1[23] = bf0[23] + bf0[22];
615 bf1[24] = bf0[24] + bf0[25];
616 bf1[25] = -bf0[25] + bf0[24];
617 bf1[26] = -bf0[26] + bf0[27];
618 bf1[27] = bf0[27] + bf0[26];
619 bf1[28] = bf0[28] + bf0[29];
620 bf1[29] = -bf0[29] + bf0[28];
621 bf1[30] = -bf0[30] + bf0[31];
622 bf1[31] = bf0[31] + bf0[30];
623 range_check(stage, input, bf1, size, stage_range[stage]);
624
625 // stage 8
626 stage++;
627 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
628 bf0 = output;
629 bf1 = step;
630 bf1[0] = bf0[0];
631 bf1[1] = bf0[1];
632 bf1[2] = bf0[2];
633 bf1[3] = bf0[3];
634 bf1[4] = bf0[4];
635 bf1[5] = bf0[5];
636 bf1[6] = bf0[6];
637 bf1[7] = bf0[7];
638 bf1[8] = bf0[8];
639 bf1[9] = bf0[9];
640 bf1[10] = bf0[10];
641 bf1[11] = bf0[11];
642 bf1[12] = bf0[12];
643 bf1[13] = bf0[13];
644 bf1[14] = bf0[14];
645 bf1[15] = bf0[15];
646 bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit[stage]);
647 bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit[stage]);
648 bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit[stage]);
649 bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit[stage]);
650 bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit[stage]);
651 bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit[stage]);
652 bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit[stage]);
653 bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit[stage]);
654 bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit[stage]);
655 bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit[stage]);
656 bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit[stage]);
657 bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit[stage]);
658 bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit[stage]);
659 bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit[stage]);
660 bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit[stage]);
661 bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit[stage]);
662 range_check(stage, input, bf1, size, stage_range[stage]);
663
664 // stage 9
665 stage++;
666 bf0 = step;
667 bf1 = output;
668 bf1[0] = bf0[0];
669 bf1[1] = bf0[16];
670 bf1[2] = bf0[8];
671 bf1[3] = bf0[24];
672 bf1[4] = bf0[4];
673 bf1[5] = bf0[20];
674 bf1[6] = bf0[12];
675 bf1[7] = bf0[28];
676 bf1[8] = bf0[2];
677 bf1[9] = bf0[18];
678 bf1[10] = bf0[10];
679 bf1[11] = bf0[26];
680 bf1[12] = bf0[6];
681 bf1[13] = bf0[22];
682 bf1[14] = bf0[14];
683 bf1[15] = bf0[30];
684 bf1[16] = bf0[1];
685 bf1[17] = bf0[17];
686 bf1[18] = bf0[9];
687 bf1[19] = bf0[25];
688 bf1[20] = bf0[5];
689 bf1[21] = bf0[21];
690 bf1[22] = bf0[13];
691 bf1[23] = bf0[29];
692 bf1[24] = bf0[3];
693 bf1[25] = bf0[19];
694 bf1[26] = bf0[11];
695 bf1[27] = bf0[27];
696 bf1[28] = bf0[7];
697 bf1[29] = bf0[23];
698 bf1[30] = bf0[15];
699 bf1[31] = bf0[31];
700 range_check(stage, input, bf1, size, stage_range[stage]);
701}
702
Yaowu Xuf883b422016-08-30 14:01:10 -0700703void av1_fadst4_new(const int32_t *input, int32_t *output,
704 const int8_t *cos_bit, const int8_t *stage_range) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700705 const int32_t size = 4;
706 const int32_t *cospi;
707
708 int32_t stage = 0;
709 int32_t *bf0, *bf1;
710 int32_t step[4];
711
712 // stage 0;
713 range_check(stage, input, input, size, stage_range[stage]);
714
715 // stage 1;
716 stage++;
717 bf1 = output;
718 bf1[0] = input[3];
719 bf1[1] = input[0];
720 bf1[2] = input[1];
721 bf1[3] = input[2];
722 range_check(stage, input, bf1, size, stage_range[stage]);
723
724 // stage 2
725 stage++;
726 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
727 bf0 = output;
728 bf1 = step;
729 bf1[0] = half_btf(cospi[8], bf0[0], cospi[56], bf0[1], cos_bit[stage]);
730 bf1[1] = half_btf(-cospi[8], bf0[1], cospi[56], bf0[0], cos_bit[stage]);
731 bf1[2] = half_btf(cospi[40], bf0[2], cospi[24], bf0[3], cos_bit[stage]);
732 bf1[3] = half_btf(-cospi[40], bf0[3], cospi[24], bf0[2], cos_bit[stage]);
733 range_check(stage, input, bf1, size, stage_range[stage]);
734
735 // stage 3
736 stage++;
737 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
738 bf0 = step;
739 bf1 = output;
740 bf1[0] = bf0[0] + bf0[2];
741 bf1[1] = bf0[1] + bf0[3];
742 bf1[2] = -bf0[2] + bf0[0];
743 bf1[3] = -bf0[3] + bf0[1];
744 range_check(stage, input, bf1, size, stage_range[stage]);
745
746 // stage 4
747 stage++;
748 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
749 bf0 = output;
750 bf1 = step;
751 bf1[0] = bf0[0];
752 bf1[1] = bf0[1];
753 bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
754 bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
755 range_check(stage, input, bf1, size, stage_range[stage]);
756
757 // stage 5
758 stage++;
759 bf0 = step;
760 bf1 = output;
761 bf1[0] = bf0[0];
762 bf1[1] = -bf0[2];
763 bf1[2] = bf0[3];
764 bf1[3] = -bf0[1];
765 range_check(stage, input, bf1, size, stage_range[stage]);
766}
767
Yaowu Xuf883b422016-08-30 14:01:10 -0700768void av1_fadst8_new(const int32_t *input, int32_t *output,
769 const int8_t *cos_bit, const int8_t *stage_range) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700770 const int32_t size = 8;
771 const int32_t *cospi;
772
773 int32_t stage = 0;
774 int32_t *bf0, *bf1;
775 int32_t step[8];
776
777 // stage 0;
778 range_check(stage, input, input, size, stage_range[stage]);
779
780 // stage 1;
781 stage++;
782 bf1 = output;
783 bf1[0] = input[7];
784 bf1[1] = input[0];
785 bf1[2] = input[5];
786 bf1[3] = input[2];
787 bf1[4] = input[3];
788 bf1[5] = input[4];
789 bf1[6] = input[1];
790 bf1[7] = input[6];
791 range_check(stage, input, bf1, size, stage_range[stage]);
792
793 // stage 2
794 stage++;
795 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
796 bf0 = output;
797 bf1 = step;
798 bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit[stage]);
799 bf1[1] = half_btf(-cospi[4], bf0[1], cospi[60], bf0[0], cos_bit[stage]);
800 bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit[stage]);
801 bf1[3] = half_btf(-cospi[20], bf0[3], cospi[44], bf0[2], cos_bit[stage]);
802 bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit[stage]);
803 bf1[5] = half_btf(-cospi[36], bf0[5], cospi[28], bf0[4], cos_bit[stage]);
804 bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit[stage]);
805 bf1[7] = half_btf(-cospi[52], bf0[7], cospi[12], bf0[6], cos_bit[stage]);
806 range_check(stage, input, bf1, size, stage_range[stage]);
807
808 // stage 3
809 stage++;
810 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
811 bf0 = step;
812 bf1 = output;
813 bf1[0] = bf0[0] + bf0[4];
814 bf1[1] = bf0[1] + bf0[5];
815 bf1[2] = bf0[2] + bf0[6];
816 bf1[3] = bf0[3] + bf0[7];
817 bf1[4] = -bf0[4] + bf0[0];
818 bf1[5] = -bf0[5] + bf0[1];
819 bf1[6] = -bf0[6] + bf0[2];
820 bf1[7] = -bf0[7] + bf0[3];
821 range_check(stage, input, bf1, size, stage_range[stage]);
822
823 // stage 4
824 stage++;
825 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
826 bf0 = output;
827 bf1 = step;
828 bf1[0] = bf0[0];
829 bf1[1] = bf0[1];
830 bf1[2] = bf0[2];
831 bf1[3] = bf0[3];
832 bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
833 bf1[5] = half_btf(-cospi[16], bf0[5], cospi[48], bf0[4], cos_bit[stage]);
834 bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
835 bf1[7] = half_btf(cospi[48], bf0[7], cospi[16], bf0[6], cos_bit[stage]);
836 range_check(stage, input, bf1, size, stage_range[stage]);
837
838 // stage 5
839 stage++;
840 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
841 bf0 = step;
842 bf1 = output;
843 bf1[0] = bf0[0] + bf0[2];
844 bf1[1] = bf0[1] + bf0[3];
845 bf1[2] = -bf0[2] + bf0[0];
846 bf1[3] = -bf0[3] + bf0[1];
847 bf1[4] = bf0[4] + bf0[6];
848 bf1[5] = bf0[5] + bf0[7];
849 bf1[6] = -bf0[6] + bf0[4];
850 bf1[7] = -bf0[7] + bf0[5];
851 range_check(stage, input, bf1, size, stage_range[stage]);
852
853 // stage 6
854 stage++;
855 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
856 bf0 = output;
857 bf1 = step;
858 bf1[0] = bf0[0];
859 bf1[1] = bf0[1];
860 bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
861 bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
862 bf1[4] = bf0[4];
863 bf1[5] = bf0[5];
864 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
865 bf1[7] = half_btf(-cospi[32], bf0[7], cospi[32], bf0[6], cos_bit[stage]);
866 range_check(stage, input, bf1, size, stage_range[stage]);
867
868 // stage 7
869 stage++;
870 bf0 = step;
871 bf1 = output;
872 bf1[0] = bf0[0];
873 bf1[1] = -bf0[4];
874 bf1[2] = bf0[6];
875 bf1[3] = -bf0[2];
876 bf1[4] = bf0[3];
877 bf1[5] = -bf0[7];
878 bf1[6] = bf0[5];
879 bf1[7] = -bf0[1];
880 range_check(stage, input, bf1, size, stage_range[stage]);
881}
882
Yaowu Xuf883b422016-08-30 14:01:10 -0700883void av1_fadst16_new(const int32_t *input, int32_t *output,
884 const int8_t *cos_bit, const int8_t *stage_range) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700885 const int32_t size = 16;
886 const int32_t *cospi;
887
888 int32_t stage = 0;
889 int32_t *bf0, *bf1;
890 int32_t step[16];
891
892 // stage 0;
893 range_check(stage, input, input, size, stage_range[stage]);
894
895 // stage 1;
896 stage++;
897 bf1 = output;
898 bf1[0] = input[15];
899 bf1[1] = input[0];
900 bf1[2] = input[13];
901 bf1[3] = input[2];
902 bf1[4] = input[11];
903 bf1[5] = input[4];
904 bf1[6] = input[9];
905 bf1[7] = input[6];
906 bf1[8] = input[7];
907 bf1[9] = input[8];
908 bf1[10] = input[5];
909 bf1[11] = input[10];
910 bf1[12] = input[3];
911 bf1[13] = input[12];
912 bf1[14] = input[1];
913 bf1[15] = input[14];
914 range_check(stage, input, bf1, size, stage_range[stage]);
915
916 // stage 2
917 stage++;
918 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
919 bf0 = output;
920 bf1 = step;
921 bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit[stage]);
922 bf1[1] = half_btf(-cospi[2], bf0[1], cospi[62], bf0[0], cos_bit[stage]);
923 bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit[stage]);
924 bf1[3] = half_btf(-cospi[10], bf0[3], cospi[54], bf0[2], cos_bit[stage]);
925 bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit[stage]);
926 bf1[5] = half_btf(-cospi[18], bf0[5], cospi[46], bf0[4], cos_bit[stage]);
927 bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit[stage]);
928 bf1[7] = half_btf(-cospi[26], bf0[7], cospi[38], bf0[6], cos_bit[stage]);
929 bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit[stage]);
930 bf1[9] = half_btf(-cospi[34], bf0[9], cospi[30], bf0[8], cos_bit[stage]);
931 bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit[stage]);
932 bf1[11] = half_btf(-cospi[42], bf0[11], cospi[22], bf0[10], cos_bit[stage]);
933 bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit[stage]);
934 bf1[13] = half_btf(-cospi[50], bf0[13], cospi[14], bf0[12], cos_bit[stage]);
935 bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit[stage]);
936 bf1[15] = half_btf(-cospi[58], bf0[15], cospi[6], bf0[14], cos_bit[stage]);
937 range_check(stage, input, bf1, size, stage_range[stage]);
938
939 // stage 3
940 stage++;
941 bf0 = step;
942 bf1 = output;
943 bf1[0] = bf0[0] + bf0[8];
944 bf1[1] = bf0[1] + bf0[9];
945 bf1[2] = bf0[2] + bf0[10];
946 bf1[3] = bf0[3] + bf0[11];
947 bf1[4] = bf0[4] + bf0[12];
948 bf1[5] = bf0[5] + bf0[13];
949 bf1[6] = bf0[6] + bf0[14];
950 bf1[7] = bf0[7] + bf0[15];
951 bf1[8] = -bf0[8] + bf0[0];
952 bf1[9] = -bf0[9] + bf0[1];
953 bf1[10] = -bf0[10] + bf0[2];
954 bf1[11] = -bf0[11] + bf0[3];
955 bf1[12] = -bf0[12] + bf0[4];
956 bf1[13] = -bf0[13] + bf0[5];
957 bf1[14] = -bf0[14] + bf0[6];
958 bf1[15] = -bf0[15] + bf0[7];
959 range_check(stage, input, bf1, size, stage_range[stage]);
960
961 // stage 4
962 stage++;
963 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
964 bf0 = output;
965 bf1 = step;
966 bf1[0] = bf0[0];
967 bf1[1] = bf0[1];
968 bf1[2] = bf0[2];
969 bf1[3] = bf0[3];
970 bf1[4] = bf0[4];
971 bf1[5] = bf0[5];
972 bf1[6] = bf0[6];
973 bf1[7] = bf0[7];
974 bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit[stage]);
975 bf1[9] = half_btf(-cospi[8], bf0[9], cospi[56], bf0[8], cos_bit[stage]);
976 bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit[stage]);
977 bf1[11] = half_btf(-cospi[40], bf0[11], cospi[24], bf0[10], cos_bit[stage]);
978 bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit[stage]);
979 bf1[13] = half_btf(cospi[56], bf0[13], cospi[8], bf0[12], cos_bit[stage]);
980 bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit[stage]);
981 bf1[15] = half_btf(cospi[24], bf0[15], cospi[40], bf0[14], cos_bit[stage]);
982 range_check(stage, input, bf1, size, stage_range[stage]);
983
984 // stage 5
985 stage++;
986 bf0 = step;
987 bf1 = output;
988 bf1[0] = bf0[0] + bf0[4];
989 bf1[1] = bf0[1] + bf0[5];
990 bf1[2] = bf0[2] + bf0[6];
991 bf1[3] = bf0[3] + bf0[7];
992 bf1[4] = -bf0[4] + bf0[0];
993 bf1[5] = -bf0[5] + bf0[1];
994 bf1[6] = -bf0[6] + bf0[2];
995 bf1[7] = -bf0[7] + bf0[3];
996 bf1[8] = bf0[8] + bf0[12];
997 bf1[9] = bf0[9] + bf0[13];
998 bf1[10] = bf0[10] + bf0[14];
999 bf1[11] = bf0[11] + bf0[15];
1000 bf1[12] = -bf0[12] + bf0[8];
1001 bf1[13] = -bf0[13] + bf0[9];
1002 bf1[14] = -bf0[14] + bf0[10];
1003 bf1[15] = -bf0[15] + bf0[11];
1004 range_check(stage, input, bf1, size, stage_range[stage]);
1005
1006 // stage 6
1007 stage++;
1008 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
1009 bf0 = output;
1010 bf1 = step;
1011 bf1[0] = bf0[0];
1012 bf1[1] = bf0[1];
1013 bf1[2] = bf0[2];
1014 bf1[3] = bf0[3];
1015 bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
1016 bf1[5] = half_btf(-cospi[16], bf0[5], cospi[48], bf0[4], cos_bit[stage]);
1017 bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
1018 bf1[7] = half_btf(cospi[48], bf0[7], cospi[16], bf0[6], cos_bit[stage]);
1019 bf1[8] = bf0[8];
1020 bf1[9] = bf0[9];
1021 bf1[10] = bf0[10];
1022 bf1[11] = bf0[11];
1023 bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit[stage]);
1024 bf1[13] = half_btf(-cospi[16], bf0[13], cospi[48], bf0[12], cos_bit[stage]);
1025 bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit[stage]);
1026 bf1[15] = half_btf(cospi[48], bf0[15], cospi[16], bf0[14], cos_bit[stage]);
1027 range_check(stage, input, bf1, size, stage_range[stage]);
1028
1029 // stage 7
1030 stage++;
1031 bf0 = step;
1032 bf1 = output;
1033 bf1[0] = bf0[0] + bf0[2];
1034 bf1[1] = bf0[1] + bf0[3];
1035 bf1[2] = -bf0[2] + bf0[0];
1036 bf1[3] = -bf0[3] + bf0[1];
1037 bf1[4] = bf0[4] + bf0[6];
1038 bf1[5] = bf0[5] + bf0[7];
1039 bf1[6] = -bf0[6] + bf0[4];
1040 bf1[7] = -bf0[7] + bf0[5];
1041 bf1[8] = bf0[8] + bf0[10];
1042 bf1[9] = bf0[9] + bf0[11];
1043 bf1[10] = -bf0[10] + bf0[8];
1044 bf1[11] = -bf0[11] + bf0[9];
1045 bf1[12] = bf0[12] + bf0[14];
1046 bf1[13] = bf0[13] + bf0[15];
1047 bf1[14] = -bf0[14] + bf0[12];
1048 bf1[15] = -bf0[15] + bf0[13];
1049 range_check(stage, input, bf1, size, stage_range[stage]);
1050
1051 // stage 8
1052 stage++;
1053 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
1054 bf0 = output;
1055 bf1 = step;
1056 bf1[0] = bf0[0];
1057 bf1[1] = bf0[1];
1058 bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
1059 bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
1060 bf1[4] = bf0[4];
1061 bf1[5] = bf0[5];
1062 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
1063 bf1[7] = half_btf(-cospi[32], bf0[7], cospi[32], bf0[6], cos_bit[stage]);
1064 bf1[8] = bf0[8];
1065 bf1[9] = bf0[9];
1066 bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit[stage]);
1067 bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[10], cos_bit[stage]);
1068 bf1[12] = bf0[12];
1069 bf1[13] = bf0[13];
1070 bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit[stage]);
1071 bf1[15] = half_btf(-cospi[32], bf0[15], cospi[32], bf0[14], cos_bit[stage]);
1072 range_check(stage, input, bf1, size, stage_range[stage]);
1073
1074 // stage 9
1075 stage++;
1076 bf0 = step;
1077 bf1 = output;
1078 bf1[0] = bf0[0];
1079 bf1[1] = -bf0[8];
1080 bf1[2] = bf0[12];
1081 bf1[3] = -bf0[4];
1082 bf1[4] = bf0[6];
1083 bf1[5] = -bf0[14];
1084 bf1[6] = bf0[10];
1085 bf1[7] = -bf0[2];
1086 bf1[8] = bf0[3];
1087 bf1[9] = -bf0[11];
1088 bf1[10] = bf0[15];
1089 bf1[11] = -bf0[7];
1090 bf1[12] = bf0[5];
1091 bf1[13] = -bf0[13];
1092 bf1[14] = bf0[9];
1093 bf1[15] = -bf0[1];
1094 range_check(stage, input, bf1, size, stage_range[stage]);
1095}
1096
Yaowu Xuf883b422016-08-30 14:01:10 -07001097void av1_fadst32_new(const int32_t *input, int32_t *output,
1098 const int8_t *cos_bit, const int8_t *stage_range) {
Yaowu Xuc27fc142016-08-22 16:08:15 -07001099 const int32_t size = 32;
1100 const int32_t *cospi;
1101
1102 int32_t stage = 0;
1103 int32_t *bf0, *bf1;
1104 int32_t step[32];
1105
1106 // stage 0;
1107 range_check(stage, input, input, size, stage_range[stage]);
1108
1109 // stage 1;
1110 stage++;
1111 bf1 = output;
1112 bf1[0] = input[31];
1113 bf1[1] = input[0];
1114 bf1[2] = input[29];
1115 bf1[3] = input[2];
1116 bf1[4] = input[27];
1117 bf1[5] = input[4];
1118 bf1[6] = input[25];
1119 bf1[7] = input[6];
1120 bf1[8] = input[23];
1121 bf1[9] = input[8];
1122 bf1[10] = input[21];
1123 bf1[11] = input[10];
1124 bf1[12] = input[19];
1125 bf1[13] = input[12];
1126 bf1[14] = input[17];
1127 bf1[15] = input[14];
1128 bf1[16] = input[15];
1129 bf1[17] = input[16];
1130 bf1[18] = input[13];
1131 bf1[19] = input[18];
1132 bf1[20] = input[11];
1133 bf1[21] = input[20];
1134 bf1[22] = input[9];
1135 bf1[23] = input[22];
1136 bf1[24] = input[7];
1137 bf1[25] = input[24];
1138 bf1[26] = input[5];
1139 bf1[27] = input[26];
1140 bf1[28] = input[3];
1141 bf1[29] = input[28];
1142 bf1[30] = input[1];
1143 bf1[31] = input[30];
1144 range_check(stage, input, bf1, size, stage_range[stage]);
1145
1146 // stage 2
1147 stage++;
1148 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
1149 bf0 = output;
1150 bf1 = step;
1151 bf1[0] = half_btf(cospi[1], bf0[0], cospi[63], bf0[1], cos_bit[stage]);
1152 bf1[1] = half_btf(-cospi[1], bf0[1], cospi[63], bf0[0], cos_bit[stage]);
1153 bf1[2] = half_btf(cospi[5], bf0[2], cospi[59], bf0[3], cos_bit[stage]);
1154 bf1[3] = half_btf(-cospi[5], bf0[3], cospi[59], bf0[2], cos_bit[stage]);
1155 bf1[4] = half_btf(cospi[9], bf0[4], cospi[55], bf0[5], cos_bit[stage]);
1156 bf1[5] = half_btf(-cospi[9], bf0[5], cospi[55], bf0[4], cos_bit[stage]);
1157 bf1[6] = half_btf(cospi[13], bf0[6], cospi[51], bf0[7], cos_bit[stage]);
1158 bf1[7] = half_btf(-cospi[13], bf0[7], cospi[51], bf0[6], cos_bit[stage]);
1159 bf1[8] = half_btf(cospi[17], bf0[8], cospi[47], bf0[9], cos_bit[stage]);
1160 bf1[9] = half_btf(-cospi[17], bf0[9], cospi[47], bf0[8], cos_bit[stage]);
1161 bf1[10] = half_btf(cospi[21], bf0[10], cospi[43], bf0[11], cos_bit[stage]);
1162 bf1[11] = half_btf(-cospi[21], bf0[11], cospi[43], bf0[10], cos_bit[stage]);
1163 bf1[12] = half_btf(cospi[25], bf0[12], cospi[39], bf0[13], cos_bit[stage]);
1164 bf1[13] = half_btf(-cospi[25], bf0[13], cospi[39], bf0[12], cos_bit[stage]);
1165 bf1[14] = half_btf(cospi[29], bf0[14], cospi[35], bf0[15], cos_bit[stage]);
1166 bf1[15] = half_btf(-cospi[29], bf0[15], cospi[35], bf0[14], cos_bit[stage]);
1167 bf1[16] = half_btf(cospi[33], bf0[16], cospi[31], bf0[17], cos_bit[stage]);
1168 bf1[17] = half_btf(-cospi[33], bf0[17], cospi[31], bf0[16], cos_bit[stage]);
1169 bf1[18] = half_btf(cospi[37], bf0[18], cospi[27], bf0[19], cos_bit[stage]);
1170 bf1[19] = half_btf(-cospi[37], bf0[19], cospi[27], bf0[18], cos_bit[stage]);
1171 bf1[20] = half_btf(cospi[41], bf0[20], cospi[23], bf0[21], cos_bit[stage]);
1172 bf1[21] = half_btf(-cospi[41], bf0[21], cospi[23], bf0[20], cos_bit[stage]);
1173 bf1[22] = half_btf(cospi[45], bf0[22], cospi[19], bf0[23], cos_bit[stage]);
1174 bf1[23] = half_btf(-cospi[45], bf0[23], cospi[19], bf0[22], cos_bit[stage]);
1175 bf1[24] = half_btf(cospi[49], bf0[24], cospi[15], bf0[25], cos_bit[stage]);
1176 bf1[25] = half_btf(-cospi[49], bf0[25], cospi[15], bf0[24], cos_bit[stage]);
1177 bf1[26] = half_btf(cospi[53], bf0[26], cospi[11], bf0[27], cos_bit[stage]);
1178 bf1[27] = half_btf(-cospi[53], bf0[27], cospi[11], bf0[26], cos_bit[stage]);
1179 bf1[28] = half_btf(cospi[57], bf0[28], cospi[7], bf0[29], cos_bit[stage]);
1180 bf1[29] = half_btf(-cospi[57], bf0[29], cospi[7], bf0[28], cos_bit[stage]);
1181 bf1[30] = half_btf(cospi[61], bf0[30], cospi[3], bf0[31], cos_bit[stage]);
1182 bf1[31] = half_btf(-cospi[61], bf0[31], cospi[3], bf0[30], cos_bit[stage]);
1183 range_check(stage, input, bf1, size, stage_range[stage]);
1184
1185 // stage 3
1186 stage++;
1187 bf0 = step;
1188 bf1 = output;
1189 bf1[0] = bf0[0] + bf0[16];
1190 bf1[1] = bf0[1] + bf0[17];
1191 bf1[2] = bf0[2] + bf0[18];
1192 bf1[3] = bf0[3] + bf0[19];
1193 bf1[4] = bf0[4] + bf0[20];
1194 bf1[5] = bf0[5] + bf0[21];
1195 bf1[6] = bf0[6] + bf0[22];
1196 bf1[7] = bf0[7] + bf0[23];
1197 bf1[8] = bf0[8] + bf0[24];
1198 bf1[9] = bf0[9] + bf0[25];
1199 bf1[10] = bf0[10] + bf0[26];
1200 bf1[11] = bf0[11] + bf0[27];
1201 bf1[12] = bf0[12] + bf0[28];
1202 bf1[13] = bf0[13] + bf0[29];
1203 bf1[14] = bf0[14] + bf0[30];
1204 bf1[15] = bf0[15] + bf0[31];
1205 bf1[16] = -bf0[16] + bf0[0];
1206 bf1[17] = -bf0[17] + bf0[1];
1207 bf1[18] = -bf0[18] + bf0[2];
1208 bf1[19] = -bf0[19] + bf0[3];
1209 bf1[20] = -bf0[20] + bf0[4];
1210 bf1[21] = -bf0[21] + bf0[5];
1211 bf1[22] = -bf0[22] + bf0[6];
1212 bf1[23] = -bf0[23] + bf0[7];
1213 bf1[24] = -bf0[24] + bf0[8];
1214 bf1[25] = -bf0[25] + bf0[9];
1215 bf1[26] = -bf0[26] + bf0[10];
1216 bf1[27] = -bf0[27] + bf0[11];
1217 bf1[28] = -bf0[28] + bf0[12];
1218 bf1[29] = -bf0[29] + bf0[13];
1219 bf1[30] = -bf0[30] + bf0[14];
1220 bf1[31] = -bf0[31] + bf0[15];
1221 range_check(stage, input, bf1, size, stage_range[stage]);
1222
1223 // stage 4
1224 stage++;
1225 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
1226 bf0 = output;
1227 bf1 = step;
1228 bf1[0] = bf0[0];
1229 bf1[1] = bf0[1];
1230 bf1[2] = bf0[2];
1231 bf1[3] = bf0[3];
1232 bf1[4] = bf0[4];
1233 bf1[5] = bf0[5];
1234 bf1[6] = bf0[6];
1235 bf1[7] = bf0[7];
1236 bf1[8] = bf0[8];
1237 bf1[9] = bf0[9];
1238 bf1[10] = bf0[10];
1239 bf1[11] = bf0[11];
1240 bf1[12] = bf0[12];
1241 bf1[13] = bf0[13];
1242 bf1[14] = bf0[14];
1243 bf1[15] = bf0[15];
1244 bf1[16] = half_btf(cospi[4], bf0[16], cospi[60], bf0[17], cos_bit[stage]);
1245 bf1[17] = half_btf(-cospi[4], bf0[17], cospi[60], bf0[16], cos_bit[stage]);
1246 bf1[18] = half_btf(cospi[20], bf0[18], cospi[44], bf0[19], cos_bit[stage]);
1247 bf1[19] = half_btf(-cospi[20], bf0[19], cospi[44], bf0[18], cos_bit[stage]);
1248 bf1[20] = half_btf(cospi[36], bf0[20], cospi[28], bf0[21], cos_bit[stage]);
1249 bf1[21] = half_btf(-cospi[36], bf0[21], cospi[28], bf0[20], cos_bit[stage]);
1250 bf1[22] = half_btf(cospi[52], bf0[22], cospi[12], bf0[23], cos_bit[stage]);
1251 bf1[23] = half_btf(-cospi[52], bf0[23], cospi[12], bf0[22], cos_bit[stage]);
1252 bf1[24] = half_btf(-cospi[60], bf0[24], cospi[4], bf0[25], cos_bit[stage]);
1253 bf1[25] = half_btf(cospi[60], bf0[25], cospi[4], bf0[24], cos_bit[stage]);
1254 bf1[26] = half_btf(-cospi[44], bf0[26], cospi[20], bf0[27], cos_bit[stage]);
1255 bf1[27] = half_btf(cospi[44], bf0[27], cospi[20], bf0[26], cos_bit[stage]);
1256 bf1[28] = half_btf(-cospi[28], bf0[28], cospi[36], bf0[29], cos_bit[stage]);
1257 bf1[29] = half_btf(cospi[28], bf0[29], cospi[36], bf0[28], cos_bit[stage]);
1258 bf1[30] = half_btf(-cospi[12], bf0[30], cospi[52], bf0[31], cos_bit[stage]);
1259 bf1[31] = half_btf(cospi[12], bf0[31], cospi[52], bf0[30], cos_bit[stage]);
1260 range_check(stage, input, bf1, size, stage_range[stage]);
1261
1262 // stage 5
1263 stage++;
1264 bf0 = step;
1265 bf1 = output;
1266 bf1[0] = bf0[0] + bf0[8];
1267 bf1[1] = bf0[1] + bf0[9];
1268 bf1[2] = bf0[2] + bf0[10];
1269 bf1[3] = bf0[3] + bf0[11];
1270 bf1[4] = bf0[4] + bf0[12];
1271 bf1[5] = bf0[5] + bf0[13];
1272 bf1[6] = bf0[6] + bf0[14];
1273 bf1[7] = bf0[7] + bf0[15];
1274 bf1[8] = -bf0[8] + bf0[0];
1275 bf1[9] = -bf0[9] + bf0[1];
1276 bf1[10] = -bf0[10] + bf0[2];
1277 bf1[11] = -bf0[11] + bf0[3];
1278 bf1[12] = -bf0[12] + bf0[4];
1279 bf1[13] = -bf0[13] + bf0[5];
1280 bf1[14] = -bf0[14] + bf0[6];
1281 bf1[15] = -bf0[15] + bf0[7];
1282 bf1[16] = bf0[16] + bf0[24];
1283 bf1[17] = bf0[17] + bf0[25];
1284 bf1[18] = bf0[18] + bf0[26];
1285 bf1[19] = bf0[19] + bf0[27];
1286 bf1[20] = bf0[20] + bf0[28];
1287 bf1[21] = bf0[21] + bf0[29];
1288 bf1[22] = bf0[22] + bf0[30];
1289 bf1[23] = bf0[23] + bf0[31];
1290 bf1[24] = -bf0[24] + bf0[16];
1291 bf1[25] = -bf0[25] + bf0[17];
1292 bf1[26] = -bf0[26] + bf0[18];
1293 bf1[27] = -bf0[27] + bf0[19];
1294 bf1[28] = -bf0[28] + bf0[20];
1295 bf1[29] = -bf0[29] + bf0[21];
1296 bf1[30] = -bf0[30] + bf0[22];
1297 bf1[31] = -bf0[31] + bf0[23];
1298 range_check(stage, input, bf1, size, stage_range[stage]);
1299
1300 // stage 6
1301 stage++;
1302 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
1303 bf0 = output;
1304 bf1 = step;
1305 bf1[0] = bf0[0];
1306 bf1[1] = bf0[1];
1307 bf1[2] = bf0[2];
1308 bf1[3] = bf0[3];
1309 bf1[4] = bf0[4];
1310 bf1[5] = bf0[5];
1311 bf1[6] = bf0[6];
1312 bf1[7] = bf0[7];
1313 bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit[stage]);
1314 bf1[9] = half_btf(-cospi[8], bf0[9], cospi[56], bf0[8], cos_bit[stage]);
1315 bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit[stage]);
1316 bf1[11] = half_btf(-cospi[40], bf0[11], cospi[24], bf0[10], cos_bit[stage]);
1317 bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit[stage]);
1318 bf1[13] = half_btf(cospi[56], bf0[13], cospi[8], bf0[12], cos_bit[stage]);
1319 bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit[stage]);
1320 bf1[15] = half_btf(cospi[24], bf0[15], cospi[40], bf0[14], cos_bit[stage]);
1321 bf1[16] = bf0[16];
1322 bf1[17] = bf0[17];
1323 bf1[18] = bf0[18];
1324 bf1[19] = bf0[19];
1325 bf1[20] = bf0[20];
1326 bf1[21] = bf0[21];
1327 bf1[22] = bf0[22];
1328 bf1[23] = bf0[23];
1329 bf1[24] = half_btf(cospi[8], bf0[24], cospi[56], bf0[25], cos_bit[stage]);
1330 bf1[25] = half_btf(-cospi[8], bf0[25], cospi[56], bf0[24], cos_bit[stage]);
1331 bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[27], cos_bit[stage]);
1332 bf1[27] = half_btf(-cospi[40], bf0[27], cospi[24], bf0[26], cos_bit[stage]);
1333 bf1[28] = half_btf(-cospi[56], bf0[28], cospi[8], bf0[29], cos_bit[stage]);
1334 bf1[29] = half_btf(cospi[56], bf0[29], cospi[8], bf0[28], cos_bit[stage]);
1335 bf1[30] = half_btf(-cospi[24], bf0[30], cospi[40], bf0[31], cos_bit[stage]);
1336 bf1[31] = half_btf(cospi[24], bf0[31], cospi[40], bf0[30], cos_bit[stage]);
1337 range_check(stage, input, bf1, size, stage_range[stage]);
1338
1339 // stage 7
1340 stage++;
1341 bf0 = step;
1342 bf1 = output;
1343 bf1[0] = bf0[0] + bf0[4];
1344 bf1[1] = bf0[1] + bf0[5];
1345 bf1[2] = bf0[2] + bf0[6];
1346 bf1[3] = bf0[3] + bf0[7];
1347 bf1[4] = -bf0[4] + bf0[0];
1348 bf1[5] = -bf0[5] + bf0[1];
1349 bf1[6] = -bf0[6] + bf0[2];
1350 bf1[7] = -bf0[7] + bf0[3];
1351 bf1[8] = bf0[8] + bf0[12];
1352 bf1[9] = bf0[9] + bf0[13];
1353 bf1[10] = bf0[10] + bf0[14];
1354 bf1[11] = bf0[11] + bf0[15];
1355 bf1[12] = -bf0[12] + bf0[8];
1356 bf1[13] = -bf0[13] + bf0[9];
1357 bf1[14] = -bf0[14] + bf0[10];
1358 bf1[15] = -bf0[15] + bf0[11];
1359 bf1[16] = bf0[16] + bf0[20];
1360 bf1[17] = bf0[17] + bf0[21];
1361 bf1[18] = bf0[18] + bf0[22];
1362 bf1[19] = bf0[19] + bf0[23];
1363 bf1[20] = -bf0[20] + bf0[16];
1364 bf1[21] = -bf0[21] + bf0[17];
1365 bf1[22] = -bf0[22] + bf0[18];
1366 bf1[23] = -bf0[23] + bf0[19];
1367 bf1[24] = bf0[24] + bf0[28];
1368 bf1[25] = bf0[25] + bf0[29];
1369 bf1[26] = bf0[26] + bf0[30];
1370 bf1[27] = bf0[27] + bf0[31];
1371 bf1[28] = -bf0[28] + bf0[24];
1372 bf1[29] = -bf0[29] + bf0[25];
1373 bf1[30] = -bf0[30] + bf0[26];
1374 bf1[31] = -bf0[31] + bf0[27];
1375 range_check(stage, input, bf1, size, stage_range[stage]);
1376
1377 // stage 8
1378 stage++;
1379 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
1380 bf0 = output;
1381 bf1 = step;
1382 bf1[0] = bf0[0];
1383 bf1[1] = bf0[1];
1384 bf1[2] = bf0[2];
1385 bf1[3] = bf0[3];
1386 bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
1387 bf1[5] = half_btf(-cospi[16], bf0[5], cospi[48], bf0[4], cos_bit[stage]);
1388 bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
1389 bf1[7] = half_btf(cospi[48], bf0[7], cospi[16], bf0[6], cos_bit[stage]);
1390 bf1[8] = bf0[8];
1391 bf1[9] = bf0[9];
1392 bf1[10] = bf0[10];
1393 bf1[11] = bf0[11];
1394 bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit[stage]);
1395 bf1[13] = half_btf(-cospi[16], bf0[13], cospi[48], bf0[12], cos_bit[stage]);
1396 bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit[stage]);
1397 bf1[15] = half_btf(cospi[48], bf0[15], cospi[16], bf0[14], cos_bit[stage]);
1398 bf1[16] = bf0[16];
1399 bf1[17] = bf0[17];
1400 bf1[18] = bf0[18];
1401 bf1[19] = bf0[19];
1402 bf1[20] = half_btf(cospi[16], bf0[20], cospi[48], bf0[21], cos_bit[stage]);
1403 bf1[21] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[20], cos_bit[stage]);
1404 bf1[22] = half_btf(-cospi[48], bf0[22], cospi[16], bf0[23], cos_bit[stage]);
1405 bf1[23] = half_btf(cospi[48], bf0[23], cospi[16], bf0[22], cos_bit[stage]);
1406 bf1[24] = bf0[24];
1407 bf1[25] = bf0[25];
1408 bf1[26] = bf0[26];
1409 bf1[27] = bf0[27];
1410 bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[29], cos_bit[stage]);
1411 bf1[29] = half_btf(-cospi[16], bf0[29], cospi[48], bf0[28], cos_bit[stage]);
1412 bf1[30] = half_btf(-cospi[48], bf0[30], cospi[16], bf0[31], cos_bit[stage]);
1413 bf1[31] = half_btf(cospi[48], bf0[31], cospi[16], bf0[30], cos_bit[stage]);
1414 range_check(stage, input, bf1, size, stage_range[stage]);
1415
1416 // stage 9
1417 stage++;
1418 bf0 = step;
1419 bf1 = output;
1420 bf1[0] = bf0[0] + bf0[2];
1421 bf1[1] = bf0[1] + bf0[3];
1422 bf1[2] = -bf0[2] + bf0[0];
1423 bf1[3] = -bf0[3] + bf0[1];
1424 bf1[4] = bf0[4] + bf0[6];
1425 bf1[5] = bf0[5] + bf0[7];
1426 bf1[6] = -bf0[6] + bf0[4];
1427 bf1[7] = -bf0[7] + bf0[5];
1428 bf1[8] = bf0[8] + bf0[10];
1429 bf1[9] = bf0[9] + bf0[11];
1430 bf1[10] = -bf0[10] + bf0[8];
1431 bf1[11] = -bf0[11] + bf0[9];
1432 bf1[12] = bf0[12] + bf0[14];
1433 bf1[13] = bf0[13] + bf0[15];
1434 bf1[14] = -bf0[14] + bf0[12];
1435 bf1[15] = -bf0[15] + bf0[13];
1436 bf1[16] = bf0[16] + bf0[18];
1437 bf1[17] = bf0[17] + bf0[19];
1438 bf1[18] = -bf0[18] + bf0[16];
1439 bf1[19] = -bf0[19] + bf0[17];
1440 bf1[20] = bf0[20] + bf0[22];
1441 bf1[21] = bf0[21] + bf0[23];
1442 bf1[22] = -bf0[22] + bf0[20];
1443 bf1[23] = -bf0[23] + bf0[21];
1444 bf1[24] = bf0[24] + bf0[26];
1445 bf1[25] = bf0[25] + bf0[27];
1446 bf1[26] = -bf0[26] + bf0[24];
1447 bf1[27] = -bf0[27] + bf0[25];
1448 bf1[28] = bf0[28] + bf0[30];
1449 bf1[29] = bf0[29] + bf0[31];
1450 bf1[30] = -bf0[30] + bf0[28];
1451 bf1[31] = -bf0[31] + bf0[29];
1452 range_check(stage, input, bf1, size, stage_range[stage]);
1453
1454 // stage 10
1455 stage++;
1456 cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
1457 bf0 = output;
1458 bf1 = step;
1459 bf1[0] = bf0[0];
1460 bf1[1] = bf0[1];
1461 bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
1462 bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
1463 bf1[4] = bf0[4];
1464 bf1[5] = bf0[5];
1465 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
1466 bf1[7] = half_btf(-cospi[32], bf0[7], cospi[32], bf0[6], cos_bit[stage]);
1467 bf1[8] = bf0[8];
1468 bf1[9] = bf0[9];
1469 bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit[stage]);
1470 bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[10], cos_bit[stage]);
1471 bf1[12] = bf0[12];
1472 bf1[13] = bf0[13];
1473 bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit[stage]);
1474 bf1[15] = half_btf(-cospi[32], bf0[15], cospi[32], bf0[14], cos_bit[stage]);
1475 bf1[16] = bf0[16];
1476 bf1[17] = bf0[17];
1477 bf1[18] = half_btf(cospi[32], bf0[18], cospi[32], bf0[19], cos_bit[stage]);
1478 bf1[19] = half_btf(-cospi[32], bf0[19], cospi[32], bf0[18], cos_bit[stage]);
1479 bf1[20] = bf0[20];
1480 bf1[21] = bf0[21];
1481 bf1[22] = half_btf(cospi[32], bf0[22], cospi[32], bf0[23], cos_bit[stage]);
1482 bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[22], cos_bit[stage]);
1483 bf1[24] = bf0[24];
1484 bf1[25] = bf0[25];
1485 bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[27], cos_bit[stage]);
1486 bf1[27] = half_btf(-cospi[32], bf0[27], cospi[32], bf0[26], cos_bit[stage]);
1487 bf1[28] = bf0[28];
1488 bf1[29] = bf0[29];
1489 bf1[30] = half_btf(cospi[32], bf0[30], cospi[32], bf0[31], cos_bit[stage]);
1490 bf1[31] = half_btf(-cospi[32], bf0[31], cospi[32], bf0[30], cos_bit[stage]);
1491 range_check(stage, input, bf1, size, stage_range[stage]);
1492
1493 // stage 11
1494 stage++;
1495 bf0 = step;
1496 bf1 = output;
1497 bf1[0] = bf0[0];
1498 bf1[1] = -bf0[16];
1499 bf1[2] = bf0[24];
1500 bf1[3] = -bf0[8];
1501 bf1[4] = bf0[12];
1502 bf1[5] = -bf0[28];
1503 bf1[6] = bf0[20];
1504 bf1[7] = -bf0[4];
1505 bf1[8] = bf0[6];
1506 bf1[9] = -bf0[22];
1507 bf1[10] = bf0[30];
1508 bf1[11] = -bf0[14];
1509 bf1[12] = bf0[10];
1510 bf1[13] = -bf0[26];
1511 bf1[14] = bf0[18];
1512 bf1[15] = -bf0[2];
1513 bf1[16] = bf0[3];
1514 bf1[17] = -bf0[19];
1515 bf1[18] = bf0[27];
1516 bf1[19] = -bf0[11];
1517 bf1[20] = bf0[15];
1518 bf1[21] = -bf0[31];
1519 bf1[22] = bf0[23];
1520 bf1[23] = -bf0[7];
1521 bf1[24] = bf0[5];
1522 bf1[25] = -bf0[21];
1523 bf1[26] = bf0[29];
1524 bf1[27] = -bf0[13];
1525 bf1[28] = bf0[9];
1526 bf1[29] = -bf0[25];
1527 bf1[30] = bf0[17];
1528 bf1[31] = -bf0[1];
1529 range_check(stage, input, bf1, size, stage_range[stage]);
1530}