blob: e7793fb16e8763acddc6902b19fd4f30132bf10d [file] [log] [blame]
Christian Duvivier6a501462013-09-11 15:18:47 -07001;
Yaowu Xu9c01aa12016-09-01 14:32:49 -07002; Copyright (c) 2016, Alliance for Open Media. All rights reserved
Christian Duvivier6a501462013-09-11 15:18:47 -07003;
Yaowu Xu9c01aa12016-09-01 14:32:49 -07004; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
Christian Duvivier6a501462013-09-11 15:18:47 -070012;
13
14;TODO(cd): adjust these constant to be able to use vqdmulh for faster
15; dct_const_round_shift(a * b) within butterfly calculations.
16cospi_1_64 EQU 16364
17cospi_2_64 EQU 16305
18cospi_3_64 EQU 16207
19cospi_4_64 EQU 16069
20cospi_5_64 EQU 15893
21cospi_6_64 EQU 15679
22cospi_7_64 EQU 15426
23cospi_8_64 EQU 15137
24cospi_9_64 EQU 14811
25cospi_10_64 EQU 14449
26cospi_11_64 EQU 14053
27cospi_12_64 EQU 13623
28cospi_13_64 EQU 13160
29cospi_14_64 EQU 12665
30cospi_15_64 EQU 12140
31cospi_16_64 EQU 11585
32cospi_17_64 EQU 11003
33cospi_18_64 EQU 10394
34cospi_19_64 EQU 9760
35cospi_20_64 EQU 9102
36cospi_21_64 EQU 8423
37cospi_22_64 EQU 7723
38cospi_23_64 EQU 7005
39cospi_24_64 EQU 6270
40cospi_25_64 EQU 5520
41cospi_26_64 EQU 4756
42cospi_27_64 EQU 3981
43cospi_28_64 EQU 3196
44cospi_29_64 EQU 2404
45cospi_30_64 EQU 1606
46cospi_31_64 EQU 804
47
48
Yaowu Xuf883b422016-08-30 14:01:10 -070049 EXPORT |aom_idct32x32_1024_add_neon|
Christian Duvivier6a501462013-09-11 15:18:47 -070050 ARM
51 REQUIRE8
52 PRESERVE8
53
54 AREA ||.text||, CODE, READONLY, ALIGN=2
55
56 AREA Block, CODE, READONLY
57
58 ; --------------------------------------------------------------------------
59 ; Load from transposed_buffer
60 ; q13 = transposed_buffer[first_offset]
61 ; q14 = transposed_buffer[second_offset]
62 ; for proper address calculation, the last offset used when manipulating
63 ; transposed_buffer must be passed in. use 0 for first use.
64 MACRO
65 LOAD_FROM_TRANSPOSED $prev_offset, $first_offset, $second_offset
66 ; address calculation with proper stride and loading
67 add r0, #($first_offset - $prev_offset )*8*2
68 vld1.s16 {q14}, [r0]
69 add r0, #($second_offset - $first_offset)*8*2
70 vld1.s16 {q13}, [r0]
71 ; (used) two registers (q14, q13)
72 MEND
73 ; --------------------------------------------------------------------------
74 ; Load from output (used as temporary storage)
75 ; reg1 = output[first_offset]
76 ; reg2 = output[second_offset]
77 ; for proper address calculation, the last offset used when manipulating
Andrew Russell549c31f2014-02-12 16:32:51 -080078 ; output, whether reading or storing) must be passed in. use 0 for first
Christian Duvivier6a501462013-09-11 15:18:47 -070079 ; use.
80 MACRO
81 LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
82 ; address calculation with proper stride and loading
83 add r1, #($first_offset - $prev_offset )*32*2
84 vld1.s16 {$reg1}, [r1]
85 add r1, #($second_offset - $first_offset)*32*2
86 vld1.s16 {$reg2}, [r1]
87 ; (used) two registers ($reg1, $reg2)
88 MEND
89 ; --------------------------------------------------------------------------
90 ; Store into output (sometimes as as temporary storage)
91 ; output[first_offset] = reg1
92 ; output[second_offset] = reg2
93 ; for proper address calculation, the last offset used when manipulating
Andrew Russell549c31f2014-02-12 16:32:51 -080094 ; output, whether reading or storing) must be passed in. use 0 for first
Christian Duvivier6a501462013-09-11 15:18:47 -070095 ; use.
96 MACRO
97 STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
98 ; address calculation with proper stride and storing
99 add r1, #($first_offset - $prev_offset )*32*2
100 vst1.16 {$reg1}, [r1]
101 add r1, #($second_offset - $first_offset)*32*2
102 vst1.16 {$reg2}, [r1]
103 MEND
104 ; --------------------------------------------------------------------------
Christian Duvivier5b1dc152013-09-25 18:07:10 -0700105 ; Combine-add results with current destination content
106 ; q6-q9 contain the results (out[j * 32 + 0-31])
107 MACRO
108 STORE_COMBINE_CENTER_RESULTS
109 ; load dest[j * dest_stride + 0-31]
110 vld1.s16 {d8}, [r10], r2
111 vld1.s16 {d11}, [r9], r11
112 vld1.s16 {d9}, [r10]
113 vld1.s16 {d10}, [r9]
114 ; ROUND_POWER_OF_TWO
115 vrshr.s16 q7, q7, #6
116 vrshr.s16 q8, q8, #6
117 vrshr.s16 q9, q9, #6
118 vrshr.s16 q6, q6, #6
119 ; add to dest[j * dest_stride + 0-31]
120 vaddw.u8 q7, q7, d9
121 vaddw.u8 q8, q8, d10
122 vaddw.u8 q9, q9, d11
123 vaddw.u8 q6, q6, d8
124 ; clip pixel
125 vqmovun.s16 d9, q7
126 vqmovun.s16 d10, q8
127 vqmovun.s16 d11, q9
128 vqmovun.s16 d8, q6
129 ; store back into dest[j * dest_stride + 0-31]
130 vst1.16 {d9}, [r10], r11
131 vst1.16 {d10}, [r9], r2
132 vst1.16 {d8}, [r10]
133 vst1.16 {d11}, [r9]
134 ; update pointers (by dest_stride * 2)
135 sub r9, r9, r2, lsl #1
136 add r10, r10, r2, lsl #1
137 MEND
138 ; --------------------------------------------------------------------------
139 ; Combine-add results with current destination content
140 ; q6-q9 contain the results (out[j * 32 + 0-31])
141 MACRO
142 STORE_COMBINE_CENTER_RESULTS_LAST
143 ; load dest[j * dest_stride + 0-31]
144 vld1.s16 {d8}, [r10], r2
145 vld1.s16 {d11}, [r9], r11
146 vld1.s16 {d9}, [r10]
147 vld1.s16 {d10}, [r9]
148 ; ROUND_POWER_OF_TWO
149 vrshr.s16 q7, q7, #6
150 vrshr.s16 q8, q8, #6
151 vrshr.s16 q9, q9, #6
152 vrshr.s16 q6, q6, #6
153 ; add to dest[j * dest_stride + 0-31]
154 vaddw.u8 q7, q7, d9
155 vaddw.u8 q8, q8, d10
156 vaddw.u8 q9, q9, d11
157 vaddw.u8 q6, q6, d8
158 ; clip pixel
159 vqmovun.s16 d9, q7
160 vqmovun.s16 d10, q8
161 vqmovun.s16 d11, q9
162 vqmovun.s16 d8, q6
163 ; store back into dest[j * dest_stride + 0-31]
164 vst1.16 {d9}, [r10], r11
165 vst1.16 {d10}, [r9], r2
166 vst1.16 {d8}, [r10]!
167 vst1.16 {d11}, [r9]!
168 ; update pointers (by dest_stride * 2)
169 sub r9, r9, r2, lsl #1
170 add r10, r10, r2, lsl #1
171 MEND
172 ; --------------------------------------------------------------------------
173 ; Combine-add results with current destination content
174 ; q4-q7 contain the results (out[j * 32 + 0-31])
175 MACRO
176 STORE_COMBINE_EXTREME_RESULTS
177 ; load dest[j * dest_stride + 0-31]
178 vld1.s16 {d4}, [r7], r2
179 vld1.s16 {d7}, [r6], r11
180 vld1.s16 {d5}, [r7]
181 vld1.s16 {d6}, [r6]
182 ; ROUND_POWER_OF_TWO
183 vrshr.s16 q5, q5, #6
184 vrshr.s16 q6, q6, #6
185 vrshr.s16 q7, q7, #6
186 vrshr.s16 q4, q4, #6
187 ; add to dest[j * dest_stride + 0-31]
188 vaddw.u8 q5, q5, d5
189 vaddw.u8 q6, q6, d6
190 vaddw.u8 q7, q7, d7
191 vaddw.u8 q4, q4, d4
192 ; clip pixel
193 vqmovun.s16 d5, q5
194 vqmovun.s16 d6, q6
195 vqmovun.s16 d7, q7
196 vqmovun.s16 d4, q4
197 ; store back into dest[j * dest_stride + 0-31]
198 vst1.16 {d5}, [r7], r11
199 vst1.16 {d6}, [r6], r2
200 vst1.16 {d7}, [r6]
201 vst1.16 {d4}, [r7]
202 ; update pointers (by dest_stride * 2)
203 sub r6, r6, r2, lsl #1
204 add r7, r7, r2, lsl #1
205 MEND
206 ; --------------------------------------------------------------------------
207 ; Combine-add results with current destination content
208 ; q4-q7 contain the results (out[j * 32 + 0-31])
209 MACRO
210 STORE_COMBINE_EXTREME_RESULTS_LAST
211 ; load dest[j * dest_stride + 0-31]
212 vld1.s16 {d4}, [r7], r2
213 vld1.s16 {d7}, [r6], r11
214 vld1.s16 {d5}, [r7]
215 vld1.s16 {d6}, [r6]
216 ; ROUND_POWER_OF_TWO
217 vrshr.s16 q5, q5, #6
218 vrshr.s16 q6, q6, #6
219 vrshr.s16 q7, q7, #6
220 vrshr.s16 q4, q4, #6
221 ; add to dest[j * dest_stride + 0-31]
222 vaddw.u8 q5, q5, d5
223 vaddw.u8 q6, q6, d6
224 vaddw.u8 q7, q7, d7
225 vaddw.u8 q4, q4, d4
226 ; clip pixel
227 vqmovun.s16 d5, q5
228 vqmovun.s16 d6, q6
229 vqmovun.s16 d7, q7
230 vqmovun.s16 d4, q4
231 ; store back into dest[j * dest_stride + 0-31]
232 vst1.16 {d5}, [r7], r11
233 vst1.16 {d6}, [r6], r2
234 vst1.16 {d7}, [r6]!
235 vst1.16 {d4}, [r7]!
236 ; update pointers (by dest_stride * 2)
237 sub r6, r6, r2, lsl #1
238 add r7, r7, r2, lsl #1
239 MEND
240 ; --------------------------------------------------------------------------
Christian Duvivier6a501462013-09-11 15:18:47 -0700241 ; Touches q8-q12, q15 (q13-q14 are preserved)
242 ; valid output registers are anything but q8-q11
243 MACRO
244 DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
245 ; TODO(cd): have special case to re-use constants when they are similar for
246 ; consecutive butterflies
247 ; TODO(cd): have special case when both constants are the same, do the
Andrew Russell549c31f2014-02-12 16:32:51 -0800248 ; additions/subtractions before the multiplies.
Christian Duvivier6a501462013-09-11 15:18:47 -0700249 ; generate the constants
250 ; generate scalar constants
Christian Duvivier5b1dc152013-09-25 18:07:10 -0700251 mov r8, #$first_constant & 0xFF00
Christian Duvivier6a501462013-09-11 15:18:47 -0700252 mov r12, #$second_constant & 0xFF00
Christian Duvivier5b1dc152013-09-25 18:07:10 -0700253 add r8, #$first_constant & 0x00FF
Christian Duvivier6a501462013-09-11 15:18:47 -0700254 add r12, #$second_constant & 0x00FF
255 ; generate vector constants
Christian Duvivier5b1dc152013-09-25 18:07:10 -0700256 vdup.16 d30, r8
Christian Duvivier6a501462013-09-11 15:18:47 -0700257 vdup.16 d31, r12
258 ; (used) two for inputs (regA-regD), one for constants (q15)
259 ; do some multiplications (ordered for maximum latency hiding)
260 vmull.s16 q8, $regC, d30
261 vmull.s16 q10, $regA, d31
262 vmull.s16 q9, $regD, d30
263 vmull.s16 q11, $regB, d31
264 vmull.s16 q12, $regC, d31
265 ; (used) five for intermediate (q8-q12), one for constants (q15)
Andrew Russell549c31f2014-02-12 16:32:51 -0800266 ; do some addition/subtractions (to get back two register)
Christian Duvivier6a501462013-09-11 15:18:47 -0700267 vsub.s32 q8, q8, q10
268 vsub.s32 q9, q9, q11
269 ; do more multiplications (ordered for maximum latency hiding)
270 vmull.s16 q10, $regD, d31
271 vmull.s16 q11, $regA, d30
272 vmull.s16 q15, $regB, d30
273 ; (used) six for intermediate (q8-q12, q15)
Andrew Russell549c31f2014-02-12 16:32:51 -0800274 ; do more addition/subtractions
Christian Duvivier6a501462013-09-11 15:18:47 -0700275 vadd.s32 q11, q12, q11
276 vadd.s32 q10, q10, q15
277 ; (used) four for intermediate (q8-q11)
278 ; dct_const_round_shift
279 vqrshrn.s32 $reg1, q8, #14
280 vqrshrn.s32 $reg2, q9, #14
281 vqrshrn.s32 $reg3, q11, #14
282 vqrshrn.s32 $reg4, q10, #14
283 ; (used) two for results, well four d registers
284 MEND
285 ; --------------------------------------------------------------------------
286 ; Touches q8-q12, q15 (q13-q14 are preserved)
287 ; valid output registers are anything but q8-q11
288 MACRO
289 DO_BUTTERFLY_STD $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
290 DO_BUTTERFLY d28, d29, d26, d27, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
291 MEND
292 ; --------------------------------------------------------------------------
293
Yaowu Xuf883b422016-08-30 14:01:10 -0700294;void aom_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
Christian Duvivier6a501462013-09-11 15:18:47 -0700295;
Christian Duvivier5b1dc152013-09-25 18:07:10 -0700296; r0 int16_t *input,
297; r1 uint8_t *dest,
298; r2 int dest_stride)
299; loop counters
300; r4 bands loop counter
301; r5 pass loop counter
302; r8 transpose loop counter
303; combine-add pointers
304; r6 dest + 31 * dest_stride, descending (30, 29, 28, ...)
305; r7 dest + 0 * dest_stride, ascending (1, 2, 3, ...)
306; r9 dest + 15 * dest_stride, descending (14, 13, 12, ...)
307; r10 dest + 16 * dest_stride, ascending (17, 18, 19, ...)
Christian Duvivier6a501462013-09-11 15:18:47 -0700308
Yaowu Xuf883b422016-08-30 14:01:10 -0700309|aom_idct32x32_1024_add_neon| PROC
Christian Duvivier6a501462013-09-11 15:18:47 -0700310 ; This function does one pass of idct32x32 transform.
311 ;
312 ; This is done by transposing the input and then doing a 1d transform on
313 ; columns. In the first pass, the transposed columns are the original
314 ; rows. In the second pass, after the transposition, the colums are the
315 ; original columns.
316 ; The 1d transform is done by looping over bands of eight columns (the
317 ; idct32_bands loop). For each band, the transform input transposition
318 ; is done on demand, one band of four 8x8 matrices at a time. The four
Christian Duvivier5b1dc152013-09-25 18:07:10 -0700319 ; matrices are transposed by pairs (the idct32_transpose_pair loop).
320 push {r4-r11}
321 vpush {d8-d15}
322 ; stack operation
323 ; internal buffer used to transpose 8 lines into before transforming them
324 ; int16_t transpose_buffer[32 * 8];
325 ; at sp + [4096, 4607]
326 ; results of the first pass (transpose and transform rows)
327 ; int16_t pass1[32 * 32];
328 ; at sp + [0, 2047]
329 ; results of the second pass (transpose and transform columns)
330 ; int16_t pass2[32 * 32];
331 ; at sp + [2048, 4095]
332 sub sp, sp, #512+2048+2048
333
334 ; r6 = dest + 31 * dest_stride
335 ; r7 = dest + 0 * dest_stride
336 ; r9 = dest + 15 * dest_stride
337 ; r10 = dest + 16 * dest_stride
338 rsb r6, r2, r2, lsl #5
339 rsb r9, r2, r2, lsl #4
340 add r10, r1, r2, lsl #4
341 mov r7, r1
342 add r6, r6, r1
343 add r9, r9, r1
344 ; r11 = -dest_stride
345 neg r11, r2
346 ; r3 = input
347 mov r3, r0
348 ; parameters for first pass
349 ; r0 = transpose_buffer[32 * 8]
350 add r0, sp, #4096
351 ; r1 = pass1[32 * 32]
352 mov r1, sp
353
354 mov r5, #0 ; initialize pass loop counter
355idct32_pass_loop
356 mov r4, #4 ; initialize bands loop counter
Christian Duvivier6a501462013-09-11 15:18:47 -0700357idct32_bands_loop
Christian Duvivier5b1dc152013-09-25 18:07:10 -0700358 mov r8, #2 ; initialize transpose loop counter
Christian Duvivier6a501462013-09-11 15:18:47 -0700359idct32_transpose_pair_loop
360 ; Load two horizontally consecutive 8x8 16bit data matrices. The first one
361 ; into q0-q7 and the second one into q8-q15. There is a stride of 64,
362 ; adjusted to 32 because of the two post-increments.
Christian Duvivier5b1dc152013-09-25 18:07:10 -0700363 vld1.s16 {q8}, [r3]!
364 vld1.s16 {q0}, [r3]!
365 add r3, #32
366 vld1.s16 {q9}, [r3]!
367 vld1.s16 {q1}, [r3]!
368 add r3, #32
369 vld1.s16 {q10}, [r3]!
370 vld1.s16 {q2}, [r3]!
371 add r3, #32
372 vld1.s16 {q11}, [r3]!
373 vld1.s16 {q3}, [r3]!
374 add r3, #32
375 vld1.s16 {q12}, [r3]!
376 vld1.s16 {q4}, [r3]!
377 add r3, #32
378 vld1.s16 {q13}, [r3]!
379 vld1.s16 {q5}, [r3]!
380 add r3, #32
381 vld1.s16 {q14}, [r3]!
382 vld1.s16 {q6}, [r3]!
383 add r3, #32
384 vld1.s16 {q15}, [r3]!
385 vld1.s16 {q7}, [r3]!
Christian Duvivier6a501462013-09-11 15:18:47 -0700386
387 ; Transpose the two 8x8 16bit data matrices.
388 vswp d17, d24
389 vswp d23, d30
390 vswp d21, d28
391 vswp d19, d26
392 vswp d1, d8
393 vswp d7, d14
394 vswp d5, d12
395 vswp d3, d10
396 vtrn.32 q8, q10
397 vtrn.32 q9, q11
398 vtrn.32 q12, q14
399 vtrn.32 q13, q15
400 vtrn.32 q0, q2
401 vtrn.32 q1, q3
402 vtrn.32 q4, q6
403 vtrn.32 q5, q7
404 vtrn.16 q8, q9
405 vtrn.16 q10, q11
406 vtrn.16 q12, q13
407 vtrn.16 q14, q15
408 vtrn.16 q0, q1
409 vtrn.16 q2, q3
410 vtrn.16 q4, q5
411 vtrn.16 q6, q7
412
413 ; Store both matrices after each other. There is a stride of 32, which
414 ; adjusts to nothing because of the post-increments.
415 vst1.16 {q8}, [r0]!
416 vst1.16 {q9}, [r0]!
417 vst1.16 {q10}, [r0]!
418 vst1.16 {q11}, [r0]!
419 vst1.16 {q12}, [r0]!
420 vst1.16 {q13}, [r0]!
421 vst1.16 {q14}, [r0]!
422 vst1.16 {q15}, [r0]!
423 vst1.16 {q0}, [r0]!
424 vst1.16 {q1}, [r0]!
425 vst1.16 {q2}, [r0]!
426 vst1.16 {q3}, [r0]!
427 vst1.16 {q4}, [r0]!
428 vst1.16 {q5}, [r0]!
429 vst1.16 {q6}, [r0]!
430 vst1.16 {q7}, [r0]!
431
432 ; increment pointers by adjusted stride (not necessary for r0/out)
Christian Duvivier5b1dc152013-09-25 18:07:10 -0700433 ; go back by 7*32 for the seven lines moved fully by read and add
434 ; go back by 32 for the eigth line only read
435 ; advance by 16*2 to go the next pair
436 sub r3, r3, #7*32*2 + 32 - 16*2
Christian Duvivier6a501462013-09-11 15:18:47 -0700437 ; transpose pair loop processing
Christian Duvivier5b1dc152013-09-25 18:07:10 -0700438 subs r8, r8, #1
439 bne idct32_transpose_pair_loop
Christian Duvivier6a501462013-09-11 15:18:47 -0700440
441 ; restore r0/input to its original value
442 sub r0, r0, #32*8*2
443
444 ; Instead of doing the transforms stage by stage, it is done by loading
445 ; some input values and doing as many stages as possible to minimize the
446 ; storing/loading of intermediate results. To fit within registers, the
447 ; final coefficients are cut into four blocks:
448 ; BLOCK A: 16-19,28-31
449 ; BLOCK B: 20-23,24-27
450 ; BLOCK C: 8-10,11-15
451 ; BLOCK D: 0-3,4-7
452 ; Blocks A and C are straight calculation through the various stages. In
453 ; block B, further calculations are performed using the results from
454 ; block A. In block D, further calculations are performed using the results
455 ; from block C and then the final calculations are done using results from
456 ; block A and B which have been combined at the end of block B.
457
458 ; --------------------------------------------------------------------------
459 ; BLOCK A: 16-19,28-31
460 ; --------------------------------------------------------------------------
461 ; generate 16,17,30,31
462 ; --------------------------------------------------------------------------
463 ; part of stage 1
464 ;temp1 = input[1 * 32] * cospi_31_64 - input[31 * 32] * cospi_1_64;
465 ;temp2 = input[1 * 32] * cospi_1_64 + input[31 * 32] * cospi_31_64;
466 ;step1b[16][i] = dct_const_round_shift(temp1);
467 ;step1b[31][i] = dct_const_round_shift(temp2);
468 LOAD_FROM_TRANSPOSED 0, 1, 31
469 DO_BUTTERFLY_STD cospi_31_64, cospi_1_64, d0, d1, d4, d5
470 ; --------------------------------------------------------------------------
471 ; part of stage 1
472 ;temp1 = input[17 * 32] * cospi_15_64 - input[15 * 32] * cospi_17_64;
473 ;temp2 = input[17 * 32] * cospi_17_64 + input[15 * 32] * cospi_15_64;
474 ;step1b[17][i] = dct_const_round_shift(temp1);
475 ;step1b[30][i] = dct_const_round_shift(temp2);
476 LOAD_FROM_TRANSPOSED 31, 17, 15
477 DO_BUTTERFLY_STD cospi_15_64, cospi_17_64, d2, d3, d6, d7
478 ; --------------------------------------------------------------------------
479 ; part of stage 2
480 ;step2[16] = step1b[16][i] + step1b[17][i];
481 ;step2[17] = step1b[16][i] - step1b[17][i];
482 ;step2[30] = -step1b[30][i] + step1b[31][i];
483 ;step2[31] = step1b[30][i] + step1b[31][i];
484 vadd.s16 q4, q0, q1
485 vsub.s16 q13, q0, q1
486 vadd.s16 q6, q2, q3
487 vsub.s16 q14, q2, q3
488 ; --------------------------------------------------------------------------
489 ; part of stage 3
490 ;temp1 = step1b[30][i] * cospi_28_64 - step1b[17][i] * cospi_4_64;
491 ;temp2 = step1b[30][i] * cospi_4_64 - step1b[17][i] * cospi_28_64;
492 ;step3[17] = dct_const_round_shift(temp1);
493 ;step3[30] = dct_const_round_shift(temp2);
494 DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d10, d11, d14, d15
495 ; --------------------------------------------------------------------------
496 ; generate 18,19,28,29
497 ; --------------------------------------------------------------------------
498 ; part of stage 1
499 ;temp1 = input[9 * 32] * cospi_23_64 - input[23 * 32] * cospi_9_64;
500 ;temp2 = input[9 * 32] * cospi_9_64 + input[23 * 32] * cospi_23_64;
501 ;step1b[18][i] = dct_const_round_shift(temp1);
502 ;step1b[29][i] = dct_const_round_shift(temp2);
503 LOAD_FROM_TRANSPOSED 15, 9, 23
504 DO_BUTTERFLY_STD cospi_23_64, cospi_9_64, d0, d1, d4, d5
505 ; --------------------------------------------------------------------------
506 ; part of stage 1
507 ;temp1 = input[25 * 32] * cospi_7_64 - input[7 * 32] * cospi_25_64;
508 ;temp2 = input[25 * 32] * cospi_25_64 + input[7 * 32] * cospi_7_64;
509 ;step1b[19][i] = dct_const_round_shift(temp1);
510 ;step1b[28][i] = dct_const_round_shift(temp2);
511 LOAD_FROM_TRANSPOSED 23, 25, 7
512 DO_BUTTERFLY_STD cospi_7_64, cospi_25_64, d2, d3, d6, d7
513 ; --------------------------------------------------------------------------
514 ; part of stage 2
515 ;step2[18] = -step1b[18][i] + step1b[19][i];
516 ;step2[19] = step1b[18][i] + step1b[19][i];
517 ;step2[28] = step1b[28][i] + step1b[29][i];
518 ;step2[29] = step1b[28][i] - step1b[29][i];
519 vsub.s16 q13, q3, q2
520 vadd.s16 q3, q3, q2
521 vsub.s16 q14, q1, q0
522 vadd.s16 q2, q1, q0
523 ; --------------------------------------------------------------------------
524 ; part of stage 3
525 ;temp1 = step1b[18][i] * (-cospi_4_64) - step1b[29][i] * (-cospi_28_64);
526 ;temp2 = step1b[18][i] * (-cospi_28_64) + step1b[29][i] * (-cospi_4_64);
527 ;step3[29] = dct_const_round_shift(temp1);
528 ;step3[18] = dct_const_round_shift(temp2);
529 DO_BUTTERFLY_STD (-cospi_4_64), (-cospi_28_64), d2, d3, d0, d1
530 ; --------------------------------------------------------------------------
531 ; combine 16-19,28-31
532 ; --------------------------------------------------------------------------
533 ; part of stage 4
534 ;step1[16] = step1b[16][i] + step1b[19][i];
535 ;step1[17] = step1b[17][i] + step1b[18][i];
536 ;step1[18] = step1b[17][i] - step1b[18][i];
537 ;step1[29] = step1b[30][i] - step1b[29][i];
538 ;step1[30] = step1b[30][i] + step1b[29][i];
539 ;step1[31] = step1b[31][i] + step1b[28][i];
540 vadd.s16 q8, q4, q2
541 vadd.s16 q9, q5, q0
542 vadd.s16 q10, q7, q1
543 vadd.s16 q15, q6, q3
544 vsub.s16 q13, q5, q0
545 vsub.s16 q14, q7, q1
546 STORE_IN_OUTPUT 0, 16, 31, q8, q15
547 STORE_IN_OUTPUT 31, 17, 30, q9, q10
548 ; --------------------------------------------------------------------------
549 ; part of stage 5
550 ;temp1 = step1b[29][i] * cospi_24_64 - step1b[18][i] * cospi_8_64;
551 ;temp2 = step1b[29][i] * cospi_8_64 + step1b[18][i] * cospi_24_64;
552 ;step2[18] = dct_const_round_shift(temp1);
553 ;step2[29] = dct_const_round_shift(temp2);
554 DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d0, d1, d2, d3
555 STORE_IN_OUTPUT 30, 29, 18, q1, q0
556 ; --------------------------------------------------------------------------
557 ; part of stage 4
558 ;step1[19] = step1b[16][i] - step1b[19][i];
559 ;step1[28] = step1b[31][i] - step1b[28][i];
560 vsub.s16 q13, q4, q2
561 vsub.s16 q14, q6, q3
562 ; --------------------------------------------------------------------------
563 ; part of stage 5
564 ;temp1 = step1b[28][i] * cospi_24_64 - step1b[19][i] * cospi_8_64;
565 ;temp2 = step1b[28][i] * cospi_8_64 + step1b[19][i] * cospi_24_64;
566 ;step2[19] = dct_const_round_shift(temp1);
567 ;step2[28] = dct_const_round_shift(temp2);
568 DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d8, d9, d12, d13
569 STORE_IN_OUTPUT 18, 19, 28, q4, q6
570 ; --------------------------------------------------------------------------
571
572
573 ; --------------------------------------------------------------------------
574 ; BLOCK B: 20-23,24-27
575 ; --------------------------------------------------------------------------
576 ; generate 20,21,26,27
577 ; --------------------------------------------------------------------------
578 ; part of stage 1
579 ;temp1 = input[5 * 32] * cospi_27_64 - input[27 * 32] * cospi_5_64;
580 ;temp2 = input[5 * 32] * cospi_5_64 + input[27 * 32] * cospi_27_64;
581 ;step1b[20][i] = dct_const_round_shift(temp1);
582 ;step1b[27][i] = dct_const_round_shift(temp2);
583 LOAD_FROM_TRANSPOSED 7, 5, 27
584 DO_BUTTERFLY_STD cospi_27_64, cospi_5_64, d0, d1, d4, d5
585 ; --------------------------------------------------------------------------
586 ; part of stage 1
587 ;temp1 = input[21 * 32] * cospi_11_64 - input[11 * 32] * cospi_21_64;
588 ;temp2 = input[21 * 32] * cospi_21_64 + input[11 * 32] * cospi_11_64;
589 ;step1b[21][i] = dct_const_round_shift(temp1);
590 ;step1b[26][i] = dct_const_round_shift(temp2);
591 LOAD_FROM_TRANSPOSED 27, 21, 11
592 DO_BUTTERFLY_STD cospi_11_64, cospi_21_64, d2, d3, d6, d7
593 ; --------------------------------------------------------------------------
594 ; part of stage 2
595 ;step2[20] = step1b[20][i] + step1b[21][i];
596 ;step2[21] = step1b[20][i] - step1b[21][i];
597 ;step2[26] = -step1b[26][i] + step1b[27][i];
598 ;step2[27] = step1b[26][i] + step1b[27][i];
599 vsub.s16 q13, q0, q1
600 vadd.s16 q0, q0, q1
601 vsub.s16 q14, q2, q3
602 vadd.s16 q2, q2, q3
603 ; --------------------------------------------------------------------------
604 ; part of stage 3
605 ;temp1 = step1b[26][i] * cospi_12_64 - step1b[21][i] * cospi_20_64;
606 ;temp2 = step1b[26][i] * cospi_20_64 + step1b[21][i] * cospi_12_64;
607 ;step3[21] = dct_const_round_shift(temp1);
608 ;step3[26] = dct_const_round_shift(temp2);
609 DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7
610 ; --------------------------------------------------------------------------
611 ; generate 22,23,24,25
612 ; --------------------------------------------------------------------------
613 ; part of stage 1
614 ;temp1 = input[13 * 32] * cospi_19_64 - input[19 * 32] * cospi_13_64;
615 ;temp2 = input[13 * 32] * cospi_13_64 + input[19 * 32] * cospi_19_64;
616 ;step1b[22][i] = dct_const_round_shift(temp1);
617 ;step1b[25][i] = dct_const_round_shift(temp2);
618 LOAD_FROM_TRANSPOSED 11, 13, 19
619 DO_BUTTERFLY_STD cospi_19_64, cospi_13_64, d10, d11, d14, d15
620 ; --------------------------------------------------------------------------
621 ; part of stage 1
622 ;temp1 = input[29 * 32] * cospi_3_64 - input[3 * 32] * cospi_29_64;
623 ;temp2 = input[29 * 32] * cospi_29_64 + input[3 * 32] * cospi_3_64;
624 ;step1b[23][i] = dct_const_round_shift(temp1);
625 ;step1b[24][i] = dct_const_round_shift(temp2);
626 LOAD_FROM_TRANSPOSED 19, 29, 3
627 DO_BUTTERFLY_STD cospi_3_64, cospi_29_64, d8, d9, d12, d13
628 ; --------------------------------------------------------------------------
629 ; part of stage 2
630 ;step2[22] = -step1b[22][i] + step1b[23][i];
631 ;step2[23] = step1b[22][i] + step1b[23][i];
632 ;step2[24] = step1b[24][i] + step1b[25][i];
633 ;step2[25] = step1b[24][i] - step1b[25][i];
634 vsub.s16 q14, q4, q5
635 vadd.s16 q5, q4, q5
636 vsub.s16 q13, q6, q7
637 vadd.s16 q6, q6, q7
638 ; --------------------------------------------------------------------------
639 ; part of stage 3
640 ;temp1 = step1b[22][i] * (-cospi_20_64) - step1b[25][i] * (-cospi_12_64);
641 ;temp2 = step1b[22][i] * (-cospi_12_64) + step1b[25][i] * (-cospi_20_64);
642 ;step3[25] = dct_const_round_shift(temp1);
643 ;step3[22] = dct_const_round_shift(temp2);
644 DO_BUTTERFLY_STD (-cospi_20_64), (-cospi_12_64), d8, d9, d14, d15
645 ; --------------------------------------------------------------------------
646 ; combine 20-23,24-27
647 ; --------------------------------------------------------------------------
648 ; part of stage 4
649 ;step1[22] = step1b[22][i] + step1b[21][i];
650 ;step1[23] = step1b[23][i] + step1b[20][i];
651 vadd.s16 q10, q7, q1
652 vadd.s16 q11, q5, q0
653 ;step1[24] = step1b[24][i] + step1b[27][i];
654 ;step1[25] = step1b[25][i] + step1b[26][i];
655 vadd.s16 q12, q6, q2
656 vadd.s16 q15, q4, q3
657 ; --------------------------------------------------------------------------
658 ; part of stage 6
659 ;step3[16] = step1b[16][i] + step1b[23][i];
660 ;step3[17] = step1b[17][i] + step1b[22][i];
661 ;step3[22] = step1b[17][i] - step1b[22][i];
662 ;step3[23] = step1b[16][i] - step1b[23][i];
663 LOAD_FROM_OUTPUT 28, 16, 17, q14, q13
664 vadd.s16 q8, q14, q11
665 vadd.s16 q9, q13, q10
666 vsub.s16 q13, q13, q10
667 vsub.s16 q11, q14, q11
668 STORE_IN_OUTPUT 17, 17, 16, q9, q8
669 ; --------------------------------------------------------------------------
670 ; part of stage 6
671 ;step3[24] = step1b[31][i] - step1b[24][i];
672 ;step3[25] = step1b[30][i] - step1b[25][i];
673 ;step3[30] = step1b[30][i] + step1b[25][i];
674 ;step3[31] = step1b[31][i] + step1b[24][i];
675 LOAD_FROM_OUTPUT 16, 30, 31, q14, q9
676 vsub.s16 q8, q9, q12
677 vadd.s16 q10, q14, q15
678 vsub.s16 q14, q14, q15
679 vadd.s16 q12, q9, q12
680 STORE_IN_OUTPUT 31, 30, 31, q10, q12
681 ; --------------------------------------------------------------------------
682 ; TODO(cd) do some register allocation change to remove these push/pop
683 vpush {q8} ; [24]
684 vpush {q11} ; [23]
685 ; --------------------------------------------------------------------------
686 ; part of stage 7
687 ;temp1 = (step1b[25][i] - step1b[22][i]) * cospi_16_64;
688 ;temp2 = (step1b[25][i] + step1b[22][i]) * cospi_16_64;
689 ;step1[22] = dct_const_round_shift(temp1);
690 ;step1[25] = dct_const_round_shift(temp2);
691 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
692 STORE_IN_OUTPUT 31, 25, 22, q14, q13
693 ; --------------------------------------------------------------------------
694 ; part of stage 7
695 ;temp1 = (step1b[24][i] - step1b[23][i]) * cospi_16_64;
696 ;temp2 = (step1b[24][i] + step1b[23][i]) * cospi_16_64;
697 ;step1[23] = dct_const_round_shift(temp1);
698 ;step1[24] = dct_const_round_shift(temp2);
699 ; TODO(cd) do some register allocation change to remove these push/pop
700 vpop {q13} ; [23]
701 vpop {q14} ; [24]
702 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
703 STORE_IN_OUTPUT 22, 24, 23, q14, q13
704 ; --------------------------------------------------------------------------
705 ; part of stage 4
706 ;step1[20] = step1b[23][i] - step1b[20][i];
707 ;step1[27] = step1b[24][i] - step1b[27][i];
708 vsub.s16 q14, q5, q0
709 vsub.s16 q13, q6, q2
710 ; --------------------------------------------------------------------------
711 ; part of stage 5
712 ;temp1 = step1b[20][i] * (-cospi_8_64) - step1b[27][i] * (-cospi_24_64);
713 ;temp2 = step1b[20][i] * (-cospi_24_64) + step1b[27][i] * (-cospi_8_64);
714 ;step2[27] = dct_const_round_shift(temp1);
715 ;step2[20] = dct_const_round_shift(temp2);
716 DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d10, d11, d12, d13
717 ; --------------------------------------------------------------------------
718 ; part of stage 4
719 ;step1[21] = step1b[22][i] - step1b[21][i];
720 ;step1[26] = step1b[25][i] - step1b[26][i];
721 vsub.s16 q14, q7, q1
722 vsub.s16 q13, q4, q3
723 ; --------------------------------------------------------------------------
724 ; part of stage 5
725 ;temp1 = step1b[21][i] * (-cospi_8_64) - step1b[26][i] * (-cospi_24_64);
726 ;temp2 = step1b[21][i] * (-cospi_24_64) + step1b[26][i] * (-cospi_8_64);
727 ;step2[26] = dct_const_round_shift(temp1);
728 ;step2[21] = dct_const_round_shift(temp2);
729 DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d0, d1, d2, d3
730 ; --------------------------------------------------------------------------
731 ; part of stage 6
732 ;step3[18] = step1b[18][i] + step1b[21][i];
733 ;step3[19] = step1b[19][i] + step1b[20][i];
734 ;step3[20] = step1b[19][i] - step1b[20][i];
735 ;step3[21] = step1b[18][i] - step1b[21][i];
736 LOAD_FROM_OUTPUT 23, 18, 19, q14, q13
737 vadd.s16 q8, q14, q1
738 vadd.s16 q9, q13, q6
739 vsub.s16 q13, q13, q6
740 vsub.s16 q1, q14, q1
741 STORE_IN_OUTPUT 19, 18, 19, q8, q9
742 ; --------------------------------------------------------------------------
743 ; part of stage 6
744 ;step3[27] = step1b[28][i] - step1b[27][i];
745 ;step3[28] = step1b[28][i] + step1b[27][i];
746 ;step3[29] = step1b[29][i] + step1b[26][i];
747 ;step3[26] = step1b[29][i] - step1b[26][i];
748 LOAD_FROM_OUTPUT 19, 28, 29, q8, q9
749 vsub.s16 q14, q8, q5
750 vadd.s16 q10, q8, q5
751 vadd.s16 q11, q9, q0
752 vsub.s16 q0, q9, q0
753 STORE_IN_OUTPUT 29, 28, 29, q10, q11
754 ; --------------------------------------------------------------------------
755 ; part of stage 7
756 ;temp1 = (step1b[27][i] - step1b[20][i]) * cospi_16_64;
757 ;temp2 = (step1b[27][i] + step1b[20][i]) * cospi_16_64;
758 ;step1[20] = dct_const_round_shift(temp1);
759 ;step1[27] = dct_const_round_shift(temp2);
760 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
761 STORE_IN_OUTPUT 29, 20, 27, q13, q14
762 ; --------------------------------------------------------------------------
763 ; part of stage 7
764 ;temp1 = (step1b[26][i] - step1b[21][i]) * cospi_16_64;
765 ;temp2 = (step1b[26][i] + step1b[21][i]) * cospi_16_64;
766 ;step1[21] = dct_const_round_shift(temp1);
767 ;step1[26] = dct_const_round_shift(temp2);
768 DO_BUTTERFLY d0, d1, d2, d3, cospi_16_64, cospi_16_64, d2, d3, d0, d1
769 STORE_IN_OUTPUT 27, 21, 26, q1, q0
770 ; --------------------------------------------------------------------------
771
772
773 ; --------------------------------------------------------------------------
774 ; BLOCK C: 8-10,11-15
775 ; --------------------------------------------------------------------------
776 ; generate 8,9,14,15
777 ; --------------------------------------------------------------------------
778 ; part of stage 2
779 ;temp1 = input[2 * 32] * cospi_30_64 - input[30 * 32] * cospi_2_64;
780 ;temp2 = input[2 * 32] * cospi_2_64 + input[30 * 32] * cospi_30_64;
781 ;step2[8] = dct_const_round_shift(temp1);
782 ;step2[15] = dct_const_round_shift(temp2);
783 LOAD_FROM_TRANSPOSED 3, 2, 30
784 DO_BUTTERFLY_STD cospi_30_64, cospi_2_64, d0, d1, d4, d5
785 ; --------------------------------------------------------------------------
786 ; part of stage 2
787 ;temp1 = input[18 * 32] * cospi_14_64 - input[14 * 32] * cospi_18_64;
788 ;temp2 = input[18 * 32] * cospi_18_64 + input[14 * 32] * cospi_14_64;
789 ;step2[9] = dct_const_round_shift(temp1);
790 ;step2[14] = dct_const_round_shift(temp2);
791 LOAD_FROM_TRANSPOSED 30, 18, 14
792 DO_BUTTERFLY_STD cospi_14_64, cospi_18_64, d2, d3, d6, d7
793 ; --------------------------------------------------------------------------
794 ; part of stage 3
795 ;step3[8] = step1b[8][i] + step1b[9][i];
796 ;step3[9] = step1b[8][i] - step1b[9][i];
797 ;step3[14] = step1b[15][i] - step1b[14][i];
798 ;step3[15] = step1b[15][i] + step1b[14][i];
799 vsub.s16 q13, q0, q1
800 vadd.s16 q0, q0, q1
801 vsub.s16 q14, q2, q3
802 vadd.s16 q2, q2, q3
803 ; --------------------------------------------------------------------------
804 ; part of stage 4
805 ;temp1 = step1b[14][i] * cospi_24_64 - step1b[9][i] * cospi_8_64;
806 ;temp2 = step1b[14][i] * cospi_8_64 + step1b[9][i] * cospi_24_64;
807 ;step1[9] = dct_const_round_shift(temp1);
808 ;step1[14] = dct_const_round_shift(temp2);
809 DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d2, d3, d6, d7
810 ; --------------------------------------------------------------------------
811 ; generate 10,11,12,13
812 ; --------------------------------------------------------------------------
813 ; part of stage 2
814 ;temp1 = input[10 * 32] * cospi_22_64 - input[22 * 32] * cospi_10_64;
815 ;temp2 = input[10 * 32] * cospi_10_64 + input[22 * 32] * cospi_22_64;
816 ;step2[10] = dct_const_round_shift(temp1);
817 ;step2[13] = dct_const_round_shift(temp2);
818 LOAD_FROM_TRANSPOSED 14, 10, 22
819 DO_BUTTERFLY_STD cospi_22_64, cospi_10_64, d10, d11, d14, d15
820 ; --------------------------------------------------------------------------
821 ; part of stage 2
822 ;temp1 = input[26 * 32] * cospi_6_64 - input[6 * 32] * cospi_26_64;
823 ;temp2 = input[26 * 32] * cospi_26_64 + input[6 * 32] * cospi_6_64;
824 ;step2[11] = dct_const_round_shift(temp1);
825 ;step2[12] = dct_const_round_shift(temp2);
826 LOAD_FROM_TRANSPOSED 22, 26, 6
827 DO_BUTTERFLY_STD cospi_6_64, cospi_26_64, d8, d9, d12, d13
828 ; --------------------------------------------------------------------------
829 ; part of stage 3
830 ;step3[10] = step1b[11][i] - step1b[10][i];
831 ;step3[11] = step1b[11][i] + step1b[10][i];
832 ;step3[12] = step1b[12][i] + step1b[13][i];
833 ;step3[13] = step1b[12][i] - step1b[13][i];
834 vsub.s16 q14, q4, q5
835 vadd.s16 q5, q4, q5
836 vsub.s16 q13, q6, q7
837 vadd.s16 q6, q6, q7
838 ; --------------------------------------------------------------------------
839 ; part of stage 4
840 ;temp1 = step1b[10][i] * (-cospi_8_64) - step1b[13][i] * (-cospi_24_64);
841 ;temp2 = step1b[10][i] * (-cospi_24_64) + step1b[13][i] * (-cospi_8_64);
842 ;step1[13] = dct_const_round_shift(temp1);
843 ;step1[10] = dct_const_round_shift(temp2);
844 DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d8, d9, d14, d15
845 ; --------------------------------------------------------------------------
846 ; combine 8-10,11-15
847 ; --------------------------------------------------------------------------
848 ; part of stage 5
849 ;step2[8] = step1b[8][i] + step1b[11][i];
850 ;step2[9] = step1b[9][i] + step1b[10][i];
851 ;step2[10] = step1b[9][i] - step1b[10][i];
852 vadd.s16 q8, q0, q5
853 vadd.s16 q9, q1, q7
854 vsub.s16 q13, q1, q7
855 ;step2[13] = step1b[14][i] - step1b[13][i];
856 ;step2[14] = step1b[14][i] + step1b[13][i];
857 ;step2[15] = step1b[15][i] + step1b[12][i];
858 vsub.s16 q14, q3, q4
859 vadd.s16 q10, q3, q4
860 vadd.s16 q15, q2, q6
861 STORE_IN_OUTPUT 26, 8, 15, q8, q15
862 STORE_IN_OUTPUT 15, 9, 14, q9, q10
863 ; --------------------------------------------------------------------------
864 ; part of stage 6
865 ;temp1 = (step1b[13][i] - step1b[10][i]) * cospi_16_64;
866 ;temp2 = (step1b[13][i] + step1b[10][i]) * cospi_16_64;
867 ;step3[10] = dct_const_round_shift(temp1);
868 ;step3[13] = dct_const_round_shift(temp2);
869 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
870 STORE_IN_OUTPUT 14, 13, 10, q3, q1
871 ; --------------------------------------------------------------------------
872 ; part of stage 5
873 ;step2[11] = step1b[8][i] - step1b[11][i];
874 ;step2[12] = step1b[15][i] - step1b[12][i];
875 vsub.s16 q13, q0, q5
876 vsub.s16 q14, q2, q6
877 ; --------------------------------------------------------------------------
878 ; part of stage 6
879 ;temp1 = (step1b[12][i] - step1b[11][i]) * cospi_16_64;
880 ;temp2 = (step1b[12][i] + step1b[11][i]) * cospi_16_64;
881 ;step3[11] = dct_const_round_shift(temp1);
882 ;step3[12] = dct_const_round_shift(temp2);
883 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
884 STORE_IN_OUTPUT 10, 11, 12, q1, q3
885 ; --------------------------------------------------------------------------
886
887
888 ; --------------------------------------------------------------------------
889 ; BLOCK D: 0-3,4-7
890 ; --------------------------------------------------------------------------
891 ; generate 4,5,6,7
892 ; --------------------------------------------------------------------------
893 ; part of stage 3
894 ;temp1 = input[4 * 32] * cospi_28_64 - input[28 * 32] * cospi_4_64;
895 ;temp2 = input[4 * 32] * cospi_4_64 + input[28 * 32] * cospi_28_64;
896 ;step3[4] = dct_const_round_shift(temp1);
897 ;step3[7] = dct_const_round_shift(temp2);
898 LOAD_FROM_TRANSPOSED 6, 4, 28
899 DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d0, d1, d4, d5
900 ; --------------------------------------------------------------------------
901 ; part of stage 3
902 ;temp1 = input[20 * 32] * cospi_12_64 - input[12 * 32] * cospi_20_64;
903 ;temp2 = input[20 * 32] * cospi_20_64 + input[12 * 32] * cospi_12_64;
904 ;step3[5] = dct_const_round_shift(temp1);
905 ;step3[6] = dct_const_round_shift(temp2);
906 LOAD_FROM_TRANSPOSED 28, 20, 12
907 DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7
908 ; --------------------------------------------------------------------------
909 ; part of stage 4
910 ;step1[4] = step1b[4][i] + step1b[5][i];
911 ;step1[5] = step1b[4][i] - step1b[5][i];
912 ;step1[6] = step1b[7][i] - step1b[6][i];
913 ;step1[7] = step1b[7][i] + step1b[6][i];
914 vsub.s16 q13, q0, q1
915 vadd.s16 q0, q0, q1
916 vsub.s16 q14, q2, q3
917 vadd.s16 q2, q2, q3
918 ; --------------------------------------------------------------------------
919 ; part of stage 5
920 ;temp1 = (step1b[6][i] - step1b[5][i]) * cospi_16_64;
921 ;temp2 = (step1b[5][i] + step1b[6][i]) * cospi_16_64;
922 ;step2[5] = dct_const_round_shift(temp1);
923 ;step2[6] = dct_const_round_shift(temp2);
924 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
925 ; --------------------------------------------------------------------------
926 ; generate 0,1,2,3
927 ; --------------------------------------------------------------------------
928 ; part of stage 4
929 ;temp1 = (input[0 * 32] - input[16 * 32]) * cospi_16_64;
930 ;temp2 = (input[0 * 32] + input[16 * 32]) * cospi_16_64;
931 ;step1[1] = dct_const_round_shift(temp1);
932 ;step1[0] = dct_const_round_shift(temp2);
933 LOAD_FROM_TRANSPOSED 12, 0, 16
934 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d10, d11, d14, d15
935 ; --------------------------------------------------------------------------
936 ; part of stage 4
937 ;temp1 = input[8 * 32] * cospi_24_64 - input[24 * 32] * cospi_8_64;
938 ;temp2 = input[8 * 32] * cospi_8_64 + input[24 * 32] * cospi_24_64;
939 ;step1[2] = dct_const_round_shift(temp1);
940 ;step1[3] = dct_const_round_shift(temp2);
941 LOAD_FROM_TRANSPOSED 16, 8, 24
942 DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d28, d29, d12, d13
943 ; --------------------------------------------------------------------------
944 ; part of stage 5
945 ;step2[0] = step1b[0][i] + step1b[3][i];
946 ;step2[1] = step1b[1][i] + step1b[2][i];
947 ;step2[2] = step1b[1][i] - step1b[2][i];
948 ;step2[3] = step1b[0][i] - step1b[3][i];
949 vadd.s16 q4, q7, q6
950 vsub.s16 q7, q7, q6
951 vsub.s16 q6, q5, q14
952 vadd.s16 q5, q5, q14
953 ; --------------------------------------------------------------------------
954 ; combine 0-3,4-7
955 ; --------------------------------------------------------------------------
956 ; part of stage 6
957 ;step3[0] = step1b[0][i] + step1b[7][i];
958 ;step3[1] = step1b[1][i] + step1b[6][i];
959 ;step3[2] = step1b[2][i] + step1b[5][i];
960 ;step3[3] = step1b[3][i] + step1b[4][i];
961 vadd.s16 q8, q4, q2
962 vadd.s16 q9, q5, q3
963 vadd.s16 q10, q6, q1
964 vadd.s16 q11, q7, q0
965 ;step3[4] = step1b[3][i] - step1b[4][i];
966 ;step3[5] = step1b[2][i] - step1b[5][i];
967 ;step3[6] = step1b[1][i] - step1b[6][i];
968 ;step3[7] = step1b[0][i] - step1b[7][i];
969 vsub.s16 q12, q7, q0
970 vsub.s16 q13, q6, q1
971 vsub.s16 q14, q5, q3
972 vsub.s16 q15, q4, q2
973 ; --------------------------------------------------------------------------
974 ; part of stage 7
975 ;step1[0] = step1b[0][i] + step1b[15][i];
976 ;step1[1] = step1b[1][i] + step1b[14][i];
977 ;step1[14] = step1b[1][i] - step1b[14][i];
978 ;step1[15] = step1b[0][i] - step1b[15][i];
979 LOAD_FROM_OUTPUT 12, 14, 15, q0, q1
980 vadd.s16 q2, q8, q1
981 vadd.s16 q3, q9, q0
982 vsub.s16 q4, q9, q0
983 vsub.s16 q5, q8, q1
984 ; --------------------------------------------------------------------------
985 ; part of final stage
986 ;output[14 * 32] = step1b[14][i] + step1b[17][i];
987 ;output[15 * 32] = step1b[15][i] + step1b[16][i];
988 ;output[16 * 32] = step1b[15][i] - step1b[16][i];
989 ;output[17 * 32] = step1b[14][i] - step1b[17][i];
990 LOAD_FROM_OUTPUT 15, 16, 17, q0, q1
991 vadd.s16 q8, q4, q1
992 vadd.s16 q9, q5, q0
993 vsub.s16 q6, q5, q0
994 vsub.s16 q7, q4, q1
Christian Duvivier5b1dc152013-09-25 18:07:10 -0700995
996 cmp r5, #0
997 bgt idct32_bands_end_2nd_pass
998
999idct32_bands_end_1st_pass
1000 STORE_IN_OUTPUT 17, 16, 17, q6, q7
1001 STORE_IN_OUTPUT 17, 14, 15, q8, q9
Christian Duvivier6a501462013-09-11 15:18:47 -07001002 ; --------------------------------------------------------------------------
1003 ; part of final stage
1004 ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
1005 ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
1006 ;output[30 * 32] = step1b[1][i] - step1b[30][i];
1007 ;output[31 * 32] = step1b[0][i] - step1b[31][i];
Christian Duvivier5b1dc152013-09-25 18:07:10 -07001008 LOAD_FROM_OUTPUT 15, 30, 31, q0, q1
Christian Duvivier6a501462013-09-11 15:18:47 -07001009 vadd.s16 q4, q2, q1
1010 vadd.s16 q5, q3, q0
1011 vsub.s16 q6, q3, q0
1012 vsub.s16 q7, q2, q1
Christian Duvivier5b1dc152013-09-25 18:07:10 -07001013 STORE_IN_OUTPUT 31, 30, 31, q6, q7
1014 STORE_IN_OUTPUT 31, 0, 1, q4, q5
Christian Duvivier6a501462013-09-11 15:18:47 -07001015 ; --------------------------------------------------------------------------
1016 ; part of stage 7
1017 ;step1[2] = step1b[2][i] + step1b[13][i];
1018 ;step1[3] = step1b[3][i] + step1b[12][i];
1019 ;step1[12] = step1b[3][i] - step1b[12][i];
1020 ;step1[13] = step1b[2][i] - step1b[13][i];
1021 LOAD_FROM_OUTPUT 1, 12, 13, q0, q1
1022 vadd.s16 q2, q10, q1
1023 vadd.s16 q3, q11, q0
1024 vsub.s16 q4, q11, q0
1025 vsub.s16 q5, q10, q1
1026 ; --------------------------------------------------------------------------
1027 ; part of final stage
1028 ;output[12 * 32] = step1b[12][i] + step1b[19][i];
1029 ;output[13 * 32] = step1b[13][i] + step1b[18][i];
1030 ;output[18 * 32] = step1b[13][i] - step1b[18][i];
1031 ;output[19 * 32] = step1b[12][i] - step1b[19][i];
1032 LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
Christian Duvivier5b1dc152013-09-25 18:07:10 -07001033 vadd.s16 q8, q4, q1
1034 vadd.s16 q9, q5, q0
1035 vsub.s16 q6, q5, q0
1036 vsub.s16 q7, q4, q1
1037 STORE_IN_OUTPUT 19, 18, 19, q6, q7
1038 STORE_IN_OUTPUT 19, 12, 13, q8, q9
Christian Duvivier6a501462013-09-11 15:18:47 -07001039 ; --------------------------------------------------------------------------
1040 ; part of final stage
1041 ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
1042 ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
1043 ;output[28 * 32] = step1b[3][i] - step1b[28][i];
1044 ;output[29 * 32] = step1b[2][i] - step1b[29][i];
Christian Duvivier5b1dc152013-09-25 18:07:10 -07001045 LOAD_FROM_OUTPUT 13, 28, 29, q0, q1
Christian Duvivier6a501462013-09-11 15:18:47 -07001046 vadd.s16 q4, q2, q1
1047 vadd.s16 q5, q3, q0
1048 vsub.s16 q6, q3, q0
1049 vsub.s16 q7, q2, q1
Christian Duvivier5b1dc152013-09-25 18:07:10 -07001050 STORE_IN_OUTPUT 29, 28, 29, q6, q7
1051 STORE_IN_OUTPUT 29, 2, 3, q4, q5
Christian Duvivier6a501462013-09-11 15:18:47 -07001052 ; --------------------------------------------------------------------------
1053 ; part of stage 7
1054 ;step1[4] = step1b[4][i] + step1b[11][i];
1055 ;step1[5] = step1b[5][i] + step1b[10][i];
1056 ;step1[10] = step1b[5][i] - step1b[10][i];
1057 ;step1[11] = step1b[4][i] - step1b[11][i];
1058 LOAD_FROM_OUTPUT 3, 10, 11, q0, q1
1059 vadd.s16 q2, q12, q1
1060 vadd.s16 q3, q13, q0
1061 vsub.s16 q4, q13, q0
1062 vsub.s16 q5, q12, q1
1063 ; --------------------------------------------------------------------------
1064 ; part of final stage
1065 ;output[10 * 32] = step1b[10][i] + step1b[21][i];
1066 ;output[11 * 32] = step1b[11][i] + step1b[20][i];
1067 ;output[20 * 32] = step1b[11][i] - step1b[20][i];
1068 ;output[21 * 32] = step1b[10][i] - step1b[21][i];
1069 LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
Christian Duvivier5b1dc152013-09-25 18:07:10 -07001070 vadd.s16 q8, q4, q1
1071 vadd.s16 q9, q5, q0
1072 vsub.s16 q6, q5, q0
1073 vsub.s16 q7, q4, q1
1074 STORE_IN_OUTPUT 21, 20, 21, q6, q7
1075 STORE_IN_OUTPUT 21, 10, 11, q8, q9
Christian Duvivier6a501462013-09-11 15:18:47 -07001076 ; --------------------------------------------------------------------------
1077 ; part of final stage
1078 ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
1079 ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
1080 ;output[26 * 32] = step1b[5][i] - step1b[26][i];
1081 ;output[27 * 32] = step1b[4][i] - step1b[27][i];
Christian Duvivier5b1dc152013-09-25 18:07:10 -07001082 LOAD_FROM_OUTPUT 11, 26, 27, q0, q1
Christian Duvivier6a501462013-09-11 15:18:47 -07001083 vadd.s16 q4, q2, q1
1084 vadd.s16 q5, q3, q0
1085 vsub.s16 q6, q3, q0
1086 vsub.s16 q7, q2, q1
Christian Duvivier5b1dc152013-09-25 18:07:10 -07001087 STORE_IN_OUTPUT 27, 26, 27, q6, q7
1088 STORE_IN_OUTPUT 27, 4, 5, q4, q5
Christian Duvivier6a501462013-09-11 15:18:47 -07001089 ; --------------------------------------------------------------------------
1090 ; part of stage 7
1091 ;step1[6] = step1b[6][i] + step1b[9][i];
1092 ;step1[7] = step1b[7][i] + step1b[8][i];
1093 ;step1[8] = step1b[7][i] - step1b[8][i];
1094 ;step1[9] = step1b[6][i] - step1b[9][i];
1095 LOAD_FROM_OUTPUT 5, 8, 9, q0, q1
1096 vadd.s16 q2, q14, q1
1097 vadd.s16 q3, q15, q0
1098 vsub.s16 q4, q15, q0
1099 vsub.s16 q5, q14, q1
1100 ; --------------------------------------------------------------------------
1101 ; part of final stage
1102 ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];
1103 ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];
1104 ;output[22 * 32] = step1b[9][i] - step1b[22][i];
1105 ;output[23 * 32] = step1b[8][i] - step1b[23][i];
1106 LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
Christian Duvivier5b1dc152013-09-25 18:07:10 -07001107 vadd.s16 q8, q4, q1
1108 vadd.s16 q9, q5, q0
1109 vsub.s16 q6, q5, q0
1110 vsub.s16 q7, q4, q1
1111 STORE_IN_OUTPUT 23, 22, 23, q6, q7
1112 STORE_IN_OUTPUT 23, 8, 9, q8, q9
Christian Duvivier6a501462013-09-11 15:18:47 -07001113 ; --------------------------------------------------------------------------
1114 ; part of final stage
1115 ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
1116 ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
1117 ;output[24 * 32] = step1b[7][i] - step1b[24][i];
1118 ;output[25 * 32] = step1b[6][i] - step1b[25][i];
Christian Duvivier5b1dc152013-09-25 18:07:10 -07001119 LOAD_FROM_OUTPUT 9, 24, 25, q0, q1
Christian Duvivier6a501462013-09-11 15:18:47 -07001120 vadd.s16 q4, q2, q1
1121 vadd.s16 q5, q3, q0
1122 vsub.s16 q6, q3, q0
1123 vsub.s16 q7, q2, q1
Christian Duvivier5b1dc152013-09-25 18:07:10 -07001124 STORE_IN_OUTPUT 25, 24, 25, q6, q7
1125 STORE_IN_OUTPUT 25, 6, 7, q4, q5
Christian Duvivier6a501462013-09-11 15:18:47 -07001126
Christian Duvivier5b1dc152013-09-25 18:07:10 -07001127 ; restore r0 by removing the last offset from the last
1128 ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2
1129 sub r0, r0, #24*8*2
1130 ; restore r1 by removing the last offset from the last
1131 ; operation (STORE_IN_OUTPUT 24, 6, 7) => 7*32*2
1132 ; advance by 8 columns => 8*2
1133 sub r1, r1, #7*32*2 - 8*2
1134 ; advance by 8 lines (8*32*2)
1135 ; go back by the two pairs from the loop (32*2)
1136 add r3, r3, #8*32*2 - 32*2
Christian Duvivier6a501462013-09-11 15:18:47 -07001137
1138 ; bands loop processing
Christian Duvivier5b1dc152013-09-25 18:07:10 -07001139 subs r4, r4, #1
1140 bne idct32_bands_loop
Christian Duvivier6a501462013-09-11 15:18:47 -07001141
Christian Duvivier5b1dc152013-09-25 18:07:10 -07001142 ; parameters for second pass
1143 ; the input of pass2 is the result of pass1. we have to remove the offset
1144 ; of 32 columns induced by the above idct32_bands_loop
1145 sub r3, r1, #32*2
1146 ; r1 = pass2[32 * 32]
1147 add r1, sp, #2048
1148
1149 ; pass loop processing
1150 add r5, r5, #1
Johanne72d49a2013-11-12 10:41:06 -08001151 b idct32_pass_loop
Christian Duvivier5b1dc152013-09-25 18:07:10 -07001152
1153idct32_bands_end_2nd_pass
1154 STORE_COMBINE_CENTER_RESULTS
1155 ; --------------------------------------------------------------------------
1156 ; part of final stage
1157 ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
1158 ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
1159 ;output[30 * 32] = step1b[1][i] - step1b[30][i];
1160 ;output[31 * 32] = step1b[0][i] - step1b[31][i];
1161 LOAD_FROM_OUTPUT 17, 30, 31, q0, q1
1162 vadd.s16 q4, q2, q1
1163 vadd.s16 q5, q3, q0
1164 vsub.s16 q6, q3, q0
1165 vsub.s16 q7, q2, q1
1166 STORE_COMBINE_EXTREME_RESULTS
1167 ; --------------------------------------------------------------------------
1168 ; part of stage 7
1169 ;step1[2] = step1b[2][i] + step1b[13][i];
1170 ;step1[3] = step1b[3][i] + step1b[12][i];
1171 ;step1[12] = step1b[3][i] - step1b[12][i];
1172 ;step1[13] = step1b[2][i] - step1b[13][i];
1173 LOAD_FROM_OUTPUT 31, 12, 13, q0, q1
1174 vadd.s16 q2, q10, q1
1175 vadd.s16 q3, q11, q0
1176 vsub.s16 q4, q11, q0
1177 vsub.s16 q5, q10, q1
1178 ; --------------------------------------------------------------------------
1179 ; part of final stage
1180 ;output[12 * 32] = step1b[12][i] + step1b[19][i];
1181 ;output[13 * 32] = step1b[13][i] + step1b[18][i];
1182 ;output[18 * 32] = step1b[13][i] - step1b[18][i];
1183 ;output[19 * 32] = step1b[12][i] - step1b[19][i];
1184 LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
1185 vadd.s16 q8, q4, q1
1186 vadd.s16 q9, q5, q0
1187 vsub.s16 q6, q5, q0
1188 vsub.s16 q7, q4, q1
1189 STORE_COMBINE_CENTER_RESULTS
1190 ; --------------------------------------------------------------------------
1191 ; part of final stage
1192 ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
1193 ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
1194 ;output[28 * 32] = step1b[3][i] - step1b[28][i];
1195 ;output[29 * 32] = step1b[2][i] - step1b[29][i];
1196 LOAD_FROM_OUTPUT 19, 28, 29, q0, q1
1197 vadd.s16 q4, q2, q1
1198 vadd.s16 q5, q3, q0
1199 vsub.s16 q6, q3, q0
1200 vsub.s16 q7, q2, q1
1201 STORE_COMBINE_EXTREME_RESULTS
1202 ; --------------------------------------------------------------------------
1203 ; part of stage 7
1204 ;step1[4] = step1b[4][i] + step1b[11][i];
1205 ;step1[5] = step1b[5][i] + step1b[10][i];
1206 ;step1[10] = step1b[5][i] - step1b[10][i];
1207 ;step1[11] = step1b[4][i] - step1b[11][i];
1208 LOAD_FROM_OUTPUT 29, 10, 11, q0, q1
1209 vadd.s16 q2, q12, q1
1210 vadd.s16 q3, q13, q0
1211 vsub.s16 q4, q13, q0
1212 vsub.s16 q5, q12, q1
1213 ; --------------------------------------------------------------------------
1214 ; part of final stage
1215 ;output[10 * 32] = step1b[10][i] + step1b[21][i];
1216 ;output[11 * 32] = step1b[11][i] + step1b[20][i];
1217 ;output[20 * 32] = step1b[11][i] - step1b[20][i];
1218 ;output[21 * 32] = step1b[10][i] - step1b[21][i];
1219 LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
1220 vadd.s16 q8, q4, q1
1221 vadd.s16 q9, q5, q0
1222 vsub.s16 q6, q5, q0
1223 vsub.s16 q7, q4, q1
1224 STORE_COMBINE_CENTER_RESULTS
1225 ; --------------------------------------------------------------------------
1226 ; part of final stage
1227 ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
1228 ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
1229 ;output[26 * 32] = step1b[5][i] - step1b[26][i];
1230 ;output[27 * 32] = step1b[4][i] - step1b[27][i];
1231 LOAD_FROM_OUTPUT 21, 26, 27, q0, q1
1232 vadd.s16 q4, q2, q1
1233 vadd.s16 q5, q3, q0
1234 vsub.s16 q6, q3, q0
1235 vsub.s16 q7, q2, q1
1236 STORE_COMBINE_EXTREME_RESULTS
1237 ; --------------------------------------------------------------------------
1238 ; part of stage 7
1239 ;step1[6] = step1b[6][i] + step1b[9][i];
1240 ;step1[7] = step1b[7][i] + step1b[8][i];
1241 ;step1[8] = step1b[7][i] - step1b[8][i];
1242 ;step1[9] = step1b[6][i] - step1b[9][i];
1243 LOAD_FROM_OUTPUT 27, 8, 9, q0, q1
1244 vadd.s16 q2, q14, q1
1245 vadd.s16 q3, q15, q0
1246 vsub.s16 q4, q15, q0
1247 vsub.s16 q5, q14, q1
1248 ; --------------------------------------------------------------------------
1249 ; part of final stage
1250 ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];
1251 ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];
1252 ;output[22 * 32] = step1b[9][i] - step1b[22][i];
1253 ;output[23 * 32] = step1b[8][i] - step1b[23][i];
1254 LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
1255 vadd.s16 q8, q4, q1
1256 vadd.s16 q9, q5, q0
1257 vsub.s16 q6, q5, q0
1258 vsub.s16 q7, q4, q1
1259 STORE_COMBINE_CENTER_RESULTS_LAST
1260 ; --------------------------------------------------------------------------
1261 ; part of final stage
1262 ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
1263 ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
1264 ;output[24 * 32] = step1b[7][i] - step1b[24][i];
1265 ;output[25 * 32] = step1b[6][i] - step1b[25][i];
1266 LOAD_FROM_OUTPUT 23, 24, 25, q0, q1
1267 vadd.s16 q4, q2, q1
1268 vadd.s16 q5, q3, q0
1269 vsub.s16 q6, q3, q0
1270 vsub.s16 q7, q2, q1
1271 STORE_COMBINE_EXTREME_RESULTS_LAST
1272 ; --------------------------------------------------------------------------
1273 ; restore pointers to their initial indices for next band pass by
1274 ; removing/adding dest_stride * 8. The actual increment by eight
1275 ; is taken care of within the _LAST macros.
1276 add r6, r6, r2, lsl #3
1277 add r9, r9, r2, lsl #3
1278 sub r7, r7, r2, lsl #3
1279 sub r10, r10, r2, lsl #3
1280
1281 ; restore r0 by removing the last offset from the last
1282 ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2
1283 sub r0, r0, #24*8*2
1284 ; restore r1 by removing the last offset from the last
1285 ; operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25*32*2
1286 ; advance by 8 columns => 8*2
1287 sub r1, r1, #25*32*2 - 8*2
1288 ; advance by 8 lines (8*32*2)
1289 ; go back by the two pairs from the loop (32*2)
1290 add r3, r3, #8*32*2 - 32*2
1291
1292 ; bands loop processing
1293 subs r4, r4, #1
1294 bne idct32_bands_loop
1295
1296 ; stack operation
1297 add sp, sp, #512+2048+2048
1298 vpop {d8-d15}
1299 pop {r4-r11}
Christian Duvivier6a501462013-09-11 15:18:47 -07001300 bx lr
Yaowu Xuf883b422016-08-30 14:01:10 -07001301 ENDP ; |aom_idct32x32_1024_add_neon|
Christian Duvivier6a501462013-09-11 15:18:47 -07001302 END