Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 1 | ; |
Yaowu Xu | 9c01aa1 | 2016-09-01 14:32:49 -0700 | [diff] [blame] | 2 | ; Copyright (c) 2016, Alliance for Open Media. All rights reserved |
Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 3 | ; |
Yaowu Xu | 9c01aa1 | 2016-09-01 14:32:49 -0700 | [diff] [blame] | 4 | ; This source code is subject to the terms of the BSD 2 Clause License and |
| 5 | ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| 6 | ; was not distributed with this source code in the LICENSE file, you can |
| 7 | ; obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| 8 | ; Media Patent License 1.0 was not distributed with this source code in the |
| 9 | ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| 10 | ; |
| 11 | |
Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 12 | ; |
| 13 | |
| 14 | ;TODO(cd): adjust these constant to be able to use vqdmulh for faster |
| 15 | ; dct_const_round_shift(a * b) within butterfly calculations. |
| 16 | cospi_1_64 EQU 16364 |
| 17 | cospi_2_64 EQU 16305 |
| 18 | cospi_3_64 EQU 16207 |
| 19 | cospi_4_64 EQU 16069 |
| 20 | cospi_5_64 EQU 15893 |
| 21 | cospi_6_64 EQU 15679 |
| 22 | cospi_7_64 EQU 15426 |
| 23 | cospi_8_64 EQU 15137 |
| 24 | cospi_9_64 EQU 14811 |
| 25 | cospi_10_64 EQU 14449 |
| 26 | cospi_11_64 EQU 14053 |
| 27 | cospi_12_64 EQU 13623 |
| 28 | cospi_13_64 EQU 13160 |
| 29 | cospi_14_64 EQU 12665 |
| 30 | cospi_15_64 EQU 12140 |
| 31 | cospi_16_64 EQU 11585 |
| 32 | cospi_17_64 EQU 11003 |
| 33 | cospi_18_64 EQU 10394 |
| 34 | cospi_19_64 EQU 9760 |
| 35 | cospi_20_64 EQU 9102 |
| 36 | cospi_21_64 EQU 8423 |
| 37 | cospi_22_64 EQU 7723 |
| 38 | cospi_23_64 EQU 7005 |
| 39 | cospi_24_64 EQU 6270 |
| 40 | cospi_25_64 EQU 5520 |
| 41 | cospi_26_64 EQU 4756 |
| 42 | cospi_27_64 EQU 3981 |
| 43 | cospi_28_64 EQU 3196 |
| 44 | cospi_29_64 EQU 2404 |
| 45 | cospi_30_64 EQU 1606 |
| 46 | cospi_31_64 EQU 804 |
| 47 | |
| 48 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 49 | EXPORT |aom_idct32x32_1024_add_neon| |
Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 50 | ARM |
| 51 | REQUIRE8 |
| 52 | PRESERVE8 |
| 53 | |
| 54 | AREA ||.text||, CODE, READONLY, ALIGN=2 |
| 55 | |
| 56 | AREA Block, CODE, READONLY |
| 57 | |
| 58 | ; -------------------------------------------------------------------------- |
| 59 | ; Load from transposed_buffer |
| 60 | ; q13 = transposed_buffer[first_offset] |
| 61 | ; q14 = transposed_buffer[second_offset] |
| 62 | ; for proper address calculation, the last offset used when manipulating |
| 63 | ; transposed_buffer must be passed in. use 0 for first use. |
| 64 | MACRO |
| 65 | LOAD_FROM_TRANSPOSED $prev_offset, $first_offset, $second_offset |
| 66 | ; address calculation with proper stride and loading |
| 67 | add r0, #($first_offset - $prev_offset )*8*2 |
| 68 | vld1.s16 {q14}, [r0] |
| 69 | add r0, #($second_offset - $first_offset)*8*2 |
| 70 | vld1.s16 {q13}, [r0] |
| 71 | ; (used) two registers (q14, q13) |
| 72 | MEND |
| 73 | ; -------------------------------------------------------------------------- |
| 74 | ; Load from output (used as temporary storage) |
| 75 | ; reg1 = output[first_offset] |
| 76 | ; reg2 = output[second_offset] |
| 77 | ; for proper address calculation, the last offset used when manipulating |
Andrew Russell | 549c31f | 2014-02-12 16:32:51 -0800 | [diff] [blame] | 78 | ; output, whether reading or storing) must be passed in. use 0 for first |
Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 79 | ; use. |
| 80 | MACRO |
| 81 | LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 |
| 82 | ; address calculation with proper stride and loading |
| 83 | add r1, #($first_offset - $prev_offset )*32*2 |
| 84 | vld1.s16 {$reg1}, [r1] |
| 85 | add r1, #($second_offset - $first_offset)*32*2 |
| 86 | vld1.s16 {$reg2}, [r1] |
| 87 | ; (used) two registers ($reg1, $reg2) |
| 88 | MEND |
| 89 | ; -------------------------------------------------------------------------- |
| 90 | ; Store into output (sometimes as as temporary storage) |
| 91 | ; output[first_offset] = reg1 |
| 92 | ; output[second_offset] = reg2 |
| 93 | ; for proper address calculation, the last offset used when manipulating |
Andrew Russell | 549c31f | 2014-02-12 16:32:51 -0800 | [diff] [blame] | 94 | ; output, whether reading or storing) must be passed in. use 0 for first |
Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 95 | ; use. |
| 96 | MACRO |
| 97 | STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 |
| 98 | ; address calculation with proper stride and storing |
| 99 | add r1, #($first_offset - $prev_offset )*32*2 |
| 100 | vst1.16 {$reg1}, [r1] |
| 101 | add r1, #($second_offset - $first_offset)*32*2 |
| 102 | vst1.16 {$reg2}, [r1] |
| 103 | MEND |
| 104 | ; -------------------------------------------------------------------------- |
Christian Duvivier | 5b1dc15 | 2013-09-25 18:07:10 -0700 | [diff] [blame] | 105 | ; Combine-add results with current destination content |
| 106 | ; q6-q9 contain the results (out[j * 32 + 0-31]) |
| 107 | MACRO |
| 108 | STORE_COMBINE_CENTER_RESULTS |
| 109 | ; load dest[j * dest_stride + 0-31] |
| 110 | vld1.s16 {d8}, [r10], r2 |
| 111 | vld1.s16 {d11}, [r9], r11 |
| 112 | vld1.s16 {d9}, [r10] |
| 113 | vld1.s16 {d10}, [r9] |
| 114 | ; ROUND_POWER_OF_TWO |
| 115 | vrshr.s16 q7, q7, #6 |
| 116 | vrshr.s16 q8, q8, #6 |
| 117 | vrshr.s16 q9, q9, #6 |
| 118 | vrshr.s16 q6, q6, #6 |
| 119 | ; add to dest[j * dest_stride + 0-31] |
| 120 | vaddw.u8 q7, q7, d9 |
| 121 | vaddw.u8 q8, q8, d10 |
| 122 | vaddw.u8 q9, q9, d11 |
| 123 | vaddw.u8 q6, q6, d8 |
| 124 | ; clip pixel |
| 125 | vqmovun.s16 d9, q7 |
| 126 | vqmovun.s16 d10, q8 |
| 127 | vqmovun.s16 d11, q9 |
| 128 | vqmovun.s16 d8, q6 |
| 129 | ; store back into dest[j * dest_stride + 0-31] |
| 130 | vst1.16 {d9}, [r10], r11 |
| 131 | vst1.16 {d10}, [r9], r2 |
| 132 | vst1.16 {d8}, [r10] |
| 133 | vst1.16 {d11}, [r9] |
| 134 | ; update pointers (by dest_stride * 2) |
| 135 | sub r9, r9, r2, lsl #1 |
| 136 | add r10, r10, r2, lsl #1 |
| 137 | MEND |
| 138 | ; -------------------------------------------------------------------------- |
| 139 | ; Combine-add results with current destination content |
| 140 | ; q6-q9 contain the results (out[j * 32 + 0-31]) |
| 141 | MACRO |
| 142 | STORE_COMBINE_CENTER_RESULTS_LAST |
| 143 | ; load dest[j * dest_stride + 0-31] |
| 144 | vld1.s16 {d8}, [r10], r2 |
| 145 | vld1.s16 {d11}, [r9], r11 |
| 146 | vld1.s16 {d9}, [r10] |
| 147 | vld1.s16 {d10}, [r9] |
| 148 | ; ROUND_POWER_OF_TWO |
| 149 | vrshr.s16 q7, q7, #6 |
| 150 | vrshr.s16 q8, q8, #6 |
| 151 | vrshr.s16 q9, q9, #6 |
| 152 | vrshr.s16 q6, q6, #6 |
| 153 | ; add to dest[j * dest_stride + 0-31] |
| 154 | vaddw.u8 q7, q7, d9 |
| 155 | vaddw.u8 q8, q8, d10 |
| 156 | vaddw.u8 q9, q9, d11 |
| 157 | vaddw.u8 q6, q6, d8 |
| 158 | ; clip pixel |
| 159 | vqmovun.s16 d9, q7 |
| 160 | vqmovun.s16 d10, q8 |
| 161 | vqmovun.s16 d11, q9 |
| 162 | vqmovun.s16 d8, q6 |
| 163 | ; store back into dest[j * dest_stride + 0-31] |
| 164 | vst1.16 {d9}, [r10], r11 |
| 165 | vst1.16 {d10}, [r9], r2 |
| 166 | vst1.16 {d8}, [r10]! |
| 167 | vst1.16 {d11}, [r9]! |
| 168 | ; update pointers (by dest_stride * 2) |
| 169 | sub r9, r9, r2, lsl #1 |
| 170 | add r10, r10, r2, lsl #1 |
| 171 | MEND |
| 172 | ; -------------------------------------------------------------------------- |
| 173 | ; Combine-add results with current destination content |
| 174 | ; q4-q7 contain the results (out[j * 32 + 0-31]) |
| 175 | MACRO |
| 176 | STORE_COMBINE_EXTREME_RESULTS |
| 177 | ; load dest[j * dest_stride + 0-31] |
| 178 | vld1.s16 {d4}, [r7], r2 |
| 179 | vld1.s16 {d7}, [r6], r11 |
| 180 | vld1.s16 {d5}, [r7] |
| 181 | vld1.s16 {d6}, [r6] |
| 182 | ; ROUND_POWER_OF_TWO |
| 183 | vrshr.s16 q5, q5, #6 |
| 184 | vrshr.s16 q6, q6, #6 |
| 185 | vrshr.s16 q7, q7, #6 |
| 186 | vrshr.s16 q4, q4, #6 |
| 187 | ; add to dest[j * dest_stride + 0-31] |
| 188 | vaddw.u8 q5, q5, d5 |
| 189 | vaddw.u8 q6, q6, d6 |
| 190 | vaddw.u8 q7, q7, d7 |
| 191 | vaddw.u8 q4, q4, d4 |
| 192 | ; clip pixel |
| 193 | vqmovun.s16 d5, q5 |
| 194 | vqmovun.s16 d6, q6 |
| 195 | vqmovun.s16 d7, q7 |
| 196 | vqmovun.s16 d4, q4 |
| 197 | ; store back into dest[j * dest_stride + 0-31] |
| 198 | vst1.16 {d5}, [r7], r11 |
| 199 | vst1.16 {d6}, [r6], r2 |
| 200 | vst1.16 {d7}, [r6] |
| 201 | vst1.16 {d4}, [r7] |
| 202 | ; update pointers (by dest_stride * 2) |
| 203 | sub r6, r6, r2, lsl #1 |
| 204 | add r7, r7, r2, lsl #1 |
| 205 | MEND |
| 206 | ; -------------------------------------------------------------------------- |
| 207 | ; Combine-add results with current destination content |
| 208 | ; q4-q7 contain the results (out[j * 32 + 0-31]) |
| 209 | MACRO |
| 210 | STORE_COMBINE_EXTREME_RESULTS_LAST |
| 211 | ; load dest[j * dest_stride + 0-31] |
| 212 | vld1.s16 {d4}, [r7], r2 |
| 213 | vld1.s16 {d7}, [r6], r11 |
| 214 | vld1.s16 {d5}, [r7] |
| 215 | vld1.s16 {d6}, [r6] |
| 216 | ; ROUND_POWER_OF_TWO |
| 217 | vrshr.s16 q5, q5, #6 |
| 218 | vrshr.s16 q6, q6, #6 |
| 219 | vrshr.s16 q7, q7, #6 |
| 220 | vrshr.s16 q4, q4, #6 |
| 221 | ; add to dest[j * dest_stride + 0-31] |
| 222 | vaddw.u8 q5, q5, d5 |
| 223 | vaddw.u8 q6, q6, d6 |
| 224 | vaddw.u8 q7, q7, d7 |
| 225 | vaddw.u8 q4, q4, d4 |
| 226 | ; clip pixel |
| 227 | vqmovun.s16 d5, q5 |
| 228 | vqmovun.s16 d6, q6 |
| 229 | vqmovun.s16 d7, q7 |
| 230 | vqmovun.s16 d4, q4 |
| 231 | ; store back into dest[j * dest_stride + 0-31] |
| 232 | vst1.16 {d5}, [r7], r11 |
| 233 | vst1.16 {d6}, [r6], r2 |
| 234 | vst1.16 {d7}, [r6]! |
| 235 | vst1.16 {d4}, [r7]! |
| 236 | ; update pointers (by dest_stride * 2) |
| 237 | sub r6, r6, r2, lsl #1 |
| 238 | add r7, r7, r2, lsl #1 |
| 239 | MEND |
| 240 | ; -------------------------------------------------------------------------- |
Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 241 | ; Touches q8-q12, q15 (q13-q14 are preserved) |
| 242 | ; valid output registers are anything but q8-q11 |
| 243 | MACRO |
| 244 | DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 |
| 245 | ; TODO(cd): have special case to re-use constants when they are similar for |
| 246 | ; consecutive butterflies |
| 247 | ; TODO(cd): have special case when both constants are the same, do the |
Andrew Russell | 549c31f | 2014-02-12 16:32:51 -0800 | [diff] [blame] | 248 | ; additions/subtractions before the multiplies. |
Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 249 | ; generate the constants |
| 250 | ; generate scalar constants |
Christian Duvivier | 5b1dc15 | 2013-09-25 18:07:10 -0700 | [diff] [blame] | 251 | mov r8, #$first_constant & 0xFF00 |
Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 252 | mov r12, #$second_constant & 0xFF00 |
Christian Duvivier | 5b1dc15 | 2013-09-25 18:07:10 -0700 | [diff] [blame] | 253 | add r8, #$first_constant & 0x00FF |
Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 254 | add r12, #$second_constant & 0x00FF |
| 255 | ; generate vector constants |
Christian Duvivier | 5b1dc15 | 2013-09-25 18:07:10 -0700 | [diff] [blame] | 256 | vdup.16 d30, r8 |
Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 257 | vdup.16 d31, r12 |
| 258 | ; (used) two for inputs (regA-regD), one for constants (q15) |
| 259 | ; do some multiplications (ordered for maximum latency hiding) |
| 260 | vmull.s16 q8, $regC, d30 |
| 261 | vmull.s16 q10, $regA, d31 |
| 262 | vmull.s16 q9, $regD, d30 |
| 263 | vmull.s16 q11, $regB, d31 |
| 264 | vmull.s16 q12, $regC, d31 |
| 265 | ; (used) five for intermediate (q8-q12), one for constants (q15) |
Andrew Russell | 549c31f | 2014-02-12 16:32:51 -0800 | [diff] [blame] | 266 | ; do some addition/subtractions (to get back two register) |
Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 267 | vsub.s32 q8, q8, q10 |
| 268 | vsub.s32 q9, q9, q11 |
| 269 | ; do more multiplications (ordered for maximum latency hiding) |
| 270 | vmull.s16 q10, $regD, d31 |
| 271 | vmull.s16 q11, $regA, d30 |
| 272 | vmull.s16 q15, $regB, d30 |
| 273 | ; (used) six for intermediate (q8-q12, q15) |
Andrew Russell | 549c31f | 2014-02-12 16:32:51 -0800 | [diff] [blame] | 274 | ; do more addition/subtractions |
Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 275 | vadd.s32 q11, q12, q11 |
| 276 | vadd.s32 q10, q10, q15 |
| 277 | ; (used) four for intermediate (q8-q11) |
| 278 | ; dct_const_round_shift |
| 279 | vqrshrn.s32 $reg1, q8, #14 |
| 280 | vqrshrn.s32 $reg2, q9, #14 |
| 281 | vqrshrn.s32 $reg3, q11, #14 |
| 282 | vqrshrn.s32 $reg4, q10, #14 |
| 283 | ; (used) two for results, well four d registers |
| 284 | MEND |
| 285 | ; -------------------------------------------------------------------------- |
| 286 | ; Touches q8-q12, q15 (q13-q14 are preserved) |
| 287 | ; valid output registers are anything but q8-q11 |
| 288 | MACRO |
| 289 | DO_BUTTERFLY_STD $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 |
| 290 | DO_BUTTERFLY d28, d29, d26, d27, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 |
| 291 | MEND |
| 292 | ; -------------------------------------------------------------------------- |
| 293 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 294 | ;void aom_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride); |
Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 295 | ; |
Christian Duvivier | 5b1dc15 | 2013-09-25 18:07:10 -0700 | [diff] [blame] | 296 | ; r0 int16_t *input, |
| 297 | ; r1 uint8_t *dest, |
| 298 | ; r2 int dest_stride) |
| 299 | ; loop counters |
| 300 | ; r4 bands loop counter |
| 301 | ; r5 pass loop counter |
| 302 | ; r8 transpose loop counter |
| 303 | ; combine-add pointers |
| 304 | ; r6 dest + 31 * dest_stride, descending (30, 29, 28, ...) |
| 305 | ; r7 dest + 0 * dest_stride, ascending (1, 2, 3, ...) |
| 306 | ; r9 dest + 15 * dest_stride, descending (14, 13, 12, ...) |
| 307 | ; r10 dest + 16 * dest_stride, ascending (17, 18, 19, ...) |
Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 308 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 309 | |aom_idct32x32_1024_add_neon| PROC |
Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 310 | ; This function does one pass of idct32x32 transform. |
| 311 | ; |
| 312 | ; This is done by transposing the input and then doing a 1d transform on |
| 313 | ; columns. In the first pass, the transposed columns are the original |
| 314 | ; rows. In the second pass, after the transposition, the colums are the |
| 315 | ; original columns. |
| 316 | ; The 1d transform is done by looping over bands of eight columns (the |
| 317 | ; idct32_bands loop). For each band, the transform input transposition |
| 318 | ; is done on demand, one band of four 8x8 matrices at a time. The four |
Christian Duvivier | 5b1dc15 | 2013-09-25 18:07:10 -0700 | [diff] [blame] | 319 | ; matrices are transposed by pairs (the idct32_transpose_pair loop). |
| 320 | push {r4-r11} |
| 321 | vpush {d8-d15} |
| 322 | ; stack operation |
| 323 | ; internal buffer used to transpose 8 lines into before transforming them |
| 324 | ; int16_t transpose_buffer[32 * 8]; |
| 325 | ; at sp + [4096, 4607] |
| 326 | ; results of the first pass (transpose and transform rows) |
| 327 | ; int16_t pass1[32 * 32]; |
| 328 | ; at sp + [0, 2047] |
| 329 | ; results of the second pass (transpose and transform columns) |
| 330 | ; int16_t pass2[32 * 32]; |
| 331 | ; at sp + [2048, 4095] |
| 332 | sub sp, sp, #512+2048+2048 |
| 333 | |
| 334 | ; r6 = dest + 31 * dest_stride |
| 335 | ; r7 = dest + 0 * dest_stride |
| 336 | ; r9 = dest + 15 * dest_stride |
| 337 | ; r10 = dest + 16 * dest_stride |
| 338 | rsb r6, r2, r2, lsl #5 |
| 339 | rsb r9, r2, r2, lsl #4 |
| 340 | add r10, r1, r2, lsl #4 |
| 341 | mov r7, r1 |
| 342 | add r6, r6, r1 |
| 343 | add r9, r9, r1 |
| 344 | ; r11 = -dest_stride |
| 345 | neg r11, r2 |
| 346 | ; r3 = input |
| 347 | mov r3, r0 |
| 348 | ; parameters for first pass |
| 349 | ; r0 = transpose_buffer[32 * 8] |
| 350 | add r0, sp, #4096 |
| 351 | ; r1 = pass1[32 * 32] |
| 352 | mov r1, sp |
| 353 | |
| 354 | mov r5, #0 ; initialize pass loop counter |
| 355 | idct32_pass_loop |
| 356 | mov r4, #4 ; initialize bands loop counter |
Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 357 | idct32_bands_loop |
Christian Duvivier | 5b1dc15 | 2013-09-25 18:07:10 -0700 | [diff] [blame] | 358 | mov r8, #2 ; initialize transpose loop counter |
Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 359 | idct32_transpose_pair_loop |
| 360 | ; Load two horizontally consecutive 8x8 16bit data matrices. The first one |
| 361 | ; into q0-q7 and the second one into q8-q15. There is a stride of 64, |
| 362 | ; adjusted to 32 because of the two post-increments. |
Christian Duvivier | 5b1dc15 | 2013-09-25 18:07:10 -0700 | [diff] [blame] | 363 | vld1.s16 {q8}, [r3]! |
| 364 | vld1.s16 {q0}, [r3]! |
| 365 | add r3, #32 |
| 366 | vld1.s16 {q9}, [r3]! |
| 367 | vld1.s16 {q1}, [r3]! |
| 368 | add r3, #32 |
| 369 | vld1.s16 {q10}, [r3]! |
| 370 | vld1.s16 {q2}, [r3]! |
| 371 | add r3, #32 |
| 372 | vld1.s16 {q11}, [r3]! |
| 373 | vld1.s16 {q3}, [r3]! |
| 374 | add r3, #32 |
| 375 | vld1.s16 {q12}, [r3]! |
| 376 | vld1.s16 {q4}, [r3]! |
| 377 | add r3, #32 |
| 378 | vld1.s16 {q13}, [r3]! |
| 379 | vld1.s16 {q5}, [r3]! |
| 380 | add r3, #32 |
| 381 | vld1.s16 {q14}, [r3]! |
| 382 | vld1.s16 {q6}, [r3]! |
| 383 | add r3, #32 |
| 384 | vld1.s16 {q15}, [r3]! |
| 385 | vld1.s16 {q7}, [r3]! |
Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 386 | |
| 387 | ; Transpose the two 8x8 16bit data matrices. |
| 388 | vswp d17, d24 |
| 389 | vswp d23, d30 |
| 390 | vswp d21, d28 |
| 391 | vswp d19, d26 |
| 392 | vswp d1, d8 |
| 393 | vswp d7, d14 |
| 394 | vswp d5, d12 |
| 395 | vswp d3, d10 |
| 396 | vtrn.32 q8, q10 |
| 397 | vtrn.32 q9, q11 |
| 398 | vtrn.32 q12, q14 |
| 399 | vtrn.32 q13, q15 |
| 400 | vtrn.32 q0, q2 |
| 401 | vtrn.32 q1, q3 |
| 402 | vtrn.32 q4, q6 |
| 403 | vtrn.32 q5, q7 |
| 404 | vtrn.16 q8, q9 |
| 405 | vtrn.16 q10, q11 |
| 406 | vtrn.16 q12, q13 |
| 407 | vtrn.16 q14, q15 |
| 408 | vtrn.16 q0, q1 |
| 409 | vtrn.16 q2, q3 |
| 410 | vtrn.16 q4, q5 |
| 411 | vtrn.16 q6, q7 |
| 412 | |
| 413 | ; Store both matrices after each other. There is a stride of 32, which |
| 414 | ; adjusts to nothing because of the post-increments. |
| 415 | vst1.16 {q8}, [r0]! |
| 416 | vst1.16 {q9}, [r0]! |
| 417 | vst1.16 {q10}, [r0]! |
| 418 | vst1.16 {q11}, [r0]! |
| 419 | vst1.16 {q12}, [r0]! |
| 420 | vst1.16 {q13}, [r0]! |
| 421 | vst1.16 {q14}, [r0]! |
| 422 | vst1.16 {q15}, [r0]! |
| 423 | vst1.16 {q0}, [r0]! |
| 424 | vst1.16 {q1}, [r0]! |
| 425 | vst1.16 {q2}, [r0]! |
| 426 | vst1.16 {q3}, [r0]! |
| 427 | vst1.16 {q4}, [r0]! |
| 428 | vst1.16 {q5}, [r0]! |
| 429 | vst1.16 {q6}, [r0]! |
| 430 | vst1.16 {q7}, [r0]! |
| 431 | |
| 432 | ; increment pointers by adjusted stride (not necessary for r0/out) |
Christian Duvivier | 5b1dc15 | 2013-09-25 18:07:10 -0700 | [diff] [blame] | 433 | ; go back by 7*32 for the seven lines moved fully by read and add |
| 434 | ; go back by 32 for the eigth line only read |
| 435 | ; advance by 16*2 to go the next pair |
| 436 | sub r3, r3, #7*32*2 + 32 - 16*2 |
Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 437 | ; transpose pair loop processing |
Christian Duvivier | 5b1dc15 | 2013-09-25 18:07:10 -0700 | [diff] [blame] | 438 | subs r8, r8, #1 |
| 439 | bne idct32_transpose_pair_loop |
Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 440 | |
| 441 | ; restore r0/input to its original value |
| 442 | sub r0, r0, #32*8*2 |
| 443 | |
| 444 | ; Instead of doing the transforms stage by stage, it is done by loading |
| 445 | ; some input values and doing as many stages as possible to minimize the |
| 446 | ; storing/loading of intermediate results. To fit within registers, the |
| 447 | ; final coefficients are cut into four blocks: |
| 448 | ; BLOCK A: 16-19,28-31 |
| 449 | ; BLOCK B: 20-23,24-27 |
| 450 | ; BLOCK C: 8-10,11-15 |
| 451 | ; BLOCK D: 0-3,4-7 |
| 452 | ; Blocks A and C are straight calculation through the various stages. In |
| 453 | ; block B, further calculations are performed using the results from |
| 454 | ; block A. In block D, further calculations are performed using the results |
| 455 | ; from block C and then the final calculations are done using results from |
| 456 | ; block A and B which have been combined at the end of block B. |
| 457 | |
| 458 | ; -------------------------------------------------------------------------- |
| 459 | ; BLOCK A: 16-19,28-31 |
| 460 | ; -------------------------------------------------------------------------- |
| 461 | ; generate 16,17,30,31 |
| 462 | ; -------------------------------------------------------------------------- |
| 463 | ; part of stage 1 |
| 464 | ;temp1 = input[1 * 32] * cospi_31_64 - input[31 * 32] * cospi_1_64; |
| 465 | ;temp2 = input[1 * 32] * cospi_1_64 + input[31 * 32] * cospi_31_64; |
| 466 | ;step1b[16][i] = dct_const_round_shift(temp1); |
| 467 | ;step1b[31][i] = dct_const_round_shift(temp2); |
| 468 | LOAD_FROM_TRANSPOSED 0, 1, 31 |
| 469 | DO_BUTTERFLY_STD cospi_31_64, cospi_1_64, d0, d1, d4, d5 |
| 470 | ; -------------------------------------------------------------------------- |
| 471 | ; part of stage 1 |
| 472 | ;temp1 = input[17 * 32] * cospi_15_64 - input[15 * 32] * cospi_17_64; |
| 473 | ;temp2 = input[17 * 32] * cospi_17_64 + input[15 * 32] * cospi_15_64; |
| 474 | ;step1b[17][i] = dct_const_round_shift(temp1); |
| 475 | ;step1b[30][i] = dct_const_round_shift(temp2); |
| 476 | LOAD_FROM_TRANSPOSED 31, 17, 15 |
| 477 | DO_BUTTERFLY_STD cospi_15_64, cospi_17_64, d2, d3, d6, d7 |
| 478 | ; -------------------------------------------------------------------------- |
| 479 | ; part of stage 2 |
| 480 | ;step2[16] = step1b[16][i] + step1b[17][i]; |
| 481 | ;step2[17] = step1b[16][i] - step1b[17][i]; |
| 482 | ;step2[30] = -step1b[30][i] + step1b[31][i]; |
| 483 | ;step2[31] = step1b[30][i] + step1b[31][i]; |
| 484 | vadd.s16 q4, q0, q1 |
| 485 | vsub.s16 q13, q0, q1 |
| 486 | vadd.s16 q6, q2, q3 |
| 487 | vsub.s16 q14, q2, q3 |
| 488 | ; -------------------------------------------------------------------------- |
| 489 | ; part of stage 3 |
| 490 | ;temp1 = step1b[30][i] * cospi_28_64 - step1b[17][i] * cospi_4_64; |
| 491 | ;temp2 = step1b[30][i] * cospi_4_64 - step1b[17][i] * cospi_28_64; |
| 492 | ;step3[17] = dct_const_round_shift(temp1); |
| 493 | ;step3[30] = dct_const_round_shift(temp2); |
| 494 | DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d10, d11, d14, d15 |
| 495 | ; -------------------------------------------------------------------------- |
| 496 | ; generate 18,19,28,29 |
| 497 | ; -------------------------------------------------------------------------- |
| 498 | ; part of stage 1 |
| 499 | ;temp1 = input[9 * 32] * cospi_23_64 - input[23 * 32] * cospi_9_64; |
| 500 | ;temp2 = input[9 * 32] * cospi_9_64 + input[23 * 32] * cospi_23_64; |
| 501 | ;step1b[18][i] = dct_const_round_shift(temp1); |
| 502 | ;step1b[29][i] = dct_const_round_shift(temp2); |
| 503 | LOAD_FROM_TRANSPOSED 15, 9, 23 |
| 504 | DO_BUTTERFLY_STD cospi_23_64, cospi_9_64, d0, d1, d4, d5 |
| 505 | ; -------------------------------------------------------------------------- |
| 506 | ; part of stage 1 |
| 507 | ;temp1 = input[25 * 32] * cospi_7_64 - input[7 * 32] * cospi_25_64; |
| 508 | ;temp2 = input[25 * 32] * cospi_25_64 + input[7 * 32] * cospi_7_64; |
| 509 | ;step1b[19][i] = dct_const_round_shift(temp1); |
| 510 | ;step1b[28][i] = dct_const_round_shift(temp2); |
| 511 | LOAD_FROM_TRANSPOSED 23, 25, 7 |
| 512 | DO_BUTTERFLY_STD cospi_7_64, cospi_25_64, d2, d3, d6, d7 |
| 513 | ; -------------------------------------------------------------------------- |
| 514 | ; part of stage 2 |
| 515 | ;step2[18] = -step1b[18][i] + step1b[19][i]; |
| 516 | ;step2[19] = step1b[18][i] + step1b[19][i]; |
| 517 | ;step2[28] = step1b[28][i] + step1b[29][i]; |
| 518 | ;step2[29] = step1b[28][i] - step1b[29][i]; |
| 519 | vsub.s16 q13, q3, q2 |
| 520 | vadd.s16 q3, q3, q2 |
| 521 | vsub.s16 q14, q1, q0 |
| 522 | vadd.s16 q2, q1, q0 |
| 523 | ; -------------------------------------------------------------------------- |
| 524 | ; part of stage 3 |
| 525 | ;temp1 = step1b[18][i] * (-cospi_4_64) - step1b[29][i] * (-cospi_28_64); |
| 526 | ;temp2 = step1b[18][i] * (-cospi_28_64) + step1b[29][i] * (-cospi_4_64); |
| 527 | ;step3[29] = dct_const_round_shift(temp1); |
| 528 | ;step3[18] = dct_const_round_shift(temp2); |
| 529 | DO_BUTTERFLY_STD (-cospi_4_64), (-cospi_28_64), d2, d3, d0, d1 |
| 530 | ; -------------------------------------------------------------------------- |
| 531 | ; combine 16-19,28-31 |
| 532 | ; -------------------------------------------------------------------------- |
| 533 | ; part of stage 4 |
| 534 | ;step1[16] = step1b[16][i] + step1b[19][i]; |
| 535 | ;step1[17] = step1b[17][i] + step1b[18][i]; |
| 536 | ;step1[18] = step1b[17][i] - step1b[18][i]; |
| 537 | ;step1[29] = step1b[30][i] - step1b[29][i]; |
| 538 | ;step1[30] = step1b[30][i] + step1b[29][i]; |
| 539 | ;step1[31] = step1b[31][i] + step1b[28][i]; |
| 540 | vadd.s16 q8, q4, q2 |
| 541 | vadd.s16 q9, q5, q0 |
| 542 | vadd.s16 q10, q7, q1 |
| 543 | vadd.s16 q15, q6, q3 |
| 544 | vsub.s16 q13, q5, q0 |
| 545 | vsub.s16 q14, q7, q1 |
| 546 | STORE_IN_OUTPUT 0, 16, 31, q8, q15 |
| 547 | STORE_IN_OUTPUT 31, 17, 30, q9, q10 |
| 548 | ; -------------------------------------------------------------------------- |
| 549 | ; part of stage 5 |
| 550 | ;temp1 = step1b[29][i] * cospi_24_64 - step1b[18][i] * cospi_8_64; |
| 551 | ;temp2 = step1b[29][i] * cospi_8_64 + step1b[18][i] * cospi_24_64; |
| 552 | ;step2[18] = dct_const_round_shift(temp1); |
| 553 | ;step2[29] = dct_const_round_shift(temp2); |
| 554 | DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d0, d1, d2, d3 |
| 555 | STORE_IN_OUTPUT 30, 29, 18, q1, q0 |
| 556 | ; -------------------------------------------------------------------------- |
| 557 | ; part of stage 4 |
| 558 | ;step1[19] = step1b[16][i] - step1b[19][i]; |
| 559 | ;step1[28] = step1b[31][i] - step1b[28][i]; |
| 560 | vsub.s16 q13, q4, q2 |
| 561 | vsub.s16 q14, q6, q3 |
| 562 | ; -------------------------------------------------------------------------- |
| 563 | ; part of stage 5 |
| 564 | ;temp1 = step1b[28][i] * cospi_24_64 - step1b[19][i] * cospi_8_64; |
| 565 | ;temp2 = step1b[28][i] * cospi_8_64 + step1b[19][i] * cospi_24_64; |
| 566 | ;step2[19] = dct_const_round_shift(temp1); |
| 567 | ;step2[28] = dct_const_round_shift(temp2); |
| 568 | DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d8, d9, d12, d13 |
| 569 | STORE_IN_OUTPUT 18, 19, 28, q4, q6 |
| 570 | ; -------------------------------------------------------------------------- |
| 571 | |
| 572 | |
| 573 | ; -------------------------------------------------------------------------- |
| 574 | ; BLOCK B: 20-23,24-27 |
| 575 | ; -------------------------------------------------------------------------- |
| 576 | ; generate 20,21,26,27 |
| 577 | ; -------------------------------------------------------------------------- |
| 578 | ; part of stage 1 |
| 579 | ;temp1 = input[5 * 32] * cospi_27_64 - input[27 * 32] * cospi_5_64; |
| 580 | ;temp2 = input[5 * 32] * cospi_5_64 + input[27 * 32] * cospi_27_64; |
| 581 | ;step1b[20][i] = dct_const_round_shift(temp1); |
| 582 | ;step1b[27][i] = dct_const_round_shift(temp2); |
| 583 | LOAD_FROM_TRANSPOSED 7, 5, 27 |
| 584 | DO_BUTTERFLY_STD cospi_27_64, cospi_5_64, d0, d1, d4, d5 |
| 585 | ; -------------------------------------------------------------------------- |
| 586 | ; part of stage 1 |
| 587 | ;temp1 = input[21 * 32] * cospi_11_64 - input[11 * 32] * cospi_21_64; |
| 588 | ;temp2 = input[21 * 32] * cospi_21_64 + input[11 * 32] * cospi_11_64; |
| 589 | ;step1b[21][i] = dct_const_round_shift(temp1); |
| 590 | ;step1b[26][i] = dct_const_round_shift(temp2); |
| 591 | LOAD_FROM_TRANSPOSED 27, 21, 11 |
| 592 | DO_BUTTERFLY_STD cospi_11_64, cospi_21_64, d2, d3, d6, d7 |
| 593 | ; -------------------------------------------------------------------------- |
| 594 | ; part of stage 2 |
| 595 | ;step2[20] = step1b[20][i] + step1b[21][i]; |
| 596 | ;step2[21] = step1b[20][i] - step1b[21][i]; |
| 597 | ;step2[26] = -step1b[26][i] + step1b[27][i]; |
| 598 | ;step2[27] = step1b[26][i] + step1b[27][i]; |
| 599 | vsub.s16 q13, q0, q1 |
| 600 | vadd.s16 q0, q0, q1 |
| 601 | vsub.s16 q14, q2, q3 |
| 602 | vadd.s16 q2, q2, q3 |
| 603 | ; -------------------------------------------------------------------------- |
| 604 | ; part of stage 3 |
| 605 | ;temp1 = step1b[26][i] * cospi_12_64 - step1b[21][i] * cospi_20_64; |
| 606 | ;temp2 = step1b[26][i] * cospi_20_64 + step1b[21][i] * cospi_12_64; |
| 607 | ;step3[21] = dct_const_round_shift(temp1); |
| 608 | ;step3[26] = dct_const_round_shift(temp2); |
| 609 | DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7 |
| 610 | ; -------------------------------------------------------------------------- |
| 611 | ; generate 22,23,24,25 |
| 612 | ; -------------------------------------------------------------------------- |
| 613 | ; part of stage 1 |
| 614 | ;temp1 = input[13 * 32] * cospi_19_64 - input[19 * 32] * cospi_13_64; |
| 615 | ;temp2 = input[13 * 32] * cospi_13_64 + input[19 * 32] * cospi_19_64; |
| 616 | ;step1b[22][i] = dct_const_round_shift(temp1); |
| 617 | ;step1b[25][i] = dct_const_round_shift(temp2); |
| 618 | LOAD_FROM_TRANSPOSED 11, 13, 19 |
| 619 | DO_BUTTERFLY_STD cospi_19_64, cospi_13_64, d10, d11, d14, d15 |
| 620 | ; -------------------------------------------------------------------------- |
| 621 | ; part of stage 1 |
| 622 | ;temp1 = input[29 * 32] * cospi_3_64 - input[3 * 32] * cospi_29_64; |
| 623 | ;temp2 = input[29 * 32] * cospi_29_64 + input[3 * 32] * cospi_3_64; |
| 624 | ;step1b[23][i] = dct_const_round_shift(temp1); |
| 625 | ;step1b[24][i] = dct_const_round_shift(temp2); |
| 626 | LOAD_FROM_TRANSPOSED 19, 29, 3 |
| 627 | DO_BUTTERFLY_STD cospi_3_64, cospi_29_64, d8, d9, d12, d13 |
| 628 | ; -------------------------------------------------------------------------- |
| 629 | ; part of stage 2 |
| 630 | ;step2[22] = -step1b[22][i] + step1b[23][i]; |
| 631 | ;step2[23] = step1b[22][i] + step1b[23][i]; |
| 632 | ;step2[24] = step1b[24][i] + step1b[25][i]; |
| 633 | ;step2[25] = step1b[24][i] - step1b[25][i]; |
| 634 | vsub.s16 q14, q4, q5 |
| 635 | vadd.s16 q5, q4, q5 |
| 636 | vsub.s16 q13, q6, q7 |
| 637 | vadd.s16 q6, q6, q7 |
| 638 | ; -------------------------------------------------------------------------- |
| 639 | ; part of stage 3 |
| 640 | ;temp1 = step1b[22][i] * (-cospi_20_64) - step1b[25][i] * (-cospi_12_64); |
| 641 | ;temp2 = step1b[22][i] * (-cospi_12_64) + step1b[25][i] * (-cospi_20_64); |
| 642 | ;step3[25] = dct_const_round_shift(temp1); |
| 643 | ;step3[22] = dct_const_round_shift(temp2); |
| 644 | DO_BUTTERFLY_STD (-cospi_20_64), (-cospi_12_64), d8, d9, d14, d15 |
| 645 | ; -------------------------------------------------------------------------- |
| 646 | ; combine 20-23,24-27 |
| 647 | ; -------------------------------------------------------------------------- |
| 648 | ; part of stage 4 |
| 649 | ;step1[22] = step1b[22][i] + step1b[21][i]; |
| 650 | ;step1[23] = step1b[23][i] + step1b[20][i]; |
| 651 | vadd.s16 q10, q7, q1 |
| 652 | vadd.s16 q11, q5, q0 |
| 653 | ;step1[24] = step1b[24][i] + step1b[27][i]; |
| 654 | ;step1[25] = step1b[25][i] + step1b[26][i]; |
| 655 | vadd.s16 q12, q6, q2 |
| 656 | vadd.s16 q15, q4, q3 |
| 657 | ; -------------------------------------------------------------------------- |
| 658 | ; part of stage 6 |
| 659 | ;step3[16] = step1b[16][i] + step1b[23][i]; |
| 660 | ;step3[17] = step1b[17][i] + step1b[22][i]; |
| 661 | ;step3[22] = step1b[17][i] - step1b[22][i]; |
| 662 | ;step3[23] = step1b[16][i] - step1b[23][i]; |
| 663 | LOAD_FROM_OUTPUT 28, 16, 17, q14, q13 |
| 664 | vadd.s16 q8, q14, q11 |
| 665 | vadd.s16 q9, q13, q10 |
| 666 | vsub.s16 q13, q13, q10 |
| 667 | vsub.s16 q11, q14, q11 |
| 668 | STORE_IN_OUTPUT 17, 17, 16, q9, q8 |
| 669 | ; -------------------------------------------------------------------------- |
| 670 | ; part of stage 6 |
| 671 | ;step3[24] = step1b[31][i] - step1b[24][i]; |
| 672 | ;step3[25] = step1b[30][i] - step1b[25][i]; |
| 673 | ;step3[30] = step1b[30][i] + step1b[25][i]; |
| 674 | ;step3[31] = step1b[31][i] + step1b[24][i]; |
| 675 | LOAD_FROM_OUTPUT 16, 30, 31, q14, q9 |
| 676 | vsub.s16 q8, q9, q12 |
| 677 | vadd.s16 q10, q14, q15 |
| 678 | vsub.s16 q14, q14, q15 |
| 679 | vadd.s16 q12, q9, q12 |
| 680 | STORE_IN_OUTPUT 31, 30, 31, q10, q12 |
| 681 | ; -------------------------------------------------------------------------- |
| 682 | ; TODO(cd) do some register allocation change to remove these push/pop |
| 683 | vpush {q8} ; [24] |
| 684 | vpush {q11} ; [23] |
| 685 | ; -------------------------------------------------------------------------- |
| 686 | ; part of stage 7 |
| 687 | ;temp1 = (step1b[25][i] - step1b[22][i]) * cospi_16_64; |
| 688 | ;temp2 = (step1b[25][i] + step1b[22][i]) * cospi_16_64; |
| 689 | ;step1[22] = dct_const_round_shift(temp1); |
| 690 | ;step1[25] = dct_const_round_shift(temp2); |
| 691 | DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 |
| 692 | STORE_IN_OUTPUT 31, 25, 22, q14, q13 |
| 693 | ; -------------------------------------------------------------------------- |
| 694 | ; part of stage 7 |
| 695 | ;temp1 = (step1b[24][i] - step1b[23][i]) * cospi_16_64; |
| 696 | ;temp2 = (step1b[24][i] + step1b[23][i]) * cospi_16_64; |
| 697 | ;step1[23] = dct_const_round_shift(temp1); |
| 698 | ;step1[24] = dct_const_round_shift(temp2); |
| 699 | ; TODO(cd) do some register allocation change to remove these push/pop |
| 700 | vpop {q13} ; [23] |
| 701 | vpop {q14} ; [24] |
| 702 | DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 |
| 703 | STORE_IN_OUTPUT 22, 24, 23, q14, q13 |
| 704 | ; -------------------------------------------------------------------------- |
| 705 | ; part of stage 4 |
| 706 | ;step1[20] = step1b[23][i] - step1b[20][i]; |
| 707 | ;step1[27] = step1b[24][i] - step1b[27][i]; |
| 708 | vsub.s16 q14, q5, q0 |
| 709 | vsub.s16 q13, q6, q2 |
| 710 | ; -------------------------------------------------------------------------- |
| 711 | ; part of stage 5 |
| 712 | ;temp1 = step1b[20][i] * (-cospi_8_64) - step1b[27][i] * (-cospi_24_64); |
| 713 | ;temp2 = step1b[20][i] * (-cospi_24_64) + step1b[27][i] * (-cospi_8_64); |
| 714 | ;step2[27] = dct_const_round_shift(temp1); |
| 715 | ;step2[20] = dct_const_round_shift(temp2); |
| 716 | DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d10, d11, d12, d13 |
| 717 | ; -------------------------------------------------------------------------- |
| 718 | ; part of stage 4 |
| 719 | ;step1[21] = step1b[22][i] - step1b[21][i]; |
| 720 | ;step1[26] = step1b[25][i] - step1b[26][i]; |
| 721 | vsub.s16 q14, q7, q1 |
| 722 | vsub.s16 q13, q4, q3 |
| 723 | ; -------------------------------------------------------------------------- |
| 724 | ; part of stage 5 |
| 725 | ;temp1 = step1b[21][i] * (-cospi_8_64) - step1b[26][i] * (-cospi_24_64); |
| 726 | ;temp2 = step1b[21][i] * (-cospi_24_64) + step1b[26][i] * (-cospi_8_64); |
| 727 | ;step2[26] = dct_const_round_shift(temp1); |
| 728 | ;step2[21] = dct_const_round_shift(temp2); |
| 729 | DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d0, d1, d2, d3 |
| 730 | ; -------------------------------------------------------------------------- |
| 731 | ; part of stage 6 |
| 732 | ;step3[18] = step1b[18][i] + step1b[21][i]; |
| 733 | ;step3[19] = step1b[19][i] + step1b[20][i]; |
| 734 | ;step3[20] = step1b[19][i] - step1b[20][i]; |
| 735 | ;step3[21] = step1b[18][i] - step1b[21][i]; |
| 736 | LOAD_FROM_OUTPUT 23, 18, 19, q14, q13 |
| 737 | vadd.s16 q8, q14, q1 |
| 738 | vadd.s16 q9, q13, q6 |
| 739 | vsub.s16 q13, q13, q6 |
| 740 | vsub.s16 q1, q14, q1 |
| 741 | STORE_IN_OUTPUT 19, 18, 19, q8, q9 |
| 742 | ; -------------------------------------------------------------------------- |
| 743 | ; part of stage 6 |
| 744 | ;step3[27] = step1b[28][i] - step1b[27][i]; |
| 745 | ;step3[28] = step1b[28][i] + step1b[27][i]; |
| 746 | ;step3[29] = step1b[29][i] + step1b[26][i]; |
| 747 | ;step3[26] = step1b[29][i] - step1b[26][i]; |
| 748 | LOAD_FROM_OUTPUT 19, 28, 29, q8, q9 |
| 749 | vsub.s16 q14, q8, q5 |
| 750 | vadd.s16 q10, q8, q5 |
| 751 | vadd.s16 q11, q9, q0 |
| 752 | vsub.s16 q0, q9, q0 |
| 753 | STORE_IN_OUTPUT 29, 28, 29, q10, q11 |
| 754 | ; -------------------------------------------------------------------------- |
| 755 | ; part of stage 7 |
| 756 | ;temp1 = (step1b[27][i] - step1b[20][i]) * cospi_16_64; |
| 757 | ;temp2 = (step1b[27][i] + step1b[20][i]) * cospi_16_64; |
| 758 | ;step1[20] = dct_const_round_shift(temp1); |
| 759 | ;step1[27] = dct_const_round_shift(temp2); |
| 760 | DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 |
| 761 | STORE_IN_OUTPUT 29, 20, 27, q13, q14 |
| 762 | ; -------------------------------------------------------------------------- |
| 763 | ; part of stage 7 |
| 764 | ;temp1 = (step1b[26][i] - step1b[21][i]) * cospi_16_64; |
| 765 | ;temp2 = (step1b[26][i] + step1b[21][i]) * cospi_16_64; |
| 766 | ;step1[21] = dct_const_round_shift(temp1); |
| 767 | ;step1[26] = dct_const_round_shift(temp2); |
| 768 | DO_BUTTERFLY d0, d1, d2, d3, cospi_16_64, cospi_16_64, d2, d3, d0, d1 |
| 769 | STORE_IN_OUTPUT 27, 21, 26, q1, q0 |
| 770 | ; -------------------------------------------------------------------------- |
| 771 | |
| 772 | |
| 773 | ; -------------------------------------------------------------------------- |
| 774 | ; BLOCK C: 8-10,11-15 |
| 775 | ; -------------------------------------------------------------------------- |
| 776 | ; generate 8,9,14,15 |
| 777 | ; -------------------------------------------------------------------------- |
| 778 | ; part of stage 2 |
| 779 | ;temp1 = input[2 * 32] * cospi_30_64 - input[30 * 32] * cospi_2_64; |
| 780 | ;temp2 = input[2 * 32] * cospi_2_64 + input[30 * 32] * cospi_30_64; |
| 781 | ;step2[8] = dct_const_round_shift(temp1); |
| 782 | ;step2[15] = dct_const_round_shift(temp2); |
| 783 | LOAD_FROM_TRANSPOSED 3, 2, 30 |
| 784 | DO_BUTTERFLY_STD cospi_30_64, cospi_2_64, d0, d1, d4, d5 |
| 785 | ; -------------------------------------------------------------------------- |
| 786 | ; part of stage 2 |
| 787 | ;temp1 = input[18 * 32] * cospi_14_64 - input[14 * 32] * cospi_18_64; |
| 788 | ;temp2 = input[18 * 32] * cospi_18_64 + input[14 * 32] * cospi_14_64; |
| 789 | ;step2[9] = dct_const_round_shift(temp1); |
| 790 | ;step2[14] = dct_const_round_shift(temp2); |
| 791 | LOAD_FROM_TRANSPOSED 30, 18, 14 |
| 792 | DO_BUTTERFLY_STD cospi_14_64, cospi_18_64, d2, d3, d6, d7 |
| 793 | ; -------------------------------------------------------------------------- |
| 794 | ; part of stage 3 |
| 795 | ;step3[8] = step1b[8][i] + step1b[9][i]; |
| 796 | ;step3[9] = step1b[8][i] - step1b[9][i]; |
| 797 | ;step3[14] = step1b[15][i] - step1b[14][i]; |
| 798 | ;step3[15] = step1b[15][i] + step1b[14][i]; |
| 799 | vsub.s16 q13, q0, q1 |
| 800 | vadd.s16 q0, q0, q1 |
| 801 | vsub.s16 q14, q2, q3 |
| 802 | vadd.s16 q2, q2, q3 |
| 803 | ; -------------------------------------------------------------------------- |
| 804 | ; part of stage 4 |
| 805 | ;temp1 = step1b[14][i] * cospi_24_64 - step1b[9][i] * cospi_8_64; |
| 806 | ;temp2 = step1b[14][i] * cospi_8_64 + step1b[9][i] * cospi_24_64; |
| 807 | ;step1[9] = dct_const_round_shift(temp1); |
| 808 | ;step1[14] = dct_const_round_shift(temp2); |
| 809 | DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d2, d3, d6, d7 |
| 810 | ; -------------------------------------------------------------------------- |
| 811 | ; generate 10,11,12,13 |
| 812 | ; -------------------------------------------------------------------------- |
| 813 | ; part of stage 2 |
| 814 | ;temp1 = input[10 * 32] * cospi_22_64 - input[22 * 32] * cospi_10_64; |
| 815 | ;temp2 = input[10 * 32] * cospi_10_64 + input[22 * 32] * cospi_22_64; |
| 816 | ;step2[10] = dct_const_round_shift(temp1); |
| 817 | ;step2[13] = dct_const_round_shift(temp2); |
| 818 | LOAD_FROM_TRANSPOSED 14, 10, 22 |
| 819 | DO_BUTTERFLY_STD cospi_22_64, cospi_10_64, d10, d11, d14, d15 |
| 820 | ; -------------------------------------------------------------------------- |
| 821 | ; part of stage 2 |
| 822 | ;temp1 = input[26 * 32] * cospi_6_64 - input[6 * 32] * cospi_26_64; |
| 823 | ;temp2 = input[26 * 32] * cospi_26_64 + input[6 * 32] * cospi_6_64; |
| 824 | ;step2[11] = dct_const_round_shift(temp1); |
| 825 | ;step2[12] = dct_const_round_shift(temp2); |
| 826 | LOAD_FROM_TRANSPOSED 22, 26, 6 |
| 827 | DO_BUTTERFLY_STD cospi_6_64, cospi_26_64, d8, d9, d12, d13 |
| 828 | ; -------------------------------------------------------------------------- |
| 829 | ; part of stage 3 |
| 830 | ;step3[10] = step1b[11][i] - step1b[10][i]; |
| 831 | ;step3[11] = step1b[11][i] + step1b[10][i]; |
| 832 | ;step3[12] = step1b[12][i] + step1b[13][i]; |
| 833 | ;step3[13] = step1b[12][i] - step1b[13][i]; |
| 834 | vsub.s16 q14, q4, q5 |
| 835 | vadd.s16 q5, q4, q5 |
| 836 | vsub.s16 q13, q6, q7 |
| 837 | vadd.s16 q6, q6, q7 |
| 838 | ; -------------------------------------------------------------------------- |
| 839 | ; part of stage 4 |
| 840 | ;temp1 = step1b[10][i] * (-cospi_8_64) - step1b[13][i] * (-cospi_24_64); |
| 841 | ;temp2 = step1b[10][i] * (-cospi_24_64) + step1b[13][i] * (-cospi_8_64); |
| 842 | ;step1[13] = dct_const_round_shift(temp1); |
| 843 | ;step1[10] = dct_const_round_shift(temp2); |
| 844 | DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d8, d9, d14, d15 |
| 845 | ; -------------------------------------------------------------------------- |
| 846 | ; combine 8-10,11-15 |
| 847 | ; -------------------------------------------------------------------------- |
| 848 | ; part of stage 5 |
| 849 | ;step2[8] = step1b[8][i] + step1b[11][i]; |
| 850 | ;step2[9] = step1b[9][i] + step1b[10][i]; |
| 851 | ;step2[10] = step1b[9][i] - step1b[10][i]; |
| 852 | vadd.s16 q8, q0, q5 |
| 853 | vadd.s16 q9, q1, q7 |
| 854 | vsub.s16 q13, q1, q7 |
| 855 | ;step2[13] = step1b[14][i] - step1b[13][i]; |
| 856 | ;step2[14] = step1b[14][i] + step1b[13][i]; |
| 857 | ;step2[15] = step1b[15][i] + step1b[12][i]; |
| 858 | vsub.s16 q14, q3, q4 |
| 859 | vadd.s16 q10, q3, q4 |
| 860 | vadd.s16 q15, q2, q6 |
| 861 | STORE_IN_OUTPUT 26, 8, 15, q8, q15 |
| 862 | STORE_IN_OUTPUT 15, 9, 14, q9, q10 |
| 863 | ; -------------------------------------------------------------------------- |
| 864 | ; part of stage 6 |
| 865 | ;temp1 = (step1b[13][i] - step1b[10][i]) * cospi_16_64; |
| 866 | ;temp2 = (step1b[13][i] + step1b[10][i]) * cospi_16_64; |
| 867 | ;step3[10] = dct_const_round_shift(temp1); |
| 868 | ;step3[13] = dct_const_round_shift(temp2); |
| 869 | DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 |
| 870 | STORE_IN_OUTPUT 14, 13, 10, q3, q1 |
| 871 | ; -------------------------------------------------------------------------- |
| 872 | ; part of stage 5 |
| 873 | ;step2[11] = step1b[8][i] - step1b[11][i]; |
| 874 | ;step2[12] = step1b[15][i] - step1b[12][i]; |
| 875 | vsub.s16 q13, q0, q5 |
| 876 | vsub.s16 q14, q2, q6 |
| 877 | ; -------------------------------------------------------------------------- |
| 878 | ; part of stage 6 |
| 879 | ;temp1 = (step1b[12][i] - step1b[11][i]) * cospi_16_64; |
| 880 | ;temp2 = (step1b[12][i] + step1b[11][i]) * cospi_16_64; |
| 881 | ;step3[11] = dct_const_round_shift(temp1); |
| 882 | ;step3[12] = dct_const_round_shift(temp2); |
| 883 | DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 |
| 884 | STORE_IN_OUTPUT 10, 11, 12, q1, q3 |
| 885 | ; -------------------------------------------------------------------------- |
| 886 | |
| 887 | |
| 888 | ; -------------------------------------------------------------------------- |
| 889 | ; BLOCK D: 0-3,4-7 |
| 890 | ; -------------------------------------------------------------------------- |
| 891 | ; generate 4,5,6,7 |
| 892 | ; -------------------------------------------------------------------------- |
| 893 | ; part of stage 3 |
| 894 | ;temp1 = input[4 * 32] * cospi_28_64 - input[28 * 32] * cospi_4_64; |
| 895 | ;temp2 = input[4 * 32] * cospi_4_64 + input[28 * 32] * cospi_28_64; |
| 896 | ;step3[4] = dct_const_round_shift(temp1); |
| 897 | ;step3[7] = dct_const_round_shift(temp2); |
| 898 | LOAD_FROM_TRANSPOSED 6, 4, 28 |
| 899 | DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d0, d1, d4, d5 |
| 900 | ; -------------------------------------------------------------------------- |
| 901 | ; part of stage 3 |
| 902 | ;temp1 = input[20 * 32] * cospi_12_64 - input[12 * 32] * cospi_20_64; |
| 903 | ;temp2 = input[20 * 32] * cospi_20_64 + input[12 * 32] * cospi_12_64; |
| 904 | ;step3[5] = dct_const_round_shift(temp1); |
| 905 | ;step3[6] = dct_const_round_shift(temp2); |
| 906 | LOAD_FROM_TRANSPOSED 28, 20, 12 |
| 907 | DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7 |
| 908 | ; -------------------------------------------------------------------------- |
| 909 | ; part of stage 4 |
| 910 | ;step1[4] = step1b[4][i] + step1b[5][i]; |
| 911 | ;step1[5] = step1b[4][i] - step1b[5][i]; |
| 912 | ;step1[6] = step1b[7][i] - step1b[6][i]; |
| 913 | ;step1[7] = step1b[7][i] + step1b[6][i]; |
| 914 | vsub.s16 q13, q0, q1 |
| 915 | vadd.s16 q0, q0, q1 |
| 916 | vsub.s16 q14, q2, q3 |
| 917 | vadd.s16 q2, q2, q3 |
| 918 | ; -------------------------------------------------------------------------- |
| 919 | ; part of stage 5 |
| 920 | ;temp1 = (step1b[6][i] - step1b[5][i]) * cospi_16_64; |
| 921 | ;temp2 = (step1b[5][i] + step1b[6][i]) * cospi_16_64; |
| 922 | ;step2[5] = dct_const_round_shift(temp1); |
| 923 | ;step2[6] = dct_const_round_shift(temp2); |
| 924 | DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 |
| 925 | ; -------------------------------------------------------------------------- |
| 926 | ; generate 0,1,2,3 |
| 927 | ; -------------------------------------------------------------------------- |
| 928 | ; part of stage 4 |
| 929 | ;temp1 = (input[0 * 32] - input[16 * 32]) * cospi_16_64; |
| 930 | ;temp2 = (input[0 * 32] + input[16 * 32]) * cospi_16_64; |
| 931 | ;step1[1] = dct_const_round_shift(temp1); |
| 932 | ;step1[0] = dct_const_round_shift(temp2); |
| 933 | LOAD_FROM_TRANSPOSED 12, 0, 16 |
| 934 | DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d10, d11, d14, d15 |
| 935 | ; -------------------------------------------------------------------------- |
| 936 | ; part of stage 4 |
| 937 | ;temp1 = input[8 * 32] * cospi_24_64 - input[24 * 32] * cospi_8_64; |
| 938 | ;temp2 = input[8 * 32] * cospi_8_64 + input[24 * 32] * cospi_24_64; |
| 939 | ;step1[2] = dct_const_round_shift(temp1); |
| 940 | ;step1[3] = dct_const_round_shift(temp2); |
| 941 | LOAD_FROM_TRANSPOSED 16, 8, 24 |
| 942 | DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d28, d29, d12, d13 |
| 943 | ; -------------------------------------------------------------------------- |
| 944 | ; part of stage 5 |
| 945 | ;step2[0] = step1b[0][i] + step1b[3][i]; |
| 946 | ;step2[1] = step1b[1][i] + step1b[2][i]; |
| 947 | ;step2[2] = step1b[1][i] - step1b[2][i]; |
| 948 | ;step2[3] = step1b[0][i] - step1b[3][i]; |
| 949 | vadd.s16 q4, q7, q6 |
| 950 | vsub.s16 q7, q7, q6 |
| 951 | vsub.s16 q6, q5, q14 |
| 952 | vadd.s16 q5, q5, q14 |
| 953 | ; -------------------------------------------------------------------------- |
| 954 | ; combine 0-3,4-7 |
| 955 | ; -------------------------------------------------------------------------- |
| 956 | ; part of stage 6 |
| 957 | ;step3[0] = step1b[0][i] + step1b[7][i]; |
| 958 | ;step3[1] = step1b[1][i] + step1b[6][i]; |
| 959 | ;step3[2] = step1b[2][i] + step1b[5][i]; |
| 960 | ;step3[3] = step1b[3][i] + step1b[4][i]; |
| 961 | vadd.s16 q8, q4, q2 |
| 962 | vadd.s16 q9, q5, q3 |
| 963 | vadd.s16 q10, q6, q1 |
| 964 | vadd.s16 q11, q7, q0 |
| 965 | ;step3[4] = step1b[3][i] - step1b[4][i]; |
| 966 | ;step3[5] = step1b[2][i] - step1b[5][i]; |
| 967 | ;step3[6] = step1b[1][i] - step1b[6][i]; |
| 968 | ;step3[7] = step1b[0][i] - step1b[7][i]; |
| 969 | vsub.s16 q12, q7, q0 |
| 970 | vsub.s16 q13, q6, q1 |
| 971 | vsub.s16 q14, q5, q3 |
| 972 | vsub.s16 q15, q4, q2 |
| 973 | ; -------------------------------------------------------------------------- |
| 974 | ; part of stage 7 |
| 975 | ;step1[0] = step1b[0][i] + step1b[15][i]; |
| 976 | ;step1[1] = step1b[1][i] + step1b[14][i]; |
| 977 | ;step1[14] = step1b[1][i] - step1b[14][i]; |
| 978 | ;step1[15] = step1b[0][i] - step1b[15][i]; |
| 979 | LOAD_FROM_OUTPUT 12, 14, 15, q0, q1 |
| 980 | vadd.s16 q2, q8, q1 |
| 981 | vadd.s16 q3, q9, q0 |
| 982 | vsub.s16 q4, q9, q0 |
| 983 | vsub.s16 q5, q8, q1 |
| 984 | ; -------------------------------------------------------------------------- |
| 985 | ; part of final stage |
| 986 | ;output[14 * 32] = step1b[14][i] + step1b[17][i]; |
| 987 | ;output[15 * 32] = step1b[15][i] + step1b[16][i]; |
| 988 | ;output[16 * 32] = step1b[15][i] - step1b[16][i]; |
| 989 | ;output[17 * 32] = step1b[14][i] - step1b[17][i]; |
| 990 | LOAD_FROM_OUTPUT 15, 16, 17, q0, q1 |
| 991 | vadd.s16 q8, q4, q1 |
| 992 | vadd.s16 q9, q5, q0 |
| 993 | vsub.s16 q6, q5, q0 |
| 994 | vsub.s16 q7, q4, q1 |
Christian Duvivier | 5b1dc15 | 2013-09-25 18:07:10 -0700 | [diff] [blame] | 995 | |
| 996 | cmp r5, #0 |
| 997 | bgt idct32_bands_end_2nd_pass |
| 998 | |
| 999 | idct32_bands_end_1st_pass |
| 1000 | STORE_IN_OUTPUT 17, 16, 17, q6, q7 |
| 1001 | STORE_IN_OUTPUT 17, 14, 15, q8, q9 |
Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 1002 | ; -------------------------------------------------------------------------- |
| 1003 | ; part of final stage |
| 1004 | ;output[ 0 * 32] = step1b[0][i] + step1b[31][i]; |
| 1005 | ;output[ 1 * 32] = step1b[1][i] + step1b[30][i]; |
| 1006 | ;output[30 * 32] = step1b[1][i] - step1b[30][i]; |
| 1007 | ;output[31 * 32] = step1b[0][i] - step1b[31][i]; |
Christian Duvivier | 5b1dc15 | 2013-09-25 18:07:10 -0700 | [diff] [blame] | 1008 | LOAD_FROM_OUTPUT 15, 30, 31, q0, q1 |
Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 1009 | vadd.s16 q4, q2, q1 |
| 1010 | vadd.s16 q5, q3, q0 |
| 1011 | vsub.s16 q6, q3, q0 |
| 1012 | vsub.s16 q7, q2, q1 |
Christian Duvivier | 5b1dc15 | 2013-09-25 18:07:10 -0700 | [diff] [blame] | 1013 | STORE_IN_OUTPUT 31, 30, 31, q6, q7 |
| 1014 | STORE_IN_OUTPUT 31, 0, 1, q4, q5 |
Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 1015 | ; -------------------------------------------------------------------------- |
| 1016 | ; part of stage 7 |
| 1017 | ;step1[2] = step1b[2][i] + step1b[13][i]; |
| 1018 | ;step1[3] = step1b[3][i] + step1b[12][i]; |
| 1019 | ;step1[12] = step1b[3][i] - step1b[12][i]; |
| 1020 | ;step1[13] = step1b[2][i] - step1b[13][i]; |
| 1021 | LOAD_FROM_OUTPUT 1, 12, 13, q0, q1 |
| 1022 | vadd.s16 q2, q10, q1 |
| 1023 | vadd.s16 q3, q11, q0 |
| 1024 | vsub.s16 q4, q11, q0 |
| 1025 | vsub.s16 q5, q10, q1 |
| 1026 | ; -------------------------------------------------------------------------- |
| 1027 | ; part of final stage |
| 1028 | ;output[12 * 32] = step1b[12][i] + step1b[19][i]; |
| 1029 | ;output[13 * 32] = step1b[13][i] + step1b[18][i]; |
| 1030 | ;output[18 * 32] = step1b[13][i] - step1b[18][i]; |
| 1031 | ;output[19 * 32] = step1b[12][i] - step1b[19][i]; |
| 1032 | LOAD_FROM_OUTPUT 13, 18, 19, q0, q1 |
Christian Duvivier | 5b1dc15 | 2013-09-25 18:07:10 -0700 | [diff] [blame] | 1033 | vadd.s16 q8, q4, q1 |
| 1034 | vadd.s16 q9, q5, q0 |
| 1035 | vsub.s16 q6, q5, q0 |
| 1036 | vsub.s16 q7, q4, q1 |
| 1037 | STORE_IN_OUTPUT 19, 18, 19, q6, q7 |
| 1038 | STORE_IN_OUTPUT 19, 12, 13, q8, q9 |
Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 1039 | ; -------------------------------------------------------------------------- |
| 1040 | ; part of final stage |
| 1041 | ;output[ 2 * 32] = step1b[2][i] + step1b[29][i]; |
| 1042 | ;output[ 3 * 32] = step1b[3][i] + step1b[28][i]; |
| 1043 | ;output[28 * 32] = step1b[3][i] - step1b[28][i]; |
| 1044 | ;output[29 * 32] = step1b[2][i] - step1b[29][i]; |
Christian Duvivier | 5b1dc15 | 2013-09-25 18:07:10 -0700 | [diff] [blame] | 1045 | LOAD_FROM_OUTPUT 13, 28, 29, q0, q1 |
Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 1046 | vadd.s16 q4, q2, q1 |
| 1047 | vadd.s16 q5, q3, q0 |
| 1048 | vsub.s16 q6, q3, q0 |
| 1049 | vsub.s16 q7, q2, q1 |
Christian Duvivier | 5b1dc15 | 2013-09-25 18:07:10 -0700 | [diff] [blame] | 1050 | STORE_IN_OUTPUT 29, 28, 29, q6, q7 |
| 1051 | STORE_IN_OUTPUT 29, 2, 3, q4, q5 |
Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 1052 | ; -------------------------------------------------------------------------- |
| 1053 | ; part of stage 7 |
| 1054 | ;step1[4] = step1b[4][i] + step1b[11][i]; |
| 1055 | ;step1[5] = step1b[5][i] + step1b[10][i]; |
| 1056 | ;step1[10] = step1b[5][i] - step1b[10][i]; |
| 1057 | ;step1[11] = step1b[4][i] - step1b[11][i]; |
| 1058 | LOAD_FROM_OUTPUT 3, 10, 11, q0, q1 |
| 1059 | vadd.s16 q2, q12, q1 |
| 1060 | vadd.s16 q3, q13, q0 |
| 1061 | vsub.s16 q4, q13, q0 |
| 1062 | vsub.s16 q5, q12, q1 |
| 1063 | ; -------------------------------------------------------------------------- |
| 1064 | ; part of final stage |
| 1065 | ;output[10 * 32] = step1b[10][i] + step1b[21][i]; |
| 1066 | ;output[11 * 32] = step1b[11][i] + step1b[20][i]; |
| 1067 | ;output[20 * 32] = step1b[11][i] - step1b[20][i]; |
| 1068 | ;output[21 * 32] = step1b[10][i] - step1b[21][i]; |
| 1069 | LOAD_FROM_OUTPUT 11, 20, 21, q0, q1 |
Christian Duvivier | 5b1dc15 | 2013-09-25 18:07:10 -0700 | [diff] [blame] | 1070 | vadd.s16 q8, q4, q1 |
| 1071 | vadd.s16 q9, q5, q0 |
| 1072 | vsub.s16 q6, q5, q0 |
| 1073 | vsub.s16 q7, q4, q1 |
| 1074 | STORE_IN_OUTPUT 21, 20, 21, q6, q7 |
| 1075 | STORE_IN_OUTPUT 21, 10, 11, q8, q9 |
Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 1076 | ; -------------------------------------------------------------------------- |
| 1077 | ; part of final stage |
| 1078 | ;output[ 4 * 32] = step1b[4][i] + step1b[27][i]; |
| 1079 | ;output[ 5 * 32] = step1b[5][i] + step1b[26][i]; |
| 1080 | ;output[26 * 32] = step1b[5][i] - step1b[26][i]; |
| 1081 | ;output[27 * 32] = step1b[4][i] - step1b[27][i]; |
Christian Duvivier | 5b1dc15 | 2013-09-25 18:07:10 -0700 | [diff] [blame] | 1082 | LOAD_FROM_OUTPUT 11, 26, 27, q0, q1 |
Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 1083 | vadd.s16 q4, q2, q1 |
| 1084 | vadd.s16 q5, q3, q0 |
| 1085 | vsub.s16 q6, q3, q0 |
| 1086 | vsub.s16 q7, q2, q1 |
Christian Duvivier | 5b1dc15 | 2013-09-25 18:07:10 -0700 | [diff] [blame] | 1087 | STORE_IN_OUTPUT 27, 26, 27, q6, q7 |
| 1088 | STORE_IN_OUTPUT 27, 4, 5, q4, q5 |
Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 1089 | ; -------------------------------------------------------------------------- |
| 1090 | ; part of stage 7 |
| 1091 | ;step1[6] = step1b[6][i] + step1b[9][i]; |
| 1092 | ;step1[7] = step1b[7][i] + step1b[8][i]; |
| 1093 | ;step1[8] = step1b[7][i] - step1b[8][i]; |
| 1094 | ;step1[9] = step1b[6][i] - step1b[9][i]; |
| 1095 | LOAD_FROM_OUTPUT 5, 8, 9, q0, q1 |
| 1096 | vadd.s16 q2, q14, q1 |
| 1097 | vadd.s16 q3, q15, q0 |
| 1098 | vsub.s16 q4, q15, q0 |
| 1099 | vsub.s16 q5, q14, q1 |
| 1100 | ; -------------------------------------------------------------------------- |
| 1101 | ; part of final stage |
| 1102 | ;output[ 8 * 32] = step1b[8][i] + step1b[23][i]; |
| 1103 | ;output[ 9 * 32] = step1b[9][i] + step1b[22][i]; |
| 1104 | ;output[22 * 32] = step1b[9][i] - step1b[22][i]; |
| 1105 | ;output[23 * 32] = step1b[8][i] - step1b[23][i]; |
| 1106 | LOAD_FROM_OUTPUT 9, 22, 23, q0, q1 |
Christian Duvivier | 5b1dc15 | 2013-09-25 18:07:10 -0700 | [diff] [blame] | 1107 | vadd.s16 q8, q4, q1 |
| 1108 | vadd.s16 q9, q5, q0 |
| 1109 | vsub.s16 q6, q5, q0 |
| 1110 | vsub.s16 q7, q4, q1 |
| 1111 | STORE_IN_OUTPUT 23, 22, 23, q6, q7 |
| 1112 | STORE_IN_OUTPUT 23, 8, 9, q8, q9 |
Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 1113 | ; -------------------------------------------------------------------------- |
| 1114 | ; part of final stage |
| 1115 | ;output[ 6 * 32] = step1b[6][i] + step1b[25][i]; |
| 1116 | ;output[ 7 * 32] = step1b[7][i] + step1b[24][i]; |
| 1117 | ;output[24 * 32] = step1b[7][i] - step1b[24][i]; |
| 1118 | ;output[25 * 32] = step1b[6][i] - step1b[25][i]; |
Christian Duvivier | 5b1dc15 | 2013-09-25 18:07:10 -0700 | [diff] [blame] | 1119 | LOAD_FROM_OUTPUT 9, 24, 25, q0, q1 |
Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 1120 | vadd.s16 q4, q2, q1 |
| 1121 | vadd.s16 q5, q3, q0 |
| 1122 | vsub.s16 q6, q3, q0 |
| 1123 | vsub.s16 q7, q2, q1 |
Christian Duvivier | 5b1dc15 | 2013-09-25 18:07:10 -0700 | [diff] [blame] | 1124 | STORE_IN_OUTPUT 25, 24, 25, q6, q7 |
| 1125 | STORE_IN_OUTPUT 25, 6, 7, q4, q5 |
Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 1126 | |
Christian Duvivier | 5b1dc15 | 2013-09-25 18:07:10 -0700 | [diff] [blame] | 1127 | ; restore r0 by removing the last offset from the last |
| 1128 | ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2 |
| 1129 | sub r0, r0, #24*8*2 |
| 1130 | ; restore r1 by removing the last offset from the last |
| 1131 | ; operation (STORE_IN_OUTPUT 24, 6, 7) => 7*32*2 |
| 1132 | ; advance by 8 columns => 8*2 |
| 1133 | sub r1, r1, #7*32*2 - 8*2 |
| 1134 | ; advance by 8 lines (8*32*2) |
| 1135 | ; go back by the two pairs from the loop (32*2) |
| 1136 | add r3, r3, #8*32*2 - 32*2 |
Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 1137 | |
| 1138 | ; bands loop processing |
Christian Duvivier | 5b1dc15 | 2013-09-25 18:07:10 -0700 | [diff] [blame] | 1139 | subs r4, r4, #1 |
| 1140 | bne idct32_bands_loop |
Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 1141 | |
Christian Duvivier | 5b1dc15 | 2013-09-25 18:07:10 -0700 | [diff] [blame] | 1142 | ; parameters for second pass |
| 1143 | ; the input of pass2 is the result of pass1. we have to remove the offset |
| 1144 | ; of 32 columns induced by the above idct32_bands_loop |
| 1145 | sub r3, r1, #32*2 |
| 1146 | ; r1 = pass2[32 * 32] |
| 1147 | add r1, sp, #2048 |
| 1148 | |
| 1149 | ; pass loop processing |
| 1150 | add r5, r5, #1 |
Johann | e72d49a | 2013-11-12 10:41:06 -0800 | [diff] [blame] | 1151 | b idct32_pass_loop |
Christian Duvivier | 5b1dc15 | 2013-09-25 18:07:10 -0700 | [diff] [blame] | 1152 | |
| 1153 | idct32_bands_end_2nd_pass |
| 1154 | STORE_COMBINE_CENTER_RESULTS |
| 1155 | ; -------------------------------------------------------------------------- |
| 1156 | ; part of final stage |
| 1157 | ;output[ 0 * 32] = step1b[0][i] + step1b[31][i]; |
| 1158 | ;output[ 1 * 32] = step1b[1][i] + step1b[30][i]; |
| 1159 | ;output[30 * 32] = step1b[1][i] - step1b[30][i]; |
| 1160 | ;output[31 * 32] = step1b[0][i] - step1b[31][i]; |
| 1161 | LOAD_FROM_OUTPUT 17, 30, 31, q0, q1 |
| 1162 | vadd.s16 q4, q2, q1 |
| 1163 | vadd.s16 q5, q3, q0 |
| 1164 | vsub.s16 q6, q3, q0 |
| 1165 | vsub.s16 q7, q2, q1 |
| 1166 | STORE_COMBINE_EXTREME_RESULTS |
| 1167 | ; -------------------------------------------------------------------------- |
| 1168 | ; part of stage 7 |
| 1169 | ;step1[2] = step1b[2][i] + step1b[13][i]; |
| 1170 | ;step1[3] = step1b[3][i] + step1b[12][i]; |
| 1171 | ;step1[12] = step1b[3][i] - step1b[12][i]; |
| 1172 | ;step1[13] = step1b[2][i] - step1b[13][i]; |
| 1173 | LOAD_FROM_OUTPUT 31, 12, 13, q0, q1 |
| 1174 | vadd.s16 q2, q10, q1 |
| 1175 | vadd.s16 q3, q11, q0 |
| 1176 | vsub.s16 q4, q11, q0 |
| 1177 | vsub.s16 q5, q10, q1 |
| 1178 | ; -------------------------------------------------------------------------- |
| 1179 | ; part of final stage |
| 1180 | ;output[12 * 32] = step1b[12][i] + step1b[19][i]; |
| 1181 | ;output[13 * 32] = step1b[13][i] + step1b[18][i]; |
| 1182 | ;output[18 * 32] = step1b[13][i] - step1b[18][i]; |
| 1183 | ;output[19 * 32] = step1b[12][i] - step1b[19][i]; |
| 1184 | LOAD_FROM_OUTPUT 13, 18, 19, q0, q1 |
| 1185 | vadd.s16 q8, q4, q1 |
| 1186 | vadd.s16 q9, q5, q0 |
| 1187 | vsub.s16 q6, q5, q0 |
| 1188 | vsub.s16 q7, q4, q1 |
| 1189 | STORE_COMBINE_CENTER_RESULTS |
| 1190 | ; -------------------------------------------------------------------------- |
| 1191 | ; part of final stage |
| 1192 | ;output[ 2 * 32] = step1b[2][i] + step1b[29][i]; |
| 1193 | ;output[ 3 * 32] = step1b[3][i] + step1b[28][i]; |
| 1194 | ;output[28 * 32] = step1b[3][i] - step1b[28][i]; |
| 1195 | ;output[29 * 32] = step1b[2][i] - step1b[29][i]; |
| 1196 | LOAD_FROM_OUTPUT 19, 28, 29, q0, q1 |
| 1197 | vadd.s16 q4, q2, q1 |
| 1198 | vadd.s16 q5, q3, q0 |
| 1199 | vsub.s16 q6, q3, q0 |
| 1200 | vsub.s16 q7, q2, q1 |
| 1201 | STORE_COMBINE_EXTREME_RESULTS |
| 1202 | ; -------------------------------------------------------------------------- |
| 1203 | ; part of stage 7 |
| 1204 | ;step1[4] = step1b[4][i] + step1b[11][i]; |
| 1205 | ;step1[5] = step1b[5][i] + step1b[10][i]; |
| 1206 | ;step1[10] = step1b[5][i] - step1b[10][i]; |
| 1207 | ;step1[11] = step1b[4][i] - step1b[11][i]; |
| 1208 | LOAD_FROM_OUTPUT 29, 10, 11, q0, q1 |
| 1209 | vadd.s16 q2, q12, q1 |
| 1210 | vadd.s16 q3, q13, q0 |
| 1211 | vsub.s16 q4, q13, q0 |
| 1212 | vsub.s16 q5, q12, q1 |
| 1213 | ; -------------------------------------------------------------------------- |
| 1214 | ; part of final stage |
| 1215 | ;output[10 * 32] = step1b[10][i] + step1b[21][i]; |
| 1216 | ;output[11 * 32] = step1b[11][i] + step1b[20][i]; |
| 1217 | ;output[20 * 32] = step1b[11][i] - step1b[20][i]; |
| 1218 | ;output[21 * 32] = step1b[10][i] - step1b[21][i]; |
| 1219 | LOAD_FROM_OUTPUT 11, 20, 21, q0, q1 |
| 1220 | vadd.s16 q8, q4, q1 |
| 1221 | vadd.s16 q9, q5, q0 |
| 1222 | vsub.s16 q6, q5, q0 |
| 1223 | vsub.s16 q7, q4, q1 |
| 1224 | STORE_COMBINE_CENTER_RESULTS |
| 1225 | ; -------------------------------------------------------------------------- |
| 1226 | ; part of final stage |
| 1227 | ;output[ 4 * 32] = step1b[4][i] + step1b[27][i]; |
| 1228 | ;output[ 5 * 32] = step1b[5][i] + step1b[26][i]; |
| 1229 | ;output[26 * 32] = step1b[5][i] - step1b[26][i]; |
| 1230 | ;output[27 * 32] = step1b[4][i] - step1b[27][i]; |
| 1231 | LOAD_FROM_OUTPUT 21, 26, 27, q0, q1 |
| 1232 | vadd.s16 q4, q2, q1 |
| 1233 | vadd.s16 q5, q3, q0 |
| 1234 | vsub.s16 q6, q3, q0 |
| 1235 | vsub.s16 q7, q2, q1 |
| 1236 | STORE_COMBINE_EXTREME_RESULTS |
| 1237 | ; -------------------------------------------------------------------------- |
| 1238 | ; part of stage 7 |
| 1239 | ;step1[6] = step1b[6][i] + step1b[9][i]; |
| 1240 | ;step1[7] = step1b[7][i] + step1b[8][i]; |
| 1241 | ;step1[8] = step1b[7][i] - step1b[8][i]; |
| 1242 | ;step1[9] = step1b[6][i] - step1b[9][i]; |
| 1243 | LOAD_FROM_OUTPUT 27, 8, 9, q0, q1 |
| 1244 | vadd.s16 q2, q14, q1 |
| 1245 | vadd.s16 q3, q15, q0 |
| 1246 | vsub.s16 q4, q15, q0 |
| 1247 | vsub.s16 q5, q14, q1 |
| 1248 | ; -------------------------------------------------------------------------- |
| 1249 | ; part of final stage |
| 1250 | ;output[ 8 * 32] = step1b[8][i] + step1b[23][i]; |
| 1251 | ;output[ 9 * 32] = step1b[9][i] + step1b[22][i]; |
| 1252 | ;output[22 * 32] = step1b[9][i] - step1b[22][i]; |
| 1253 | ;output[23 * 32] = step1b[8][i] - step1b[23][i]; |
| 1254 | LOAD_FROM_OUTPUT 9, 22, 23, q0, q1 |
| 1255 | vadd.s16 q8, q4, q1 |
| 1256 | vadd.s16 q9, q5, q0 |
| 1257 | vsub.s16 q6, q5, q0 |
| 1258 | vsub.s16 q7, q4, q1 |
| 1259 | STORE_COMBINE_CENTER_RESULTS_LAST |
| 1260 | ; -------------------------------------------------------------------------- |
| 1261 | ; part of final stage |
| 1262 | ;output[ 6 * 32] = step1b[6][i] + step1b[25][i]; |
| 1263 | ;output[ 7 * 32] = step1b[7][i] + step1b[24][i]; |
| 1264 | ;output[24 * 32] = step1b[7][i] - step1b[24][i]; |
| 1265 | ;output[25 * 32] = step1b[6][i] - step1b[25][i]; |
| 1266 | LOAD_FROM_OUTPUT 23, 24, 25, q0, q1 |
| 1267 | vadd.s16 q4, q2, q1 |
| 1268 | vadd.s16 q5, q3, q0 |
| 1269 | vsub.s16 q6, q3, q0 |
| 1270 | vsub.s16 q7, q2, q1 |
| 1271 | STORE_COMBINE_EXTREME_RESULTS_LAST |
| 1272 | ; -------------------------------------------------------------------------- |
| 1273 | ; restore pointers to their initial indices for next band pass by |
| 1274 | ; removing/adding dest_stride * 8. The actual increment by eight |
| 1275 | ; is taken care of within the _LAST macros. |
| 1276 | add r6, r6, r2, lsl #3 |
| 1277 | add r9, r9, r2, lsl #3 |
| 1278 | sub r7, r7, r2, lsl #3 |
| 1279 | sub r10, r10, r2, lsl #3 |
| 1280 | |
| 1281 | ; restore r0 by removing the last offset from the last |
| 1282 | ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2 |
| 1283 | sub r0, r0, #24*8*2 |
| 1284 | ; restore r1 by removing the last offset from the last |
| 1285 | ; operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25*32*2 |
| 1286 | ; advance by 8 columns => 8*2 |
| 1287 | sub r1, r1, #25*32*2 - 8*2 |
| 1288 | ; advance by 8 lines (8*32*2) |
| 1289 | ; go back by the two pairs from the loop (32*2) |
| 1290 | add r3, r3, #8*32*2 - 32*2 |
| 1291 | |
| 1292 | ; bands loop processing |
| 1293 | subs r4, r4, #1 |
| 1294 | bne idct32_bands_loop |
| 1295 | |
| 1296 | ; stack operation |
| 1297 | add sp, sp, #512+2048+2048 |
| 1298 | vpop {d8-d15} |
| 1299 | pop {r4-r11} |
Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 1300 | bx lr |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 1301 | ENDP ; |aom_idct32x32_1024_add_neon| |
Christian Duvivier | 6a50146 | 2013-09-11 15:18:47 -0700 | [diff] [blame] | 1302 | END |