Christian Duvivier | 7818253 | 2013-08-04 17:37:05 -0700 | [diff] [blame] | 1 | ; |
| 2 | ; Copyright (c) 2013 The WebM project authors. All Rights Reserved. |
| 3 | ; |
| 4 | ; Use of this source code is governed by a BSD-style license |
| 5 | ; that can be found in the LICENSE file in the root of the source |
| 6 | ; tree. An additional intellectual property rights grant can be found |
| 7 | ; in the file PATENTS. All contributing project authors may |
| 8 | ; be found in the AUTHORS file in the root of the source tree. |
| 9 | ; |
| 10 | |
Jingning Han | 08a453b | 2015-08-03 14:51:10 -0700 | [diff] [blame^] | 11 | EXPORT |vpx_idct4x4_16_add_neon| |
Christian Duvivier | 7818253 | 2013-08-04 17:37:05 -0700 | [diff] [blame] | 12 | ARM |
| 13 | REQUIRE8 |
| 14 | PRESERVE8 |
| 15 | |
| 16 | AREA ||.text||, CODE, READONLY, ALIGN=2 |
| 17 | |
| 18 | AREA Block, CODE, READONLY ; name this block of code |
Jingning Han | 08a453b | 2015-08-03 14:51:10 -0700 | [diff] [blame^] | 19 | ;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) |
Christian Duvivier | 7818253 | 2013-08-04 17:37:05 -0700 | [diff] [blame] | 20 | ; |
| 21 | ; r0 int16_t input |
| 22 | ; r1 uint8_t *dest |
| 23 | ; r2 int dest_stride) |
| 24 | |
Jingning Han | 08a453b | 2015-08-03 14:51:10 -0700 | [diff] [blame^] | 25 | |vpx_idct4x4_16_add_neon| PROC |
Christian Duvivier | 7818253 | 2013-08-04 17:37:05 -0700 | [diff] [blame] | 26 | |
| 27 | ; The 2D transform is done with two passes which are actually pretty |
| 28 | ; similar. We first transform the rows. This is done by transposing |
| 29 | ; the inputs, doing an SIMD column transform (the columns are the |
| 30 | ; transposed rows) and then transpose the results (so that it goes back |
| 31 | ; in normal/row positions). Then, we transform the columns by doing |
| 32 | ; another SIMD column transform. |
| 33 | ; So, two passes of a transpose followed by a column transform. |
| 34 | |
| 35 | ; load the inputs into q8-q9, d16-d19 |
hkuang | 37cda6d | 2013-08-16 16:36:07 -0700 | [diff] [blame] | 36 | vld1.s16 {q8,q9}, [r0]! |
Christian Duvivier | 7818253 | 2013-08-04 17:37:05 -0700 | [diff] [blame] | 37 | |
| 38 | ; generate scalar constants |
| 39 | ; cospi_8_64 = 15137 = 0x3b21 |
| 40 | mov r0, #0x3b00 |
| 41 | add r0, #0x21 |
| 42 | ; cospi_16_64 = 11585 = 0x2d41 |
| 43 | mov r3, #0x2d00 |
| 44 | add r3, #0x41 |
| 45 | ; cospi_24_64 = 6270 = 0x 187e |
| 46 | mov r12, #0x1800 |
| 47 | add r12, #0x7e |
Christian Duvivier | 7818253 | 2013-08-04 17:37:05 -0700 | [diff] [blame] | 48 | |
| 49 | ; transpose the input data |
| 50 | ; 00 01 02 03 d16 |
| 51 | ; 10 11 12 13 d17 |
| 52 | ; 20 21 22 23 d18 |
| 53 | ; 30 31 32 33 d19 |
| 54 | vtrn.16 d16, d17 |
| 55 | vtrn.16 d18, d19 |
hkuang | 610642c | 2013-08-22 11:02:22 -0700 | [diff] [blame] | 56 | |
| 57 | ; generate constant vectors |
| 58 | vdup.16 d20, r0 ; replicate cospi_8_64 |
| 59 | vdup.16 d21, r3 ; replicate cospi_16_64 |
| 60 | |
Christian Duvivier | 7818253 | 2013-08-04 17:37:05 -0700 | [diff] [blame] | 61 | ; 00 10 02 12 d16 |
| 62 | ; 01 11 03 13 d17 |
| 63 | ; 20 30 22 32 d18 |
| 64 | ; 21 31 23 33 d19 |
| 65 | vtrn.32 q8, q9 |
| 66 | ; 00 10 20 30 d16 |
| 67 | ; 01 11 21 31 d17 |
| 68 | ; 02 12 22 32 d18 |
| 69 | ; 03 13 23 33 d19 |
| 70 | |
hkuang | 610642c | 2013-08-22 11:02:22 -0700 | [diff] [blame] | 71 | vdup.16 d22, r12 ; replicate cospi_24_64 |
| 72 | |
Christian Duvivier | 7818253 | 2013-08-04 17:37:05 -0700 | [diff] [blame] | 73 | ; do the transform on transposed rows |
| 74 | |
| 75 | ; stage 1 |
hkuang | 610642c | 2013-08-22 11:02:22 -0700 | [diff] [blame] | 76 | vadd.s16 d23, d16, d18 ; (input[0] + input[2]) |
| 77 | vsub.s16 d24, d16, d18 ; (input[0] - input[2]) |
| 78 | |
| 79 | vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64 |
| 80 | vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64 |
| 81 | |
Christian Duvivier | 7818253 | 2013-08-04 17:37:05 -0700 | [diff] [blame] | 82 | ; (input[0] + input[2]) * cospi_16_64; |
| 83 | ; (input[0] - input[2]) * cospi_16_64; |
Christian Duvivier | 7818253 | 2013-08-04 17:37:05 -0700 | [diff] [blame] | 84 | vmull.s16 q13, d23, d21 |
| 85 | vmull.s16 q14, d24, d21 |
| 86 | |
| 87 | ; input[1] * cospi_24_64 - input[3] * cospi_8_64; |
| 88 | ; input[1] * cospi_8_64 + input[3] * cospi_24_64; |
hkuang | df07152 | 2013-08-16 10:54:56 -0700 | [diff] [blame] | 89 | vmlsl.s16 q15, d19, d20 |
| 90 | vmlal.s16 q1, d19, d22 |
Christian Duvivier | 7818253 | 2013-08-04 17:37:05 -0700 | [diff] [blame] | 91 | |
| 92 | ; dct_const_round_shift |
| 93 | vqrshrn.s32 d26, q13, #14 |
| 94 | vqrshrn.s32 d27, q14, #14 |
hkuang | df07152 | 2013-08-16 10:54:56 -0700 | [diff] [blame] | 95 | vqrshrn.s32 d29, q15, #14 |
| 96 | vqrshrn.s32 d28, q1, #14 |
Christian Duvivier | 7818253 | 2013-08-04 17:37:05 -0700 | [diff] [blame] | 97 | |
| 98 | ; stage 2 |
| 99 | ; output[0] = step[0] + step[3]; |
| 100 | ; output[1] = step[1] + step[2]; |
| 101 | ; output[3] = step[0] - step[3]; |
| 102 | ; output[2] = step[1] - step[2]; |
| 103 | vadd.s16 q8, q13, q14 |
| 104 | vsub.s16 q9, q13, q14 |
| 105 | vswp d18, d19 |
| 106 | |
| 107 | ; transpose the results |
| 108 | ; 00 01 02 03 d16 |
| 109 | ; 10 11 12 13 d17 |
| 110 | ; 20 21 22 23 d18 |
| 111 | ; 30 31 32 33 d19 |
| 112 | vtrn.16 d16, d17 |
| 113 | vtrn.16 d18, d19 |
| 114 | ; 00 10 02 12 d16 |
| 115 | ; 01 11 03 13 d17 |
| 116 | ; 20 30 22 32 d18 |
| 117 | ; 21 31 23 33 d19 |
| 118 | vtrn.32 q8, q9 |
| 119 | ; 00 10 20 30 d16 |
| 120 | ; 01 11 21 31 d17 |
| 121 | ; 02 12 22 32 d18 |
| 122 | ; 03 13 23 33 d19 |
| 123 | |
| 124 | ; do the transform on columns |
| 125 | |
| 126 | ; stage 1 |
hkuang | 610642c | 2013-08-22 11:02:22 -0700 | [diff] [blame] | 127 | vadd.s16 d23, d16, d18 ; (input[0] + input[2]) |
| 128 | vsub.s16 d24, d16, d18 ; (input[0] - input[2]) |
| 129 | |
| 130 | vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64 |
| 131 | vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64 |
| 132 | |
Christian Duvivier | 7818253 | 2013-08-04 17:37:05 -0700 | [diff] [blame] | 133 | ; (input[0] + input[2]) * cospi_16_64; |
| 134 | ; (input[0] - input[2]) * cospi_16_64; |
Christian Duvivier | 7818253 | 2013-08-04 17:37:05 -0700 | [diff] [blame] | 135 | vmull.s16 q13, d23, d21 |
| 136 | vmull.s16 q14, d24, d21 |
| 137 | |
| 138 | ; input[1] * cospi_24_64 - input[3] * cospi_8_64; |
| 139 | ; input[1] * cospi_8_64 + input[3] * cospi_24_64; |
hkuang | df07152 | 2013-08-16 10:54:56 -0700 | [diff] [blame] | 140 | vmlsl.s16 q15, d19, d20 |
| 141 | vmlal.s16 q1, d19, d22 |
Christian Duvivier | 7818253 | 2013-08-04 17:37:05 -0700 | [diff] [blame] | 142 | |
| 143 | ; dct_const_round_shift |
| 144 | vqrshrn.s32 d26, q13, #14 |
| 145 | vqrshrn.s32 d27, q14, #14 |
hkuang | df07152 | 2013-08-16 10:54:56 -0700 | [diff] [blame] | 146 | vqrshrn.s32 d29, q15, #14 |
| 147 | vqrshrn.s32 d28, q1, #14 |
Christian Duvivier | 7818253 | 2013-08-04 17:37:05 -0700 | [diff] [blame] | 148 | |
| 149 | ; stage 2 |
| 150 | ; output[0] = step[0] + step[3]; |
| 151 | ; output[1] = step[1] + step[2]; |
| 152 | ; output[3] = step[0] - step[3]; |
| 153 | ; output[2] = step[1] - step[2]; |
| 154 | vadd.s16 q8, q13, q14 |
| 155 | vsub.s16 q9, q13, q14 |
| 156 | |
| 157 | ; The results are in two registers, one of them being swapped. This will |
| 158 | ; be taken care of by loading the 'dest' value in a swapped fashion and |
| 159 | ; also storing them in the same swapped fashion. |
| 160 | ; temp_out[0, 1] = d16, d17 = q8 |
| 161 | ; temp_out[2, 3] = d19, d18 = q9 swapped |
| 162 | |
| 163 | ; ROUND_POWER_OF_TWO(temp_out[j], 4) |
| 164 | vrshr.s16 q8, q8, #4 |
| 165 | vrshr.s16 q9, q9, #4 |
| 166 | |
| 167 | vld1.32 {d26[0]}, [r1], r2 |
| 168 | vld1.32 {d26[1]}, [r1], r2 |
| 169 | vld1.32 {d27[1]}, [r1], r2 |
| 170 | vld1.32 {d27[0]}, [r1] ; no post-increment |
| 171 | |
| 172 | ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i] |
| 173 | vaddw.u8 q8, q8, d26 |
| 174 | vaddw.u8 q9, q9, d27 |
| 175 | |
| 176 | ; clip_pixel |
| 177 | vqmovun.s16 d26, q8 |
| 178 | vqmovun.s16 d27, q9 |
| 179 | |
| 180 | ; do the stores in reverse order with negative post-increment, by changing |
| 181 | ; the sign of the stride |
| 182 | rsb r2, r2, #0 |
| 183 | vst1.32 {d27[0]}, [r1], r2 |
| 184 | vst1.32 {d27[1]}, [r1], r2 |
| 185 | vst1.32 {d26[1]}, [r1], r2 |
| 186 | vst1.32 {d26[0]}, [r1] ; no post-increment |
| 187 | bx lr |
Jingning Han | 08a453b | 2015-08-03 14:51:10 -0700 | [diff] [blame^] | 188 | ENDP ; |vpx_idct4x4_16_add_neon| |
Christian Duvivier | 7818253 | 2013-08-04 17:37:05 -0700 | [diff] [blame] | 189 | |
| 190 | END |