Remya Prakasan | e915e3c | 2018-05-11 17:16:34 +0530 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (c) 2018, Alliance for Open Media. All Rights Reserved. |
| 3 | * |
| 4 | * Use of this source code is governed by a BSD-style license |
| 5 | * that can be found in the LICENSE file in the root of the source |
| 6 | * tree. An additional intellectual property rights grant can be found |
| 7 | * in the file PATENTS. All contributing project authors may |
| 8 | * be found in the AUTHORS file in the root of the source tree. |
| 9 | */ |
| 10 | |
Bohan Li | 3adb660d | 2021-08-24 17:59:14 -0700 | [diff] [blame] | 11 | #ifndef AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_ |
| 12 | #define AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_ |
Remya Prakasan | e915e3c | 2018-05-11 17:16:34 +0530 | [diff] [blame] | 13 | |
| 14 | #include <arm_neon.h> |
| 15 | |
James Zern | 72d4864 | 2022-02-14 18:59:44 -0800 | [diff] [blame] | 16 | // Swap high and low halves. |
James Zern | 02b9e9d | 2022-02-18 19:17:40 -0800 | [diff] [blame] | 17 | static INLINE uint16x8_t transpose64_u16q(const uint16x8_t a) { |
James Zern | 72d4864 | 2022-02-14 18:59:44 -0800 | [diff] [blame] | 18 | return vextq_u16(a, a, 4); |
| 19 | } |
| 20 | |
Remya Prakasan | e915e3c | 2018-05-11 17:16:34 +0530 | [diff] [blame] | 21 | static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, |
| 22 | uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5, |
| 23 | uint8x8_t *a6, uint8x8_t *a7) { |
| 24 | // Swap 8 bit elements. Goes from: |
| 25 | // a0: 00 01 02 03 04 05 06 07 |
| 26 | // a1: 10 11 12 13 14 15 16 17 |
| 27 | // a2: 20 21 22 23 24 25 26 27 |
| 28 | // a3: 30 31 32 33 34 35 36 37 |
| 29 | // a4: 40 41 42 43 44 45 46 47 |
| 30 | // a5: 50 51 52 53 54 55 56 57 |
| 31 | // a6: 60 61 62 63 64 65 66 67 |
| 32 | // a7: 70 71 72 73 74 75 76 77 |
| 33 | // to: |
| 34 | // b0.val[0]: 00 10 02 12 04 14 06 16 40 50 42 52 44 54 46 56 |
| 35 | // b0.val[1]: 01 11 03 13 05 15 07 17 41 51 43 53 45 55 47 57 |
| 36 | // b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76 |
| 37 | // b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77 |
| 38 | |
| 39 | const uint8x16x2_t b0 = |
| 40 | vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5)); |
| 41 | const uint8x16x2_t b1 = |
| 42 | vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7)); |
| 43 | |
| 44 | // Swap 16 bit elements resulting in: |
| 45 | // c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74 |
| 46 | // c0.val[1]: 02 12 22 32 06 16 26 36 42 52 62 72 46 56 66 76 |
| 47 | // c1.val[0]: 01 11 21 31 05 15 25 35 41 51 61 71 45 55 65 75 |
| 48 | // c1.val[1]: 03 13 23 33 07 17 27 37 43 53 63 73 47 57 67 77 |
| 49 | |
| 50 | const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]), |
| 51 | vreinterpretq_u16_u8(b1.val[0])); |
| 52 | const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]), |
| 53 | vreinterpretq_u16_u8(b1.val[1])); |
| 54 | |
| 55 | // Unzip 32 bit elements resulting in: |
| 56 | // d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 |
| 57 | // d0.val[1]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 |
| 58 | // d1.val[0]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 |
| 59 | // d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 |
| 60 | const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]), |
| 61 | vreinterpretq_u32_u16(c1.val[0])); |
| 62 | const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]), |
| 63 | vreinterpretq_u32_u16(c1.val[1])); |
| 64 | |
| 65 | *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0])); |
| 66 | *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0])); |
| 67 | *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0])); |
| 68 | *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0])); |
| 69 | *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1])); |
| 70 | *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1])); |
| 71 | *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1])); |
| 72 | *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1])); |
| 73 | } |
| 74 | |
| 75 | static INLINE void transpose_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, |
| 76 | uint8x8_t *a3) { |
| 77 | // Swap 8 bit elements. Goes from: |
| 78 | // a0: 00 01 02 03 04 05 06 07 |
| 79 | // a1: 10 11 12 13 14 15 16 17 |
| 80 | // a2: 20 21 22 23 24 25 26 27 |
| 81 | // a3: 30 31 32 33 34 35 36 37 |
| 82 | // to: |
| 83 | // b0.val[0]: 00 10 02 12 04 14 06 16 |
| 84 | // b0.val[1]: 01 11 03 13 05 15 07 17 |
| 85 | // b1.val[0]: 20 30 22 32 24 34 26 36 |
| 86 | // b1.val[1]: 21 31 23 33 25 35 27 37 |
| 87 | |
| 88 | const uint8x8x2_t b0 = vtrn_u8(*a0, *a1); |
| 89 | const uint8x8x2_t b1 = vtrn_u8(*a2, *a3); |
| 90 | |
| 91 | // Swap 16 bit elements resulting in: |
| 92 | // c0.val[0]: 00 10 20 30 04 14 24 34 |
| 93 | // c0.val[1]: 02 12 22 32 06 16 26 36 |
| 94 | // c1.val[0]: 01 11 21 31 05 15 25 35 |
| 95 | // c1.val[1]: 03 13 23 33 07 17 27 37 |
| 96 | |
| 97 | const uint16x4x2_t c0 = |
| 98 | vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0])); |
| 99 | const uint16x4x2_t c1 = |
| 100 | vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1])); |
| 101 | |
| 102 | *a0 = vreinterpret_u8_u16(c0.val[0]); |
| 103 | *a1 = vreinterpret_u8_u16(c1.val[0]); |
| 104 | *a2 = vreinterpret_u8_u16(c0.val[1]); |
| 105 | *a3 = vreinterpret_u8_u16(c1.val[1]); |
| 106 | } |
| 107 | |
| 108 | static INLINE void transpose_u8_4x4(uint8x8_t *a0, uint8x8_t *a1) { |
| 109 | // Swap 16 bit elements. Goes from: |
| 110 | // a0: 00 01 02 03 10 11 12 13 |
| 111 | // a1: 20 21 22 23 30 31 32 33 |
| 112 | // to: |
| 113 | // b0.val[0]: 00 01 20 21 10 11 30 31 |
| 114 | // b0.val[1]: 02 03 22 23 12 13 32 33 |
| 115 | |
| 116 | const uint16x4x2_t b0 = |
| 117 | vtrn_u16(vreinterpret_u16_u8(*a0), vreinterpret_u16_u8(*a1)); |
| 118 | |
| 119 | // Swap 32 bit elements resulting in: |
| 120 | // c0.val[0]: 00 01 20 21 02 03 22 23 |
| 121 | // c0.val[1]: 10 11 30 31 12 13 32 33 |
| 122 | |
| 123 | const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]), |
| 124 | vreinterpret_u32_u16(b0.val[1])); |
| 125 | |
| 126 | // Swap 8 bit elements resulting in: |
| 127 | // d0.val[0]: 00 10 20 30 02 12 22 32 |
| 128 | // d0.val[1]: 01 11 21 31 03 13 23 33 |
| 129 | |
| 130 | const uint8x8x2_t d0 = |
| 131 | vtrn_u8(vreinterpret_u8_u32(c0.val[0]), vreinterpret_u8_u32(c0.val[1])); |
| 132 | |
| 133 | *a0 = d0.val[0]; |
| 134 | *a1 = d0.val[1]; |
| 135 | } |
| 136 | |
| 137 | static INLINE void transpose_u8_4x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, |
| 138 | uint8x8_t *a3, const uint8x8_t a4, |
| 139 | const uint8x8_t a5, const uint8x8_t a6, |
| 140 | const uint8x8_t a7) { |
| 141 | // Swap 32 bit elements. Goes from: |
| 142 | // a0: 00 01 02 03 XX XX XX XX |
| 143 | // a1: 10 11 12 13 XX XX XX XX |
| 144 | // a2: 20 21 22 23 XX XX XX XX |
| 145 | // a3; 30 31 32 33 XX XX XX XX |
| 146 | // a4: 40 41 42 43 XX XX XX XX |
| 147 | // a5: 50 51 52 53 XX XX XX XX |
| 148 | // a6: 60 61 62 63 XX XX XX XX |
| 149 | // a7: 70 71 72 73 XX XX XX XX |
| 150 | // to: |
| 151 | // b0.val[0]: 00 01 02 03 40 41 42 43 |
| 152 | // b1.val[0]: 10 11 12 13 50 51 52 53 |
| 153 | // b2.val[0]: 20 21 22 23 60 61 62 63 |
| 154 | // b3.val[0]: 30 31 32 33 70 71 72 73 |
| 155 | |
| 156 | const uint32x2x2_t b0 = |
| 157 | vtrn_u32(vreinterpret_u32_u8(*a0), vreinterpret_u32_u8(a4)); |
| 158 | const uint32x2x2_t b1 = |
| 159 | vtrn_u32(vreinterpret_u32_u8(*a1), vreinterpret_u32_u8(a5)); |
| 160 | const uint32x2x2_t b2 = |
| 161 | vtrn_u32(vreinterpret_u32_u8(*a2), vreinterpret_u32_u8(a6)); |
| 162 | const uint32x2x2_t b3 = |
| 163 | vtrn_u32(vreinterpret_u32_u8(*a3), vreinterpret_u32_u8(a7)); |
| 164 | |
| 165 | // Swap 16 bit elements resulting in: |
| 166 | // c0.val[0]: 00 01 20 21 40 41 60 61 |
| 167 | // c0.val[1]: 02 03 22 23 42 43 62 63 |
| 168 | // c1.val[0]: 10 11 30 31 50 51 70 71 |
| 169 | // c1.val[1]: 12 13 32 33 52 53 72 73 |
| 170 | |
| 171 | const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]), |
| 172 | vreinterpret_u16_u32(b2.val[0])); |
| 173 | const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]), |
| 174 | vreinterpret_u16_u32(b3.val[0])); |
| 175 | |
| 176 | // Swap 8 bit elements resulting in: |
| 177 | // d0.val[0]: 00 10 20 30 40 50 60 70 |
| 178 | // d0.val[1]: 01 11 21 31 41 51 61 71 |
| 179 | // d1.val[0]: 02 12 22 32 42 52 62 72 |
| 180 | // d1.val[1]: 03 13 23 33 43 53 63 73 |
| 181 | |
| 182 | const uint8x8x2_t d0 = |
| 183 | vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0])); |
| 184 | const uint8x8x2_t d1 = |
| 185 | vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1])); |
| 186 | |
| 187 | *a0 = d0.val[0]; |
| 188 | *a1 = d0.val[1]; |
| 189 | *a2 = d1.val[0]; |
| 190 | *a3 = d1.val[1]; |
| 191 | } |
Sanampudi Venkata Rao | 7c9746d | 2018-05-17 12:26:13 +0530 | [diff] [blame] | 192 | |
James Zern | 81bdabc | 2022-02-14 18:32:29 -0800 | [diff] [blame] | 193 | // Input: |
| 194 | // 00 01 02 03 |
| 195 | // 10 11 12 13 |
| 196 | // 20 21 22 23 |
| 197 | // 30 31 32 33 |
| 198 | // Output: |
| 199 | // 00 10 20 30 |
| 200 | // 01 11 21 31 |
| 201 | // 02 12 22 32 |
| 202 | // 03 13 23 33 |
James Zern | 02b9e9d | 2022-02-18 19:17:40 -0800 | [diff] [blame] | 203 | static INLINE void transpose_u16_4x4(uint16x4_t a[4]) { |
James Zern | 81bdabc | 2022-02-14 18:32:29 -0800 | [diff] [blame] | 204 | // b: |
| 205 | // 00 10 02 12 |
| 206 | // 01 11 03 13 |
| 207 | const uint16x4x2_t b = vtrn_u16(a[0], a[1]); |
| 208 | // c: |
| 209 | // 20 30 22 32 |
| 210 | // 21 31 23 33 |
| 211 | const uint16x4x2_t c = vtrn_u16(a[2], a[3]); |
| 212 | // d: |
| 213 | // 00 10 20 30 |
| 214 | // 02 12 22 32 |
| 215 | const uint32x2x2_t d = |
| 216 | vtrn_u32(vreinterpret_u32_u16(b.val[0]), vreinterpret_u32_u16(c.val[0])); |
| 217 | // e: |
| 218 | // 01 11 21 31 |
| 219 | // 03 13 23 33 |
| 220 | const uint32x2x2_t e = |
| 221 | vtrn_u32(vreinterpret_u32_u16(b.val[1]), vreinterpret_u32_u16(c.val[1])); |
| 222 | a[0] = vreinterpret_u16_u32(d.val[0]); |
| 223 | a[1] = vreinterpret_u16_u32(e.val[0]); |
| 224 | a[2] = vreinterpret_u16_u32(d.val[1]); |
| 225 | a[3] = vreinterpret_u16_u32(e.val[1]); |
| 226 | } |
| 227 | |
James Zern | dbfdc52 | 2022-02-14 19:15:29 -0800 | [diff] [blame] | 228 | // 4x8 Input: |
| 229 | // a[0]: 00 01 02 03 04 05 06 07 |
| 230 | // a[1]: 10 11 12 13 14 15 16 17 |
| 231 | // a[2]: 20 21 22 23 24 25 26 27 |
| 232 | // a[3]: 30 31 32 33 34 35 36 37 |
| 233 | // 8x4 Output: |
| 234 | // a[0]: 00 10 20 30 04 14 24 34 |
| 235 | // a[1]: 01 11 21 31 05 15 25 35 |
| 236 | // a[2]: 02 12 22 32 06 16 26 36 |
| 237 | // a[3]: 03 13 23 33 07 17 27 37 |
James Zern | 02b9e9d | 2022-02-18 19:17:40 -0800 | [diff] [blame] | 238 | static INLINE void transpose_u16_4x8q(uint16x8_t a[4]) { |
James Zern | dbfdc52 | 2022-02-14 19:15:29 -0800 | [diff] [blame] | 239 | // b0.val[0]: 00 10 02 12 04 14 06 16 |
| 240 | // b0.val[1]: 01 11 03 13 05 15 07 17 |
| 241 | // b1.val[0]: 20 30 22 32 24 34 26 36 |
| 242 | // b1.val[1]: 21 31 23 33 25 35 27 37 |
| 243 | const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]); |
| 244 | const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]); |
| 245 | |
| 246 | // c0.val[0]: 00 10 20 30 04 14 24 34 |
| 247 | // c0.val[1]: 02 12 22 32 06 16 26 36 |
| 248 | // c1.val[0]: 01 11 21 31 05 15 25 35 |
| 249 | // c1.val[1]: 03 13 23 33 07 17 27 37 |
| 250 | const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]), |
| 251 | vreinterpretq_u32_u16(b1.val[0])); |
| 252 | const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]), |
| 253 | vreinterpretq_u32_u16(b1.val[1])); |
| 254 | |
| 255 | a[0] = vreinterpretq_u16_u32(c0.val[0]); |
| 256 | a[1] = vreinterpretq_u16_u32(c1.val[0]); |
| 257 | a[2] = vreinterpretq_u16_u32(c0.val[1]); |
| 258 | a[3] = vreinterpretq_u16_u32(c1.val[1]); |
| 259 | } |
| 260 | |
James Zern | 02b9e9d | 2022-02-18 19:17:40 -0800 | [diff] [blame] | 261 | static INLINE uint16x8x2_t aom_vtrnq_u64_to_u16(const uint32x4_t a0, |
| 262 | const uint32x4_t a1) { |
James Zern | a01a2f0 | 2022-02-14 19:42:51 -0800 | [diff] [blame] | 263 | uint16x8x2_t b0; |
| 264 | b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)), |
| 265 | vreinterpret_u16_u32(vget_low_u32(a1))); |
| 266 | b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)), |
| 267 | vreinterpret_u16_u32(vget_high_u32(a1))); |
| 268 | return b0; |
| 269 | } |
| 270 | |
| 271 | // Special transpose for loop filter. |
| 272 | // 4x8 Input: |
| 273 | // p_q: p3 p2 p1 p0 q0 q1 q2 q3 |
| 274 | // a[0]: 00 01 02 03 04 05 06 07 |
| 275 | // a[1]: 10 11 12 13 14 15 16 17 |
| 276 | // a[2]: 20 21 22 23 24 25 26 27 |
| 277 | // a[3]: 30 31 32 33 34 35 36 37 |
| 278 | // 8x4 Output: |
| 279 | // a[0]: 03 13 23 33 04 14 24 34 p0q0 |
| 280 | // a[1]: 02 12 22 32 05 15 25 35 p1q1 |
| 281 | // a[2]: 01 11 21 31 06 16 26 36 p2q2 |
| 282 | // a[3]: 00 10 20 30 07 17 27 37 p3q3 |
| 283 | // Direct reapplication of the function will reset the high halves, but |
| 284 | // reverse the low halves: |
| 285 | // p_q: p0 p1 p2 p3 q0 q1 q2 q3 |
| 286 | // a[0]: 33 32 31 30 04 05 06 07 |
| 287 | // a[1]: 23 22 21 20 14 15 16 17 |
| 288 | // a[2]: 13 12 11 10 24 25 26 27 |
| 289 | // a[3]: 03 02 01 00 34 35 36 37 |
| 290 | // Simply reordering the inputs (3, 2, 1, 0) will reset the low halves, but |
| 291 | // reverse the high halves. |
James Zern | 02b9e9d | 2022-02-18 19:17:40 -0800 | [diff] [blame] | 292 | // The standard transpose_u16_4x8q will produce the same reversals, but with the |
James Zern | a01a2f0 | 2022-02-14 19:42:51 -0800 | [diff] [blame] | 293 | // order of the low halves also restored relative to the high halves. This is |
| 294 | // preferable because it puts all values from the same source row back together, |
| 295 | // but some post-processing is inevitable. |
James Zern | 02b9e9d | 2022-02-18 19:17:40 -0800 | [diff] [blame] | 296 | static INLINE void loop_filter_transpose_u16_4x8q(uint16x8_t a[4]) { |
James Zern | a01a2f0 | 2022-02-14 19:42:51 -0800 | [diff] [blame] | 297 | // b0.val[0]: 00 10 02 12 04 14 06 16 |
| 298 | // b0.val[1]: 01 11 03 13 05 15 07 17 |
| 299 | // b1.val[0]: 20 30 22 32 24 34 26 36 |
| 300 | // b1.val[1]: 21 31 23 33 25 35 27 37 |
| 301 | const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]); |
| 302 | const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]); |
| 303 | |
| 304 | // Reverse odd vectors to bring the appropriate items to the front of zips. |
| 305 | // b0.val[0]: 00 10 02 12 04 14 06 16 |
| 306 | // r0 : 03 13 01 11 07 17 05 15 |
| 307 | // b1.val[0]: 20 30 22 32 24 34 26 36 |
| 308 | // r1 : 23 33 21 31 27 37 25 35 |
| 309 | const uint32x4_t r0 = vrev64q_u32(vreinterpretq_u32_u16(b0.val[1])); |
| 310 | const uint32x4_t r1 = vrev64q_u32(vreinterpretq_u32_u16(b1.val[1])); |
| 311 | |
| 312 | // Zip to complete the halves. |
| 313 | // c0.val[0]: 00 10 20 30 02 12 22 32 p3p1 |
| 314 | // c0.val[1]: 04 14 24 34 06 16 26 36 q0q2 |
| 315 | // c1.val[0]: 03 13 23 33 01 11 21 31 p0p2 |
| 316 | // c1.val[1]: 07 17 27 37 05 15 25 35 q3q1 |
| 317 | const uint32x4x2_t c0 = vzipq_u32(vreinterpretq_u32_u16(b0.val[0]), |
| 318 | vreinterpretq_u32_u16(b1.val[0])); |
| 319 | const uint32x4x2_t c1 = vzipq_u32(r0, r1); |
| 320 | |
| 321 | // d0.val[0]: 00 10 20 30 07 17 27 37 p3q3 |
| 322 | // d0.val[1]: 02 12 22 32 05 15 25 35 p1q1 |
| 323 | // d1.val[0]: 03 13 23 33 04 14 24 34 p0q0 |
| 324 | // d1.val[1]: 01 11 21 31 06 16 26 36 p2q2 |
James Zern | 02b9e9d | 2022-02-18 19:17:40 -0800 | [diff] [blame] | 325 | const uint16x8x2_t d0 = aom_vtrnq_u64_to_u16(c0.val[0], c1.val[1]); |
James Zern | a01a2f0 | 2022-02-14 19:42:51 -0800 | [diff] [blame] | 326 | // The third row of c comes first here to swap p2 with q0. |
James Zern | 02b9e9d | 2022-02-18 19:17:40 -0800 | [diff] [blame] | 327 | const uint16x8x2_t d1 = aom_vtrnq_u64_to_u16(c1.val[0], c0.val[1]); |
James Zern | a01a2f0 | 2022-02-14 19:42:51 -0800 | [diff] [blame] | 328 | |
| 329 | // 8x4 Output: |
| 330 | // a[0]: 03 13 23 33 04 14 24 34 p0q0 |
| 331 | // a[1]: 02 12 22 32 05 15 25 35 p1q1 |
| 332 | // a[2]: 01 11 21 31 06 16 26 36 p2q2 |
| 333 | // a[3]: 00 10 20 30 07 17 27 37 p3q3 |
| 334 | a[0] = d1.val[0]; // p0q0 |
| 335 | a[1] = d0.val[1]; // p1q1 |
| 336 | a[2] = d1.val[1]; // p2q2 |
| 337 | a[3] = d0.val[0]; // p3q3 |
| 338 | } |
| 339 | |
Sanampudi Venkata Rao | 7c9746d | 2018-05-17 12:26:13 +0530 | [diff] [blame] | 340 | static INLINE void transpose_u16_4x8(uint16x4_t *a0, uint16x4_t *a1, |
| 341 | uint16x4_t *a2, uint16x4_t *a3, |
| 342 | uint16x4_t *a4, uint16x4_t *a5, |
| 343 | uint16x4_t *a6, uint16x4_t *a7, |
| 344 | uint16x8_t *o0, uint16x8_t *o1, |
| 345 | uint16x8_t *o2, uint16x8_t *o3) { |
| 346 | // Swap 16 bit elements. Goes from: |
| 347 | // a0: 00 01 02 03 |
| 348 | // a1: 10 11 12 13 |
| 349 | // a2: 20 21 22 23 |
| 350 | // a3: 30 31 32 33 |
| 351 | // a4: 40 41 42 43 |
| 352 | // a5: 50 51 52 53 |
| 353 | // a6: 60 61 62 63 |
| 354 | // a7: 70 71 72 73 |
| 355 | // to: |
| 356 | // b0.val[0]: 00 10 02 12 |
| 357 | // b0.val[1]: 01 11 03 13 |
| 358 | // b1.val[0]: 20 30 22 32 |
| 359 | // b1.val[1]: 21 31 23 33 |
| 360 | // b2.val[0]: 40 50 42 52 |
| 361 | // b2.val[1]: 41 51 43 53 |
| 362 | // b3.val[0]: 60 70 62 72 |
| 363 | // b3.val[1]: 61 71 63 73 |
| 364 | |
| 365 | uint16x4x2_t b0 = vtrn_u16(*a0, *a1); |
| 366 | uint16x4x2_t b1 = vtrn_u16(*a2, *a3); |
| 367 | uint16x4x2_t b2 = vtrn_u16(*a4, *a5); |
| 368 | uint16x4x2_t b3 = vtrn_u16(*a6, *a7); |
| 369 | |
| 370 | // Swap 32 bit elements resulting in: |
| 371 | // c0.val[0]: 00 10 20 30 |
| 372 | // c0.val[1]: 02 12 22 32 |
| 373 | // c1.val[0]: 01 11 21 31 |
| 374 | // c1.val[1]: 03 13 23 33 |
| 375 | // c2.val[0]: 40 50 60 70 |
| 376 | // c2.val[1]: 42 52 62 72 |
| 377 | // c3.val[0]: 41 51 61 71 |
| 378 | // c3.val[1]: 43 53 63 73 |
| 379 | |
| 380 | uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]), |
| 381 | vreinterpret_u32_u16(b1.val[0])); |
| 382 | uint32x2x2_t c1 = vtrn_u32(vreinterpret_u32_u16(b0.val[1]), |
| 383 | vreinterpret_u32_u16(b1.val[1])); |
| 384 | uint32x2x2_t c2 = vtrn_u32(vreinterpret_u32_u16(b2.val[0]), |
| 385 | vreinterpret_u32_u16(b3.val[0])); |
| 386 | uint32x2x2_t c3 = vtrn_u32(vreinterpret_u32_u16(b2.val[1]), |
| 387 | vreinterpret_u32_u16(b3.val[1])); |
| 388 | |
| 389 | // Swap 64 bit elements resulting in: |
| 390 | // o0: 00 10 20 30 40 50 60 70 |
| 391 | // o1: 01 11 21 31 41 51 61 71 |
| 392 | // o2: 02 12 22 32 42 52 62 72 |
| 393 | // o3: 03 13 23 33 43 53 63 73 |
| 394 | |
| 395 | *o0 = vcombine_u16(vreinterpret_u16_u32(c0.val[0]), |
| 396 | vreinterpret_u16_u32(c2.val[0])); |
| 397 | *o1 = vcombine_u16(vreinterpret_u16_u32(c1.val[0]), |
| 398 | vreinterpret_u16_u32(c3.val[0])); |
| 399 | *o2 = vcombine_u16(vreinterpret_u16_u32(c0.val[1]), |
| 400 | vreinterpret_u16_u32(c2.val[1])); |
| 401 | *o3 = vcombine_u16(vreinterpret_u16_u32(c1.val[1]), |
| 402 | vreinterpret_u16_u32(c3.val[1])); |
| 403 | } |
| 404 | |
Remya | c8e0b60 | 2019-12-23 20:42:19 +0530 | [diff] [blame] | 405 | static INLINE void transpose_s16_4x8(int16x4_t *a0, int16x4_t *a1, |
| 406 | int16x4_t *a2, int16x4_t *a3, |
| 407 | int16x4_t *a4, int16x4_t *a5, |
| 408 | int16x4_t *a6, int16x4_t *a7, |
| 409 | int16x8_t *o0, int16x8_t *o1, |
| 410 | int16x8_t *o2, int16x8_t *o3) { |
| 411 | // Swap 16 bit elements. Goes from: |
| 412 | // a0: 00 01 02 03 |
| 413 | // a1: 10 11 12 13 |
| 414 | // a2: 20 21 22 23 |
| 415 | // a3: 30 31 32 33 |
| 416 | // a4: 40 41 42 43 |
| 417 | // a5: 50 51 52 53 |
| 418 | // a6: 60 61 62 63 |
| 419 | // a7: 70 71 72 73 |
| 420 | // to: |
| 421 | // b0.val[0]: 00 10 02 12 |
| 422 | // b0.val[1]: 01 11 03 13 |
| 423 | // b1.val[0]: 20 30 22 32 |
| 424 | // b1.val[1]: 21 31 23 33 |
| 425 | // b2.val[0]: 40 50 42 52 |
| 426 | // b2.val[1]: 41 51 43 53 |
| 427 | // b3.val[0]: 60 70 62 72 |
| 428 | // b3.val[1]: 61 71 63 73 |
| 429 | |
| 430 | int16x4x2_t b0 = vtrn_s16(*a0, *a1); |
| 431 | int16x4x2_t b1 = vtrn_s16(*a2, *a3); |
| 432 | int16x4x2_t b2 = vtrn_s16(*a4, *a5); |
| 433 | int16x4x2_t b3 = vtrn_s16(*a6, *a7); |
| 434 | |
| 435 | // Swap 32 bit elements resulting in: |
| 436 | // c0.val[0]: 00 10 20 30 |
| 437 | // c0.val[1]: 02 12 22 32 |
| 438 | // c1.val[0]: 01 11 21 31 |
| 439 | // c1.val[1]: 03 13 23 33 |
| 440 | // c2.val[0]: 40 50 60 70 |
| 441 | // c2.val[1]: 42 52 62 72 |
| 442 | // c3.val[0]: 41 51 61 71 |
| 443 | // c3.val[1]: 43 53 63 73 |
| 444 | |
| 445 | int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]), |
| 446 | vreinterpret_s32_s16(b1.val[0])); |
| 447 | int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]), |
| 448 | vreinterpret_s32_s16(b1.val[1])); |
| 449 | int32x2x2_t c2 = vtrn_s32(vreinterpret_s32_s16(b2.val[0]), |
| 450 | vreinterpret_s32_s16(b3.val[0])); |
| 451 | int32x2x2_t c3 = vtrn_s32(vreinterpret_s32_s16(b2.val[1]), |
| 452 | vreinterpret_s32_s16(b3.val[1])); |
| 453 | |
| 454 | // Swap 64 bit elements resulting in: |
| 455 | // o0: 00 10 20 30 40 50 60 70 |
| 456 | // o1: 01 11 21 31 41 51 61 71 |
| 457 | // o2: 02 12 22 32 42 52 62 72 |
| 458 | // o3: 03 13 23 33 43 53 63 73 |
| 459 | |
| 460 | *o0 = vcombine_s16(vreinterpret_s16_s32(c0.val[0]), |
| 461 | vreinterpret_s16_s32(c2.val[0])); |
| 462 | *o1 = vcombine_s16(vreinterpret_s16_s32(c1.val[0]), |
| 463 | vreinterpret_s16_s32(c3.val[0])); |
| 464 | *o2 = vcombine_s16(vreinterpret_s16_s32(c0.val[1]), |
| 465 | vreinterpret_s16_s32(c2.val[1])); |
| 466 | *o3 = vcombine_s16(vreinterpret_s16_s32(c1.val[1]), |
| 467 | vreinterpret_s16_s32(c3.val[1])); |
| 468 | } |
| 469 | |
Sanampudi Venkata Rao | 7c9746d | 2018-05-17 12:26:13 +0530 | [diff] [blame] | 470 | static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1, |
| 471 | uint16x8_t *a2, uint16x8_t *a3, |
| 472 | uint16x8_t *a4, uint16x8_t *a5, |
| 473 | uint16x8_t *a6, uint16x8_t *a7) { |
| 474 | // Swap 16 bit elements. Goes from: |
| 475 | // a0: 00 01 02 03 04 05 06 07 |
| 476 | // a1: 10 11 12 13 14 15 16 17 |
| 477 | // a2: 20 21 22 23 24 25 26 27 |
| 478 | // a3: 30 31 32 33 34 35 36 37 |
| 479 | // a4: 40 41 42 43 44 45 46 47 |
| 480 | // a5: 50 51 52 53 54 55 56 57 |
| 481 | // a6: 60 61 62 63 64 65 66 67 |
| 482 | // a7: 70 71 72 73 74 75 76 77 |
| 483 | // to: |
| 484 | // b0.val[0]: 00 10 02 12 04 14 06 16 |
| 485 | // b0.val[1]: 01 11 03 13 05 15 07 17 |
| 486 | // b1.val[0]: 20 30 22 32 24 34 26 36 |
| 487 | // b1.val[1]: 21 31 23 33 25 35 27 37 |
| 488 | // b2.val[0]: 40 50 42 52 44 54 46 56 |
| 489 | // b2.val[1]: 41 51 43 53 45 55 47 57 |
| 490 | // b3.val[0]: 60 70 62 72 64 74 66 76 |
| 491 | // b3.val[1]: 61 71 63 73 65 75 67 77 |
| 492 | |
| 493 | const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1); |
| 494 | const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3); |
| 495 | const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5); |
| 496 | const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7); |
| 497 | |
| 498 | // Swap 32 bit elements resulting in: |
| 499 | // c0.val[0]: 00 10 20 30 04 14 24 34 |
| 500 | // c0.val[1]: 02 12 22 32 06 16 26 36 |
| 501 | // c1.val[0]: 01 11 21 31 05 15 25 35 |
| 502 | // c1.val[1]: 03 13 23 33 07 17 27 37 |
| 503 | // c2.val[0]: 40 50 60 70 44 54 64 74 |
| 504 | // c2.val[1]: 42 52 62 72 46 56 66 76 |
| 505 | // c3.val[0]: 41 51 61 71 45 55 65 75 |
| 506 | // c3.val[1]: 43 53 63 73 47 57 67 77 |
| 507 | |
| 508 | const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]), |
| 509 | vreinterpretq_u32_u16(b1.val[0])); |
| 510 | const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]), |
| 511 | vreinterpretq_u32_u16(b1.val[1])); |
| 512 | const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]), |
| 513 | vreinterpretq_u32_u16(b3.val[0])); |
| 514 | const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]), |
| 515 | vreinterpretq_u32_u16(b3.val[1])); |
| 516 | |
| 517 | *a0 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c0.val[0])), |
| 518 | vget_low_u16(vreinterpretq_u16_u32(c2.val[0]))); |
| 519 | *a4 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c0.val[0])), |
| 520 | vget_high_u16(vreinterpretq_u16_u32(c2.val[0]))); |
| 521 | |
| 522 | *a2 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c0.val[1])), |
| 523 | vget_low_u16(vreinterpretq_u16_u32(c2.val[1]))); |
| 524 | *a6 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c0.val[1])), |
| 525 | vget_high_u16(vreinterpretq_u16_u32(c2.val[1]))); |
| 526 | |
| 527 | *a1 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c1.val[0])), |
| 528 | vget_low_u16(vreinterpretq_u16_u32(c3.val[0]))); |
| 529 | *a5 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c1.val[0])), |
| 530 | vget_high_u16(vreinterpretq_u16_u32(c3.val[0]))); |
| 531 | |
| 532 | *a3 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c1.val[1])), |
| 533 | vget_low_u16(vreinterpretq_u16_u32(c3.val[1]))); |
| 534 | *a7 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c1.val[1])), |
| 535 | vget_high_u16(vreinterpretq_u16_u32(c3.val[1]))); |
| 536 | } |
| 537 | |
Sanampudi Venkata Rao | 90134d3 | 2018-05-19 16:19:20 +0530 | [diff] [blame] | 538 | static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1, |
| 539 | int16x8_t *a2, int16x8_t *a3, |
| 540 | int16x8_t *a4, int16x8_t *a5, |
| 541 | int16x8_t *a6, int16x8_t *a7) { |
| 542 | // Swap 16 bit elements. Goes from: |
| 543 | // a0: 00 01 02 03 04 05 06 07 |
| 544 | // a1: 10 11 12 13 14 15 16 17 |
| 545 | // a2: 20 21 22 23 24 25 26 27 |
| 546 | // a3: 30 31 32 33 34 35 36 37 |
| 547 | // a4: 40 41 42 43 44 45 46 47 |
| 548 | // a5: 50 51 52 53 54 55 56 57 |
| 549 | // a6: 60 61 62 63 64 65 66 67 |
| 550 | // a7: 70 71 72 73 74 75 76 77 |
| 551 | // to: |
| 552 | // b0.val[0]: 00 10 02 12 04 14 06 16 |
| 553 | // b0.val[1]: 01 11 03 13 05 15 07 17 |
| 554 | // b1.val[0]: 20 30 22 32 24 34 26 36 |
| 555 | // b1.val[1]: 21 31 23 33 25 35 27 37 |
| 556 | // b2.val[0]: 40 50 42 52 44 54 46 56 |
| 557 | // b2.val[1]: 41 51 43 53 45 55 47 57 |
| 558 | // b3.val[0]: 60 70 62 72 64 74 66 76 |
| 559 | // b3.val[1]: 61 71 63 73 65 75 67 77 |
| 560 | |
| 561 | const int16x8x2_t b0 = vtrnq_s16(*a0, *a1); |
| 562 | const int16x8x2_t b1 = vtrnq_s16(*a2, *a3); |
| 563 | const int16x8x2_t b2 = vtrnq_s16(*a4, *a5); |
| 564 | const int16x8x2_t b3 = vtrnq_s16(*a6, *a7); |
| 565 | |
| 566 | // Swap 32 bit elements resulting in: |
| 567 | // c0.val[0]: 00 10 20 30 04 14 24 34 |
| 568 | // c0.val[1]: 02 12 22 32 06 16 26 36 |
| 569 | // c1.val[0]: 01 11 21 31 05 15 25 35 |
| 570 | // c1.val[1]: 03 13 23 33 07 17 27 37 |
| 571 | // c2.val[0]: 40 50 60 70 44 54 64 74 |
| 572 | // c2.val[1]: 42 52 62 72 46 56 66 76 |
| 573 | // c3.val[0]: 41 51 61 71 45 55 65 75 |
| 574 | // c3.val[1]: 43 53 63 73 47 57 67 77 |
| 575 | |
| 576 | const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]), |
| 577 | vreinterpretq_s32_s16(b1.val[0])); |
| 578 | const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]), |
| 579 | vreinterpretq_s32_s16(b1.val[1])); |
| 580 | const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]), |
| 581 | vreinterpretq_s32_s16(b3.val[0])); |
| 582 | const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]), |
| 583 | vreinterpretq_s32_s16(b3.val[1])); |
| 584 | |
| 585 | *a0 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c0.val[0])), |
| 586 | vget_low_s16(vreinterpretq_s16_s32(c2.val[0]))); |
| 587 | *a4 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c0.val[0])), |
| 588 | vget_high_s16(vreinterpretq_s16_s32(c2.val[0]))); |
| 589 | |
| 590 | *a2 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c0.val[1])), |
| 591 | vget_low_s16(vreinterpretq_s16_s32(c2.val[1]))); |
| 592 | *a6 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c0.val[1])), |
| 593 | vget_high_s16(vreinterpretq_s16_s32(c2.val[1]))); |
| 594 | |
| 595 | *a1 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c1.val[0])), |
| 596 | vget_low_s16(vreinterpretq_s16_s32(c3.val[0]))); |
| 597 | *a5 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c1.val[0])), |
| 598 | vget_high_s16(vreinterpretq_s16_s32(c3.val[0]))); |
| 599 | |
| 600 | *a3 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c1.val[1])), |
| 601 | vget_low_s16(vreinterpretq_s16_s32(c3.val[1]))); |
| 602 | *a7 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c1.val[1])), |
| 603 | vget_high_s16(vreinterpretq_s16_s32(c3.val[1]))); |
| 604 | } |
| 605 | |
Yaowu Xu | 14f63c0 | 2020-03-24 08:47:47 -0700 | [diff] [blame] | 606 | static INLINE int16x8x2_t aom_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) { |
Sachin Kumar Garg | 11e0937 | 2018-07-17 18:02:10 +0530 | [diff] [blame] | 607 | int16x8x2_t b0; |
| 608 | b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)), |
| 609 | vreinterpret_s16_s32(vget_low_s32(a1))); |
| 610 | b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)), |
| 611 | vreinterpret_s16_s32(vget_high_s32(a1))); |
| 612 | return b0; |
| 613 | } |
| 614 | |
| 615 | static INLINE void transpose_s16_8x8q(int16x8_t *a0, int16x8_t *out) { |
| 616 | // Swap 16 bit elements. Goes from: |
| 617 | // a0: 00 01 02 03 04 05 06 07 |
| 618 | // a1: 10 11 12 13 14 15 16 17 |
| 619 | // a2: 20 21 22 23 24 25 26 27 |
| 620 | // a3: 30 31 32 33 34 35 36 37 |
| 621 | // a4: 40 41 42 43 44 45 46 47 |
| 622 | // a5: 50 51 52 53 54 55 56 57 |
| 623 | // a6: 60 61 62 63 64 65 66 67 |
| 624 | // a7: 70 71 72 73 74 75 76 77 |
| 625 | // to: |
| 626 | // b0.val[0]: 00 10 02 12 04 14 06 16 |
| 627 | // b0.val[1]: 01 11 03 13 05 15 07 17 |
| 628 | // b1.val[0]: 20 30 22 32 24 34 26 36 |
| 629 | // b1.val[1]: 21 31 23 33 25 35 27 37 |
| 630 | // b2.val[0]: 40 50 42 52 44 54 46 56 |
| 631 | // b2.val[1]: 41 51 43 53 45 55 47 57 |
| 632 | // b3.val[0]: 60 70 62 72 64 74 66 76 |
| 633 | // b3.val[1]: 61 71 63 73 65 75 67 77 |
| 634 | |
| 635 | const int16x8x2_t b0 = vtrnq_s16(*a0, *(a0 + 1)); |
| 636 | const int16x8x2_t b1 = vtrnq_s16(*(a0 + 2), *(a0 + 3)); |
| 637 | const int16x8x2_t b2 = vtrnq_s16(*(a0 + 4), *(a0 + 5)); |
| 638 | const int16x8x2_t b3 = vtrnq_s16(*(a0 + 6), *(a0 + 7)); |
| 639 | |
| 640 | // Swap 32 bit elements resulting in: |
| 641 | // c0.val[0]: 00 10 20 30 04 14 24 34 |
| 642 | // c0.val[1]: 02 12 22 32 06 16 26 36 |
| 643 | // c1.val[0]: 01 11 21 31 05 15 25 35 |
| 644 | // c1.val[1]: 03 13 23 33 07 17 27 37 |
| 645 | // c2.val[0]: 40 50 60 70 44 54 64 74 |
| 646 | // c2.val[1]: 42 52 62 72 46 56 66 76 |
| 647 | // c3.val[0]: 41 51 61 71 45 55 65 75 |
| 648 | // c3.val[1]: 43 53 63 73 47 57 67 77 |
| 649 | |
| 650 | const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]), |
| 651 | vreinterpretq_s32_s16(b1.val[0])); |
| 652 | const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]), |
| 653 | vreinterpretq_s32_s16(b1.val[1])); |
| 654 | const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]), |
| 655 | vreinterpretq_s32_s16(b3.val[0])); |
| 656 | const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]), |
| 657 | vreinterpretq_s32_s16(b3.val[1])); |
| 658 | |
| 659 | // Swap 64 bit elements resulting in: |
| 660 | // d0.val[0]: 00 10 20 30 40 50 60 70 |
| 661 | // d0.val[1]: 04 14 24 34 44 54 64 74 |
| 662 | // d1.val[0]: 01 11 21 31 41 51 61 71 |
| 663 | // d1.val[1]: 05 15 25 35 45 55 65 75 |
| 664 | // d2.val[0]: 02 12 22 32 42 52 62 72 |
| 665 | // d2.val[1]: 06 16 26 36 46 56 66 76 |
| 666 | // d3.val[0]: 03 13 23 33 43 53 63 73 |
| 667 | // d3.val[1]: 07 17 27 37 47 57 67 77 |
Yaowu Xu | 14f63c0 | 2020-03-24 08:47:47 -0700 | [diff] [blame] | 668 | const int16x8x2_t d0 = aom_vtrnq_s64_to_s16(c0.val[0], c2.val[0]); |
| 669 | const int16x8x2_t d1 = aom_vtrnq_s64_to_s16(c1.val[0], c3.val[0]); |
| 670 | const int16x8x2_t d2 = aom_vtrnq_s64_to_s16(c0.val[1], c2.val[1]); |
| 671 | const int16x8x2_t d3 = aom_vtrnq_s64_to_s16(c1.val[1], c3.val[1]); |
Sachin Kumar Garg | 11e0937 | 2018-07-17 18:02:10 +0530 | [diff] [blame] | 672 | |
| 673 | *out = d0.val[0]; |
| 674 | *(out + 1) = d1.val[0]; |
| 675 | *(out + 2) = d2.val[0]; |
| 676 | *(out + 3) = d3.val[0]; |
| 677 | *(out + 4) = d0.val[1]; |
| 678 | *(out + 5) = d1.val[1]; |
| 679 | *(out + 6) = d2.val[1]; |
| 680 | *(out + 7) = d3.val[1]; |
| 681 | } |
| 682 | |
Sanampudi Venkata Rao | 90134d3 | 2018-05-19 16:19:20 +0530 | [diff] [blame] | 683 | static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1, |
| 684 | int16x4_t *a2, int16x4_t *a3) { |
| 685 | // Swap 16 bit elements. Goes from: |
| 686 | // a0: 00 01 02 03 |
| 687 | // a1: 10 11 12 13 |
| 688 | // a2: 20 21 22 23 |
| 689 | // a3: 30 31 32 33 |
| 690 | // to: |
| 691 | // b0.val[0]: 00 10 02 12 |
| 692 | // b0.val[1]: 01 11 03 13 |
| 693 | // b1.val[0]: 20 30 22 32 |
| 694 | // b1.val[1]: 21 31 23 33 |
| 695 | |
| 696 | const int16x4x2_t b0 = vtrn_s16(*a0, *a1); |
| 697 | const int16x4x2_t b1 = vtrn_s16(*a2, *a3); |
| 698 | |
| 699 | // Swap 32 bit elements resulting in: |
| 700 | // c0.val[0]: 00 10 20 30 |
| 701 | // c0.val[1]: 02 12 22 32 |
| 702 | // c1.val[0]: 01 11 21 31 |
| 703 | // c1.val[1]: 03 13 23 33 |
| 704 | |
| 705 | const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]), |
| 706 | vreinterpret_s32_s16(b1.val[0])); |
| 707 | const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]), |
| 708 | vreinterpret_s32_s16(b1.val[1])); |
| 709 | |
| 710 | *a0 = vreinterpret_s16_s32(c0.val[0]); |
| 711 | *a1 = vreinterpret_s16_s32(c1.val[0]); |
| 712 | *a2 = vreinterpret_s16_s32(c0.val[1]); |
| 713 | *a3 = vreinterpret_s16_s32(c1.val[1]); |
| 714 | } |
| 715 | |
Venkat | 0350496f | 2018-06-26 08:41:26 +0530 | [diff] [blame] | 716 | static INLINE int32x4x2_t aom_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) { |
| 717 | int32x4x2_t b0; |
| 718 | b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1)); |
| 719 | b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1)); |
| 720 | return b0; |
| 721 | } |
| 722 | |
| 723 | static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1, |
| 724 | int32x4_t *a2, int32x4_t *a3) { |
| 725 | // Swap 32 bit elements. Goes from: |
| 726 | // a0: 00 01 02 03 |
| 727 | // a1: 10 11 12 13 |
| 728 | // a2: 20 21 22 23 |
| 729 | // a3: 30 31 32 33 |
| 730 | // to: |
| 731 | // b0.val[0]: 00 10 02 12 |
| 732 | // b0.val[1]: 01 11 03 13 |
| 733 | // b1.val[0]: 20 30 22 32 |
| 734 | // b1.val[1]: 21 31 23 33 |
| 735 | |
| 736 | const int32x4x2_t b0 = vtrnq_s32(*a0, *a1); |
| 737 | const int32x4x2_t b1 = vtrnq_s32(*a2, *a3); |
| 738 | |
| 739 | // Swap 64 bit elements resulting in: |
| 740 | // c0.val[0]: 00 10 20 30 |
| 741 | // c0.val[1]: 02 12 22 32 |
| 742 | // c1.val[0]: 01 11 21 31 |
| 743 | // c1.val[1]: 03 13 23 33 |
| 744 | |
| 745 | const int32x4x2_t c0 = aom_vtrnq_s64_to_s32(b0.val[0], b1.val[0]); |
| 746 | const int32x4x2_t c1 = aom_vtrnq_s64_to_s32(b0.val[1], b1.val[1]); |
| 747 | |
| 748 | *a0 = c0.val[0]; |
| 749 | *a1 = c1.val[0]; |
| 750 | *a2 = c0.val[1]; |
| 751 | *a3 = c1.val[1]; |
| 752 | } |
| 753 | |
Bohan Li | 3adb660d | 2021-08-24 17:59:14 -0700 | [diff] [blame] | 754 | #endif // AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_ |