blob: 26fc1fd74020906ee0f2e9d2f0c32b1dd3027968 [file] [log] [blame]
Remya Prakasane915e3c2018-05-11 17:16:34 +05301/*
2 * Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
Bohan Li3adb660d2021-08-24 17:59:14 -070011#ifndef AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
12#define AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
Remya Prakasane915e3c2018-05-11 17:16:34 +053013
14#include <arm_neon.h>
15
James Zern72d48642022-02-14 18:59:44 -080016// Swap high and low halves.
James Zern02b9e9d2022-02-18 19:17:40 -080017static INLINE uint16x8_t transpose64_u16q(const uint16x8_t a) {
James Zern72d48642022-02-14 18:59:44 -080018 return vextq_u16(a, a, 4);
19}
20
Remya Prakasane915e3c2018-05-11 17:16:34 +053021static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
22 uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5,
23 uint8x8_t *a6, uint8x8_t *a7) {
24 // Swap 8 bit elements. Goes from:
25 // a0: 00 01 02 03 04 05 06 07
26 // a1: 10 11 12 13 14 15 16 17
27 // a2: 20 21 22 23 24 25 26 27
28 // a3: 30 31 32 33 34 35 36 37
29 // a4: 40 41 42 43 44 45 46 47
30 // a5: 50 51 52 53 54 55 56 57
31 // a6: 60 61 62 63 64 65 66 67
32 // a7: 70 71 72 73 74 75 76 77
33 // to:
34 // b0.val[0]: 00 10 02 12 04 14 06 16 40 50 42 52 44 54 46 56
35 // b0.val[1]: 01 11 03 13 05 15 07 17 41 51 43 53 45 55 47 57
36 // b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76
37 // b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77
38
39 const uint8x16x2_t b0 =
40 vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5));
41 const uint8x16x2_t b1 =
42 vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7));
43
44 // Swap 16 bit elements resulting in:
45 // c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74
46 // c0.val[1]: 02 12 22 32 06 16 26 36 42 52 62 72 46 56 66 76
47 // c1.val[0]: 01 11 21 31 05 15 25 35 41 51 61 71 45 55 65 75
48 // c1.val[1]: 03 13 23 33 07 17 27 37 43 53 63 73 47 57 67 77
49
50 const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
51 vreinterpretq_u16_u8(b1.val[0]));
52 const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
53 vreinterpretq_u16_u8(b1.val[1]));
54
55 // Unzip 32 bit elements resulting in:
56 // d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
57 // d0.val[1]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
58 // d1.val[0]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
59 // d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
60 const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
61 vreinterpretq_u32_u16(c1.val[0]));
62 const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
63 vreinterpretq_u32_u16(c1.val[1]));
64
65 *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
66 *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
67 *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
68 *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
69 *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
70 *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
71 *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
72 *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
73}
74
75static INLINE void transpose_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
76 uint8x8_t *a3) {
77 // Swap 8 bit elements. Goes from:
78 // a0: 00 01 02 03 04 05 06 07
79 // a1: 10 11 12 13 14 15 16 17
80 // a2: 20 21 22 23 24 25 26 27
81 // a3: 30 31 32 33 34 35 36 37
82 // to:
83 // b0.val[0]: 00 10 02 12 04 14 06 16
84 // b0.val[1]: 01 11 03 13 05 15 07 17
85 // b1.val[0]: 20 30 22 32 24 34 26 36
86 // b1.val[1]: 21 31 23 33 25 35 27 37
87
88 const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
89 const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
90
91 // Swap 16 bit elements resulting in:
92 // c0.val[0]: 00 10 20 30 04 14 24 34
93 // c0.val[1]: 02 12 22 32 06 16 26 36
94 // c1.val[0]: 01 11 21 31 05 15 25 35
95 // c1.val[1]: 03 13 23 33 07 17 27 37
96
97 const uint16x4x2_t c0 =
98 vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
99 const uint16x4x2_t c1 =
100 vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
101
102 *a0 = vreinterpret_u8_u16(c0.val[0]);
103 *a1 = vreinterpret_u8_u16(c1.val[0]);
104 *a2 = vreinterpret_u8_u16(c0.val[1]);
105 *a3 = vreinterpret_u8_u16(c1.val[1]);
106}
107
108static INLINE void transpose_u8_4x4(uint8x8_t *a0, uint8x8_t *a1) {
109 // Swap 16 bit elements. Goes from:
110 // a0: 00 01 02 03 10 11 12 13
111 // a1: 20 21 22 23 30 31 32 33
112 // to:
113 // b0.val[0]: 00 01 20 21 10 11 30 31
114 // b0.val[1]: 02 03 22 23 12 13 32 33
115
116 const uint16x4x2_t b0 =
117 vtrn_u16(vreinterpret_u16_u8(*a0), vreinterpret_u16_u8(*a1));
118
119 // Swap 32 bit elements resulting in:
120 // c0.val[0]: 00 01 20 21 02 03 22 23
121 // c0.val[1]: 10 11 30 31 12 13 32 33
122
123 const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
124 vreinterpret_u32_u16(b0.val[1]));
125
126 // Swap 8 bit elements resulting in:
127 // d0.val[0]: 00 10 20 30 02 12 22 32
128 // d0.val[1]: 01 11 21 31 03 13 23 33
129
130 const uint8x8x2_t d0 =
131 vtrn_u8(vreinterpret_u8_u32(c0.val[0]), vreinterpret_u8_u32(c0.val[1]));
132
133 *a0 = d0.val[0];
134 *a1 = d0.val[1];
135}
136
137static INLINE void transpose_u8_4x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
138 uint8x8_t *a3, const uint8x8_t a4,
139 const uint8x8_t a5, const uint8x8_t a6,
140 const uint8x8_t a7) {
141 // Swap 32 bit elements. Goes from:
142 // a0: 00 01 02 03 XX XX XX XX
143 // a1: 10 11 12 13 XX XX XX XX
144 // a2: 20 21 22 23 XX XX XX XX
145 // a3; 30 31 32 33 XX XX XX XX
146 // a4: 40 41 42 43 XX XX XX XX
147 // a5: 50 51 52 53 XX XX XX XX
148 // a6: 60 61 62 63 XX XX XX XX
149 // a7: 70 71 72 73 XX XX XX XX
150 // to:
151 // b0.val[0]: 00 01 02 03 40 41 42 43
152 // b1.val[0]: 10 11 12 13 50 51 52 53
153 // b2.val[0]: 20 21 22 23 60 61 62 63
154 // b3.val[0]: 30 31 32 33 70 71 72 73
155
156 const uint32x2x2_t b0 =
157 vtrn_u32(vreinterpret_u32_u8(*a0), vreinterpret_u32_u8(a4));
158 const uint32x2x2_t b1 =
159 vtrn_u32(vreinterpret_u32_u8(*a1), vreinterpret_u32_u8(a5));
160 const uint32x2x2_t b2 =
161 vtrn_u32(vreinterpret_u32_u8(*a2), vreinterpret_u32_u8(a6));
162 const uint32x2x2_t b3 =
163 vtrn_u32(vreinterpret_u32_u8(*a3), vreinterpret_u32_u8(a7));
164
165 // Swap 16 bit elements resulting in:
166 // c0.val[0]: 00 01 20 21 40 41 60 61
167 // c0.val[1]: 02 03 22 23 42 43 62 63
168 // c1.val[0]: 10 11 30 31 50 51 70 71
169 // c1.val[1]: 12 13 32 33 52 53 72 73
170
171 const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]),
172 vreinterpret_u16_u32(b2.val[0]));
173 const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]),
174 vreinterpret_u16_u32(b3.val[0]));
175
176 // Swap 8 bit elements resulting in:
177 // d0.val[0]: 00 10 20 30 40 50 60 70
178 // d0.val[1]: 01 11 21 31 41 51 61 71
179 // d1.val[0]: 02 12 22 32 42 52 62 72
180 // d1.val[1]: 03 13 23 33 43 53 63 73
181
182 const uint8x8x2_t d0 =
183 vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0]));
184 const uint8x8x2_t d1 =
185 vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1]));
186
187 *a0 = d0.val[0];
188 *a1 = d0.val[1];
189 *a2 = d1.val[0];
190 *a3 = d1.val[1];
191}
Sanampudi Venkata Rao7c9746d2018-05-17 12:26:13 +0530192
James Zern81bdabc2022-02-14 18:32:29 -0800193// Input:
194// 00 01 02 03
195// 10 11 12 13
196// 20 21 22 23
197// 30 31 32 33
198// Output:
199// 00 10 20 30
200// 01 11 21 31
201// 02 12 22 32
202// 03 13 23 33
James Zern02b9e9d2022-02-18 19:17:40 -0800203static INLINE void transpose_u16_4x4(uint16x4_t a[4]) {
James Zern81bdabc2022-02-14 18:32:29 -0800204 // b:
205 // 00 10 02 12
206 // 01 11 03 13
207 const uint16x4x2_t b = vtrn_u16(a[0], a[1]);
208 // c:
209 // 20 30 22 32
210 // 21 31 23 33
211 const uint16x4x2_t c = vtrn_u16(a[2], a[3]);
212 // d:
213 // 00 10 20 30
214 // 02 12 22 32
215 const uint32x2x2_t d =
216 vtrn_u32(vreinterpret_u32_u16(b.val[0]), vreinterpret_u32_u16(c.val[0]));
217 // e:
218 // 01 11 21 31
219 // 03 13 23 33
220 const uint32x2x2_t e =
221 vtrn_u32(vreinterpret_u32_u16(b.val[1]), vreinterpret_u32_u16(c.val[1]));
222 a[0] = vreinterpret_u16_u32(d.val[0]);
223 a[1] = vreinterpret_u16_u32(e.val[0]);
224 a[2] = vreinterpret_u16_u32(d.val[1]);
225 a[3] = vreinterpret_u16_u32(e.val[1]);
226}
227
James Zerndbfdc522022-02-14 19:15:29 -0800228// 4x8 Input:
229// a[0]: 00 01 02 03 04 05 06 07
230// a[1]: 10 11 12 13 14 15 16 17
231// a[2]: 20 21 22 23 24 25 26 27
232// a[3]: 30 31 32 33 34 35 36 37
233// 8x4 Output:
234// a[0]: 00 10 20 30 04 14 24 34
235// a[1]: 01 11 21 31 05 15 25 35
236// a[2]: 02 12 22 32 06 16 26 36
237// a[3]: 03 13 23 33 07 17 27 37
James Zern02b9e9d2022-02-18 19:17:40 -0800238static INLINE void transpose_u16_4x8q(uint16x8_t a[4]) {
James Zerndbfdc522022-02-14 19:15:29 -0800239 // b0.val[0]: 00 10 02 12 04 14 06 16
240 // b0.val[1]: 01 11 03 13 05 15 07 17
241 // b1.val[0]: 20 30 22 32 24 34 26 36
242 // b1.val[1]: 21 31 23 33 25 35 27 37
243 const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
244 const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
245
246 // c0.val[0]: 00 10 20 30 04 14 24 34
247 // c0.val[1]: 02 12 22 32 06 16 26 36
248 // c1.val[0]: 01 11 21 31 05 15 25 35
249 // c1.val[1]: 03 13 23 33 07 17 27 37
250 const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
251 vreinterpretq_u32_u16(b1.val[0]));
252 const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
253 vreinterpretq_u32_u16(b1.val[1]));
254
255 a[0] = vreinterpretq_u16_u32(c0.val[0]);
256 a[1] = vreinterpretq_u16_u32(c1.val[0]);
257 a[2] = vreinterpretq_u16_u32(c0.val[1]);
258 a[3] = vreinterpretq_u16_u32(c1.val[1]);
259}
260
James Zern02b9e9d2022-02-18 19:17:40 -0800261static INLINE uint16x8x2_t aom_vtrnq_u64_to_u16(const uint32x4_t a0,
262 const uint32x4_t a1) {
James Zerna01a2f02022-02-14 19:42:51 -0800263 uint16x8x2_t b0;
264 b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
265 vreinterpret_u16_u32(vget_low_u32(a1)));
266 b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
267 vreinterpret_u16_u32(vget_high_u32(a1)));
268 return b0;
269}
270
271// Special transpose for loop filter.
272// 4x8 Input:
273// p_q: p3 p2 p1 p0 q0 q1 q2 q3
274// a[0]: 00 01 02 03 04 05 06 07
275// a[1]: 10 11 12 13 14 15 16 17
276// a[2]: 20 21 22 23 24 25 26 27
277// a[3]: 30 31 32 33 34 35 36 37
278// 8x4 Output:
279// a[0]: 03 13 23 33 04 14 24 34 p0q0
280// a[1]: 02 12 22 32 05 15 25 35 p1q1
281// a[2]: 01 11 21 31 06 16 26 36 p2q2
282// a[3]: 00 10 20 30 07 17 27 37 p3q3
283// Direct reapplication of the function will reset the high halves, but
284// reverse the low halves:
285// p_q: p0 p1 p2 p3 q0 q1 q2 q3
286// a[0]: 33 32 31 30 04 05 06 07
287// a[1]: 23 22 21 20 14 15 16 17
288// a[2]: 13 12 11 10 24 25 26 27
289// a[3]: 03 02 01 00 34 35 36 37
290// Simply reordering the inputs (3, 2, 1, 0) will reset the low halves, but
291// reverse the high halves.
James Zern02b9e9d2022-02-18 19:17:40 -0800292// The standard transpose_u16_4x8q will produce the same reversals, but with the
James Zerna01a2f02022-02-14 19:42:51 -0800293// order of the low halves also restored relative to the high halves. This is
294// preferable because it puts all values from the same source row back together,
295// but some post-processing is inevitable.
James Zern02b9e9d2022-02-18 19:17:40 -0800296static INLINE void loop_filter_transpose_u16_4x8q(uint16x8_t a[4]) {
James Zerna01a2f02022-02-14 19:42:51 -0800297 // b0.val[0]: 00 10 02 12 04 14 06 16
298 // b0.val[1]: 01 11 03 13 05 15 07 17
299 // b1.val[0]: 20 30 22 32 24 34 26 36
300 // b1.val[1]: 21 31 23 33 25 35 27 37
301 const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
302 const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
303
304 // Reverse odd vectors to bring the appropriate items to the front of zips.
305 // b0.val[0]: 00 10 02 12 04 14 06 16
306 // r0 : 03 13 01 11 07 17 05 15
307 // b1.val[0]: 20 30 22 32 24 34 26 36
308 // r1 : 23 33 21 31 27 37 25 35
309 const uint32x4_t r0 = vrev64q_u32(vreinterpretq_u32_u16(b0.val[1]));
310 const uint32x4_t r1 = vrev64q_u32(vreinterpretq_u32_u16(b1.val[1]));
311
312 // Zip to complete the halves.
313 // c0.val[0]: 00 10 20 30 02 12 22 32 p3p1
314 // c0.val[1]: 04 14 24 34 06 16 26 36 q0q2
315 // c1.val[0]: 03 13 23 33 01 11 21 31 p0p2
316 // c1.val[1]: 07 17 27 37 05 15 25 35 q3q1
317 const uint32x4x2_t c0 = vzipq_u32(vreinterpretq_u32_u16(b0.val[0]),
318 vreinterpretq_u32_u16(b1.val[0]));
319 const uint32x4x2_t c1 = vzipq_u32(r0, r1);
320
321 // d0.val[0]: 00 10 20 30 07 17 27 37 p3q3
322 // d0.val[1]: 02 12 22 32 05 15 25 35 p1q1
323 // d1.val[0]: 03 13 23 33 04 14 24 34 p0q0
324 // d1.val[1]: 01 11 21 31 06 16 26 36 p2q2
James Zern02b9e9d2022-02-18 19:17:40 -0800325 const uint16x8x2_t d0 = aom_vtrnq_u64_to_u16(c0.val[0], c1.val[1]);
James Zerna01a2f02022-02-14 19:42:51 -0800326 // The third row of c comes first here to swap p2 with q0.
James Zern02b9e9d2022-02-18 19:17:40 -0800327 const uint16x8x2_t d1 = aom_vtrnq_u64_to_u16(c1.val[0], c0.val[1]);
James Zerna01a2f02022-02-14 19:42:51 -0800328
329 // 8x4 Output:
330 // a[0]: 03 13 23 33 04 14 24 34 p0q0
331 // a[1]: 02 12 22 32 05 15 25 35 p1q1
332 // a[2]: 01 11 21 31 06 16 26 36 p2q2
333 // a[3]: 00 10 20 30 07 17 27 37 p3q3
334 a[0] = d1.val[0]; // p0q0
335 a[1] = d0.val[1]; // p1q1
336 a[2] = d1.val[1]; // p2q2
337 a[3] = d0.val[0]; // p3q3
338}
339
Sanampudi Venkata Rao7c9746d2018-05-17 12:26:13 +0530340static INLINE void transpose_u16_4x8(uint16x4_t *a0, uint16x4_t *a1,
341 uint16x4_t *a2, uint16x4_t *a3,
342 uint16x4_t *a4, uint16x4_t *a5,
343 uint16x4_t *a6, uint16x4_t *a7,
344 uint16x8_t *o0, uint16x8_t *o1,
345 uint16x8_t *o2, uint16x8_t *o3) {
346 // Swap 16 bit elements. Goes from:
347 // a0: 00 01 02 03
348 // a1: 10 11 12 13
349 // a2: 20 21 22 23
350 // a3: 30 31 32 33
351 // a4: 40 41 42 43
352 // a5: 50 51 52 53
353 // a6: 60 61 62 63
354 // a7: 70 71 72 73
355 // to:
356 // b0.val[0]: 00 10 02 12
357 // b0.val[1]: 01 11 03 13
358 // b1.val[0]: 20 30 22 32
359 // b1.val[1]: 21 31 23 33
360 // b2.val[0]: 40 50 42 52
361 // b2.val[1]: 41 51 43 53
362 // b3.val[0]: 60 70 62 72
363 // b3.val[1]: 61 71 63 73
364
365 uint16x4x2_t b0 = vtrn_u16(*a0, *a1);
366 uint16x4x2_t b1 = vtrn_u16(*a2, *a3);
367 uint16x4x2_t b2 = vtrn_u16(*a4, *a5);
368 uint16x4x2_t b3 = vtrn_u16(*a6, *a7);
369
370 // Swap 32 bit elements resulting in:
371 // c0.val[0]: 00 10 20 30
372 // c0.val[1]: 02 12 22 32
373 // c1.val[0]: 01 11 21 31
374 // c1.val[1]: 03 13 23 33
375 // c2.val[0]: 40 50 60 70
376 // c2.val[1]: 42 52 62 72
377 // c3.val[0]: 41 51 61 71
378 // c3.val[1]: 43 53 63 73
379
380 uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
381 vreinterpret_u32_u16(b1.val[0]));
382 uint32x2x2_t c1 = vtrn_u32(vreinterpret_u32_u16(b0.val[1]),
383 vreinterpret_u32_u16(b1.val[1]));
384 uint32x2x2_t c2 = vtrn_u32(vreinterpret_u32_u16(b2.val[0]),
385 vreinterpret_u32_u16(b3.val[0]));
386 uint32x2x2_t c3 = vtrn_u32(vreinterpret_u32_u16(b2.val[1]),
387 vreinterpret_u32_u16(b3.val[1]));
388
389 // Swap 64 bit elements resulting in:
390 // o0: 00 10 20 30 40 50 60 70
391 // o1: 01 11 21 31 41 51 61 71
392 // o2: 02 12 22 32 42 52 62 72
393 // o3: 03 13 23 33 43 53 63 73
394
395 *o0 = vcombine_u16(vreinterpret_u16_u32(c0.val[0]),
396 vreinterpret_u16_u32(c2.val[0]));
397 *o1 = vcombine_u16(vreinterpret_u16_u32(c1.val[0]),
398 vreinterpret_u16_u32(c3.val[0]));
399 *o2 = vcombine_u16(vreinterpret_u16_u32(c0.val[1]),
400 vreinterpret_u16_u32(c2.val[1]));
401 *o3 = vcombine_u16(vreinterpret_u16_u32(c1.val[1]),
402 vreinterpret_u16_u32(c3.val[1]));
403}
404
Remyac8e0b602019-12-23 20:42:19 +0530405static INLINE void transpose_s16_4x8(int16x4_t *a0, int16x4_t *a1,
406 int16x4_t *a2, int16x4_t *a3,
407 int16x4_t *a4, int16x4_t *a5,
408 int16x4_t *a6, int16x4_t *a7,
409 int16x8_t *o0, int16x8_t *o1,
410 int16x8_t *o2, int16x8_t *o3) {
411 // Swap 16 bit elements. Goes from:
412 // a0: 00 01 02 03
413 // a1: 10 11 12 13
414 // a2: 20 21 22 23
415 // a3: 30 31 32 33
416 // a4: 40 41 42 43
417 // a5: 50 51 52 53
418 // a6: 60 61 62 63
419 // a7: 70 71 72 73
420 // to:
421 // b0.val[0]: 00 10 02 12
422 // b0.val[1]: 01 11 03 13
423 // b1.val[0]: 20 30 22 32
424 // b1.val[1]: 21 31 23 33
425 // b2.val[0]: 40 50 42 52
426 // b2.val[1]: 41 51 43 53
427 // b3.val[0]: 60 70 62 72
428 // b3.val[1]: 61 71 63 73
429
430 int16x4x2_t b0 = vtrn_s16(*a0, *a1);
431 int16x4x2_t b1 = vtrn_s16(*a2, *a3);
432 int16x4x2_t b2 = vtrn_s16(*a4, *a5);
433 int16x4x2_t b3 = vtrn_s16(*a6, *a7);
434
435 // Swap 32 bit elements resulting in:
436 // c0.val[0]: 00 10 20 30
437 // c0.val[1]: 02 12 22 32
438 // c1.val[0]: 01 11 21 31
439 // c1.val[1]: 03 13 23 33
440 // c2.val[0]: 40 50 60 70
441 // c2.val[1]: 42 52 62 72
442 // c3.val[0]: 41 51 61 71
443 // c3.val[1]: 43 53 63 73
444
445 int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
446 vreinterpret_s32_s16(b1.val[0]));
447 int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
448 vreinterpret_s32_s16(b1.val[1]));
449 int32x2x2_t c2 = vtrn_s32(vreinterpret_s32_s16(b2.val[0]),
450 vreinterpret_s32_s16(b3.val[0]));
451 int32x2x2_t c3 = vtrn_s32(vreinterpret_s32_s16(b2.val[1]),
452 vreinterpret_s32_s16(b3.val[1]));
453
454 // Swap 64 bit elements resulting in:
455 // o0: 00 10 20 30 40 50 60 70
456 // o1: 01 11 21 31 41 51 61 71
457 // o2: 02 12 22 32 42 52 62 72
458 // o3: 03 13 23 33 43 53 63 73
459
460 *o0 = vcombine_s16(vreinterpret_s16_s32(c0.val[0]),
461 vreinterpret_s16_s32(c2.val[0]));
462 *o1 = vcombine_s16(vreinterpret_s16_s32(c1.val[0]),
463 vreinterpret_s16_s32(c3.val[0]));
464 *o2 = vcombine_s16(vreinterpret_s16_s32(c0.val[1]),
465 vreinterpret_s16_s32(c2.val[1]));
466 *o3 = vcombine_s16(vreinterpret_s16_s32(c1.val[1]),
467 vreinterpret_s16_s32(c3.val[1]));
468}
469
Sanampudi Venkata Rao7c9746d2018-05-17 12:26:13 +0530470static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1,
471 uint16x8_t *a2, uint16x8_t *a3,
472 uint16x8_t *a4, uint16x8_t *a5,
473 uint16x8_t *a6, uint16x8_t *a7) {
474 // Swap 16 bit elements. Goes from:
475 // a0: 00 01 02 03 04 05 06 07
476 // a1: 10 11 12 13 14 15 16 17
477 // a2: 20 21 22 23 24 25 26 27
478 // a3: 30 31 32 33 34 35 36 37
479 // a4: 40 41 42 43 44 45 46 47
480 // a5: 50 51 52 53 54 55 56 57
481 // a6: 60 61 62 63 64 65 66 67
482 // a7: 70 71 72 73 74 75 76 77
483 // to:
484 // b0.val[0]: 00 10 02 12 04 14 06 16
485 // b0.val[1]: 01 11 03 13 05 15 07 17
486 // b1.val[0]: 20 30 22 32 24 34 26 36
487 // b1.val[1]: 21 31 23 33 25 35 27 37
488 // b2.val[0]: 40 50 42 52 44 54 46 56
489 // b2.val[1]: 41 51 43 53 45 55 47 57
490 // b3.val[0]: 60 70 62 72 64 74 66 76
491 // b3.val[1]: 61 71 63 73 65 75 67 77
492
493 const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
494 const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
495 const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5);
496 const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7);
497
498 // Swap 32 bit elements resulting in:
499 // c0.val[0]: 00 10 20 30 04 14 24 34
500 // c0.val[1]: 02 12 22 32 06 16 26 36
501 // c1.val[0]: 01 11 21 31 05 15 25 35
502 // c1.val[1]: 03 13 23 33 07 17 27 37
503 // c2.val[0]: 40 50 60 70 44 54 64 74
504 // c2.val[1]: 42 52 62 72 46 56 66 76
505 // c3.val[0]: 41 51 61 71 45 55 65 75
506 // c3.val[1]: 43 53 63 73 47 57 67 77
507
508 const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
509 vreinterpretq_u32_u16(b1.val[0]));
510 const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
511 vreinterpretq_u32_u16(b1.val[1]));
512 const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
513 vreinterpretq_u32_u16(b3.val[0]));
514 const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
515 vreinterpretq_u32_u16(b3.val[1]));
516
517 *a0 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c0.val[0])),
518 vget_low_u16(vreinterpretq_u16_u32(c2.val[0])));
519 *a4 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c0.val[0])),
520 vget_high_u16(vreinterpretq_u16_u32(c2.val[0])));
521
522 *a2 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c0.val[1])),
523 vget_low_u16(vreinterpretq_u16_u32(c2.val[1])));
524 *a6 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c0.val[1])),
525 vget_high_u16(vreinterpretq_u16_u32(c2.val[1])));
526
527 *a1 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c1.val[0])),
528 vget_low_u16(vreinterpretq_u16_u32(c3.val[0])));
529 *a5 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c1.val[0])),
530 vget_high_u16(vreinterpretq_u16_u32(c3.val[0])));
531
532 *a3 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c1.val[1])),
533 vget_low_u16(vreinterpretq_u16_u32(c3.val[1])));
534 *a7 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c1.val[1])),
535 vget_high_u16(vreinterpretq_u16_u32(c3.val[1])));
536}
537
Sanampudi Venkata Rao90134d32018-05-19 16:19:20 +0530538static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
539 int16x8_t *a2, int16x8_t *a3,
540 int16x8_t *a4, int16x8_t *a5,
541 int16x8_t *a6, int16x8_t *a7) {
542 // Swap 16 bit elements. Goes from:
543 // a0: 00 01 02 03 04 05 06 07
544 // a1: 10 11 12 13 14 15 16 17
545 // a2: 20 21 22 23 24 25 26 27
546 // a3: 30 31 32 33 34 35 36 37
547 // a4: 40 41 42 43 44 45 46 47
548 // a5: 50 51 52 53 54 55 56 57
549 // a6: 60 61 62 63 64 65 66 67
550 // a7: 70 71 72 73 74 75 76 77
551 // to:
552 // b0.val[0]: 00 10 02 12 04 14 06 16
553 // b0.val[1]: 01 11 03 13 05 15 07 17
554 // b1.val[0]: 20 30 22 32 24 34 26 36
555 // b1.val[1]: 21 31 23 33 25 35 27 37
556 // b2.val[0]: 40 50 42 52 44 54 46 56
557 // b2.val[1]: 41 51 43 53 45 55 47 57
558 // b3.val[0]: 60 70 62 72 64 74 66 76
559 // b3.val[1]: 61 71 63 73 65 75 67 77
560
561 const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
562 const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
563 const int16x8x2_t b2 = vtrnq_s16(*a4, *a5);
564 const int16x8x2_t b3 = vtrnq_s16(*a6, *a7);
565
566 // Swap 32 bit elements resulting in:
567 // c0.val[0]: 00 10 20 30 04 14 24 34
568 // c0.val[1]: 02 12 22 32 06 16 26 36
569 // c1.val[0]: 01 11 21 31 05 15 25 35
570 // c1.val[1]: 03 13 23 33 07 17 27 37
571 // c2.val[0]: 40 50 60 70 44 54 64 74
572 // c2.val[1]: 42 52 62 72 46 56 66 76
573 // c3.val[0]: 41 51 61 71 45 55 65 75
574 // c3.val[1]: 43 53 63 73 47 57 67 77
575
576 const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
577 vreinterpretq_s32_s16(b1.val[0]));
578 const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
579 vreinterpretq_s32_s16(b1.val[1]));
580 const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
581 vreinterpretq_s32_s16(b3.val[0]));
582 const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
583 vreinterpretq_s32_s16(b3.val[1]));
584
585 *a0 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c0.val[0])),
586 vget_low_s16(vreinterpretq_s16_s32(c2.val[0])));
587 *a4 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c0.val[0])),
588 vget_high_s16(vreinterpretq_s16_s32(c2.val[0])));
589
590 *a2 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c0.val[1])),
591 vget_low_s16(vreinterpretq_s16_s32(c2.val[1])));
592 *a6 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c0.val[1])),
593 vget_high_s16(vreinterpretq_s16_s32(c2.val[1])));
594
595 *a1 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c1.val[0])),
596 vget_low_s16(vreinterpretq_s16_s32(c3.val[0])));
597 *a5 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c1.val[0])),
598 vget_high_s16(vreinterpretq_s16_s32(c3.val[0])));
599
600 *a3 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c1.val[1])),
601 vget_low_s16(vreinterpretq_s16_s32(c3.val[1])));
602 *a7 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c1.val[1])),
603 vget_high_s16(vreinterpretq_s16_s32(c3.val[1])));
604}
605
Yaowu Xu14f63c02020-03-24 08:47:47 -0700606static INLINE int16x8x2_t aom_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530607 int16x8x2_t b0;
608 b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
609 vreinterpret_s16_s32(vget_low_s32(a1)));
610 b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
611 vreinterpret_s16_s32(vget_high_s32(a1)));
612 return b0;
613}
614
615static INLINE void transpose_s16_8x8q(int16x8_t *a0, int16x8_t *out) {
616 // Swap 16 bit elements. Goes from:
617 // a0: 00 01 02 03 04 05 06 07
618 // a1: 10 11 12 13 14 15 16 17
619 // a2: 20 21 22 23 24 25 26 27
620 // a3: 30 31 32 33 34 35 36 37
621 // a4: 40 41 42 43 44 45 46 47
622 // a5: 50 51 52 53 54 55 56 57
623 // a6: 60 61 62 63 64 65 66 67
624 // a7: 70 71 72 73 74 75 76 77
625 // to:
626 // b0.val[0]: 00 10 02 12 04 14 06 16
627 // b0.val[1]: 01 11 03 13 05 15 07 17
628 // b1.val[0]: 20 30 22 32 24 34 26 36
629 // b1.val[1]: 21 31 23 33 25 35 27 37
630 // b2.val[0]: 40 50 42 52 44 54 46 56
631 // b2.val[1]: 41 51 43 53 45 55 47 57
632 // b3.val[0]: 60 70 62 72 64 74 66 76
633 // b3.val[1]: 61 71 63 73 65 75 67 77
634
635 const int16x8x2_t b0 = vtrnq_s16(*a0, *(a0 + 1));
636 const int16x8x2_t b1 = vtrnq_s16(*(a0 + 2), *(a0 + 3));
637 const int16x8x2_t b2 = vtrnq_s16(*(a0 + 4), *(a0 + 5));
638 const int16x8x2_t b3 = vtrnq_s16(*(a0 + 6), *(a0 + 7));
639
640 // Swap 32 bit elements resulting in:
641 // c0.val[0]: 00 10 20 30 04 14 24 34
642 // c0.val[1]: 02 12 22 32 06 16 26 36
643 // c1.val[0]: 01 11 21 31 05 15 25 35
644 // c1.val[1]: 03 13 23 33 07 17 27 37
645 // c2.val[0]: 40 50 60 70 44 54 64 74
646 // c2.val[1]: 42 52 62 72 46 56 66 76
647 // c3.val[0]: 41 51 61 71 45 55 65 75
648 // c3.val[1]: 43 53 63 73 47 57 67 77
649
650 const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
651 vreinterpretq_s32_s16(b1.val[0]));
652 const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
653 vreinterpretq_s32_s16(b1.val[1]));
654 const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
655 vreinterpretq_s32_s16(b3.val[0]));
656 const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
657 vreinterpretq_s32_s16(b3.val[1]));
658
659 // Swap 64 bit elements resulting in:
660 // d0.val[0]: 00 10 20 30 40 50 60 70
661 // d0.val[1]: 04 14 24 34 44 54 64 74
662 // d1.val[0]: 01 11 21 31 41 51 61 71
663 // d1.val[1]: 05 15 25 35 45 55 65 75
664 // d2.val[0]: 02 12 22 32 42 52 62 72
665 // d2.val[1]: 06 16 26 36 46 56 66 76
666 // d3.val[0]: 03 13 23 33 43 53 63 73
667 // d3.val[1]: 07 17 27 37 47 57 67 77
Yaowu Xu14f63c02020-03-24 08:47:47 -0700668 const int16x8x2_t d0 = aom_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
669 const int16x8x2_t d1 = aom_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
670 const int16x8x2_t d2 = aom_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
671 const int16x8x2_t d3 = aom_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530672
673 *out = d0.val[0];
674 *(out + 1) = d1.val[0];
675 *(out + 2) = d2.val[0];
676 *(out + 3) = d3.val[0];
677 *(out + 4) = d0.val[1];
678 *(out + 5) = d1.val[1];
679 *(out + 6) = d2.val[1];
680 *(out + 7) = d3.val[1];
681}
682
Sanampudi Venkata Rao90134d32018-05-19 16:19:20 +0530683static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1,
684 int16x4_t *a2, int16x4_t *a3) {
685 // Swap 16 bit elements. Goes from:
686 // a0: 00 01 02 03
687 // a1: 10 11 12 13
688 // a2: 20 21 22 23
689 // a3: 30 31 32 33
690 // to:
691 // b0.val[0]: 00 10 02 12
692 // b0.val[1]: 01 11 03 13
693 // b1.val[0]: 20 30 22 32
694 // b1.val[1]: 21 31 23 33
695
696 const int16x4x2_t b0 = vtrn_s16(*a0, *a1);
697 const int16x4x2_t b1 = vtrn_s16(*a2, *a3);
698
699 // Swap 32 bit elements resulting in:
700 // c0.val[0]: 00 10 20 30
701 // c0.val[1]: 02 12 22 32
702 // c1.val[0]: 01 11 21 31
703 // c1.val[1]: 03 13 23 33
704
705 const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
706 vreinterpret_s32_s16(b1.val[0]));
707 const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
708 vreinterpret_s32_s16(b1.val[1]));
709
710 *a0 = vreinterpret_s16_s32(c0.val[0]);
711 *a1 = vreinterpret_s16_s32(c1.val[0]);
712 *a2 = vreinterpret_s16_s32(c0.val[1]);
713 *a3 = vreinterpret_s16_s32(c1.val[1]);
714}
715
Venkat0350496f2018-06-26 08:41:26 +0530716static INLINE int32x4x2_t aom_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
717 int32x4x2_t b0;
718 b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
719 b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
720 return b0;
721}
722
723static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1,
724 int32x4_t *a2, int32x4_t *a3) {
725 // Swap 32 bit elements. Goes from:
726 // a0: 00 01 02 03
727 // a1: 10 11 12 13
728 // a2: 20 21 22 23
729 // a3: 30 31 32 33
730 // to:
731 // b0.val[0]: 00 10 02 12
732 // b0.val[1]: 01 11 03 13
733 // b1.val[0]: 20 30 22 32
734 // b1.val[1]: 21 31 23 33
735
736 const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
737 const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
738
739 // Swap 64 bit elements resulting in:
740 // c0.val[0]: 00 10 20 30
741 // c0.val[1]: 02 12 22 32
742 // c1.val[0]: 01 11 21 31
743 // c1.val[1]: 03 13 23 33
744
745 const int32x4x2_t c0 = aom_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
746 const int32x4x2_t c1 = aom_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
747
748 *a0 = c0.val[0];
749 *a1 = c1.val[0];
750 *a2 = c0.val[1];
751 *a3 = c1.val[1];
752}
753
Bohan Li3adb660d2021-08-24 17:59:14 -0700754#endif // AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_