blob: 5ac287fce7c5b98db22047825788d61b334a5b5b [file] [log] [blame]
Remya Prakasane915e3c2018-05-11 17:16:34 +05301/*
James Zernb7c05bd2024-06-11 19:15:10 -07002 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
Remya Prakasane915e3c2018-05-11 17:16:34 +05303 *
James Zernb7c05bd2024-06-11 19:15:10 -07004 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
Remya Prakasane915e3c2018-05-11 17:16:34 +053010 */
11
Bohan Li3adb660d2021-08-24 17:59:14 -070012#ifndef AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
13#define AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
Remya Prakasane915e3c2018-05-11 17:16:34 +053014
15#include <arm_neon.h>
16
George Steed7905fe02023-10-05 14:38:55 +010017#include "aom/aom_integer.h" // For AOM_FORCE_INLINE.
Wan-Teh Changab9cd8f2023-06-05 11:05:00 -070018#include "config/aom_config.h"
19
George Steedd9532922023-11-20 16:53:56 +000020static INLINE void transpose_elems_u8_8x8(
21 uint8x8_t a0, uint8x8_t a1, uint8x8_t a2, uint8x8_t a3, uint8x8_t a4,
22 uint8x8_t a5, uint8x8_t a6, uint8x8_t a7, uint8x8_t *o0, uint8x8_t *o1,
23 uint8x8_t *o2, uint8x8_t *o3, uint8x8_t *o4, uint8x8_t *o5, uint8x8_t *o6,
24 uint8x8_t *o7) {
Remya Prakasane915e3c2018-05-11 17:16:34 +053025 // Swap 8 bit elements. Goes from:
26 // a0: 00 01 02 03 04 05 06 07
27 // a1: 10 11 12 13 14 15 16 17
28 // a2: 20 21 22 23 24 25 26 27
29 // a3: 30 31 32 33 34 35 36 37
30 // a4: 40 41 42 43 44 45 46 47
31 // a5: 50 51 52 53 54 55 56 57
32 // a6: 60 61 62 63 64 65 66 67
33 // a7: 70 71 72 73 74 75 76 77
34 // to:
35 // b0.val[0]: 00 10 02 12 04 14 06 16 40 50 42 52 44 54 46 56
36 // b0.val[1]: 01 11 03 13 05 15 07 17 41 51 43 53 45 55 47 57
37 // b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76
38 // b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77
39
George Steedd9532922023-11-20 16:53:56 +000040 const uint8x16x2_t b0 = vtrnq_u8(vcombine_u8(a0, a4), vcombine_u8(a1, a5));
41 const uint8x16x2_t b1 = vtrnq_u8(vcombine_u8(a2, a6), vcombine_u8(a3, a7));
Remya Prakasane915e3c2018-05-11 17:16:34 +053042
43 // Swap 16 bit elements resulting in:
44 // c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74
45 // c0.val[1]: 02 12 22 32 06 16 26 36 42 52 62 72 46 56 66 76
46 // c1.val[0]: 01 11 21 31 05 15 25 35 41 51 61 71 45 55 65 75
47 // c1.val[1]: 03 13 23 33 07 17 27 37 43 53 63 73 47 57 67 77
48
49 const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
50 vreinterpretq_u16_u8(b1.val[0]));
51 const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
52 vreinterpretq_u16_u8(b1.val[1]));
53
54 // Unzip 32 bit elements resulting in:
55 // d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
56 // d0.val[1]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
57 // d1.val[0]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
58 // d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
59 const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
60 vreinterpretq_u32_u16(c1.val[0]));
61 const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
62 vreinterpretq_u32_u16(c1.val[1]));
63
George Steedd9532922023-11-20 16:53:56 +000064 *o0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
65 *o1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
66 *o2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
67 *o3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
68 *o4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
69 *o5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
70 *o6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
71 *o7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
72}
73
74static INLINE void transpose_elems_inplace_u8_8x8(uint8x8_t *a0, uint8x8_t *a1,
75 uint8x8_t *a2, uint8x8_t *a3,
76 uint8x8_t *a4, uint8x8_t *a5,
77 uint8x8_t *a6,
78 uint8x8_t *a7) {
79 transpose_elems_u8_8x8(*a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7, a0, a1, a2, a3,
80 a4, a5, a6, a7);
81}
82
83static INLINE void transpose_arrays_u8_8x8(const uint8x8_t *in,
84 uint8x8_t *out) {
85 transpose_elems_u8_8x8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
86 &out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
87 &out[6], &out[7]);
88}
89
90static AOM_FORCE_INLINE void transpose_arrays_u8_8x16(const uint8x8_t *x,
91 uint8x16_t *d) {
92 uint8x8x2_t w0 = vzip_u8(x[0], x[1]);
93 uint8x8x2_t w1 = vzip_u8(x[2], x[3]);
94 uint8x8x2_t w2 = vzip_u8(x[4], x[5]);
95 uint8x8x2_t w3 = vzip_u8(x[6], x[7]);
96
97 uint8x8x2_t w8 = vzip_u8(x[8], x[9]);
98 uint8x8x2_t w9 = vzip_u8(x[10], x[11]);
99 uint8x8x2_t w10 = vzip_u8(x[12], x[13]);
100 uint8x8x2_t w11 = vzip_u8(x[14], x[15]);
101
102 uint16x4x2_t w4 =
103 vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
104 uint16x4x2_t w5 =
105 vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0]));
106 uint16x4x2_t w12 =
107 vzip_u16(vreinterpret_u16_u8(w8.val[0]), vreinterpret_u16_u8(w9.val[0]));
108 uint16x4x2_t w13 = vzip_u16(vreinterpret_u16_u8(w10.val[0]),
109 vreinterpret_u16_u8(w11.val[0]));
110
111 uint32x2x2_t w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
112 vreinterpret_u32_u16(w5.val[0]));
113 uint32x2x2_t w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
114 vreinterpret_u32_u16(w5.val[1]));
115 uint32x2x2_t w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]),
116 vreinterpret_u32_u16(w13.val[0]));
117 uint32x2x2_t w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]),
118 vreinterpret_u32_u16(w13.val[1]));
119
120 // Store first 4-line result
121 d[0] = vreinterpretq_u8_u32(vcombine_u32(w6.val[0], w14.val[0]));
122 d[1] = vreinterpretq_u8_u32(vcombine_u32(w6.val[1], w14.val[1]));
123 d[2] = vreinterpretq_u8_u32(vcombine_u32(w7.val[0], w15.val[0]));
124 d[3] = vreinterpretq_u8_u32(vcombine_u32(w7.val[1], w15.val[1]));
125
126 w4 = vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1]));
127 w5 = vzip_u16(vreinterpret_u16_u8(w2.val[1]), vreinterpret_u16_u8(w3.val[1]));
128 w12 =
129 vzip_u16(vreinterpret_u16_u8(w8.val[1]), vreinterpret_u16_u8(w9.val[1]));
130 w13 = vzip_u16(vreinterpret_u16_u8(w10.val[1]),
131 vreinterpret_u16_u8(w11.val[1]));
132
133 w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
134 vreinterpret_u32_u16(w5.val[0]));
135 w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
136 vreinterpret_u32_u16(w5.val[1]));
137 w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]),
138 vreinterpret_u32_u16(w13.val[0]));
139 w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]),
140 vreinterpret_u32_u16(w13.val[1]));
141
142 // Store second 4-line result
143 d[4] = vreinterpretq_u8_u32(vcombine_u32(w6.val[0], w14.val[0]));
144 d[5] = vreinterpretq_u8_u32(vcombine_u32(w6.val[1], w14.val[1]));
145 d[6] = vreinterpretq_u8_u32(vcombine_u32(w7.val[0], w15.val[0]));
146 d[7] = vreinterpretq_u8_u32(vcombine_u32(w7.val[1], w15.val[1]));
147}
148
149static AOM_FORCE_INLINE void transpose_arrays_u8_16x8(const uint8x16_t *x,
150 uint8x8_t *d) {
151 uint8x16x2_t w0 = vzipq_u8(x[0], x[1]);
152 uint8x16x2_t w1 = vzipq_u8(x[2], x[3]);
153 uint8x16x2_t w2 = vzipq_u8(x[4], x[5]);
154 uint8x16x2_t w3 = vzipq_u8(x[6], x[7]);
155
156 uint16x8x2_t w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
157 vreinterpretq_u16_u8(w1.val[0]));
158 uint16x8x2_t w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
159 vreinterpretq_u16_u8(w3.val[0]));
160 uint16x8x2_t w6 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
161 vreinterpretq_u16_u8(w1.val[1]));
162 uint16x8x2_t w7 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
163 vreinterpretq_u16_u8(w3.val[1]));
164
165 uint32x4x2_t w8 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
166 vreinterpretq_u32_u16(w5.val[0]));
167 uint32x4x2_t w9 = vzipq_u32(vreinterpretq_u32_u16(w6.val[0]),
168 vreinterpretq_u32_u16(w7.val[0]));
169 uint32x4x2_t w10 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
170 vreinterpretq_u32_u16(w5.val[1]));
171 uint32x4x2_t w11 = vzipq_u32(vreinterpretq_u32_u16(w6.val[1]),
172 vreinterpretq_u32_u16(w7.val[1]));
173
174 d[0] = vreinterpret_u8_u32(vget_low_u32(w8.val[0]));
175 d[1] = vreinterpret_u8_u32(vget_high_u32(w8.val[0]));
176 d[2] = vreinterpret_u8_u32(vget_low_u32(w8.val[1]));
177 d[3] = vreinterpret_u8_u32(vget_high_u32(w8.val[1]));
178 d[4] = vreinterpret_u8_u32(vget_low_u32(w10.val[0]));
179 d[5] = vreinterpret_u8_u32(vget_high_u32(w10.val[0]));
180 d[6] = vreinterpret_u8_u32(vget_low_u32(w10.val[1]));
181 d[7] = vreinterpret_u8_u32(vget_high_u32(w10.val[1]));
182 d[8] = vreinterpret_u8_u32(vget_low_u32(w9.val[0]));
183 d[9] = vreinterpret_u8_u32(vget_high_u32(w9.val[0]));
184 d[10] = vreinterpret_u8_u32(vget_low_u32(w9.val[1]));
185 d[11] = vreinterpret_u8_u32(vget_high_u32(w9.val[1]));
186 d[12] = vreinterpret_u8_u32(vget_low_u32(w11.val[0]));
187 d[13] = vreinterpret_u8_u32(vget_high_u32(w11.val[0]));
188 d[14] = vreinterpret_u8_u32(vget_low_u32(w11.val[1]));
189 d[15] = vreinterpret_u8_u32(vget_high_u32(w11.val[1]));
190}
191
192static INLINE uint16x8x2_t aom_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
193 uint16x8x2_t b0;
194#if AOM_ARCH_AARCH64
195 b0.val[0] = vreinterpretq_u16_u64(
196 vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
197 b0.val[1] = vreinterpretq_u16_u64(
198 vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
199#else
200 b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
201 vreinterpret_u16_u32(vget_low_u32(a1)));
202 b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
203 vreinterpret_u16_u32(vget_high_u32(a1)));
204#endif
205 return b0;
206}
207
208static INLINE void transpose_arrays_u8_16x16(const uint8x16_t *x,
209 uint8x16_t *d) {
210 uint8x16x2_t w0 = vzipq_u8(x[0], x[1]);
211 uint8x16x2_t w1 = vzipq_u8(x[2], x[3]);
212 uint8x16x2_t w2 = vzipq_u8(x[4], x[5]);
213 uint8x16x2_t w3 = vzipq_u8(x[6], x[7]);
214
215 uint8x16x2_t w4 = vzipq_u8(x[8], x[9]);
216 uint8x16x2_t w5 = vzipq_u8(x[10], x[11]);
217 uint8x16x2_t w6 = vzipq_u8(x[12], x[13]);
218 uint8x16x2_t w7 = vzipq_u8(x[14], x[15]);
219
220 uint16x8x2_t w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
221 vreinterpretq_u16_u8(w1.val[0]));
222 uint16x8x2_t w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
223 vreinterpretq_u16_u8(w3.val[0]));
224 uint16x8x2_t w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[0]),
225 vreinterpretq_u16_u8(w5.val[0]));
226 uint16x8x2_t w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[0]),
227 vreinterpretq_u16_u8(w7.val[0]));
228
229 uint32x4x2_t w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]),
230 vreinterpretq_u32_u16(w9.val[0]));
231 uint32x4x2_t w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]),
232 vreinterpretq_u32_u16(w11.val[0]));
233 uint32x4x2_t w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]),
234 vreinterpretq_u32_u16(w9.val[1]));
235 uint32x4x2_t w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]),
236 vreinterpretq_u32_u16(w11.val[1]));
237
238 uint16x8x2_t d01 = aom_vtrnq_u64_to_u16(w12.val[0], w13.val[0]);
239 d[0] = vreinterpretq_u8_u16(d01.val[0]);
240 d[1] = vreinterpretq_u8_u16(d01.val[1]);
241 uint16x8x2_t d23 = aom_vtrnq_u64_to_u16(w12.val[1], w13.val[1]);
242 d[2] = vreinterpretq_u8_u16(d23.val[0]);
243 d[3] = vreinterpretq_u8_u16(d23.val[1]);
244 uint16x8x2_t d45 = aom_vtrnq_u64_to_u16(w14.val[0], w15.val[0]);
245 d[4] = vreinterpretq_u8_u16(d45.val[0]);
246 d[5] = vreinterpretq_u8_u16(d45.val[1]);
247 uint16x8x2_t d67 = aom_vtrnq_u64_to_u16(w14.val[1], w15.val[1]);
248 d[6] = vreinterpretq_u8_u16(d67.val[0]);
249 d[7] = vreinterpretq_u8_u16(d67.val[1]);
250
251 // upper half
252 w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
253 vreinterpretq_u16_u8(w1.val[1]));
254 w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
255 vreinterpretq_u16_u8(w3.val[1]));
256 w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[1]),
257 vreinterpretq_u16_u8(w5.val[1]));
258 w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[1]),
259 vreinterpretq_u16_u8(w7.val[1]));
260
261 w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]),
262 vreinterpretq_u32_u16(w9.val[0]));
263 w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]),
264 vreinterpretq_u32_u16(w11.val[0]));
265 w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]),
266 vreinterpretq_u32_u16(w9.val[1]));
267 w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]),
268 vreinterpretq_u32_u16(w11.val[1]));
269
270 d01 = aom_vtrnq_u64_to_u16(w12.val[0], w13.val[0]);
271 d[8] = vreinterpretq_u8_u16(d01.val[0]);
272 d[9] = vreinterpretq_u8_u16(d01.val[1]);
273 d23 = aom_vtrnq_u64_to_u16(w12.val[1], w13.val[1]);
274 d[10] = vreinterpretq_u8_u16(d23.val[0]);
275 d[11] = vreinterpretq_u8_u16(d23.val[1]);
276 d45 = aom_vtrnq_u64_to_u16(w14.val[0], w15.val[0]);
277 d[12] = vreinterpretq_u8_u16(d45.val[0]);
278 d[13] = vreinterpretq_u8_u16(d45.val[1]);
279 d67 = aom_vtrnq_u64_to_u16(w14.val[1], w15.val[1]);
280 d[14] = vreinterpretq_u8_u16(d67.val[0]);
281 d[15] = vreinterpretq_u8_u16(d67.val[1]);
282}
283
284static AOM_FORCE_INLINE void transpose_arrays_u8_32x16(const uint8x16x2_t *x,
285 uint8x16_t *d) {
286 uint8x16_t x2[32];
287 for (int i = 0; i < 16; ++i) {
288 x2[i] = x[i].val[0];
289 x2[i + 16] = x[i].val[1];
290 }
291 transpose_arrays_u8_16x16(x2, d);
292 transpose_arrays_u8_16x16(x2 + 16, d + 16);
Remya Prakasane915e3c2018-05-11 17:16:34 +0530293}
294
George Steede2391552023-08-16 18:31:42 +0100295static INLINE void transpose_elems_inplace_u8_8x4(uint8x8_t *a0, uint8x8_t *a1,
296 uint8x8_t *a2,
297 uint8x8_t *a3) {
Remya Prakasane915e3c2018-05-11 17:16:34 +0530298 // Swap 8 bit elements. Goes from:
299 // a0: 00 01 02 03 04 05 06 07
300 // a1: 10 11 12 13 14 15 16 17
301 // a2: 20 21 22 23 24 25 26 27
302 // a3: 30 31 32 33 34 35 36 37
303 // to:
304 // b0.val[0]: 00 10 02 12 04 14 06 16
305 // b0.val[1]: 01 11 03 13 05 15 07 17
306 // b1.val[0]: 20 30 22 32 24 34 26 36
307 // b1.val[1]: 21 31 23 33 25 35 27 37
308
309 const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
310 const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
311
312 // Swap 16 bit elements resulting in:
313 // c0.val[0]: 00 10 20 30 04 14 24 34
314 // c0.val[1]: 02 12 22 32 06 16 26 36
315 // c1.val[0]: 01 11 21 31 05 15 25 35
316 // c1.val[1]: 03 13 23 33 07 17 27 37
317
318 const uint16x4x2_t c0 =
319 vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
320 const uint16x4x2_t c1 =
321 vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
322
323 *a0 = vreinterpret_u8_u16(c0.val[0]);
324 *a1 = vreinterpret_u8_u16(c1.val[0]);
325 *a2 = vreinterpret_u8_u16(c0.val[1]);
326 *a3 = vreinterpret_u8_u16(c1.val[1]);
327}
328
Gerda Zsejke More89397862024-05-14 17:04:26 +0200329static INLINE void transpose_elems_inplace_u8_16x4(uint8x16_t *a0,
330 uint8x16_t *a1,
331 uint8x16_t *a2,
332 uint8x16_t *a3) {
333 // Swap 8 bit elements. Goes from:
334 // a0: 00 01 02 03 04 05 06 07 08 09 010 011 012 013 014 015
335 // a1: 10 11 12 13 14 15 16 17 18 19 110 111 112 113 114 115
336 // a2: 20 21 22 23 24 25 26 27 28 29 210 211 212 213 214 215
337 // a3: 30 31 32 33 34 35 36 37 38 39 310 311 312 313 314 315
338 // to:
339 // b0.val[0]: 00 10 02 12 04 14 06 16 08 18 010 110 012 112 014 114
340 // b0.val[1]: 01 11 03 13 05 15 07 17 09 19 011 111 013 113 015 115
341 // b1.val[0]: 20 30 22 32 24 34 26 36 28 38 210 310 212 312 214 314
342 // b1.val[1]: 21 31 23 33 25 35 27 37 29 39 211 311 213 313 215 315
343
344 const uint8x16x2_t b0 = vtrnq_u8(*a0, *a1);
345 const uint8x16x2_t b1 = vtrnq_u8(*a2, *a3);
346
347 // Swap 16 bit elements resulting in:
348 // c0.val[0]: 00 10 20 30 04 14 24 34 08 18 28 38 012 112 212 312
349 // c0.val[1]: 02 12 22 32 06 16 26 36 09 19 29 39 013 113 213 313
350 // c1.val[0]: 01 11 21 31 05 15 25 35 010 110 210 310 014 114 214 314
351 // c1.val[1]: 03 13 23 33 07 17 27 37 011 111 211 311 015 115 215 315
352
353 const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
354 vreinterpretq_u16_u8(b1.val[0]));
355 const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
356 vreinterpretq_u16_u8(b1.val[1]));
357
358 *a0 = vreinterpretq_u8_u16(c0.val[0]);
359 *a1 = vreinterpretq_u8_u16(c1.val[0]);
360 *a2 = vreinterpretq_u8_u16(c0.val[1]);
361 *a3 = vreinterpretq_u8_u16(c1.val[1]);
362}
363
George Steede2391552023-08-16 18:31:42 +0100364static INLINE void transpose_elems_inplace_u8_4x4(uint8x8_t *a0,
365 uint8x8_t *a1) {
Remya Prakasane915e3c2018-05-11 17:16:34 +0530366 // Swap 16 bit elements. Goes from:
367 // a0: 00 01 02 03 10 11 12 13
368 // a1: 20 21 22 23 30 31 32 33
369 // to:
370 // b0.val[0]: 00 01 20 21 10 11 30 31
371 // b0.val[1]: 02 03 22 23 12 13 32 33
372
373 const uint16x4x2_t b0 =
374 vtrn_u16(vreinterpret_u16_u8(*a0), vreinterpret_u16_u8(*a1));
375
376 // Swap 32 bit elements resulting in:
377 // c0.val[0]: 00 01 20 21 02 03 22 23
378 // c0.val[1]: 10 11 30 31 12 13 32 33
379
380 const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
381 vreinterpret_u32_u16(b0.val[1]));
382
383 // Swap 8 bit elements resulting in:
384 // d0.val[0]: 00 10 20 30 02 12 22 32
385 // d0.val[1]: 01 11 21 31 03 13 23 33
386
387 const uint8x8x2_t d0 =
388 vtrn_u8(vreinterpret_u8_u32(c0.val[0]), vreinterpret_u8_u32(c0.val[1]));
389
390 *a0 = d0.val[0];
391 *a1 = d0.val[1];
392}
393
George Steede2391552023-08-16 18:31:42 +0100394static INLINE void transpose_elems_u8_4x8(uint8x8_t a0, uint8x8_t a1,
395 uint8x8_t a2, uint8x8_t a3,
396 uint8x8_t a4, uint8x8_t a5,
397 uint8x8_t a6, uint8x8_t a7,
398 uint8x8_t *o0, uint8x8_t *o1,
399 uint8x8_t *o2, uint8x8_t *o3) {
Remya Prakasane915e3c2018-05-11 17:16:34 +0530400 // Swap 32 bit elements. Goes from:
401 // a0: 00 01 02 03 XX XX XX XX
402 // a1: 10 11 12 13 XX XX XX XX
403 // a2: 20 21 22 23 XX XX XX XX
404 // a3; 30 31 32 33 XX XX XX XX
405 // a4: 40 41 42 43 XX XX XX XX
406 // a5: 50 51 52 53 XX XX XX XX
407 // a6: 60 61 62 63 XX XX XX XX
408 // a7: 70 71 72 73 XX XX XX XX
409 // to:
410 // b0.val[0]: 00 01 02 03 40 41 42 43
411 // b1.val[0]: 10 11 12 13 50 51 52 53
412 // b2.val[0]: 20 21 22 23 60 61 62 63
413 // b3.val[0]: 30 31 32 33 70 71 72 73
414
415 const uint32x2x2_t b0 =
George Steede2391552023-08-16 18:31:42 +0100416 vtrn_u32(vreinterpret_u32_u8(a0), vreinterpret_u32_u8(a4));
Remya Prakasane915e3c2018-05-11 17:16:34 +0530417 const uint32x2x2_t b1 =
George Steede2391552023-08-16 18:31:42 +0100418 vtrn_u32(vreinterpret_u32_u8(a1), vreinterpret_u32_u8(a5));
Remya Prakasane915e3c2018-05-11 17:16:34 +0530419 const uint32x2x2_t b2 =
George Steede2391552023-08-16 18:31:42 +0100420 vtrn_u32(vreinterpret_u32_u8(a2), vreinterpret_u32_u8(a6));
Remya Prakasane915e3c2018-05-11 17:16:34 +0530421 const uint32x2x2_t b3 =
George Steede2391552023-08-16 18:31:42 +0100422 vtrn_u32(vreinterpret_u32_u8(a3), vreinterpret_u32_u8(a7));
Remya Prakasane915e3c2018-05-11 17:16:34 +0530423
424 // Swap 16 bit elements resulting in:
425 // c0.val[0]: 00 01 20 21 40 41 60 61
426 // c0.val[1]: 02 03 22 23 42 43 62 63
427 // c1.val[0]: 10 11 30 31 50 51 70 71
428 // c1.val[1]: 12 13 32 33 52 53 72 73
429
430 const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]),
431 vreinterpret_u16_u32(b2.val[0]));
432 const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]),
433 vreinterpret_u16_u32(b3.val[0]));
434
435 // Swap 8 bit elements resulting in:
436 // d0.val[0]: 00 10 20 30 40 50 60 70
437 // d0.val[1]: 01 11 21 31 41 51 61 71
438 // d1.val[0]: 02 12 22 32 42 52 62 72
439 // d1.val[1]: 03 13 23 33 43 53 63 73
440
441 const uint8x8x2_t d0 =
442 vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0]));
443 const uint8x8x2_t d1 =
444 vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1]));
445
George Steede2391552023-08-16 18:31:42 +0100446 *o0 = d0.val[0];
447 *o1 = d0.val[1];
448 *o2 = d1.val[0];
449 *o3 = d1.val[1];
Remya Prakasane915e3c2018-05-11 17:16:34 +0530450}
Sanampudi Venkata Rao7c9746d2018-05-17 12:26:13 +0530451
George Steede2391552023-08-16 18:31:42 +0100452static INLINE void transpose_array_inplace_u16_4x4(uint16x4_t a[4]) {
453 // Input:
454 // 00 01 02 03
455 // 10 11 12 13
456 // 20 21 22 23
457 // 30 31 32 33
458
James Zern81bdabc2022-02-14 18:32:29 -0800459 // b:
460 // 00 10 02 12
461 // 01 11 03 13
462 const uint16x4x2_t b = vtrn_u16(a[0], a[1]);
463 // c:
464 // 20 30 22 32
465 // 21 31 23 33
466 const uint16x4x2_t c = vtrn_u16(a[2], a[3]);
467 // d:
468 // 00 10 20 30
469 // 02 12 22 32
470 const uint32x2x2_t d =
471 vtrn_u32(vreinterpret_u32_u16(b.val[0]), vreinterpret_u32_u16(c.val[0]));
472 // e:
473 // 01 11 21 31
474 // 03 13 23 33
475 const uint32x2x2_t e =
476 vtrn_u32(vreinterpret_u32_u16(b.val[1]), vreinterpret_u32_u16(c.val[1]));
George Steede2391552023-08-16 18:31:42 +0100477
478 // Output:
479 // 00 10 20 30
480 // 01 11 21 31
481 // 02 12 22 32
482 // 03 13 23 33
James Zern81bdabc2022-02-14 18:32:29 -0800483 a[0] = vreinterpret_u16_u32(d.val[0]);
484 a[1] = vreinterpret_u16_u32(e.val[0]);
485 a[2] = vreinterpret_u16_u32(d.val[1]);
486 a[3] = vreinterpret_u16_u32(e.val[1]);
487}
488
George Steede2391552023-08-16 18:31:42 +0100489static INLINE void transpose_array_inplace_u16_4x8(uint16x8_t a[4]) {
490 // 4x8 Input:
491 // a[0]: 00 01 02 03 04 05 06 07
492 // a[1]: 10 11 12 13 14 15 16 17
493 // a[2]: 20 21 22 23 24 25 26 27
494 // a[3]: 30 31 32 33 34 35 36 37
495
James Zerndbfdc522022-02-14 19:15:29 -0800496 // b0.val[0]: 00 10 02 12 04 14 06 16
497 // b0.val[1]: 01 11 03 13 05 15 07 17
498 // b1.val[0]: 20 30 22 32 24 34 26 36
499 // b1.val[1]: 21 31 23 33 25 35 27 37
500 const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
501 const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
502
503 // c0.val[0]: 00 10 20 30 04 14 24 34
504 // c0.val[1]: 02 12 22 32 06 16 26 36
505 // c1.val[0]: 01 11 21 31 05 15 25 35
506 // c1.val[1]: 03 13 23 33 07 17 27 37
507 const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
508 vreinterpretq_u32_u16(b1.val[0]));
509 const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
510 vreinterpretq_u32_u16(b1.val[1]));
511
George Steede2391552023-08-16 18:31:42 +0100512 // 8x4 Output:
513 // a[0]: 00 10 20 30 04 14 24 34
514 // a[1]: 01 11 21 31 05 15 25 35
515 // a[2]: 02 12 22 32 06 16 26 36
516 // a[3]: 03 13 23 33 07 17 27 37
James Zerndbfdc522022-02-14 19:15:29 -0800517 a[0] = vreinterpretq_u16_u32(c0.val[0]);
518 a[1] = vreinterpretq_u16_u32(c1.val[0]);
519 a[2] = vreinterpretq_u16_u32(c0.val[1]);
520 a[3] = vreinterpretq_u16_u32(c1.val[1]);
521}
522
James Zerna01a2f02022-02-14 19:42:51 -0800523// Special transpose for loop filter.
524// 4x8 Input:
525// p_q: p3 p2 p1 p0 q0 q1 q2 q3
526// a[0]: 00 01 02 03 04 05 06 07
527// a[1]: 10 11 12 13 14 15 16 17
528// a[2]: 20 21 22 23 24 25 26 27
529// a[3]: 30 31 32 33 34 35 36 37
530// 8x4 Output:
531// a[0]: 03 13 23 33 04 14 24 34 p0q0
532// a[1]: 02 12 22 32 05 15 25 35 p1q1
533// a[2]: 01 11 21 31 06 16 26 36 p2q2
534// a[3]: 00 10 20 30 07 17 27 37 p3q3
535// Direct reapplication of the function will reset the high halves, but
536// reverse the low halves:
537// p_q: p0 p1 p2 p3 q0 q1 q2 q3
538// a[0]: 33 32 31 30 04 05 06 07
539// a[1]: 23 22 21 20 14 15 16 17
540// a[2]: 13 12 11 10 24 25 26 27
541// a[3]: 03 02 01 00 34 35 36 37
542// Simply reordering the inputs (3, 2, 1, 0) will reset the low halves, but
543// reverse the high halves.
James Zern02b9e9d2022-02-18 19:17:40 -0800544// The standard transpose_u16_4x8q will produce the same reversals, but with the
James Zerna01a2f02022-02-14 19:42:51 -0800545// order of the low halves also restored relative to the high halves. This is
546// preferable because it puts all values from the same source row back together,
547// but some post-processing is inevitable.
James Zern02b9e9d2022-02-18 19:17:40 -0800548static INLINE void loop_filter_transpose_u16_4x8q(uint16x8_t a[4]) {
James Zerna01a2f02022-02-14 19:42:51 -0800549 // b0.val[0]: 00 10 02 12 04 14 06 16
550 // b0.val[1]: 01 11 03 13 05 15 07 17
551 // b1.val[0]: 20 30 22 32 24 34 26 36
552 // b1.val[1]: 21 31 23 33 25 35 27 37
553 const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
554 const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
555
556 // Reverse odd vectors to bring the appropriate items to the front of zips.
557 // b0.val[0]: 00 10 02 12 04 14 06 16
558 // r0 : 03 13 01 11 07 17 05 15
559 // b1.val[0]: 20 30 22 32 24 34 26 36
560 // r1 : 23 33 21 31 27 37 25 35
561 const uint32x4_t r0 = vrev64q_u32(vreinterpretq_u32_u16(b0.val[1]));
562 const uint32x4_t r1 = vrev64q_u32(vreinterpretq_u32_u16(b1.val[1]));
563
564 // Zip to complete the halves.
565 // c0.val[0]: 00 10 20 30 02 12 22 32 p3p1
566 // c0.val[1]: 04 14 24 34 06 16 26 36 q0q2
567 // c1.val[0]: 03 13 23 33 01 11 21 31 p0p2
568 // c1.val[1]: 07 17 27 37 05 15 25 35 q3q1
569 const uint32x4x2_t c0 = vzipq_u32(vreinterpretq_u32_u16(b0.val[0]),
570 vreinterpretq_u32_u16(b1.val[0]));
571 const uint32x4x2_t c1 = vzipq_u32(r0, r1);
572
573 // d0.val[0]: 00 10 20 30 07 17 27 37 p3q3
574 // d0.val[1]: 02 12 22 32 05 15 25 35 p1q1
575 // d1.val[0]: 03 13 23 33 04 14 24 34 p0q0
576 // d1.val[1]: 01 11 21 31 06 16 26 36 p2q2
James Zern02b9e9d2022-02-18 19:17:40 -0800577 const uint16x8x2_t d0 = aom_vtrnq_u64_to_u16(c0.val[0], c1.val[1]);
James Zerna01a2f02022-02-14 19:42:51 -0800578 // The third row of c comes first here to swap p2 with q0.
James Zern02b9e9d2022-02-18 19:17:40 -0800579 const uint16x8x2_t d1 = aom_vtrnq_u64_to_u16(c1.val[0], c0.val[1]);
James Zerna01a2f02022-02-14 19:42:51 -0800580
581 // 8x4 Output:
582 // a[0]: 03 13 23 33 04 14 24 34 p0q0
583 // a[1]: 02 12 22 32 05 15 25 35 p1q1
584 // a[2]: 01 11 21 31 06 16 26 36 p2q2
585 // a[3]: 00 10 20 30 07 17 27 37 p3q3
586 a[0] = d1.val[0]; // p0q0
587 a[1] = d0.val[1]; // p1q1
588 a[2] = d1.val[1]; // p2q2
589 a[3] = d0.val[0]; // p3q3
590}
591
George Steede2391552023-08-16 18:31:42 +0100592static INLINE void transpose_elems_u16_4x8(
593 const uint16x4_t a0, const uint16x4_t a1, const uint16x4_t a2,
594 const uint16x4_t a3, const uint16x4_t a4, const uint16x4_t a5,
595 const uint16x4_t a6, const uint16x4_t a7, uint16x8_t *o0, uint16x8_t *o1,
596 uint16x8_t *o2, uint16x8_t *o3) {
Gerda Zsejke More652bc872023-02-28 16:11:52 +0100597 // Combine rows. Goes from:
Sanampudi Venkata Rao7c9746d2018-05-17 12:26:13 +0530598 // a0: 00 01 02 03
599 // a1: 10 11 12 13
600 // a2: 20 21 22 23
601 // a3: 30 31 32 33
602 // a4: 40 41 42 43
603 // a5: 50 51 52 53
604 // a6: 60 61 62 63
605 // a7: 70 71 72 73
606 // to:
Gerda Zsejke More652bc872023-02-28 16:11:52 +0100607 // b0: 00 01 02 03 40 41 42 43
608 // b1: 10 11 12 13 50 51 52 53
609 // b2: 20 21 22 23 60 61 62 63
610 // b3: 30 31 32 33 70 71 72 73
Sanampudi Venkata Rao7c9746d2018-05-17 12:26:13 +0530611
George Steede2391552023-08-16 18:31:42 +0100612 const uint16x8_t b0 = vcombine_u16(a0, a4);
613 const uint16x8_t b1 = vcombine_u16(a1, a5);
614 const uint16x8_t b2 = vcombine_u16(a2, a6);
615 const uint16x8_t b3 = vcombine_u16(a3, a7);
Gerda Zsejke More652bc872023-02-28 16:11:52 +0100616
617 // Swap 16 bit elements resulting in:
618 // c0.val[0]: 00 10 02 12 40 50 42 52
619 // c0.val[1]: 01 11 03 13 41 51 43 53
620 // c1.val[0]: 20 30 22 32 60 70 62 72
621 // c1.val[1]: 21 31 23 33 61 71 63 73
622
623 const uint16x8x2_t c0 = vtrnq_u16(b0, b1);
624 const uint16x8x2_t c1 = vtrnq_u16(b2, b3);
Sanampudi Venkata Rao7c9746d2018-05-17 12:26:13 +0530625
626 // Swap 32 bit elements resulting in:
Gerda Zsejke More652bc872023-02-28 16:11:52 +0100627 // d0.val[0]: 00 10 20 30 40 50 60 70
628 // d0.val[1]: 02 12 22 32 42 52 62 72
629 // d1.val[0]: 01 11 21 31 41 51 61 71
630 // d1.val[1]: 03 13 23 33 43 53 63 73
Sanampudi Venkata Rao7c9746d2018-05-17 12:26:13 +0530631
Gerda Zsejke More652bc872023-02-28 16:11:52 +0100632 const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
633 vreinterpretq_u32_u16(c1.val[0]));
634 const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
635 vreinterpretq_u32_u16(c1.val[1]));
Sanampudi Venkata Rao7c9746d2018-05-17 12:26:13 +0530636
Gerda Zsejke More652bc872023-02-28 16:11:52 +0100637 *o0 = vreinterpretq_u16_u32(d0.val[0]);
638 *o1 = vreinterpretq_u16_u32(d1.val[0]);
639 *o2 = vreinterpretq_u16_u32(d0.val[1]);
640 *o3 = vreinterpretq_u16_u32(d1.val[1]);
Sanampudi Venkata Rao7c9746d2018-05-17 12:26:13 +0530641}
642
George Steede2391552023-08-16 18:31:42 +0100643static INLINE void transpose_elems_s16_4x8(
644 const int16x4_t a0, const int16x4_t a1, const int16x4_t a2,
645 const int16x4_t a3, const int16x4_t a4, const int16x4_t a5,
646 const int16x4_t a6, const int16x4_t a7, int16x8_t *o0, int16x8_t *o1,
647 int16x8_t *o2, int16x8_t *o3) {
Gerda Zsejke More652bc872023-02-28 16:11:52 +0100648 // Combine rows. Goes from:
Remyac8e0b602019-12-23 20:42:19 +0530649 // a0: 00 01 02 03
650 // a1: 10 11 12 13
651 // a2: 20 21 22 23
652 // a3: 30 31 32 33
653 // a4: 40 41 42 43
654 // a5: 50 51 52 53
655 // a6: 60 61 62 63
656 // a7: 70 71 72 73
657 // to:
Gerda Zsejke More652bc872023-02-28 16:11:52 +0100658 // b0: 00 01 02 03 40 41 42 43
659 // b1: 10 11 12 13 50 51 52 53
660 // b2: 20 21 22 23 60 61 62 63
661 // b3: 30 31 32 33 70 71 72 73
Remyac8e0b602019-12-23 20:42:19 +0530662
George Steed569280c2023-07-11 14:07:51 +0100663 const int16x8_t b0 = vcombine_s16(a0, a4);
664 const int16x8_t b1 = vcombine_s16(a1, a5);
665 const int16x8_t b2 = vcombine_s16(a2, a6);
666 const int16x8_t b3 = vcombine_s16(a3, a7);
Gerda Zsejke More652bc872023-02-28 16:11:52 +0100667
668 // Swap 16 bit elements resulting in:
669 // c0.val[0]: 00 10 02 12 40 50 42 52
670 // c0.val[1]: 01 11 03 13 41 51 43 53
671 // c1.val[0]: 20 30 22 32 60 70 62 72
672 // c1.val[1]: 21 31 23 33 61 71 63 73
673
674 const int16x8x2_t c0 = vtrnq_s16(b0, b1);
675 const int16x8x2_t c1 = vtrnq_s16(b2, b3);
Remyac8e0b602019-12-23 20:42:19 +0530676
677 // Swap 32 bit elements resulting in:
Gerda Zsejke More652bc872023-02-28 16:11:52 +0100678 // d0.val[0]: 00 10 20 30 40 50 60 70
679 // d0.val[1]: 02 12 22 32 42 52 62 72
680 // d1.val[0]: 01 11 21 31 41 51 61 71
681 // d1.val[1]: 03 13 23 33 43 53 63 73
Remyac8e0b602019-12-23 20:42:19 +0530682
Gerda Zsejke More652bc872023-02-28 16:11:52 +0100683 const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
684 vreinterpretq_s32_s16(c1.val[0]));
685 const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
686 vreinterpretq_s32_s16(c1.val[1]));
Remyac8e0b602019-12-23 20:42:19 +0530687
Gerda Zsejke More652bc872023-02-28 16:11:52 +0100688 *o0 = vreinterpretq_s16_s32(d0.val[0]);
689 *o1 = vreinterpretq_s16_s32(d1.val[0]);
690 *o2 = vreinterpretq_s16_s32(d0.val[1]);
691 *o3 = vreinterpretq_s16_s32(d1.val[1]);
Remyac8e0b602019-12-23 20:42:19 +0530692}
693
George Steede2391552023-08-16 18:31:42 +0100694static INLINE void transpose_elems_inplace_u16_8x8(
695 uint16x8_t *a0, uint16x8_t *a1, uint16x8_t *a2, uint16x8_t *a3,
696 uint16x8_t *a4, uint16x8_t *a5, uint16x8_t *a6, uint16x8_t *a7) {
Sanampudi Venkata Rao7c9746d2018-05-17 12:26:13 +0530697 // Swap 16 bit elements. Goes from:
698 // a0: 00 01 02 03 04 05 06 07
699 // a1: 10 11 12 13 14 15 16 17
700 // a2: 20 21 22 23 24 25 26 27
701 // a3: 30 31 32 33 34 35 36 37
702 // a4: 40 41 42 43 44 45 46 47
703 // a5: 50 51 52 53 54 55 56 57
704 // a6: 60 61 62 63 64 65 66 67
705 // a7: 70 71 72 73 74 75 76 77
706 // to:
707 // b0.val[0]: 00 10 02 12 04 14 06 16
708 // b0.val[1]: 01 11 03 13 05 15 07 17
709 // b1.val[0]: 20 30 22 32 24 34 26 36
710 // b1.val[1]: 21 31 23 33 25 35 27 37
711 // b2.val[0]: 40 50 42 52 44 54 46 56
712 // b2.val[1]: 41 51 43 53 45 55 47 57
713 // b3.val[0]: 60 70 62 72 64 74 66 76
714 // b3.val[1]: 61 71 63 73 65 75 67 77
715
716 const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
717 const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
718 const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5);
719 const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7);
720
721 // Swap 32 bit elements resulting in:
722 // c0.val[0]: 00 10 20 30 04 14 24 34
723 // c0.val[1]: 02 12 22 32 06 16 26 36
724 // c1.val[0]: 01 11 21 31 05 15 25 35
725 // c1.val[1]: 03 13 23 33 07 17 27 37
726 // c2.val[0]: 40 50 60 70 44 54 64 74
727 // c2.val[1]: 42 52 62 72 46 56 66 76
728 // c3.val[0]: 41 51 61 71 45 55 65 75
729 // c3.val[1]: 43 53 63 73 47 57 67 77
730
731 const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
732 vreinterpretq_u32_u16(b1.val[0]));
733 const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
734 vreinterpretq_u32_u16(b1.val[1]));
735 const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
736 vreinterpretq_u32_u16(b3.val[0]));
737 const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
738 vreinterpretq_u32_u16(b3.val[1]));
739
Gerda Zsejke More810071f2023-01-12 11:10:49 +0100740 // Swap 64 bit elements resulting in:
741 // d0.val[0]: 00 10 20 30 40 50 60 70
742 // d0.val[1]: 04 14 24 34 44 54 64 74
743 // d1.val[0]: 01 11 21 31 41 51 61 71
744 // d1.val[1]: 05 15 25 35 45 55 65 75
745 // d2.val[0]: 02 12 22 32 42 52 62 72
746 // d2.val[1]: 06 16 26 36 46 56 66 76
747 // d3.val[0]: 03 13 23 33 43 53 63 73
748 // d3.val[1]: 07 17 27 37 47 57 67 77
Sanampudi Venkata Rao7c9746d2018-05-17 12:26:13 +0530749
Gerda Zsejke More810071f2023-01-12 11:10:49 +0100750 const uint16x8x2_t d0 = aom_vtrnq_u64_to_u16(c0.val[0], c2.val[0]);
751 const uint16x8x2_t d1 = aom_vtrnq_u64_to_u16(c1.val[0], c3.val[0]);
752 const uint16x8x2_t d2 = aom_vtrnq_u64_to_u16(c0.val[1], c2.val[1]);
753 const uint16x8x2_t d3 = aom_vtrnq_u64_to_u16(c1.val[1], c3.val[1]);
Sanampudi Venkata Rao7c9746d2018-05-17 12:26:13 +0530754
Gerda Zsejke More810071f2023-01-12 11:10:49 +0100755 *a0 = d0.val[0];
756 *a1 = d1.val[0];
757 *a2 = d2.val[0];
758 *a3 = d3.val[0];
759 *a4 = d0.val[1];
760 *a5 = d1.val[1];
761 *a6 = d2.val[1];
762 *a7 = d3.val[1];
763}
Sanampudi Venkata Rao7c9746d2018-05-17 12:26:13 +0530764
Gerda Zsejke More810071f2023-01-12 11:10:49 +0100765static INLINE int16x8x2_t aom_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
766 int16x8x2_t b0;
James Zernfe7676b2023-05-22 13:18:43 -0700767#if AOM_ARCH_AARCH64
Gerda Zsejke More810071f2023-01-12 11:10:49 +0100768 b0.val[0] = vreinterpretq_s16_s64(
769 vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
770 b0.val[1] = vreinterpretq_s16_s64(
771 vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
772#else
773 b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
774 vreinterpret_s16_s32(vget_low_s32(a1)));
775 b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
776 vreinterpret_s16_s32(vget_high_s32(a1)));
777#endif
778 return b0;
Sanampudi Venkata Rao7c9746d2018-05-17 12:26:13 +0530779}
780
George Steede2391552023-08-16 18:31:42 +0100781static INLINE void transpose_elems_inplace_s16_8x8(int16x8_t *a0, int16x8_t *a1,
782 int16x8_t *a2, int16x8_t *a3,
783 int16x8_t *a4, int16x8_t *a5,
784 int16x8_t *a6,
785 int16x8_t *a7) {
Sanampudi Venkata Rao90134d32018-05-19 16:19:20 +0530786 // Swap 16 bit elements. Goes from:
787 // a0: 00 01 02 03 04 05 06 07
788 // a1: 10 11 12 13 14 15 16 17
789 // a2: 20 21 22 23 24 25 26 27
790 // a3: 30 31 32 33 34 35 36 37
791 // a4: 40 41 42 43 44 45 46 47
792 // a5: 50 51 52 53 54 55 56 57
793 // a6: 60 61 62 63 64 65 66 67
794 // a7: 70 71 72 73 74 75 76 77
795 // to:
796 // b0.val[0]: 00 10 02 12 04 14 06 16
797 // b0.val[1]: 01 11 03 13 05 15 07 17
798 // b1.val[0]: 20 30 22 32 24 34 26 36
799 // b1.val[1]: 21 31 23 33 25 35 27 37
800 // b2.val[0]: 40 50 42 52 44 54 46 56
801 // b2.val[1]: 41 51 43 53 45 55 47 57
802 // b3.val[0]: 60 70 62 72 64 74 66 76
803 // b3.val[1]: 61 71 63 73 65 75 67 77
804
805 const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
806 const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
807 const int16x8x2_t b2 = vtrnq_s16(*a4, *a5);
808 const int16x8x2_t b3 = vtrnq_s16(*a6, *a7);
809
810 // Swap 32 bit elements resulting in:
811 // c0.val[0]: 00 10 20 30 04 14 24 34
812 // c0.val[1]: 02 12 22 32 06 16 26 36
813 // c1.val[0]: 01 11 21 31 05 15 25 35
814 // c1.val[1]: 03 13 23 33 07 17 27 37
815 // c2.val[0]: 40 50 60 70 44 54 64 74
816 // c2.val[1]: 42 52 62 72 46 56 66 76
817 // c3.val[0]: 41 51 61 71 45 55 65 75
818 // c3.val[1]: 43 53 63 73 47 57 67 77
819
820 const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
821 vreinterpretq_s32_s16(b1.val[0]));
822 const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
823 vreinterpretq_s32_s16(b1.val[1]));
824 const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
825 vreinterpretq_s32_s16(b3.val[0]));
826 const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
827 vreinterpretq_s32_s16(b3.val[1]));
828
Gerda Zsejke More810071f2023-01-12 11:10:49 +0100829 // Swap 64 bit elements resulting in:
830 // d0.val[0]: 00 10 20 30 40 50 60 70
831 // d0.val[1]: 04 14 24 34 44 54 64 74
832 // d1.val[0]: 01 11 21 31 41 51 61 71
833 // d1.val[1]: 05 15 25 35 45 55 65 75
834 // d2.val[0]: 02 12 22 32 42 52 62 72
835 // d2.val[1]: 06 16 26 36 46 56 66 76
836 // d3.val[0]: 03 13 23 33 43 53 63 73
837 // d3.val[1]: 07 17 27 37 47 57 67 77
Sanampudi Venkata Rao90134d32018-05-19 16:19:20 +0530838
Gerda Zsejke More810071f2023-01-12 11:10:49 +0100839 const int16x8x2_t d0 = aom_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
840 const int16x8x2_t d1 = aom_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
841 const int16x8x2_t d2 = aom_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
842 const int16x8x2_t d3 = aom_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
Sanampudi Venkata Rao90134d32018-05-19 16:19:20 +0530843
Gerda Zsejke More810071f2023-01-12 11:10:49 +0100844 *a0 = d0.val[0];
845 *a1 = d1.val[0];
846 *a2 = d2.val[0];
847 *a3 = d3.val[0];
848 *a4 = d0.val[1];
849 *a5 = d1.val[1];
850 *a6 = d2.val[1];
851 *a7 = d3.val[1];
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530852}
853
George Steede2391552023-08-16 18:31:42 +0100854static INLINE void transpose_arrays_s16_8x8(const int16x8_t *a,
855 int16x8_t *out) {
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530856 // Swap 16 bit elements. Goes from:
857 // a0: 00 01 02 03 04 05 06 07
858 // a1: 10 11 12 13 14 15 16 17
859 // a2: 20 21 22 23 24 25 26 27
860 // a3: 30 31 32 33 34 35 36 37
861 // a4: 40 41 42 43 44 45 46 47
862 // a5: 50 51 52 53 54 55 56 57
863 // a6: 60 61 62 63 64 65 66 67
864 // a7: 70 71 72 73 74 75 76 77
865 // to:
866 // b0.val[0]: 00 10 02 12 04 14 06 16
867 // b0.val[1]: 01 11 03 13 05 15 07 17
868 // b1.val[0]: 20 30 22 32 24 34 26 36
869 // b1.val[1]: 21 31 23 33 25 35 27 37
870 // b2.val[0]: 40 50 42 52 44 54 46 56
871 // b2.val[1]: 41 51 43 53 45 55 47 57
872 // b3.val[0]: 60 70 62 72 64 74 66 76
873 // b3.val[1]: 61 71 63 73 65 75 67 77
874
Gerda Zsejke More652bc872023-02-28 16:11:52 +0100875 const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]);
876 const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]);
877 const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]);
878 const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530879
880 // Swap 32 bit elements resulting in:
881 // c0.val[0]: 00 10 20 30 04 14 24 34
882 // c0.val[1]: 02 12 22 32 06 16 26 36
883 // c1.val[0]: 01 11 21 31 05 15 25 35
884 // c1.val[1]: 03 13 23 33 07 17 27 37
885 // c2.val[0]: 40 50 60 70 44 54 64 74
886 // c2.val[1]: 42 52 62 72 46 56 66 76
887 // c3.val[0]: 41 51 61 71 45 55 65 75
888 // c3.val[1]: 43 53 63 73 47 57 67 77
889
890 const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
891 vreinterpretq_s32_s16(b1.val[0]));
892 const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
893 vreinterpretq_s32_s16(b1.val[1]));
894 const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
895 vreinterpretq_s32_s16(b3.val[0]));
896 const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
897 vreinterpretq_s32_s16(b3.val[1]));
898
899 // Swap 64 bit elements resulting in:
900 // d0.val[0]: 00 10 20 30 40 50 60 70
901 // d0.val[1]: 04 14 24 34 44 54 64 74
902 // d1.val[0]: 01 11 21 31 41 51 61 71
903 // d1.val[1]: 05 15 25 35 45 55 65 75
904 // d2.val[0]: 02 12 22 32 42 52 62 72
905 // d2.val[1]: 06 16 26 36 46 56 66 76
906 // d3.val[0]: 03 13 23 33 43 53 63 73
907 // d3.val[1]: 07 17 27 37 47 57 67 77
Gerda Zsejke More810071f2023-01-12 11:10:49 +0100908
Yaowu Xu14f63c02020-03-24 08:47:47 -0700909 const int16x8x2_t d0 = aom_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
910 const int16x8x2_t d1 = aom_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
911 const int16x8x2_t d2 = aom_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
912 const int16x8x2_t d3 = aom_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530913
Gerda Zsejke More652bc872023-02-28 16:11:52 +0100914 out[0] = d0.val[0];
915 out[1] = d1.val[0];
916 out[2] = d2.val[0];
917 out[3] = d3.val[0];
918 out[4] = d0.val[1];
919 out[5] = d1.val[1];
920 out[6] = d2.val[1];
921 out[7] = d3.val[1];
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530922}
923
Gerda Zsejke More89397862024-05-14 17:04:26 +0200924static INLINE void transpose_elems_inplace_s16_8x4(int16x8_t *a0, int16x8_t *a1,
925 int16x8_t *a2,
926 int16x8_t *a3) {
927 // Swap 16 bit elements. Goes from:
928 // a0: 00 01 02 03 04 05 06 07
929 // a1: 10 11 12 13 14 15 16 17
930 // a2: 20 21 22 23 24 25 26 27
931 // a3: 30 31 32 33 34 35 36 37
932 // to:
933 // b0.val[0]: 00 10 02 12 04 14 06 16
934 // b0.val[1]: 01 11 03 13 05 15 07 17
935 // b1.val[0]: 20 30 22 32 24 34 26 36
936 // b1.val[1]: 21 31 23 33 25 35 27 37
937
938 const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
939 const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
940
941 // Swap 32 bit elements resulting in:
942 // c0.val[0]: 00 10 20 30 04 14 24 34
943 // c0.val[1]: 01 11 21 31 05 15 25 35
944 // c1.val[0]: 02 12 22 32 06 16 26 36
945 // c1.val[1]: 03 13 23 33 07 17 27 37
946
947 const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
948 vreinterpretq_s32_s16(b1.val[0]));
949 const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
950 vreinterpretq_s32_s16(b1.val[1]));
951
952 *a0 = vreinterpretq_s16_s32(c0.val[0]);
953 *a1 = vreinterpretq_s16_s32(c1.val[0]);
954 *a2 = vreinterpretq_s16_s32(c0.val[1]);
955 *a3 = vreinterpretq_s16_s32(c1.val[1]);
956}
957
George Steede2391552023-08-16 18:31:42 +0100958static INLINE void transpose_elems_inplace_u16_4x4(uint16x4_t *a0,
959 uint16x4_t *a1,
960 uint16x4_t *a2,
961 uint16x4_t *a3) {
Jonathan Wright864b2762023-04-26 17:56:25 +0100962 // Swap 16 bit elements. Goes from:
963 // a0: 00 01 02 03
964 // a1: 10 11 12 13
965 // a2: 20 21 22 23
966 // a3: 30 31 32 33
967 // to:
968 // b0.val[0]: 00 10 02 12
969 // b0.val[1]: 01 11 03 13
970 // b1.val[0]: 20 30 22 32
971 // b1.val[1]: 21 31 23 33
972
973 const uint16x4x2_t b0 = vtrn_u16(*a0, *a1);
974 const uint16x4x2_t b1 = vtrn_u16(*a2, *a3);
975
976 // Swap 32 bit elements resulting in:
977 // c0.val[0]: 00 10 20 30
978 // c0.val[1]: 02 12 22 32
979 // c1.val[0]: 01 11 21 31
980 // c1.val[1]: 03 13 23 33
981
982 const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
983 vreinterpret_u32_u16(b1.val[0]));
984 const uint32x2x2_t c1 = vtrn_u32(vreinterpret_u32_u16(b0.val[1]),
985 vreinterpret_u32_u16(b1.val[1]));
986
987 *a0 = vreinterpret_u16_u32(c0.val[0]);
988 *a1 = vreinterpret_u16_u32(c1.val[0]);
989 *a2 = vreinterpret_u16_u32(c0.val[1]);
990 *a3 = vreinterpret_u16_u32(c1.val[1]);
991}
992
George Steede2391552023-08-16 18:31:42 +0100993static INLINE void transpose_elems_inplace_s16_4x4(int16x4_t *a0, int16x4_t *a1,
994 int16x4_t *a2,
995 int16x4_t *a3) {
Sanampudi Venkata Rao90134d32018-05-19 16:19:20 +0530996 // Swap 16 bit elements. Goes from:
997 // a0: 00 01 02 03
998 // a1: 10 11 12 13
999 // a2: 20 21 22 23
1000 // a3: 30 31 32 33
1001 // to:
1002 // b0.val[0]: 00 10 02 12
1003 // b0.val[1]: 01 11 03 13
1004 // b1.val[0]: 20 30 22 32
1005 // b1.val[1]: 21 31 23 33
1006
1007 const int16x4x2_t b0 = vtrn_s16(*a0, *a1);
1008 const int16x4x2_t b1 = vtrn_s16(*a2, *a3);
1009
1010 // Swap 32 bit elements resulting in:
1011 // c0.val[0]: 00 10 20 30
1012 // c0.val[1]: 02 12 22 32
1013 // c1.val[0]: 01 11 21 31
1014 // c1.val[1]: 03 13 23 33
1015
1016 const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
1017 vreinterpret_s32_s16(b1.val[0]));
1018 const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
1019 vreinterpret_s32_s16(b1.val[1]));
1020
1021 *a0 = vreinterpret_s16_s32(c0.val[0]);
1022 *a1 = vreinterpret_s16_s32(c1.val[0]);
1023 *a2 = vreinterpret_s16_s32(c0.val[1]);
1024 *a3 = vreinterpret_s16_s32(c1.val[1]);
1025}
1026
Venkat0350496f2018-06-26 08:41:26 +05301027static INLINE int32x4x2_t aom_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
1028 int32x4x2_t b0;
James Zernfe7676b2023-05-22 13:18:43 -07001029#if AOM_ARCH_AARCH64
Gerda Zsejke More652bc872023-02-28 16:11:52 +01001030 b0.val[0] = vreinterpretq_s32_s64(
1031 vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
1032 b0.val[1] = vreinterpretq_s32_s64(
1033 vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
1034#else
Venkat0350496f2018-06-26 08:41:26 +05301035 b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
1036 b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
Gerda Zsejke More652bc872023-02-28 16:11:52 +01001037#endif
Venkat0350496f2018-06-26 08:41:26 +05301038 return b0;
1039}
1040
George Steedff6df212023-09-12 14:38:45 +01001041static INLINE void transpose_elems_s32_4x4(const int32x4_t a0,
1042 const int32x4_t a1,
1043 const int32x4_t a2,
1044 const int32x4_t a3, int32x4_t *o0,
1045 int32x4_t *o1, int32x4_t *o2,
1046 int32x4_t *o3) {
Venkat0350496f2018-06-26 08:41:26 +05301047 // Swap 32 bit elements. Goes from:
1048 // a0: 00 01 02 03
1049 // a1: 10 11 12 13
1050 // a2: 20 21 22 23
1051 // a3: 30 31 32 33
1052 // to:
1053 // b0.val[0]: 00 10 02 12
1054 // b0.val[1]: 01 11 03 13
1055 // b1.val[0]: 20 30 22 32
1056 // b1.val[1]: 21 31 23 33
1057
George Steedff6df212023-09-12 14:38:45 +01001058 const int32x4x2_t b0 = vtrnq_s32(a0, a1);
1059 const int32x4x2_t b1 = vtrnq_s32(a2, a3);
Venkat0350496f2018-06-26 08:41:26 +05301060
1061 // Swap 64 bit elements resulting in:
1062 // c0.val[0]: 00 10 20 30
1063 // c0.val[1]: 02 12 22 32
1064 // c1.val[0]: 01 11 21 31
1065 // c1.val[1]: 03 13 23 33
1066
1067 const int32x4x2_t c0 = aom_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
1068 const int32x4x2_t c1 = aom_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
1069
George Steedff6df212023-09-12 14:38:45 +01001070 *o0 = c0.val[0];
1071 *o1 = c1.val[0];
1072 *o2 = c0.val[1];
1073 *o3 = c1.val[1];
1074}
1075
1076static INLINE void transpose_elems_inplace_s32_4x4(int32x4_t *a0, int32x4_t *a1,
1077 int32x4_t *a2,
1078 int32x4_t *a3) {
1079 transpose_elems_s32_4x4(*a0, *a1, *a2, *a3, a0, a1, a2, a3);
Venkat0350496f2018-06-26 08:41:26 +05301080}
1081
George Steed225b7b52023-09-19 13:45:44 +01001082static INLINE void transpose_arrays_s32_4x4(const int32x4_t *in,
1083 int32x4_t *out) {
1084 transpose_elems_s32_4x4(in[0], in[1], in[2], in[3], &out[0], &out[1], &out[2],
1085 &out[3]);
1086}
1087
1088static AOM_FORCE_INLINE void transpose_arrays_s32_4nx4n(const int32x4_t *in,
1089 int32x4_t *out,
1090 const int width,
1091 const int height) {
1092 const int h = height >> 2;
1093 const int w = width >> 2;
1094 for (int j = 0; j < w; j++) {
1095 for (int i = 0; i < h; i++) {
1096 transpose_arrays_s32_4x4(in + j * height + i * 4,
1097 out + i * width + j * 4);
1098 }
1099 }
1100}
1101
1102#define TRANSPOSE_ARRAYS_S32_WXH_NEON(w, h) \
1103 static AOM_FORCE_INLINE void transpose_arrays_s32_##w##x##h( \
1104 const int32x4_t *in, int32x4_t *out) { \
1105 transpose_arrays_s32_4nx4n(in, out, w, h); \
1106 }
1107
1108TRANSPOSE_ARRAYS_S32_WXH_NEON(4, 8)
1109TRANSPOSE_ARRAYS_S32_WXH_NEON(4, 16)
1110TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 4)
1111TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 8)
1112TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 16)
1113TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 32)
1114TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 8)
1115TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 16)
1116TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 32)
1117TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 64)
1118TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 8)
1119TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 16)
1120TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 32)
1121TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 64)
1122TRANSPOSE_ARRAYS_S32_WXH_NEON(64, 16)
1123TRANSPOSE_ARRAYS_S32_WXH_NEON(64, 32)
1124
1125#undef TRANSPOSE_ARRAYS_S32_WXH_NEON
1126
Salome Thirot70838c82023-06-19 15:05:35 +01001127static INLINE int64x2_t aom_vtrn1q_s64(int64x2_t a, int64x2_t b) {
1128#if AOM_ARCH_AARCH64
1129 return vtrn1q_s64(a, b);
1130#else
1131 return vcombine_s64(vget_low_s64(a), vget_low_s64(b));
1132#endif
1133}
1134
1135static INLINE int64x2_t aom_vtrn2q_s64(int64x2_t a, int64x2_t b) {
1136#if AOM_ARCH_AARCH64
1137 return vtrn2q_s64(a, b);
1138#else
1139 return vcombine_s64(vget_high_s64(a), vget_high_s64(b));
1140#endif
1141}
1142
George Steede2391552023-08-16 18:31:42 +01001143static INLINE void transpose_elems_s32_4x8(int32x4_t a0, int32x4_t a1,
1144 int32x4_t a2, int32x4_t a3,
1145 int32x4_t a4, int32x4_t a5,
1146 int32x4_t a6, int32x4_t a7,
1147 int32x4x2_t *o0, int32x4x2_t *o1,
1148 int32x4x2_t *o2, int32x4x2_t *o3) {
George Steed95940db2023-07-27 10:13:31 +01001149 // Perform a 4 x 8 matrix transpose by building on top of the existing 4 x 4
1150 // matrix transpose implementation:
1151 // [ A ]^T => [ A^T B^T ]
1152 // [ B ]
1153
George Steede2391552023-08-16 18:31:42 +01001154 transpose_elems_inplace_s32_4x4(&a0, &a1, &a2, &a3); // A^T
1155 transpose_elems_inplace_s32_4x4(&a4, &a5, &a6, &a7); // B^T
George Steed95940db2023-07-27 10:13:31 +01001156
1157 o0->val[0] = a0;
1158 o1->val[0] = a1;
1159 o2->val[0] = a2;
1160 o3->val[0] = a3;
1161
1162 o0->val[1] = a4;
1163 o1->val[1] = a5;
1164 o2->val[1] = a6;
1165 o3->val[1] = a7;
1166}
1167
George Steede2391552023-08-16 18:31:42 +01001168static INLINE void transpose_elems_inplace_s32_8x8(
1169 int32x4x2_t *a0, int32x4x2_t *a1, int32x4x2_t *a2, int32x4x2_t *a3,
1170 int32x4x2_t *a4, int32x4x2_t *a5, int32x4x2_t *a6, int32x4x2_t *a7) {
George Steed459fe8d2023-07-05 11:26:27 +01001171 // Perform an 8 x 8 matrix transpose by building on top of the existing 4 x 4
1172 // matrix transpose implementation:
1173 // [ A B ]^T => [ A^T C^T ]
1174 // [ C D ] [ B^T D^T ]
1175
1176 int32x4_t q0_v1 = a0->val[0];
1177 int32x4_t q0_v2 = a1->val[0];
1178 int32x4_t q0_v3 = a2->val[0];
1179 int32x4_t q0_v4 = a3->val[0];
1180
1181 int32x4_t q1_v1 = a0->val[1];
1182 int32x4_t q1_v2 = a1->val[1];
1183 int32x4_t q1_v3 = a2->val[1];
1184 int32x4_t q1_v4 = a3->val[1];
1185
1186 int32x4_t q2_v1 = a4->val[0];
1187 int32x4_t q2_v2 = a5->val[0];
1188 int32x4_t q2_v3 = a6->val[0];
1189 int32x4_t q2_v4 = a7->val[0];
1190
1191 int32x4_t q3_v1 = a4->val[1];
1192 int32x4_t q3_v2 = a5->val[1];
1193 int32x4_t q3_v3 = a6->val[1];
1194 int32x4_t q3_v4 = a7->val[1];
1195
George Steede2391552023-08-16 18:31:42 +01001196 transpose_elems_inplace_s32_4x4(&q0_v1, &q0_v2, &q0_v3, &q0_v4); // A^T
1197 transpose_elems_inplace_s32_4x4(&q1_v1, &q1_v2, &q1_v3, &q1_v4); // B^T
1198 transpose_elems_inplace_s32_4x4(&q2_v1, &q2_v2, &q2_v3, &q2_v4); // C^T
1199 transpose_elems_inplace_s32_4x4(&q3_v1, &q3_v2, &q3_v3, &q3_v4); // D^T
George Steed459fe8d2023-07-05 11:26:27 +01001200
1201 a0->val[0] = q0_v1;
1202 a1->val[0] = q0_v2;
1203 a2->val[0] = q0_v3;
1204 a3->val[0] = q0_v4;
1205
1206 a0->val[1] = q2_v1;
1207 a1->val[1] = q2_v2;
1208 a2->val[1] = q2_v3;
1209 a3->val[1] = q2_v4;
1210
1211 a4->val[0] = q1_v1;
1212 a5->val[0] = q1_v2;
1213 a6->val[0] = q1_v3;
1214 a7->val[0] = q1_v4;
1215
1216 a4->val[1] = q3_v1;
1217 a5->val[1] = q3_v2;
1218 a6->val[1] = q3_v3;
1219 a7->val[1] = q3_v4;
1220}
1221
George Steedb6dde352023-08-17 08:37:41 +01001222static INLINE void transpose_arrays_s16_4x4(const int16x4_t *const in,
1223 int16x4_t *const out) {
1224 int16x4_t a0 = in[0];
1225 int16x4_t a1 = in[1];
1226 int16x4_t a2 = in[2];
1227 int16x4_t a3 = in[3];
1228
1229 transpose_elems_inplace_s16_4x4(&a0, &a1, &a2, &a3);
1230
1231 out[0] = a0;
1232 out[1] = a1;
1233 out[2] = a2;
1234 out[3] = a3;
1235}
1236
George Steed64e92802023-08-21 13:58:12 +01001237static INLINE void transpose_arrays_s16_4x8(const int16x4_t *const in,
George Steedb6dde352023-08-17 08:37:41 +01001238 int16x8_t *const out) {
1239#if AOM_ARCH_AARCH64
George Steed64e92802023-08-21 13:58:12 +01001240 const int16x8_t a0 = vzip1q_s16(vcombine_s16(in[0], vdup_n_s16(0)),
1241 vcombine_s16(in[1], vdup_n_s16(0)));
1242 const int16x8_t a1 = vzip1q_s16(vcombine_s16(in[2], vdup_n_s16(0)),
1243 vcombine_s16(in[3], vdup_n_s16(0)));
1244 const int16x8_t a2 = vzip1q_s16(vcombine_s16(in[4], vdup_n_s16(0)),
1245 vcombine_s16(in[5], vdup_n_s16(0)));
1246 const int16x8_t a3 = vzip1q_s16(vcombine_s16(in[6], vdup_n_s16(0)),
1247 vcombine_s16(in[7], vdup_n_s16(0)));
George Steedb6dde352023-08-17 08:37:41 +01001248#else
1249 int16x4x2_t temp;
George Steed64e92802023-08-21 13:58:12 +01001250 temp = vzip_s16(in[0], in[1]);
George Steedb6dde352023-08-17 08:37:41 +01001251 const int16x8_t a0 = vcombine_s16(temp.val[0], temp.val[1]);
George Steed64e92802023-08-21 13:58:12 +01001252 temp = vzip_s16(in[2], in[3]);
George Steedb6dde352023-08-17 08:37:41 +01001253 const int16x8_t a1 = vcombine_s16(temp.val[0], temp.val[1]);
George Steed64e92802023-08-21 13:58:12 +01001254 temp = vzip_s16(in[4], in[5]);
George Steedb6dde352023-08-17 08:37:41 +01001255 const int16x8_t a2 = vcombine_s16(temp.val[0], temp.val[1]);
George Steed64e92802023-08-21 13:58:12 +01001256 temp = vzip_s16(in[6], in[7]);
George Steedb6dde352023-08-17 08:37:41 +01001257 const int16x8_t a3 = vcombine_s16(temp.val[0], temp.val[1]);
1258#endif
1259
1260 const int32x4x2_t b02 =
1261 vzipq_s32(vreinterpretq_s32_s16(a0), vreinterpretq_s32_s16(a1));
1262 const int32x4x2_t b13 =
1263 vzipq_s32(vreinterpretq_s32_s16(a2), vreinterpretq_s32_s16(a3));
1264
1265#if AOM_ARCH_AARCH64
1266 out[0] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[0]),
1267 vreinterpretq_s64_s32(b13.val[0])));
1268 out[1] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[0]),
1269 vreinterpretq_s64_s32(b13.val[0])));
1270 out[2] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[1]),
1271 vreinterpretq_s64_s32(b13.val[1])));
1272 out[3] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[1]),
1273 vreinterpretq_s64_s32(b13.val[1])));
1274#else
1275 out[0] = vreinterpretq_s16_s32(
1276 vextq_s32(vextq_s32(b02.val[0], b02.val[0], 2), b13.val[0], 2));
1277 out[2] = vreinterpretq_s16_s32(
1278 vextq_s32(vextq_s32(b02.val[1], b02.val[1], 2), b13.val[1], 2));
1279 out[1] = vreinterpretq_s16_s32(
1280 vextq_s32(b02.val[0], vextq_s32(b13.val[0], b13.val[0], 2), 2));
1281 out[3] = vreinterpretq_s16_s32(
1282 vextq_s32(b02.val[1], vextq_s32(b13.val[1], b13.val[1], 2), 2));
1283#endif
1284}
1285
1286static INLINE void transpose_arrays_s16_8x4(const int16x8_t *const in,
George Steed64e92802023-08-21 13:58:12 +01001287 int16x4_t *const out) {
1288 // Swap 16 bit elements. Goes from:
1289 // in[0]: 00 01 02 03 04 05 06 07
1290 // in[1]: 10 11 12 13 14 15 16 17
1291 // in[2]: 20 21 22 23 24 25 26 27
1292 // in[3]: 30 31 32 33 34 35 36 37
1293 // to:
1294 // b0.val[0]: 00 10 02 12 04 14 06 16
1295 // b0.val[1]: 01 11 03 13 05 15 07 17
1296 // b1.val[0]: 20 30 22 32 24 34 26 36
1297 // b1.val[1]: 21 31 23 33 25 35 27 37
George Steedb6dde352023-08-17 08:37:41 +01001298
George Steed64e92802023-08-21 13:58:12 +01001299 const int16x8x2_t b0 = vtrnq_s16(in[0], in[1]);
1300 const int16x8x2_t b1 = vtrnq_s16(in[2], in[3]);
George Steedb6dde352023-08-17 08:37:41 +01001301
George Steed64e92802023-08-21 13:58:12 +01001302 // Swap 32 bit elements resulting in:
1303 // c0.val[0]: 00 10 20 30 04 14 24 34
1304 // c0.val[1]: 02 12 22 32 06 16 26 36
1305 // c1.val[0]: 01 11 21 31 05 15 25 35
1306 // c1.val[1]: 03 13 23 33 07 17 27 37
George Steedb6dde352023-08-17 08:37:41 +01001307
George Steed64e92802023-08-21 13:58:12 +01001308 const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_s16(b0.val[0]),
1309 vreinterpretq_u32_s16(b1.val[0]));
1310 const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_s16(b0.val[1]),
1311 vreinterpretq_u32_s16(b1.val[1]));
1312
1313 // Unpack 64 bit elements resulting in:
1314 // out[0]: 00 10 20 30
1315 // out[1]: 01 11 21 31
1316 // out[2]: 02 12 22 32
1317 // out[3]: 03 13 23 33
1318 // out[4]: 04 14 24 34
1319 // out[5]: 05 15 25 35
1320 // out[6]: 06 16 26 36
1321 // out[7]: 07 17 27 37
1322
1323 out[0] = vget_low_s16(vreinterpretq_s16_u32(c0.val[0]));
1324 out[1] = vget_low_s16(vreinterpretq_s16_u32(c1.val[0]));
1325 out[2] = vget_low_s16(vreinterpretq_s16_u32(c0.val[1]));
1326 out[3] = vget_low_s16(vreinterpretq_s16_u32(c1.val[1]));
1327 out[4] = vget_high_s16(vreinterpretq_s16_u32(c0.val[0]));
1328 out[5] = vget_high_s16(vreinterpretq_s16_u32(c1.val[0]));
1329 out[6] = vget_high_s16(vreinterpretq_s16_u32(c0.val[1]));
1330 out[7] = vget_high_s16(vreinterpretq_s16_u32(c1.val[1]));
George Steedb6dde352023-08-17 08:37:41 +01001331}
1332
Bohan Li3adb660d2021-08-24 17:59:14 -07001333#endif // AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_