blob: 5d3c15265b79bd4ab05806a7cda2b0e81a011156 [file] [log] [blame]
Yaowu Xuc27fc142016-08-22 16:08:15 -07001/*
Krishna Rapaka7319db52021-09-28 20:35:29 -07002 * Copyright (c) 2021, Alliance for Open Media. All rights reserved
Yaowu Xuc27fc142016-08-22 16:08:15 -07003 *
Vibhoothi41c6dd72021-10-12 18:48:26 +00004 * This source code is subject to the terms of the BSD 3-Clause Clear License
5 * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
6 * License was not distributed with this source code in the LICENSE file, you
7 * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the
8 * Alliance for Open Media Patent License 1.0 was not distributed with this
9 * source code in the PATENTS file, you can obtain it at
10 * aomedia.org/license/patent-license/.
Yaowu Xuc27fc142016-08-22 16:08:15 -070011 */
12
13#include <arm_neon.h>
14
Vitalii Dziumenko7b9b7392020-05-26 04:42:51 +030015#include "common/tools_common.h"
16
Tom Finegan60e653d2018-05-22 11:34:58 -070017#include "config/aom_config.h"
Tom Finegan44702c82018-05-22 13:00:39 -070018#include "config/aom_dsp_rtcd.h"
Tom Finegan60e653d2018-05-22 11:34:58 -070019
Yaowu Xuf883b422016-08-30 14:01:10 -070020#include "aom/aom_integer.h"
Yaowu Xuc27fc142016-08-22 16:08:15 -070021
22//------------------------------------------------------------------------------
23// DC 4x4
24
25// 'do_above' and 'do_left' facilitate branch removal when inlined.
26static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
27 const uint8_t *left, int do_above, int do_left) {
28 uint16x8_t sum_top;
29 uint16x8_t sum_left;
30 uint8x8_t dc0;
31
32 if (do_above) {
33 const uint8x8_t A = vld1_u8(above); // top row
34 const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
35 const uint16x4_t p1 = vpadd_u16(p0, p0);
36 sum_top = vcombine_u16(p1, p1);
37 }
38
39 if (do_left) {
40 const uint8x8_t L = vld1_u8(left); // left border
41 const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left
42 const uint16x4_t p1 = vpadd_u16(p0, p0);
43 sum_left = vcombine_u16(p1, p1);
44 }
45
46 if (do_above && do_left) {
47 const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
48 dc0 = vrshrn_n_u16(sum, 3);
49 } else if (do_above) {
50 dc0 = vrshrn_n_u16(sum_top, 2);
51 } else if (do_left) {
52 dc0 = vrshrn_n_u16(sum_left, 2);
53 } else {
54 dc0 = vdup_n_u8(0x80);
55 }
56
57 {
58 const uint8x8_t dc = vdup_lane_u8(dc0, 0);
59 int i;
60 for (i = 0; i < 4; ++i) {
61 vst1_lane_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc), 0);
62 }
63 }
64}
65
Yaowu Xuf883b422016-08-30 14:01:10 -070066void aom_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -070067 const uint8_t *above, const uint8_t *left) {
68 dc_4x4(dst, stride, above, left, 1, 1);
69}
70
Yaowu Xuf883b422016-08-30 14:01:10 -070071void aom_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -070072 const uint8_t *above, const uint8_t *left) {
73 (void)above;
74 dc_4x4(dst, stride, NULL, left, 0, 1);
75}
76
Yaowu Xuf883b422016-08-30 14:01:10 -070077void aom_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -070078 const uint8_t *above, const uint8_t *left) {
79 (void)left;
80 dc_4x4(dst, stride, above, NULL, 1, 0);
81}
82
Yaowu Xuf883b422016-08-30 14:01:10 -070083void aom_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -070084 const uint8_t *above, const uint8_t *left) {
85 (void)above;
86 (void)left;
87 dc_4x4(dst, stride, NULL, NULL, 0, 0);
88}
89
90//------------------------------------------------------------------------------
91// DC 8x8
92
93// 'do_above' and 'do_left' facilitate branch removal when inlined.
94static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
95 const uint8_t *left, int do_above, int do_left) {
96 uint16x8_t sum_top;
97 uint16x8_t sum_left;
98 uint8x8_t dc0;
99
100 if (do_above) {
101 const uint8x8_t A = vld1_u8(above); // top row
102 const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
103 const uint16x4_t p1 = vpadd_u16(p0, p0);
104 const uint16x4_t p2 = vpadd_u16(p1, p1);
105 sum_top = vcombine_u16(p2, p2);
106 }
107
108 if (do_left) {
109 const uint8x8_t L = vld1_u8(left); // left border
110 const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left
111 const uint16x4_t p1 = vpadd_u16(p0, p0);
112 const uint16x4_t p2 = vpadd_u16(p1, p1);
113 sum_left = vcombine_u16(p2, p2);
114 }
115
116 if (do_above && do_left) {
117 const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
118 dc0 = vrshrn_n_u16(sum, 4);
119 } else if (do_above) {
120 dc0 = vrshrn_n_u16(sum_top, 3);
121 } else if (do_left) {
122 dc0 = vrshrn_n_u16(sum_left, 3);
123 } else {
124 dc0 = vdup_n_u8(0x80);
125 }
126
127 {
128 const uint8x8_t dc = vdup_lane_u8(dc0, 0);
129 int i;
130 for (i = 0; i < 8; ++i) {
131 vst1_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc));
132 }
133 }
134}
135
Yaowu Xuf883b422016-08-30 14:01:10 -0700136void aom_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700137 const uint8_t *above, const uint8_t *left) {
138 dc_8x8(dst, stride, above, left, 1, 1);
139}
140
Yaowu Xuf883b422016-08-30 14:01:10 -0700141void aom_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700142 const uint8_t *above, const uint8_t *left) {
143 (void)above;
144 dc_8x8(dst, stride, NULL, left, 0, 1);
145}
146
Yaowu Xuf883b422016-08-30 14:01:10 -0700147void aom_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700148 const uint8_t *above, const uint8_t *left) {
149 (void)left;
150 dc_8x8(dst, stride, above, NULL, 1, 0);
151}
152
Yaowu Xuf883b422016-08-30 14:01:10 -0700153void aom_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700154 const uint8_t *above, const uint8_t *left) {
155 (void)above;
156 (void)left;
157 dc_8x8(dst, stride, NULL, NULL, 0, 0);
158}
159
160//------------------------------------------------------------------------------
161// DC 16x16
162
163// 'do_above' and 'do_left' facilitate branch removal when inlined.
164static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride,
165 const uint8_t *above, const uint8_t *left,
166 int do_above, int do_left) {
167 uint16x8_t sum_top;
168 uint16x8_t sum_left;
169 uint8x8_t dc0;
170
171 if (do_above) {
172 const uint8x16_t A = vld1q_u8(above); // top row
173 const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top
174 const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
175 const uint16x4_t p2 = vpadd_u16(p1, p1);
176 const uint16x4_t p3 = vpadd_u16(p2, p2);
177 sum_top = vcombine_u16(p3, p3);
178 }
179
180 if (do_left) {
181 const uint8x16_t L = vld1q_u8(left); // left row
182 const uint16x8_t p0 = vpaddlq_u8(L); // cascading summation of the left
183 const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
184 const uint16x4_t p2 = vpadd_u16(p1, p1);
185 const uint16x4_t p3 = vpadd_u16(p2, p2);
186 sum_left = vcombine_u16(p3, p3);
187 }
188
189 if (do_above && do_left) {
190 const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
191 dc0 = vrshrn_n_u16(sum, 5);
192 } else if (do_above) {
193 dc0 = vrshrn_n_u16(sum_top, 4);
194 } else if (do_left) {
195 dc0 = vrshrn_n_u16(sum_left, 4);
196 } else {
197 dc0 = vdup_n_u8(0x80);
198 }
199
200 {
201 const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
202 int i;
203 for (i = 0; i < 16; ++i) {
204 vst1q_u8(dst + i * stride, dc);
205 }
206 }
207}
208
Yaowu Xuf883b422016-08-30 14:01:10 -0700209void aom_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700210 const uint8_t *above, const uint8_t *left) {
211 dc_16x16(dst, stride, above, left, 1, 1);
212}
213
Yaowu Xuf883b422016-08-30 14:01:10 -0700214void aom_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700215 const uint8_t *above,
216 const uint8_t *left) {
217 (void)above;
218 dc_16x16(dst, stride, NULL, left, 0, 1);
219}
220
Yaowu Xuf883b422016-08-30 14:01:10 -0700221void aom_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700222 const uint8_t *above,
223 const uint8_t *left) {
224 (void)left;
225 dc_16x16(dst, stride, above, NULL, 1, 0);
226}
227
Yaowu Xuf883b422016-08-30 14:01:10 -0700228void aom_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700229 const uint8_t *above,
230 const uint8_t *left) {
231 (void)above;
232 (void)left;
233 dc_16x16(dst, stride, NULL, NULL, 0, 0);
234}
235
236//------------------------------------------------------------------------------
237// DC 32x32
238
239// 'do_above' and 'do_left' facilitate branch removal when inlined.
240static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride,
241 const uint8_t *above, const uint8_t *left,
242 int do_above, int do_left) {
243 uint16x8_t sum_top;
244 uint16x8_t sum_left;
245 uint8x8_t dc0;
246
247 if (do_above) {
248 const uint8x16_t A0 = vld1q_u8(above); // top row
249 const uint8x16_t A1 = vld1q_u8(above + 16);
250 const uint16x8_t p0 = vpaddlq_u8(A0); // cascading summation of the top
251 const uint16x8_t p1 = vpaddlq_u8(A1);
252 const uint16x8_t p2 = vaddq_u16(p0, p1);
253 const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
254 const uint16x4_t p4 = vpadd_u16(p3, p3);
255 const uint16x4_t p5 = vpadd_u16(p4, p4);
256 sum_top = vcombine_u16(p5, p5);
257 }
258
259 if (do_left) {
260 const uint8x16_t L0 = vld1q_u8(left); // left row
261 const uint8x16_t L1 = vld1q_u8(left + 16);
262 const uint16x8_t p0 = vpaddlq_u8(L0); // cascading summation of the left
263 const uint16x8_t p1 = vpaddlq_u8(L1);
264 const uint16x8_t p2 = vaddq_u16(p0, p1);
265 const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
266 const uint16x4_t p4 = vpadd_u16(p3, p3);
267 const uint16x4_t p5 = vpadd_u16(p4, p4);
268 sum_left = vcombine_u16(p5, p5);
269 }
270
271 if (do_above && do_left) {
272 const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
273 dc0 = vrshrn_n_u16(sum, 6);
274 } else if (do_above) {
275 dc0 = vrshrn_n_u16(sum_top, 5);
276 } else if (do_left) {
277 dc0 = vrshrn_n_u16(sum_left, 5);
278 } else {
279 dc0 = vdup_n_u8(0x80);
280 }
281
282 {
283 const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
284 int i;
285 for (i = 0; i < 32; ++i) {
286 vst1q_u8(dst + i * stride, dc);
287 vst1q_u8(dst + i * stride + 16, dc);
288 }
289 }
290}
291
Yaowu Xuf883b422016-08-30 14:01:10 -0700292void aom_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700293 const uint8_t *above, const uint8_t *left) {
294 dc_32x32(dst, stride, above, left, 1, 1);
295}
296
Yaowu Xuf883b422016-08-30 14:01:10 -0700297void aom_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700298 const uint8_t *above,
299 const uint8_t *left) {
300 (void)above;
301 dc_32x32(dst, stride, NULL, left, 0, 1);
302}
303
Yaowu Xuf883b422016-08-30 14:01:10 -0700304void aom_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700305 const uint8_t *above,
306 const uint8_t *left) {
307 (void)left;
308 dc_32x32(dst, stride, above, NULL, 1, 0);
309}
310
Yaowu Xuf883b422016-08-30 14:01:10 -0700311void aom_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700312 const uint8_t *above,
313 const uint8_t *left) {
314 (void)above;
315 (void)left;
316 dc_32x32(dst, stride, NULL, NULL, 0, 0);
317}
318
319// -----------------------------------------------------------------------------
320
Yaowu Xuf883b422016-08-30 14:01:10 -0700321void aom_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700322 const uint8_t *above, const uint8_t *left) {
323 const uint8x8_t XABCD_u8 = vld1_u8(above - 1);
324 const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
325 const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
326 const uint32x2_t zero = vdup_n_u32(0);
327 const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0);
328 const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL);
329 const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8));
330 const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
331 const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
332 const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
333 const uint8_t D = vget_lane_u8(XABCD_u8, 4);
334 const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
335 const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
336 const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
337 const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
338 const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
339 const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
340 const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
341 const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
342 const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
343 vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
344 vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
345 vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
346 vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
347}
348
Yaowu Xuf883b422016-08-30 14:01:10 -0700349void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700350 const uint8_t *above, const uint8_t *left) {
351 int i;
352 uint32x2_t d0u32 = vdup_n_u32(0);
353 (void)left;
354
355 d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
356 for (i = 0; i < 4; i++, dst += stride)
357 vst1_lane_u32((uint32_t *)dst, d0u32, 0);
358}
359
Yaowu Xuf883b422016-08-30 14:01:10 -0700360void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700361 const uint8_t *above, const uint8_t *left) {
362 int i;
363 uint8x8_t d0u8 = vdup_n_u8(0);
364 (void)left;
365
366 d0u8 = vld1_u8(above);
367 for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8);
368}
369
Yaowu Xuf883b422016-08-30 14:01:10 -0700370void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700371 const uint8_t *above, const uint8_t *left) {
372 int i;
373 uint8x16_t q0u8 = vdupq_n_u8(0);
374 (void)left;
375
376 q0u8 = vld1q_u8(above);
377 for (i = 0; i < 16; i++, dst += stride) vst1q_u8(dst, q0u8);
378}
379
Yaowu Xuf883b422016-08-30 14:01:10 -0700380void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700381 const uint8_t *above, const uint8_t *left) {
382 int i;
383 uint8x16_t q0u8 = vdupq_n_u8(0);
384 uint8x16_t q1u8 = vdupq_n_u8(0);
385 (void)left;
386
387 q0u8 = vld1q_u8(above);
388 q1u8 = vld1q_u8(above + 16);
389 for (i = 0; i < 32; i++, dst += stride) {
390 vst1q_u8(dst, q0u8);
391 vst1q_u8(dst + 16, q1u8);
392 }
393}
394
Yaowu Xuf883b422016-08-30 14:01:10 -0700395void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700396 const uint8_t *above, const uint8_t *left) {
397 uint8x8_t d0u8 = vdup_n_u8(0);
398 uint32x2_t d1u32 = vdup_n_u32(0);
399 (void)above;
400
401 d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
402
403 d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
404 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
405 dst += stride;
406 d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
407 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
408 dst += stride;
409 d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
410 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
411 dst += stride;
412 d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
413 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
414}
415
Yaowu Xuf883b422016-08-30 14:01:10 -0700416void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700417 const uint8_t *above, const uint8_t *left) {
418 uint8x8_t d0u8 = vdup_n_u8(0);
419 uint64x1_t d1u64 = vdup_n_u64(0);
420 (void)above;
421
422 d1u64 = vld1_u64((const uint64_t *)left);
423
424 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
425 vst1_u8(dst, d0u8);
426 dst += stride;
427 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
428 vst1_u8(dst, d0u8);
429 dst += stride;
430 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
431 vst1_u8(dst, d0u8);
432 dst += stride;
433 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
434 vst1_u8(dst, d0u8);
435 dst += stride;
436 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
437 vst1_u8(dst, d0u8);
438 dst += stride;
439 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
440 vst1_u8(dst, d0u8);
441 dst += stride;
442 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
443 vst1_u8(dst, d0u8);
444 dst += stride;
445 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
446 vst1_u8(dst, d0u8);
447}
448
Yaowu Xuf883b422016-08-30 14:01:10 -0700449void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700450 const uint8_t *above, const uint8_t *left) {
451 int j;
452 uint8x8_t d2u8 = vdup_n_u8(0);
453 uint8x16_t q0u8 = vdupq_n_u8(0);
454 uint8x16_t q1u8 = vdupq_n_u8(0);
455 (void)above;
456
457 q1u8 = vld1q_u8(left);
458 d2u8 = vget_low_u8(q1u8);
459 for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
460 q0u8 = vdupq_lane_u8(d2u8, 0);
461 vst1q_u8(dst, q0u8);
462 dst += stride;
463 q0u8 = vdupq_lane_u8(d2u8, 1);
464 vst1q_u8(dst, q0u8);
465 dst += stride;
466 q0u8 = vdupq_lane_u8(d2u8, 2);
467 vst1q_u8(dst, q0u8);
468 dst += stride;
469 q0u8 = vdupq_lane_u8(d2u8, 3);
470 vst1q_u8(dst, q0u8);
471 dst += stride;
472 q0u8 = vdupq_lane_u8(d2u8, 4);
473 vst1q_u8(dst, q0u8);
474 dst += stride;
475 q0u8 = vdupq_lane_u8(d2u8, 5);
476 vst1q_u8(dst, q0u8);
477 dst += stride;
478 q0u8 = vdupq_lane_u8(d2u8, 6);
479 vst1q_u8(dst, q0u8);
480 dst += stride;
481 q0u8 = vdupq_lane_u8(d2u8, 7);
482 vst1q_u8(dst, q0u8);
483 dst += stride;
484 }
485}
486
Yaowu Xuf883b422016-08-30 14:01:10 -0700487void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700488 const uint8_t *above, const uint8_t *left) {
489 int j, k;
490 uint8x8_t d2u8 = vdup_n_u8(0);
491 uint8x16_t q0u8 = vdupq_n_u8(0);
492 uint8x16_t q1u8 = vdupq_n_u8(0);
493 (void)above;
494
495 for (k = 0; k < 2; k++, left += 16) {
496 q1u8 = vld1q_u8(left);
497 d2u8 = vget_low_u8(q1u8);
498 for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
499 q0u8 = vdupq_lane_u8(d2u8, 0);
500 vst1q_u8(dst, q0u8);
501 vst1q_u8(dst + 16, q0u8);
502 dst += stride;
503 q0u8 = vdupq_lane_u8(d2u8, 1);
504 vst1q_u8(dst, q0u8);
505 vst1q_u8(dst + 16, q0u8);
506 dst += stride;
507 q0u8 = vdupq_lane_u8(d2u8, 2);
508 vst1q_u8(dst, q0u8);
509 vst1q_u8(dst + 16, q0u8);
510 dst += stride;
511 q0u8 = vdupq_lane_u8(d2u8, 3);
512 vst1q_u8(dst, q0u8);
513 vst1q_u8(dst + 16, q0u8);
514 dst += stride;
515 q0u8 = vdupq_lane_u8(d2u8, 4);
516 vst1q_u8(dst, q0u8);
517 vst1q_u8(dst + 16, q0u8);
518 dst += stride;
519 q0u8 = vdupq_lane_u8(d2u8, 5);
520 vst1q_u8(dst, q0u8);
521 vst1q_u8(dst + 16, q0u8);
522 dst += stride;
523 q0u8 = vdupq_lane_u8(d2u8, 6);
524 vst1q_u8(dst, q0u8);
525 vst1q_u8(dst + 16, q0u8);
526 dst += stride;
527 q0u8 = vdupq_lane_u8(d2u8, 7);
528 vst1q_u8(dst, q0u8);
529 vst1q_u8(dst + 16, q0u8);
530 dst += stride;
531 }
532 }
533}
Sachin Kumar Garg8a68f7f2018-07-09 11:16:33 +0530534
535static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
536 const uint16_t *above,
537 const uint16_t *left) {
538 assert(bw >= 4);
539 assert(IS_POWER_OF_TWO(bw));
540 int expected_dc, sum = 0;
541 const int count = bw * 2;
542 uint32x4_t sum_q = vdupq_n_u32(0);
543 uint32x2_t sum_d;
544 uint16_t *dst_1;
545 if (bw >= 8) {
546 for (int i = 0; i < bw; i += 8) {
547 sum_q = vpadalq_u16(sum_q, vld1q_u16(above));
548 sum_q = vpadalq_u16(sum_q, vld1q_u16(left));
549 above += 8;
550 left += 8;
551 }
552 sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
553 sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
554 expected_dc = (sum + (count >> 1)) / count;
555 const uint16x8_t dc = vdupq_n_u16((uint16_t)expected_dc);
556 for (int r = 0; r < bw; r++) {
557 dst_1 = dst;
558 for (int i = 0; i < bw; i += 8) {
559 vst1q_u16(dst_1, dc);
560 dst_1 += 8;
561 }
562 dst += stride;
563 }
564 } else { // 4x4
565 sum_q = vaddl_u16(vld1_u16(above), vld1_u16(left));
566 sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
567 sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
568 expected_dc = (sum + (count >> 1)) / count;
569 const uint16x4_t dc = vdup_n_u16((uint16_t)expected_dc);
570 for (int r = 0; r < bw; r++) {
571 vst1_u16(dst, dc);
572 dst += stride;
573 }
574 }
575}
576
577#define intra_pred_highbd_sized_neon(type, width) \
578 void aom_highbd_##type##_predictor_##width##x##width##_neon( \
579 uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
580 const uint16_t *left, int bd) { \
581 (void)bd; \
582 highbd_##type##_predictor(dst, stride, width, above, left); \
583 }
584
585#define intra_pred_square(type) \
586 intra_pred_highbd_sized_neon(type, 4); \
587 intra_pred_highbd_sized_neon(type, 8); \
588 intra_pred_highbd_sized_neon(type, 16); \
589 intra_pred_highbd_sized_neon(type, 32); \
590 intra_pred_highbd_sized_neon(type, 64);
591
592intra_pred_square(dc);
593#undef intra_pred_square
Vitalii Dziumenkoef5d9ba2020-04-14 21:10:59 +0300594
595/* ---------------------P R E D I C T I O N Z 1--------------------------- */
596
597static DECLARE_ALIGNED(16, uint8_t, EvenOddMaskx[8][16]) = {
598 { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
599 { 0, 1, 3, 5, 7, 9, 11, 13, 0, 2, 4, 6, 8, 10, 12, 14 },
600 { 0, 0, 2, 4, 6, 8, 10, 12, 0, 0, 3, 5, 7, 9, 11, 13 },
601 { 0, 0, 0, 3, 5, 7, 9, 11, 0, 0, 0, 4, 6, 8, 10, 12 },
602 { 0, 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 0, 5, 7, 9, 11 },
603 { 0, 0, 0, 0, 0, 5, 7, 9, 0, 0, 0, 0, 0, 6, 8, 10 },
604 { 0, 0, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 0, 7, 9 },
605 { 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 8 }
606};
607
608// Low bit depth functions
609static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = {
610 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
611 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
612 { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
613 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
614 { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
615 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
616 { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
617 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
618 { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
619 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
620 { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
621 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
622 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
623 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
624 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
625 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
626 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
627 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
628 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
629 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
630 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
631 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
632 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
633 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
634 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
635 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
636 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
637 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
638 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
639 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
640 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
641 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
642 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
643 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
644 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
645 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
646 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
647 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
648 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
649 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0,
650 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
651 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
652 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0,
653 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
654 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
655 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0,
656 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
657 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
658 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0,
659 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
660 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
661 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0,
662 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
663 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
664 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
665 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
666 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
667 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
668 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
669 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
670 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
671 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
672 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
673 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
674 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 },
675 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
676 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
677 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0 },
678 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
679 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
680 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0 },
681 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
682 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
683 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0 },
684 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
685 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
686 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0 },
687 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
688 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
689 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0 },
690 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
691 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
692 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 },
693 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
694 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
695 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 },
696 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
697 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
698 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
699};
700
701/* clang-format on */
702static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_neon_64(
703 int H, int W, uint8x8_t *dst, const uint8_t *above, int upsample_above,
704 int dx) {
705 const int frac_bits = 6 - upsample_above;
706 const int max_base_x = ((W + H) - 1) << upsample_above;
707
708 assert(dx > 0);
709 // pre-filter above pixels
710 // store in temp buffers:
711 // above[x] * 32 + 16
712 // above[x+1] - above[x]
713 // final pixels will be calculated as:
714 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
715
716 uint16x8_t a0, a1;
717 uint16x8_t diff, a32;
718 uint16x8_t a16;
719 uint8x8_t a_mbase_x;
720
721 a16 = vdupq_n_u16(16);
722 a_mbase_x = vdup_n_u8(above[max_base_x]);
723 uint16x8_t v_32 = vdupq_n_u16(32);
724 int16x8_t v_upsample_above = vdupq_n_s16(upsample_above);
725 uint16x8_t c3f = vdupq_n_u16(0x3f);
726
727 int x = dx;
728 for (int r = 0; r < W; r++) {
729 uint16x8_t res;
730 uint16x8_t shift;
731 uint8x8x2_t v_tmp_a0_128;
732
733 int base = x >> frac_bits;
734 int base_max_diff = (max_base_x - base) >> upsample_above;
735 if (base_max_diff <= 0) {
736 for (int i = r; i < W; ++i) {
737 dst[i] = a_mbase_x; // save 4 values
738 }
739 return;
740 }
741
742 if (base_max_diff > H) base_max_diff = H;
743
744 if (upsample_above) {
745 v_tmp_a0_128 = vld2_u8(above + base);
746 shift = vshrq_n_u16(
747 vandq_u16(vshlq_u16(vdupq_n_u16(x), v_upsample_above), c3f), 1);
748 } else {
749 v_tmp_a0_128.val[0] = vld1_u8(above + base);
750 v_tmp_a0_128.val[1] = vld1_u8(above + base + 1);
751 shift = vshrq_n_u16(vandq_u16(vdupq_n_u16(x), c3f), 1);
752 }
753 a0 = vmovl_u8(v_tmp_a0_128.val[0]);
754 a1 = vmovl_u8(v_tmp_a0_128.val[1]);
755 diff = vsubq_u16(a1, a0); // a[x+1] - a[x]
756 a32 = vmlaq_u16(a16, a0, v_32); // a[x] * 32 + 16
757 res = vmlaq_u16(a32, diff, shift);
758
759 uint8x8_t mask = vld1_u8(BaseMask[base_max_diff]);
760 dst[r] =
761 vorr_u8(vand_u8(mask, vshrn_n_u16(res, 5)), vbic_u8(a_mbase_x, mask));
762
763 x += dx;
764 }
765}
766
767static void dr_prediction_z1_4xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
768 const uint8_t *above, int upsample_above,
769 int dx) {
770 uint8x8_t dstvec[16];
771
772 dr_prediction_z1_HxW_internal_neon_64(4, N, dstvec, above, upsample_above,
773 dx);
774 for (int i = 0; i < N; i++) {
775 vst1_lane_u32((uint32_t *)(dst + stride * i),
776 vreinterpret_u32_u8(dstvec[i]), 0);
777 }
778}
779
780static void dr_prediction_z1_8xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
781 const uint8_t *above, int upsample_above,
782 int dx) {
783 uint8x8_t dstvec[32];
784
785 dr_prediction_z1_HxW_internal_neon_64(8, N, dstvec, above, upsample_above,
786 dx);
787 for (int i = 0; i < N; i++) {
788 vst1_u8(dst + stride * i, dstvec[i]);
789 }
790}
791
792static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_neon(
793 int H, int W, uint8x16_t *dst, const uint8_t *above, int upsample_above,
794 int dx) {
795 const int frac_bits = 6 - upsample_above;
796 const int max_base_x = ((W + H) - 1) << upsample_above;
797
798 assert(dx > 0);
799 // pre-filter above pixels
800 // store in temp buffers:
801 // above[x] * 32 + 16
802 // above[x+1] - above[x]
803 // final pixels will be calculated as:
804 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
805
806 uint8x16x2_t a0, a1;
807 uint16x8x2_t diff, a32;
808 uint16x8_t a16, c3f;
809 uint8x16_t a_mbase_x;
810
811 a16 = vdupq_n_u16(16);
812 a_mbase_x = vdupq_n_u8(above[max_base_x]);
813 c3f = vdupq_n_u16(0x3f);
814 uint16x8_t v_32 = vdupq_n_u16(32);
815 uint8x16_t v_zero = vdupq_n_u8(0);
816 int16x8_t v_upsample_above = vdupq_n_s16(upsample_above);
817
818 int x = dx;
819 for (int r = 0; r < W; r++) {
820 uint16x8x2_t res;
821 uint16x8_t shift;
822 uint8x16_t a0_128, a1_128;
823
824 int base = x >> frac_bits;
825 int base_max_diff = (max_base_x - base) >> upsample_above;
826 if (base_max_diff <= 0) {
827 for (int i = r; i < W; ++i) {
828 dst[i] = a_mbase_x; // save 4 values
829 }
830 return;
831 }
832
833 if (base_max_diff > H) base_max_diff = H;
834
835 if (upsample_above) {
836 uint8x8x2_t v_tmp_a0_128 = vld2_u8(above + base);
837 a0_128 = vcombine_u8(v_tmp_a0_128.val[0], v_tmp_a0_128.val[1]);
838 a1_128 = vextq_u8(a0_128, v_zero, 8);
839 shift = vshrq_n_u16(
840 vandq_u16(vshlq_u16(vdupq_n_u16(x), v_upsample_above), c3f), 1);
841 } else {
842 a0_128 = vld1q_u8(above + base);
843 a1_128 = vld1q_u8(above + base + 1);
844 shift = vshrq_n_u16(vandq_u16(vdupq_n_u16(x), c3f), 1);
845 }
846 a0 = vzipq_u8(a0_128, v_zero);
847 a1 = vzipq_u8(a1_128, v_zero);
848 diff.val[0] = vsubq_u16(vreinterpretq_u16_u8(a1.val[0]),
849 vreinterpretq_u16_u8(a0.val[0])); // a[x+1] - a[x]
850 diff.val[1] = vsubq_u16(vreinterpretq_u16_u8(a1.val[1]),
851 vreinterpretq_u16_u8(a0.val[1])); // a[x+1] - a[x]
852 a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0.val[0]),
853 v_32); // a[x] * 32 + 16
854 a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0.val[1]),
855 v_32); // a[x] * 32 + 16
856 res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift);
857 res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift);
858 uint8x16_t v_temp =
859 vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5));
860
861 uint8x16_t mask = vld1q_u8(BaseMask[base_max_diff]);
862 dst[r] = vorrq_u8(vandq_u8(mask, v_temp), vbicq_u8(a_mbase_x, mask));
863
864 x += dx;
865 }
866}
867
868static void dr_prediction_z1_16xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
869 const uint8_t *above, int upsample_above,
870 int dx) {
871 uint8x16_t dstvec[64];
872
873 dr_prediction_z1_HxW_internal_neon(16, N, dstvec, above, upsample_above, dx);
874 for (int i = 0; i < N; i++) {
875 vst1q_u8(dst + stride * i, dstvec[i]);
876 }
877}
878
879static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_neon(
880 int N, uint8x16x2_t *dstvec, const uint8_t *above, int upsample_above,
881 int dx) {
882 // here upsample_above is 0 by design of av1_use_intra_edge_upsample
883 (void)upsample_above;
884 const int frac_bits = 6;
885 const int max_base_x = ((32 + N) - 1);
886
887 // pre-filter above pixels
888 // store in temp buffers:
889 // above[x] * 32 + 16
890 // above[x+1] - above[x]
891 // final pixels will be calculated as:
892 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
893
894 uint8x16_t a_mbase_x;
895 uint8x16x2_t a0, a1;
896 uint16x8x2_t diff, a32;
897 uint16x8_t a16, c3f;
898
899 a_mbase_x = vdupq_n_u8(above[max_base_x]);
900 a16 = vdupq_n_u16(16);
901 c3f = vdupq_n_u16(0x3f);
902 uint16x8_t v_32 = vdupq_n_u16(32);
903 uint8x16_t v_zero = vdupq_n_u8(0);
904
905 int x = dx;
906 for (int r = 0; r < N; r++) {
907 uint16x8x2_t res;
908 uint8x16_t res16[2];
909 uint8x16_t a0_128, a1_128;
910
911 int base = x >> frac_bits;
912 int base_max_diff = (max_base_x - base);
913 if (base_max_diff <= 0) {
914 for (int i = r; i < N; ++i) {
915 dstvec[i].val[0] = a_mbase_x; // save 32 values
916 dstvec[i].val[1] = a_mbase_x;
917 }
918 return;
919 }
920 if (base_max_diff > 32) base_max_diff = 32;
921
922 uint16x8_t shift = vshrq_n_u16(vandq_u16(vdupq_n_u16(x), c3f), 1);
923
924 for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
925 int mdiff = base_max_diff - j;
926 if (mdiff <= 0) {
927 res16[jj] = a_mbase_x;
928 } else {
929 a0_128 = vld1q_u8(above + base + j);
930 a1_128 = vld1q_u8(above + base + j + 1);
931 a0 = vzipq_u8(a0_128, v_zero);
932 a1 = vzipq_u8(a1_128, v_zero);
933 diff.val[0] =
934 vsubq_u16(vreinterpretq_u16_u8(a1.val[0]),
935 vreinterpretq_u16_u8(a0.val[0])); // a[x+1] - a[x]
936 diff.val[1] =
937 vsubq_u16(vreinterpretq_u16_u8(a1.val[1]),
938 vreinterpretq_u16_u8(a0.val[1])); // a[x+1] - a[x]
939 a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0.val[0]),
940 v_32); // a[x] * 32 + 16
941 a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0.val[1]),
942 v_32); // a[x] * 32 + 16
943 res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift);
944 res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift);
945
946 res16[jj] =
947 vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5));
948 }
949 }
950
951 uint8x16x2_t mask;
952
953 mask.val[0] = vld1q_u8(BaseMask[base_max_diff]);
954 mask.val[1] = vld1q_u8(BaseMask[base_max_diff] + 16);
955 dstvec[r].val[0] = vorrq_u8(vandq_u8(mask.val[0], res16[0]),
956 vbicq_u8(a_mbase_x, mask.val[0]));
957 dstvec[r].val[1] = vorrq_u8(vandq_u8(mask.val[1], res16[1]),
958 vbicq_u8(a_mbase_x, mask.val[1]));
959 x += dx;
960 }
961}
962
963static void dr_prediction_z1_32xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
964 const uint8_t *above, int upsample_above,
965 int dx) {
966 uint8x16x2_t dstvec[64];
967
968 dr_prediction_z1_32xN_internal_neon(N, dstvec, above, upsample_above, dx);
969 for (int i = 0; i < N; i++) {
970 vst1q_u8(dst + stride * i, dstvec[i].val[0]);
971 vst1q_u8(dst + stride * i + 16, dstvec[i].val[1]);
972 }
973}
974
975static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
976 const uint8_t *above, int upsample_above,
977 int dx) {
978 // here upsample_above is 0 by design of av1_use_intra_edge_upsample
979 (void)upsample_above;
980 const int frac_bits = 6;
981 const int max_base_x = ((64 + N) - 1);
982
983 // pre-filter above pixels
984 // store in temp buffers:
985 // above[x] * 32 + 16
986 // above[x+1] - above[x]
987 // final pixels will be calculated as:
988 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
989
990 uint8x16x2_t a0, a1;
991 uint16x8x2_t a32, diff;
992 uint16x8_t a16, c3f;
993 uint8x16_t a_mbase_x, max_base_x128, mask128;
994
995 a16 = vdupq_n_u16(16);
996 a_mbase_x = vdupq_n_u8(above[max_base_x]);
997 max_base_x128 = vdupq_n_u8(max_base_x);
998 c3f = vdupq_n_u16(0x3f);
999 uint16x8_t v_32 = vdupq_n_u16(32);
1000 uint8x16_t v_zero = vdupq_n_u8(0);
1001 uint8x16_t step = vdupq_n_u8(16);
1002
1003 int x = dx;
1004 for (int r = 0; r < N; r++, dst += stride) {
1005 uint16x8x2_t res;
1006
1007 int base = x >> frac_bits;
1008 if (base >= max_base_x) {
1009 for (int i = r; i < N; ++i) {
1010 vst1q_u8(dst, a_mbase_x);
1011 vst1q_u8(dst + 16, a_mbase_x);
1012 vst1q_u8(dst + 32, a_mbase_x);
1013 vst1q_u8(dst + 48, a_mbase_x);
1014 dst += stride;
1015 }
1016 return;
1017 }
1018
1019 uint16x8_t shift = vshrq_n_u16(vandq_u16(vdupq_n_u16(x), c3f), 1);
1020 uint8x16_t a0_128, a1_128, res128;
1021 uint8x16_t base_inc128 =
1022 vaddq_u8(vdupq_n_u8(base), vcombine_u8(vcreate_u8(0x0706050403020100),
1023 vcreate_u8(0x0F0E0D0C0B0A0908)));
1024
1025 for (int j = 0; j < 64; j += 16) {
1026 int mdif = max_base_x - (base + j);
1027 if (mdif <= 0) {
1028 vst1q_u8(dst + j, a_mbase_x);
1029 } else {
1030 a0_128 = vld1q_u8(above + base + j);
1031 a1_128 = vld1q_u8(above + base + 1 + j);
1032 a0 = vzipq_u8(a0_128, v_zero);
1033 a1 = vzipq_u8(a1_128, v_zero);
1034 diff.val[0] =
1035 vsubq_u16(vreinterpretq_u16_u8(a1.val[0]),
1036 vreinterpretq_u16_u8(a0.val[0])); // a[x+1] - a[x]
1037 diff.val[1] =
1038 vsubq_u16(vreinterpretq_u16_u8(a1.val[1]),
1039 vreinterpretq_u16_u8(a0.val[1])); // a[x+1] - a[x]
1040 a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0.val[0]),
1041 v_32); // a[x] * 32 + 16
1042 a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0.val[1]),
1043 v_32); // a[x] * 32 + 16
1044 res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift);
1045 res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift);
1046 uint8x16_t v_temp =
1047 vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5));
1048
1049 mask128 = vcgtq_u8(vqsubq_u8(max_base_x128, base_inc128), v_zero);
1050 res128 =
1051 vorrq_u8(vandq_u8(mask128, v_temp), vbicq_u8(a_mbase_x, mask128));
1052 vst1q_u8(dst + j, res128);
1053
1054 base_inc128 = vaddq_u8(base_inc128, step);
1055 }
1056 }
1057 x += dx;
1058 }
1059}
1060
1061// Directional prediction, zone 1: 0 < angle < 90
1062void av1_dr_prediction_z1_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
1063 const uint8_t *above, const uint8_t *left,
1064 int upsample_above, int dx, int dy) {
1065 (void)left;
1066 (void)dy;
1067
1068 switch (bw) {
1069 case 4:
1070 dr_prediction_z1_4xN_neon(bh, dst, stride, above, upsample_above, dx);
1071 break;
1072 case 8:
1073 dr_prediction_z1_8xN_neon(bh, dst, stride, above, upsample_above, dx);
1074 break;
1075 case 16:
1076 dr_prediction_z1_16xN_neon(bh, dst, stride, above, upsample_above, dx);
1077 break;
1078 case 32:
1079 dr_prediction_z1_32xN_neon(bh, dst, stride, above, upsample_above, dx);
1080 break;
1081 case 64:
1082 dr_prediction_z1_64xN_neon(bh, dst, stride, above, upsample_above, dx);
1083 break;
1084 default: break;
1085 }
1086 return;
1087}
1088
1089/* ---------------------P R E D I C T I O N Z 2--------------------------- */
1090
1091static DECLARE_ALIGNED(16, uint8_t, LoadMaskz2[4][16]) = {
1092 { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1093 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 },
1094 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
1095 0, 0, 0 },
1096 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1097 0xff, 0xff, 0xff, 0xff }
1098};
1099
1100static AOM_FORCE_INLINE void vector_shift_x4(uint8x8_t *vec, uint8x8_t *v_zero,
1101 int shift_value) {
1102 switch (shift_value) {
1103 case 1: *vec = vext_u8(*v_zero, *vec, 7); break;
1104 case 2: *vec = vext_u8(*v_zero, *vec, 6); break;
1105 case 3: *vec = vext_u8(*v_zero, *vec, 5); break;
1106 default: break;
1107 }
1108}
1109
1110static void dr_prediction_z2_Nx4_neon(int N, uint8_t *dst, ptrdiff_t stride,
1111 const uint8_t *above, const uint8_t *left,
1112 int upsample_above, int upsample_left,
1113 int dx, int dy) {
1114 const int min_base_x = -(1 << upsample_above);
1115 const int min_base_y = -(1 << upsample_left);
1116 const int frac_bits_x = 6 - upsample_above;
1117 const int frac_bits_y = 6 - upsample_left;
1118
1119 assert(dx > 0);
1120 // pre-filter above pixels
1121 // store in temp buffers:
1122 // above[x] * 32 + 16
1123 // above[x+1] - above[x]
1124 // final pixels will be calculated as:
1125 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1126 uint16x8_t a0_x, a1_x, a32, diff;
1127 uint16x8_t v_32 = vdupq_n_u16(32);
1128 uint16x8_t v_zero = vdupq_n_u16(0);
1129 uint16x8_t a16 = vdupq_n_u16(16);
1130
1131 uint8x8_t v_zero_u8 = vdup_n_u8(0);
1132 uint16x4_t v_c3f = vdup_n_u16(0x3f);
1133 uint16x4_t r6 = vcreate_u16(0x00C0008000400000);
1134 int16x4_t v_upsample_left = vdup_n_s16(upsample_left);
1135 int16x4_t v_upsample_above = vdup_n_s16(upsample_above);
1136 int16x4_t v_1234 = vcreate_s16(0x0004000300020001);
1137 int16x4_t dy64 = vdup_n_s16(dy);
1138 int16x4_t v_frac_bits_y = vdup_n_s16(-frac_bits_y);
1139 int16x4_t min_base_y64 = vdup_n_s16(min_base_y);
1140 int16x4_t v_one = vdup_lane_s16(v_1234, 0);
1141
1142 for (int r = 0; r < N; r++) {
1143 uint16x8_t res, shift;
1144 uint16x4_t ydx;
1145 uint8x8_t resx, resy;
1146 uint16x4x2_t v_shift;
1147
1148 int y = r + 1;
1149 int base_x = (-y * dx) >> frac_bits_x;
1150 int base_shift = 0;
1151 if (base_x < (min_base_x - 1)) {
1152 base_shift = (min_base_x - base_x - 1) >> upsample_above;
1153 }
1154 int base_min_diff =
1155 (min_base_x - base_x + upsample_above) >> upsample_above;
1156 if (base_min_diff > 4) {
1157 base_min_diff = 4;
1158 } else {
1159 if (base_min_diff < 0) base_min_diff = 0;
1160 }
1161
1162 if (base_shift > 3) {
1163 a0_x = v_zero;
1164 a1_x = v_zero;
1165 v_shift.val[0] = vreinterpret_u16_u8(v_zero_u8);
1166 v_shift.val[1] = vreinterpret_u16_u8(v_zero_u8);
1167 } else {
1168 ydx = vdup_n_u16(y * dx);
1169
1170 if (upsample_above) {
1171 uint8x8x2_t v_tmp;
1172 v_tmp.val[0] = vld1_u8(above + base_x + base_shift);
1173 v_tmp.val[1] = vld1_u8(above + base_x + base_shift + 8);
1174 uint8x8_t v_index_low = vld1_u8(EvenOddMaskx[base_shift]);
1175 uint8x8_t v_index_high = vld1_u8(EvenOddMaskx[base_shift] + 8);
1176 a0_x = vmovl_u8(vtbl2_u8(v_tmp, v_index_low));
1177 a1_x = vmovl_u8(vtbl2_u8(v_tmp, v_index_high));
1178 v_shift.val[0] = vshr_n_u16(
1179 vand_u16(vshl_u16(vsub_u16(r6, ydx), v_upsample_above), v_c3f), 1);
1180 } else {
1181 uint8x8_t v_a0_x64 = vld1_u8(above + base_x + base_shift);
1182 vector_shift_x4(&v_a0_x64, &v_zero_u8, base_shift);
1183 uint8x8_t v_a1_x64 = vext_u8(v_a0_x64, v_zero_u8, 1);
1184 v_shift.val[0] = vshr_n_u16(vand_u16(vsub_u16(r6, ydx), v_c3f), 1);
1185 a0_x = vmovl_u8(v_a0_x64);
1186 a1_x = vmovl_u8(v_a1_x64);
1187 }
1188 }
1189
1190 // y calc
1191 uint8x8_t a0_y, a1_y;
1192 if (base_x < min_base_x) {
1193 DECLARE_ALIGNED(32, int16_t, base_y_c[4]);
1194 int16x4_t v_r6 = vdup_n_s16(r << 6);
1195 int16x4_t y_c64 = vmls_s16(v_r6, v_1234, dy64);
1196 int16x4_t base_y_c64 = vshl_s16(y_c64, v_frac_bits_y);
1197 uint16x4_t mask64 = vcgt_s16(min_base_y64, base_y_c64);
1198
1199 base_y_c64 = vbic_s16(base_y_c64, vreinterpret_s16_u16(mask64));
1200 vst1_s16(base_y_c, base_y_c64);
1201 a0_y = v_zero_u8;
1202 a0_y = vld1_lane_u8(left + base_y_c[0], a0_y, 0);
1203 a0_y = vld1_lane_u8(left + base_y_c[1], a0_y, 2);
1204 a0_y = vld1_lane_u8(left + base_y_c[2], a0_y, 4);
1205 a0_y = vld1_lane_u8(left + base_y_c[3], a0_y, 6);
1206
1207 base_y_c64 = vadd_s16(base_y_c64, v_one);
1208 vst1_s16(base_y_c, base_y_c64);
1209 a1_y = v_zero_u8;
1210 a1_y = vld1_lane_u8(left + base_y_c[0], a1_y, 0);
1211 a1_y = vld1_lane_u8(left + base_y_c[1], a1_y, 2);
1212 a1_y = vld1_lane_u8(left + base_y_c[2], a1_y, 4);
1213 a1_y = vld1_lane_u8(left + base_y_c[3], a1_y, 6);
1214
1215 if (upsample_left) {
1216 v_shift.val[1] = vshr_n_u16(
1217 vand_u16(vshl_u16(vreinterpret_u16_s16(y_c64), v_upsample_left),
1218 v_c3f),
1219 1);
1220 } else {
1221 v_shift.val[1] =
1222 vshr_n_u16(vand_u16(vreinterpret_u16_s16(y_c64), v_c3f), 1);
1223 }
1224
1225 a0_x = vcombine_u16(vget_low_u16(a0_x), vreinterpret_u16_u8(a0_y));
1226 a1_x = vcombine_u16(vget_low_u16(a1_x), vreinterpret_u16_u8(a1_y));
1227 }
1228 shift = vcombine_u16(v_shift.val[0], v_shift.val[1]);
1229 diff = vsubq_u16(a1_x, a0_x); // a[x+1] - a[x]
1230 a32 = vmlaq_u16(a16, a0_x, v_32); // a[x] * 32 + 16
1231 res = vmlaq_u16(a32, diff, shift);
1232 resx = vshrn_n_u16(res, 5);
1233 resy = vext_u8(resx, v_zero_u8, 4);
1234
1235 uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]);
1236 uint8x8_t v_resxy = vorr_u8(vand_u8(mask, resy), vbic_u8(resx, mask));
1237 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(v_resxy), 0);
1238
1239 dst += stride;
1240 }
1241}
1242
1243static AOM_FORCE_INLINE void vector_shuffle(uint8x16_t *vec, uint8x16_t *vzero,
1244 int shift_value) {
1245 switch (shift_value) {
1246 case 1: *vec = vextq_u8(*vzero, *vec, 15); break;
1247 case 2: *vec = vextq_u8(*vzero, *vec, 14); break;
1248 case 3: *vec = vextq_u8(*vzero, *vec, 13); break;
1249 case 4: *vec = vextq_u8(*vzero, *vec, 12); break;
1250 case 5: *vec = vextq_u8(*vzero, *vec, 11); break;
1251 case 6: *vec = vextq_u8(*vzero, *vec, 10); break;
1252 case 7: *vec = vextq_u8(*vzero, *vec, 9); break;
1253 case 8: *vec = vextq_u8(*vzero, *vec, 8); break;
1254 case 9: *vec = vextq_u8(*vzero, *vec, 7); break;
1255 case 10: *vec = vextq_u8(*vzero, *vec, 6); break;
1256 case 11: *vec = vextq_u8(*vzero, *vec, 5); break;
1257 case 12: *vec = vextq_u8(*vzero, *vec, 4); break;
1258 case 13: *vec = vextq_u8(*vzero, *vec, 3); break;
1259 case 14: *vec = vextq_u8(*vzero, *vec, 2); break;
1260 case 15: *vec = vextq_u8(*vzero, *vec, 1); break;
1261 default: break;
1262 }
1263}
1264
1265static void dr_prediction_z2_Nx8_neon(int N, uint8_t *dst, ptrdiff_t stride,
1266 const uint8_t *above, const uint8_t *left,
1267 int upsample_above, int upsample_left,
1268 int dx, int dy) {
1269 const int min_base_x = -(1 << upsample_above);
1270 const int min_base_y = -(1 << upsample_left);
1271 const int frac_bits_x = 6 - upsample_above;
1272 const int frac_bits_y = 6 - upsample_left;
1273
1274 // pre-filter above pixels
1275 // store in temp buffers:
1276 // above[x] * 32 + 16
1277 // above[x+1] - above[x]
1278 // final pixels will be calculated as:
1279 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1280 uint8x16x2_t a0_x, a1_x;
1281 uint16x8x2_t diff, a32;
1282 uint16x8_t c1234, a16, c3f;
1283 uint8x16_t a0_x128, a1_x128;
1284 int16x8_t min_base_y128, dy128;
1285 uint16x8_t v_32 = vdupq_n_u16(32);
1286 uint8x16_t v_zero = vdupq_n_u8(0);
1287 int16x8_t v_upsample_left = vdupq_n_s16(upsample_left);
1288 int16x8_t v_upsample_above = vdupq_n_s16(upsample_above);
1289 int16x8_t v_frac_bits_y = vdupq_n_s16(-frac_bits_y);
1290
1291 a16 = vdupq_n_u16(16);
1292 c3f = vdupq_n_u16(0x3f);
1293 min_base_y128 = vdupq_n_s16(min_base_y);
1294 dy128 = vdupq_n_s16(dy);
1295 c1234 = vcombine_u16(vcreate_u16(0x0004000300020001),
1296 vcreate_u16(0x0008000700060005));
1297
1298 for (int r = 0; r < N; r++) {
1299 uint8x8_t resx, resy, resxy;
1300 uint16x8_t r6, ydx;
1301 uint16x8x2_t res, shift;
1302
1303 int y = r + 1;
1304 int base_x = (-y * dx) >> frac_bits_x;
1305 int base_shift = 0;
1306 if (base_x < (min_base_x - 1)) {
1307 base_shift = (min_base_x - base_x - 1) >> upsample_above;
1308 }
1309 int base_min_diff =
1310 (min_base_x - base_x + upsample_above) >> upsample_above;
1311 if (base_min_diff > 8) {
1312 base_min_diff = 8;
1313 } else {
1314 if (base_min_diff < 0) base_min_diff = 0;
1315 }
1316
1317 if (base_shift > 7) {
1318 a0_x.val[0] = v_zero;
1319 a0_x.val[1] = v_zero;
1320 a1_x.val[0] = v_zero;
1321 a1_x.val[1] = v_zero;
1322 shift.val[0] = vreinterpretq_u16_u8(v_zero);
1323 shift.val[1] = vreinterpretq_u16_u8(v_zero);
1324 } else {
1325 ydx = vdupq_n_u16(y * dx);
1326 r6 = vshlq_n_u16(vextq_u16(c1234, vreinterpretq_u16_u8(v_zero), 2), 6);
1327
1328 if (upsample_above) {
1329 uint8x8x2_t v_tmp;
1330 v_tmp.val[0] = vld1_u8(above + base_x + base_shift);
1331 v_tmp.val[1] = vld1_u8(above + base_x + base_shift + 8);
1332 uint8x8_t v_index_low = vld1_u8(EvenOddMaskx[base_shift]);
1333 uint8x8_t v_index_high = vld1_u8(EvenOddMaskx[base_shift] + 8);
1334 shift.val[0] = vshrq_n_u16(
1335 vandq_u16(vshlq_u16(vsubq_u16(r6, ydx), v_upsample_above), c3f), 1);
1336 a0_x.val[0] =
1337 vreinterpretq_u8_u16(vmovl_u8(vtbl2_u8(v_tmp, v_index_low)));
1338 a1_x.val[0] =
1339 vreinterpretq_u8_u16(vmovl_u8(vtbl2_u8(v_tmp, v_index_high)));
1340 } else {
1341 a0_x128 = vld1q_u8(above + base_x + base_shift);
1342 a1_x128 = vextq_u8(a0_x128, v_zero, 1);
1343 vector_shuffle(&a0_x128, &v_zero, base_shift);
1344 vector_shuffle(&a1_x128, &v_zero, base_shift);
1345 shift.val[0] = vshrq_n_u16(vandq_u16(vsubq_u16(r6, ydx), c3f), 1);
1346 a0_x.val[0] = vreinterpretq_u8_u16(vmovl_u8(vget_low_u8(a0_x128)));
1347 a1_x.val[0] = vreinterpretq_u8_u16(vmovl_u8(vget_low_u8(a1_x128)));
1348 }
1349 }
1350
1351 // y calc
1352 if (base_x < min_base_x) {
1353 DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
1354 int16x8_t y_c128, base_y_c128;
1355 uint16x8_t mask128;
1356 int16x8_t v_r6 = vdupq_n_s16(r << 6);
1357
1358 y_c128 = vmlsq_s16(v_r6, vreinterpretq_s16_u16(c1234), dy128);
1359 base_y_c128 = vshlq_s16(y_c128, v_frac_bits_y);
1360 mask128 = vcgtq_s16(min_base_y128, base_y_c128);
1361
1362 base_y_c128 = vbicq_s16(base_y_c128, vreinterpretq_s16_u16(mask128));
1363 vst1q_s16(base_y_c, base_y_c128);
1364 a0_x.val[1] = v_zero;
1365 a0_x.val[1] = vld1q_lane_u8(left + base_y_c[0], a0_x.val[1], 0);
1366 a0_x.val[1] = vld1q_lane_u8(left + base_y_c[1], a0_x.val[1], 2);
1367 a0_x.val[1] = vld1q_lane_u8(left + base_y_c[2], a0_x.val[1], 4);
1368 a0_x.val[1] = vld1q_lane_u8(left + base_y_c[3], a0_x.val[1], 6);
1369 a0_x.val[1] = vld1q_lane_u8(left + base_y_c[4], a0_x.val[1], 8);
1370 a0_x.val[1] = vld1q_lane_u8(left + base_y_c[5], a0_x.val[1], 10);
1371 a0_x.val[1] = vld1q_lane_u8(left + base_y_c[6], a0_x.val[1], 12);
1372 a0_x.val[1] = vld1q_lane_u8(left + base_y_c[7], a0_x.val[1], 14);
1373
1374 base_y_c128 =
1375 vaddq_s16(base_y_c128, vreinterpretq_s16_u16(vshrq_n_u16(a16, 4)));
1376 vst1q_s16(base_y_c, base_y_c128);
1377 a1_x.val[1] = v_zero;
1378 a1_x.val[1] = vld1q_lane_u8(left + base_y_c[0], a1_x.val[1], 0);
1379 a1_x.val[1] = vld1q_lane_u8(left + base_y_c[1], a1_x.val[1], 2);
1380 a1_x.val[1] = vld1q_lane_u8(left + base_y_c[2], a1_x.val[1], 4);
1381 a1_x.val[1] = vld1q_lane_u8(left + base_y_c[3], a1_x.val[1], 6);
1382 a1_x.val[1] = vld1q_lane_u8(left + base_y_c[4], a1_x.val[1], 8);
1383 a1_x.val[1] = vld1q_lane_u8(left + base_y_c[5], a1_x.val[1], 10);
1384 a1_x.val[1] = vld1q_lane_u8(left + base_y_c[6], a1_x.val[1], 12);
1385 a1_x.val[1] = vld1q_lane_u8(left + base_y_c[7], a1_x.val[1], 14);
1386
1387 if (upsample_left) {
1388 shift.val[1] = vshrq_n_u16(
1389 vandq_u16(vshlq_u16(vreinterpretq_u16_s16(y_c128), v_upsample_left),
1390 c3f),
1391 1);
1392 } else {
1393 shift.val[1] =
1394 vshrq_n_u16(vandq_u16(vreinterpretq_u16_s16(y_c128), c3f), 1);
1395 }
1396 }
1397 diff.val[0] =
1398 vsubq_u16(vreinterpretq_u16_u8(a1_x.val[0]),
1399 vreinterpretq_u16_u8(a0_x.val[0])); // a[x+1] - a[x]
1400 diff.val[1] =
1401 vsubq_u16(vreinterpretq_u16_u8(a1_x.val[1]),
1402 vreinterpretq_u16_u8(a0_x.val[1])); // a[x+1] - a[x]
1403 a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_x.val[0]),
1404 v_32); // a[x] * 32 + 16
1405 a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_x.val[1]),
1406 v_32); // a[x] * 32 + 16
1407 res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift.val[0]);
1408 res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift.val[1]);
1409 resx = vshrn_n_u16(res.val[0], 5);
1410 resy = vshrn_n_u16(res.val[1], 5);
1411
1412 uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]);
1413
1414 resxy = vorr_u8(vand_u8(mask, resy), vbic_u8(resx, mask));
1415 vst1_u8(dst, resxy);
1416 dst += stride;
1417 }
1418}
1419
1420static void dr_prediction_z2_HxW_neon(int H, int W, uint8_t *dst,
1421 ptrdiff_t stride, const uint8_t *above,
1422 const uint8_t *left, int upsample_above,
1423 int upsample_left, int dx, int dy) {
1424 // here upsample_above and upsample_left are 0 by design of
1425 // av1_use_intra_edge_upsample
1426 const int min_base_x = -1;
1427 const int min_base_y = -1;
1428 (void)upsample_above;
1429 (void)upsample_left;
1430 const int frac_bits_x = 6;
1431 const int frac_bits_y = 6;
1432
1433 uint16x8_t a16, c1, c3f;
1434 int16x8_t min_base_y256, dy256;
1435 uint16x8x2_t a32, c0123, c1234, diff, shifty;
1436 uint8x16x2_t a0_x, a1_x, a0_y, a1_y;
1437 uint8x16_t a0_x128, a1_x128;
1438 uint16x8_t v_32 = vdupq_n_u16(32);
1439 uint8x16_t v_zero = vdupq_n_u8(0);
1440 int16x8_t v_frac_bits_y = vdupq_n_s16(-frac_bits_y);
1441
1442 DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
1443
1444 a16 = vdupq_n_u16(16);
1445 c1 = vshrq_n_u16(a16, 4);
1446 min_base_y256 = vdupq_n_s16(min_base_y);
1447 c3f = vdupq_n_u16(0x3f);
1448 dy256 = vdupq_n_s16(dy);
1449 c0123.val[0] = vcombine_u16(vcreate_u16(0x0003000200010000),
1450 vcreate_u16(0x0007000600050004));
1451 c0123.val[1] = vcombine_u16(vcreate_u16(0x000B000A00090008),
1452 vcreate_u16(0x000F000E000D000C));
1453 c1234.val[0] = vaddq_u16(c0123.val[0], c1);
1454 c1234.val[1] = vaddq_u16(c0123.val[1], c1);
1455
1456 for (int r = 0; r < H; r++) {
1457 uint16x8x2_t res, r6, shift;
1458 uint16x8_t ydx, j256;
1459 uint8x16_t resx, resy, resxy;
1460 int y = r + 1;
1461 ydx = vdupq_n_u16((uint16_t)(y * dx));
1462
1463 int base_x = (-y * dx) >> frac_bits_x;
1464 for (int j = 0; j < W; j += 16) {
1465 j256 = vdupq_n_u16(j);
1466
1467 int base_shift = 0;
1468 if ((base_x + j) < (min_base_x - 1)) {
1469 base_shift = (min_base_x - (base_x + j) - 1);
1470 }
1471 int base_min_diff = (min_base_x - base_x - j);
1472 if (base_min_diff > 16) {
1473 base_min_diff = 16;
1474 } else {
1475 if (base_min_diff < 0) base_min_diff = 0;
1476 }
1477
1478 if (base_shift < 16) {
1479 a0_x128 = vld1q_u8(above + base_x + base_shift + j);
1480 a1_x128 = vld1q_u8(above + base_x + base_shift + 1 + j);
1481 vector_shuffle(&a0_x128, &v_zero, base_shift);
1482 vector_shuffle(&a1_x128, &v_zero, base_shift);
1483 a0_x = vzipq_u8(a0_x128, v_zero);
1484 a1_x = vzipq_u8(a1_x128, v_zero);
1485 r6.val[0] = vshlq_n_u16(vaddq_u16(c0123.val[0], j256), 6);
1486 r6.val[1] = vshlq_n_u16(vaddq_u16(c0123.val[1], j256), 6);
1487 shift.val[0] =
1488 vshrq_n_u16(vandq_u16(vsubq_u16(r6.val[0], ydx), c3f), 1);
1489 shift.val[1] =
1490 vshrq_n_u16(vandq_u16(vsubq_u16(r6.val[1], ydx), c3f), 1);
1491 diff.val[0] =
1492 vsubq_u16(vreinterpretq_u16_u8(a1_x.val[0]),
1493 vreinterpretq_u16_u8(a0_x.val[0])); // a[x+1] - a[x]
1494 diff.val[1] =
1495 vsubq_u16(vreinterpretq_u16_u8(a1_x.val[1]),
1496 vreinterpretq_u16_u8(a0_x.val[1])); // a[x+1] - a[x]
1497 a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_x.val[0]),
1498 v_32); // a[x] * 32 + 16
1499 a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_x.val[1]),
1500 v_32); // a[x] * 32 + 16
1501 res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift.val[0]);
1502 res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift.val[1]);
1503 resx =
1504 vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5));
1505 } else {
1506 resx = v_zero;
1507 }
1508
1509 // y calc
1510 if (base_x < min_base_x) {
1511 uint16x8x2_t mask256;
1512 int16x8x2_t c256, y_c256, base_y_c256, mul16;
1513 int16x8_t v_r6 = vdupq_n_s16(r << 6);
1514
1515 c256.val[0] = vaddq_s16(vreinterpretq_s16_u16(j256),
1516 vreinterpretq_s16_u16(c1234.val[0]));
1517 c256.val[1] = vaddq_s16(vreinterpretq_s16_u16(j256),
1518 vreinterpretq_s16_u16(c1234.val[1]));
1519 mul16.val[0] = vminq_s16(vmulq_s16(c256.val[0], dy256),
1520 vreinterpretq_s16_u16(vshrq_n_u16(
1521 vreinterpretq_u16_s16(min_base_y256), 1)));
1522 mul16.val[1] = vminq_s16(vmulq_s16(c256.val[1], dy256),
1523 vreinterpretq_s16_u16(vshrq_n_u16(
1524 vreinterpretq_u16_s16(min_base_y256), 1)));
1525 y_c256.val[0] = vsubq_s16(v_r6, mul16.val[0]);
1526 y_c256.val[1] = vsubq_s16(v_r6, mul16.val[1]);
1527
1528 base_y_c256.val[0] = vshlq_s16(y_c256.val[0], v_frac_bits_y);
1529 base_y_c256.val[1] = vshlq_s16(y_c256.val[1], v_frac_bits_y);
1530 mask256.val[0] = vcgtq_s16(min_base_y256, base_y_c256.val[0]);
1531 mask256.val[1] = vcgtq_s16(min_base_y256, base_y_c256.val[1]);
1532
1533 base_y_c256.val[0] = vorrq_s16(
1534 vandq_s16(vreinterpretq_s16_u16(mask256.val[0]), min_base_y256),
1535 vbicq_s16(base_y_c256.val[0],
1536 vreinterpretq_s16_u16(mask256.val[0])));
1537 base_y_c256.val[1] = vorrq_s16(
1538 vandq_s16(vreinterpretq_s16_u16(mask256.val[1]), min_base_y256),
1539 vbicq_s16(base_y_c256.val[1],
1540 vreinterpretq_s16_u16(mask256.val[1])));
1541
1542 int16_t min_y = vgetq_lane_s16(base_y_c256.val[1], 7);
1543 int16_t max_y = vgetq_lane_s16(base_y_c256.val[0], 0);
1544 int16_t offset_diff = max_y - min_y;
1545
1546 if (offset_diff < 16) {
1547 int16x8_t min_y256 =
1548 vdupq_lane_s16(vget_high_s16(base_y_c256.val[1]), 3);
1549
1550 int16x8x2_t base_y_offset;
1551 base_y_offset.val[0] = vsubq_s16(base_y_c256.val[0], min_y256);
1552 base_y_offset.val[1] = vsubq_s16(base_y_c256.val[1], min_y256);
1553
1554 int8x16_t base_y_offset128 =
1555 vcombine_s8(vqmovn_s16(base_y_offset.val[0]),
1556 vqmovn_s16(base_y_offset.val[1]));
1557
1558 uint8x16_t a0_y128, a1_y128;
1559 uint8x16_t v_loadmaskz2 = vld1q_u8(LoadMaskz2[offset_diff / 4]);
1560 a0_y128 = vld1q_u8(left + min_y);
1561 a0_y128 = vandq_u8(a0_y128, v_loadmaskz2);
1562 a1_y128 = vld1q_u8(left + min_y + 1);
1563 a1_y128 = vandq_u8(a1_y128, v_loadmaskz2);
1564#if defined(__aarch64__)
1565 a0_y128 = vqtbl1q_u8(a0_y128, vreinterpretq_u8_s8(base_y_offset128));
1566 a1_y128 = vqtbl1q_u8(a1_y128, vreinterpretq_u8_s8(base_y_offset128));
1567#else
1568 uint8x8x2_t v_tmp;
1569 uint8x8x2_t v_res;
1570 uint8x8_t v_index_low =
1571 vget_low_u8(vreinterpretq_u8_s8(base_y_offset128));
1572 uint8x8_t v_index_high =
1573 vget_high_u8(vreinterpretq_u8_s8(base_y_offset128));
1574 v_tmp.val[0] = vget_low_u8(a0_y128);
1575 v_tmp.val[1] = vget_high_u8(a0_y128);
1576 v_res.val[0] = vtbl2_u8(v_tmp, v_index_low);
1577 v_res.val[1] = vtbl2_u8(v_tmp, v_index_high);
1578 a0_y128 = vcombine_u8(v_res.val[0], v_res.val[1]);
1579 v_tmp.val[0] = vget_low_u8(a1_y128);
1580 v_tmp.val[1] = vget_high_u8(a1_y128);
1581 v_res.val[0] = vtbl2_u8(v_tmp, v_index_low);
1582 v_res.val[1] = vtbl2_u8(v_tmp, v_index_high);
1583 a1_y128 = vcombine_u8(v_res.val[0], v_res.val[1]);
1584#endif
1585 a0_y = vzipq_u8(a0_y128, v_zero);
1586 a1_y = vzipq_u8(a1_y128, v_zero);
1587 } else {
1588 base_y_c256.val[0] = vbicq_s16(base_y_c256.val[0],
1589 vreinterpretq_s16_u16(mask256.val[0]));
1590 base_y_c256.val[1] = vbicq_s16(base_y_c256.val[1],
1591 vreinterpretq_s16_u16(mask256.val[1]));
1592 vst1q_s16(base_y_c, base_y_c256.val[0]);
1593 vst1q_s16(base_y_c + 8, base_y_c256.val[1]);
1594 a0_y.val[0] = v_zero;
1595 a0_y.val[1] = v_zero;
1596 a0_y.val[0] = vld1q_lane_u8(left + base_y_c[0], a0_y.val[0], 0);
1597 a0_y.val[0] = vld1q_lane_u8(left + base_y_c[1], a0_y.val[0], 2);
1598 a0_y.val[0] = vld1q_lane_u8(left + base_y_c[2], a0_y.val[0], 4);
1599 a0_y.val[0] = vld1q_lane_u8(left + base_y_c[3], a0_y.val[0], 6);
1600 a0_y.val[0] = vld1q_lane_u8(left + base_y_c[4], a0_y.val[0], 8);
1601 a0_y.val[0] = vld1q_lane_u8(left + base_y_c[5], a0_y.val[0], 10);
1602 a0_y.val[0] = vld1q_lane_u8(left + base_y_c[6], a0_y.val[0], 12);
1603 a0_y.val[0] = vld1q_lane_u8(left + base_y_c[7], a0_y.val[0], 14);
1604 a0_y.val[1] = vld1q_lane_u8(left + base_y_c[8], a0_y.val[1], 0);
1605 a0_y.val[1] = vld1q_lane_u8(left + base_y_c[9], a0_y.val[1], 2);
1606 a0_y.val[1] = vld1q_lane_u8(left + base_y_c[10], a0_y.val[1], 4);
1607 a0_y.val[1] = vld1q_lane_u8(left + base_y_c[11], a0_y.val[1], 6);
1608 a0_y.val[1] = vld1q_lane_u8(left + base_y_c[12], a0_y.val[1], 8);
1609 a0_y.val[1] = vld1q_lane_u8(left + base_y_c[13], a0_y.val[1], 10);
1610 a0_y.val[1] = vld1q_lane_u8(left + base_y_c[14], a0_y.val[1], 12);
1611 a0_y.val[1] = vld1q_lane_u8(left + base_y_c[15], a0_y.val[1], 14);
1612
1613 base_y_c256.val[0] =
1614 vaddq_s16(base_y_c256.val[0], vreinterpretq_s16_u16(c1));
1615 base_y_c256.val[1] =
1616 vaddq_s16(base_y_c256.val[1], vreinterpretq_s16_u16(c1));
1617 vst1q_s16(base_y_c, base_y_c256.val[0]);
1618 vst1q_s16(base_y_c + 8, base_y_c256.val[1]);
1619 a1_y.val[0] = v_zero;
1620 a1_y.val[1] = v_zero;
1621 a1_y.val[0] = vld1q_lane_u8(left + base_y_c[0], a1_y.val[0], 0);
1622 a1_y.val[0] = vld1q_lane_u8(left + base_y_c[1], a1_y.val[0], 2);
1623 a1_y.val[0] = vld1q_lane_u8(left + base_y_c[2], a1_y.val[0], 4);
1624 a1_y.val[0] = vld1q_lane_u8(left + base_y_c[3], a1_y.val[0], 6);
1625 a1_y.val[0] = vld1q_lane_u8(left + base_y_c[4], a1_y.val[0], 8);
1626 a1_y.val[0] = vld1q_lane_u8(left + base_y_c[5], a1_y.val[0], 10);
1627 a1_y.val[0] = vld1q_lane_u8(left + base_y_c[6], a1_y.val[0], 12);
1628 a1_y.val[0] = vld1q_lane_u8(left + base_y_c[7], a1_y.val[0], 14);
1629 a1_y.val[1] = vld1q_lane_u8(left + base_y_c[8], a1_y.val[1], 0);
1630 a1_y.val[1] = vld1q_lane_u8(left + base_y_c[9], a1_y.val[1], 2);
1631 a1_y.val[1] = vld1q_lane_u8(left + base_y_c[10], a1_y.val[1], 4);
1632 a1_y.val[1] = vld1q_lane_u8(left + base_y_c[11], a1_y.val[1], 6);
1633 a1_y.val[1] = vld1q_lane_u8(left + base_y_c[12], a1_y.val[1], 8);
1634 a1_y.val[1] = vld1q_lane_u8(left + base_y_c[13], a1_y.val[1], 10);
1635 a1_y.val[1] = vld1q_lane_u8(left + base_y_c[14], a1_y.val[1], 12);
1636 a1_y.val[1] = vld1q_lane_u8(left + base_y_c[15], a1_y.val[1], 14);
1637 }
1638 shifty.val[0] = vshrq_n_u16(
1639 vandq_u16(vreinterpretq_u16_s16(y_c256.val[0]), c3f), 1);
1640 shifty.val[1] = vshrq_n_u16(
1641 vandq_u16(vreinterpretq_u16_s16(y_c256.val[1]), c3f), 1);
1642 diff.val[0] =
1643 vsubq_u16(vreinterpretq_u16_u8(a1_y.val[0]),
1644 vreinterpretq_u16_u8(a0_y.val[0])); // a[x+1] - a[x]
1645 diff.val[1] =
1646 vsubq_u16(vreinterpretq_u16_u8(a1_y.val[1]),
1647 vreinterpretq_u16_u8(a0_y.val[1])); // a[x+1] - a[x]
1648 a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_y.val[0]),
1649 v_32); // a[x] * 32 + 16
1650 a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_y.val[1]),
1651 v_32); // a[x] * 32 + 16
1652 res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shifty.val[0]);
1653 res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shifty.val[1]);
1654
1655 resy =
1656 vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5));
1657 } else {
1658 resy = v_zero;
1659 }
1660 uint8x16_t mask = vld1q_u8(BaseMask[base_min_diff]);
1661 resxy = vorrq_u8(vandq_u8(mask, resy), vbicq_u8(resx, mask));
1662 vst1q_u8(dst + j, resxy);
1663 } // for j
1664 dst += stride;
1665 }
1666}
1667
1668// Directional prediction, zone 2: 90 < angle < 180
1669void av1_dr_prediction_z2_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
1670 const uint8_t *above, const uint8_t *left,
1671 int upsample_above, int upsample_left, int dx,
1672 int dy) {
1673 assert(dx > 0);
1674 assert(dy > 0);
1675
1676 switch (bw) {
1677 case 4:
1678 dr_prediction_z2_Nx4_neon(bh, dst, stride, above, left, upsample_above,
1679 upsample_left, dx, dy);
1680 break;
1681 case 8:
1682 dr_prediction_z2_Nx8_neon(bh, dst, stride, above, left, upsample_above,
1683 upsample_left, dx, dy);
1684 break;
1685 default:
1686 dr_prediction_z2_HxW_neon(bh, bw, dst, stride, above, left,
1687 upsample_above, upsample_left, dx, dy);
1688 break;
1689 }
1690 return;
1691}
1692
1693/* ---------------------P R E D I C T I O N Z 3--------------------------- */
1694
1695static AOM_FORCE_INLINE void transpose4x16_neon(uint8x16_t *x,
1696 uint16x8x2_t *d) {
1697 uint8x16x2_t w0, w1;
1698
1699 w0 = vzipq_u8(x[0], x[1]);
1700 w1 = vzipq_u8(x[2], x[3]);
1701
1702 d[0] = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
1703 vreinterpretq_u16_u8(w1.val[0]));
1704 d[1] = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
1705 vreinterpretq_u16_u8(w1.val[1]));
1706}
1707
1708static AOM_FORCE_INLINE void transpose4x8_8x4_low_neon(uint8x8_t *x,
1709 uint16x4x2_t *d) {
1710 uint8x8x2_t w0, w1;
1711
1712 w0 = vzip_u8(x[0], x[1]);
1713 w1 = vzip_u8(x[2], x[3]);
1714
1715 *d = vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
1716}
1717
1718static AOM_FORCE_INLINE void transpose4x8_8x4_neon(uint8x8_t *x,
1719 uint16x4x2_t *d) {
1720 uint8x8x2_t w0, w1;
1721
1722 w0 = vzip_u8(x[0], x[1]);
1723 w1 = vzip_u8(x[2], x[3]);
1724
1725 d[0] =
1726 vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
1727 d[1] =
1728 vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1]));
1729}
1730
1731static AOM_FORCE_INLINE void transpose8x8_low_neon(uint8x8_t *x,
1732 uint32x2x2_t *d) {
1733 uint8x8x2_t w0, w1, w2, w3;
1734 uint16x4x2_t w4, w5;
1735
1736 w0 = vzip_u8(x[0], x[1]);
1737 w1 = vzip_u8(x[2], x[3]);
1738 w2 = vzip_u8(x[4], x[5]);
1739 w3 = vzip_u8(x[6], x[7]);
1740
1741 w4 = vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
1742 w5 = vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0]));
1743
1744 d[0] = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
1745 vreinterpret_u32_u16(w5.val[0]));
1746 d[1] = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
1747 vreinterpret_u32_u16(w5.val[1]));
1748}
1749
1750static AOM_FORCE_INLINE void transpose8x8_neon(uint8x8_t *x, uint32x2x2_t *d) {
1751 uint8x8x2_t w0, w1, w2, w3;
1752 uint16x4x2_t w4, w5, w6, w7;
1753
1754 w0 = vzip_u8(x[0], x[1]);
1755 w1 = vzip_u8(x[2], x[3]);
1756 w2 = vzip_u8(x[4], x[5]);
1757 w3 = vzip_u8(x[6], x[7]);
1758
1759 w4 = vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
1760 w5 = vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0]));
1761
1762 d[0] = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
1763 vreinterpret_u32_u16(w5.val[0]));
1764 d[1] = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
1765 vreinterpret_u32_u16(w5.val[1]));
1766
1767 w6 = vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1]));
1768 w7 = vzip_u16(vreinterpret_u16_u8(w2.val[1]), vreinterpret_u16_u8(w3.val[1]));
1769
1770 d[2] = vzip_u32(vreinterpret_u32_u16(w6.val[0]),
1771 vreinterpret_u32_u16(w7.val[0]));
1772 d[3] = vzip_u32(vreinterpret_u32_u16(w6.val[1]),
1773 vreinterpret_u32_u16(w7.val[1]));
1774}
1775
1776static AOM_FORCE_INLINE void transpose16x8_8x16_neon(uint8x8_t *x,
1777 uint64x2_t *d) {
1778 uint8x8x2_t w0, w1, w2, w3, w8, w9, w10, w11;
1779 uint16x4x2_t w4, w5, w12, w13;
1780 uint32x2x2_t w6, w7, w14, w15;
1781
1782 w0 = vzip_u8(x[0], x[1]);
1783 w1 = vzip_u8(x[2], x[3]);
1784 w2 = vzip_u8(x[4], x[5]);
1785 w3 = vzip_u8(x[6], x[7]);
1786
1787 w8 = vzip_u8(x[8], x[9]);
1788 w9 = vzip_u8(x[10], x[11]);
1789 w10 = vzip_u8(x[12], x[13]);
1790 w11 = vzip_u8(x[14], x[15]);
1791
1792 w4 = vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
1793 w5 = vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0]));
1794 w12 =
1795 vzip_u16(vreinterpret_u16_u8(w8.val[0]), vreinterpret_u16_u8(w9.val[0]));
1796 w13 = vzip_u16(vreinterpret_u16_u8(w10.val[0]),
1797 vreinterpret_u16_u8(w11.val[0]));
1798
1799 w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
1800 vreinterpret_u32_u16(w5.val[0]));
1801 w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
1802 vreinterpret_u32_u16(w5.val[1]));
1803 w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]),
1804 vreinterpret_u32_u16(w13.val[0]));
1805 w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]),
1806 vreinterpret_u32_u16(w13.val[1]));
1807
1808 // Store first 4-line result
1809 d[0] = vcombine_u64(vreinterpret_u64_u32(w6.val[0]),
1810 vreinterpret_u64_u32(w14.val[0]));
1811 d[1] = vcombine_u64(vreinterpret_u64_u32(w6.val[1]),
1812 vreinterpret_u64_u32(w14.val[1]));
1813 d[2] = vcombine_u64(vreinterpret_u64_u32(w7.val[0]),
1814 vreinterpret_u64_u32(w15.val[0]));
1815 d[3] = vcombine_u64(vreinterpret_u64_u32(w7.val[1]),
1816 vreinterpret_u64_u32(w15.val[1]));
1817
1818 w4 = vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1]));
1819 w5 = vzip_u16(vreinterpret_u16_u8(w2.val[1]), vreinterpret_u16_u8(w3.val[1]));
1820 w12 =
1821 vzip_u16(vreinterpret_u16_u8(w8.val[1]), vreinterpret_u16_u8(w9.val[1]));
1822 w13 = vzip_u16(vreinterpret_u16_u8(w10.val[1]),
1823 vreinterpret_u16_u8(w11.val[1]));
1824
1825 w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
1826 vreinterpret_u32_u16(w5.val[0]));
1827 w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
1828 vreinterpret_u32_u16(w5.val[1]));
1829 w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]),
1830 vreinterpret_u32_u16(w13.val[0]));
1831 w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]),
1832 vreinterpret_u32_u16(w13.val[1]));
1833
1834 // Store second 4-line result
1835 d[4] = vcombine_u64(vreinterpret_u64_u32(w6.val[0]),
1836 vreinterpret_u64_u32(w14.val[0]));
1837 d[5] = vcombine_u64(vreinterpret_u64_u32(w6.val[1]),
1838 vreinterpret_u64_u32(w14.val[1]));
1839 d[6] = vcombine_u64(vreinterpret_u64_u32(w7.val[0]),
1840 vreinterpret_u64_u32(w15.val[0]));
1841 d[7] = vcombine_u64(vreinterpret_u64_u32(w7.val[1]),
1842 vreinterpret_u64_u32(w15.val[1]));
1843}
1844
1845static AOM_FORCE_INLINE void transpose8x16_16x8_neon(uint8x16_t *x,
1846 uint64x2_t *d) {
1847 uint8x16x2_t w0, w1, w2, w3;
1848 uint16x8x2_t w4, w5, w6, w7;
1849 uint32x4x2_t w8, w9, w10, w11;
1850
1851 w0 = vzipq_u8(x[0], x[1]);
1852 w1 = vzipq_u8(x[2], x[3]);
1853 w2 = vzipq_u8(x[4], x[5]);
1854 w3 = vzipq_u8(x[6], x[7]);
1855
1856 w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
1857 vreinterpretq_u16_u8(w1.val[0]));
1858 w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
1859 vreinterpretq_u16_u8(w3.val[0]));
1860 w6 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
1861 vreinterpretq_u16_u8(w1.val[1]));
1862 w7 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
1863 vreinterpretq_u16_u8(w3.val[1]));
1864
1865 w8 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
1866 vreinterpretq_u32_u16(w5.val[0]));
1867 w9 = vzipq_u32(vreinterpretq_u32_u16(w6.val[0]),
1868 vreinterpretq_u32_u16(w7.val[0]));
1869 w10 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
1870 vreinterpretq_u32_u16(w5.val[1]));
1871 w11 = vzipq_u32(vreinterpretq_u32_u16(w6.val[1]),
1872 vreinterpretq_u32_u16(w7.val[1]));
1873
1874#if defined(__aarch64__)
1875 d[0] = vzip1q_u64(vreinterpretq_u64_u32(w8.val[0]),
1876 vreinterpretq_u64_u32(w9.val[0]));
1877 d[1] = vzip2q_u64(vreinterpretq_u64_u32(w8.val[0]),
1878 vreinterpretq_u64_u32(w9.val[0]));
1879 d[2] = vzip1q_u64(vreinterpretq_u64_u32(w8.val[1]),
1880 vreinterpretq_u64_u32(w9.val[1]));
1881 d[3] = vzip2q_u64(vreinterpretq_u64_u32(w8.val[1]),
1882 vreinterpretq_u64_u32(w9.val[1]));
1883 d[4] = vzip1q_u64(vreinterpretq_u64_u32(w10.val[0]),
1884 vreinterpretq_u64_u32(w11.val[0]));
1885 d[5] = vzip2q_u64(vreinterpretq_u64_u32(w10.val[0]),
1886 vreinterpretq_u64_u32(w11.val[0]));
1887 d[6] = vzip1q_u64(vreinterpretq_u64_u32(w10.val[1]),
1888 vreinterpretq_u64_u32(w11.val[1]));
1889 d[7] = vzip2q_u64(vreinterpretq_u64_u32(w10.val[1]),
1890 vreinterpretq_u64_u32(w11.val[1]));
1891#else
1892 d[0] = vreinterpretq_u64_u32(
1893 vcombine_u32(vget_low_u32(w8.val[0]), vget_low_u32(w9.val[0])));
1894 d[1] = vreinterpretq_u64_u32(
1895 vcombine_u32(vget_high_u32(w8.val[0]), vget_high_u32(w9.val[0])));
1896 d[2] = vreinterpretq_u64_u32(
1897 vcombine_u32(vget_low_u32(w8.val[1]), vget_low_u32(w9.val[1])));
1898 d[3] = vreinterpretq_u64_u32(
1899 vcombine_u32(vget_high_u32(w8.val[1]), vget_high_u32(w9.val[1])));
1900 d[4] = vreinterpretq_u64_u32(
1901 vcombine_u32(vget_low_u32(w10.val[0]), vget_low_u32(w11.val[0])));
1902 d[5] = vreinterpretq_u64_u32(
1903 vcombine_u32(vget_high_u32(w10.val[0]), vget_high_u32(w11.val[0])));
1904 d[6] = vreinterpretq_u64_u32(
1905 vcombine_u32(vget_low_u32(w10.val[1]), vget_low_u32(w11.val[1])));
1906 d[7] = vreinterpretq_u64_u32(
1907 vcombine_u32(vget_high_u32(w10.val[1]), vget_high_u32(w11.val[1])));
1908#endif
1909}
1910
1911static AOM_FORCE_INLINE void transpose16x16_neon(uint8x16_t *x, uint64x2_t *d) {
1912 uint8x16x2_t w0, w1, w2, w3, w4, w5, w6, w7;
1913 uint16x8x2_t w8, w9, w10, w11;
1914 uint32x4x2_t w12, w13, w14, w15;
1915
1916 w0 = vzipq_u8(x[0], x[1]);
1917 w1 = vzipq_u8(x[2], x[3]);
1918 w2 = vzipq_u8(x[4], x[5]);
1919 w3 = vzipq_u8(x[6], x[7]);
1920
1921 w4 = vzipq_u8(x[8], x[9]);
1922 w5 = vzipq_u8(x[10], x[11]);
1923 w6 = vzipq_u8(x[12], x[13]);
1924 w7 = vzipq_u8(x[14], x[15]);
1925
1926 w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
1927 vreinterpretq_u16_u8(w1.val[0]));
1928 w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
1929 vreinterpretq_u16_u8(w3.val[0]));
1930 w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[0]),
1931 vreinterpretq_u16_u8(w5.val[0]));
1932 w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[0]),
1933 vreinterpretq_u16_u8(w7.val[0]));
1934
1935 w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]),
1936 vreinterpretq_u32_u16(w9.val[0]));
1937 w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]),
1938 vreinterpretq_u32_u16(w11.val[0]));
1939 w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]),
1940 vreinterpretq_u32_u16(w9.val[1]));
1941 w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]),
1942 vreinterpretq_u32_u16(w11.val[1]));
1943
1944#if defined(__aarch64__)
1945 d[0] = vzip1q_u64(vreinterpretq_u64_u32(w12.val[0]),
1946 vreinterpretq_u64_u32(w13.val[0]));
1947 d[1] = vzip2q_u64(vreinterpretq_u64_u32(w12.val[0]),
1948 vreinterpretq_u64_u32(w13.val[0]));
1949 d[2] = vzip1q_u64(vreinterpretq_u64_u32(w12.val[1]),
1950 vreinterpretq_u64_u32(w13.val[1]));
1951 d[3] = vzip2q_u64(vreinterpretq_u64_u32(w12.val[1]),
1952 vreinterpretq_u64_u32(w13.val[1]));
1953 d[4] = vzip1q_u64(vreinterpretq_u64_u32(w14.val[0]),
1954 vreinterpretq_u64_u32(w15.val[0]));
1955 d[5] = vzip2q_u64(vreinterpretq_u64_u32(w14.val[0]),
1956 vreinterpretq_u64_u32(w15.val[0]));
1957 d[6] = vzip1q_u64(vreinterpretq_u64_u32(w14.val[1]),
1958 vreinterpretq_u64_u32(w15.val[1]));
1959 d[7] = vzip2q_u64(vreinterpretq_u64_u32(w14.val[1]),
1960 vreinterpretq_u64_u32(w15.val[1]));
1961#else
1962 d[0] = vreinterpretq_u64_u32(
1963 vcombine_u32(vget_low_u32(w12.val[0]), vget_low_u32(w13.val[0])));
1964 d[1] = vreinterpretq_u64_u32(
1965 vcombine_u32(vget_high_u32(w12.val[0]), vget_high_u32(w13.val[0])));
1966 d[2] = vreinterpretq_u64_u32(
1967 vcombine_u32(vget_low_u32(w12.val[1]), vget_low_u32(w13.val[1])));
1968 d[3] = vreinterpretq_u64_u32(
1969 vcombine_u32(vget_high_u32(w12.val[1]), vget_high_u32(w13.val[1])));
1970 d[4] = vreinterpretq_u64_u32(
1971 vcombine_u32(vget_low_u32(w14.val[0]), vget_low_u32(w15.val[0])));
1972 d[5] = vreinterpretq_u64_u32(
1973 vcombine_u32(vget_high_u32(w14.val[0]), vget_high_u32(w15.val[0])));
1974 d[6] = vreinterpretq_u64_u32(
1975 vcombine_u32(vget_low_u32(w14.val[1]), vget_low_u32(w15.val[1])));
1976 d[7] = vreinterpretq_u64_u32(
1977 vcombine_u32(vget_high_u32(w14.val[1]), vget_high_u32(w15.val[1])));
1978#endif
1979
1980 // upper half
1981 w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
1982 vreinterpretq_u16_u8(w1.val[1]));
1983 w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
1984 vreinterpretq_u16_u8(w3.val[1]));
1985 w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[1]),
1986 vreinterpretq_u16_u8(w5.val[1]));
1987 w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[1]),
1988 vreinterpretq_u16_u8(w7.val[1]));
1989
1990 w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]),
1991 vreinterpretq_u32_u16(w9.val[0]));
1992 w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]),
1993 vreinterpretq_u32_u16(w11.val[0]));
1994 w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]),
1995 vreinterpretq_u32_u16(w9.val[1]));
1996 w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]),
1997 vreinterpretq_u32_u16(w11.val[1]));
1998
1999#if defined(__aarch64__)
2000 d[8] = vzip1q_u64(vreinterpretq_u64_u32(w12.val[0]),
2001 vreinterpretq_u64_u32(w13.val[0]));
2002 d[9] = vzip2q_u64(vreinterpretq_u64_u32(w12.val[0]),
2003 vreinterpretq_u64_u32(w13.val[0]));
2004 d[10] = vzip1q_u64(vreinterpretq_u64_u32(w12.val[1]),
2005 vreinterpretq_u64_u32(w13.val[1]));
2006 d[11] = vzip2q_u64(vreinterpretq_u64_u32(w12.val[1]),
2007 vreinterpretq_u64_u32(w13.val[1]));
2008 d[12] = vzip1q_u64(vreinterpretq_u64_u32(w14.val[0]),
2009 vreinterpretq_u64_u32(w15.val[0]));
2010 d[13] = vzip2q_u64(vreinterpretq_u64_u32(w14.val[0]),
2011 vreinterpretq_u64_u32(w15.val[0]));
2012 d[14] = vzip1q_u64(vreinterpretq_u64_u32(w14.val[1]),
2013 vreinterpretq_u64_u32(w15.val[1]));
2014 d[15] = vzip2q_u64(vreinterpretq_u64_u32(w14.val[1]),
2015 vreinterpretq_u64_u32(w15.val[1]));
2016#else
2017 d[8] = vreinterpretq_u64_u32(
2018 vcombine_u32(vget_low_u32(w12.val[0]), vget_low_u32(w13.val[0])));
2019 d[9] = vreinterpretq_u64_u32(
2020 vcombine_u32(vget_high_u32(w12.val[0]), vget_high_u32(w13.val[0])));
2021 d[10] = vreinterpretq_u64_u32(
2022 vcombine_u32(vget_low_u32(w12.val[1]), vget_low_u32(w13.val[1])));
2023 d[11] = vreinterpretq_u64_u32(
2024 vcombine_u32(vget_high_u32(w12.val[1]), vget_high_u32(w13.val[1])));
2025 d[12] = vreinterpretq_u64_u32(
2026 vcombine_u32(vget_low_u32(w14.val[0]), vget_low_u32(w15.val[0])));
2027 d[13] = vreinterpretq_u64_u32(
2028 vcombine_u32(vget_high_u32(w14.val[0]), vget_high_u32(w15.val[0])));
2029 d[14] = vreinterpretq_u64_u32(
2030 vcombine_u32(vget_low_u32(w14.val[1]), vget_low_u32(w15.val[1])));
2031 d[15] = vreinterpretq_u64_u32(
2032 vcombine_u32(vget_high_u32(w14.val[1]), vget_high_u32(w15.val[1])));
2033#endif
2034}
2035
2036static AOM_FORCE_INLINE void transpose16x32_neon(uint8x16x2_t *x,
2037 uint64x2x2_t *d) {
2038 uint8x16x2_t w0, w1, w2, w3, w8, w9, w10, w11;
2039 uint16x8x2_t w4, w5, w12, w13;
2040 uint32x4x2_t w6, w7, w14, w15;
2041
2042 w0 = vzipq_u8(x[0].val[0], x[1].val[0]);
2043 w1 = vzipq_u8(x[2].val[0], x[3].val[0]);
2044 w2 = vzipq_u8(x[4].val[0], x[5].val[0]);
2045 w3 = vzipq_u8(x[6].val[0], x[7].val[0]);
2046
2047 w8 = vzipq_u8(x[8].val[0], x[9].val[0]);
2048 w9 = vzipq_u8(x[10].val[0], x[11].val[0]);
2049 w10 = vzipq_u8(x[12].val[0], x[13].val[0]);
2050 w11 = vzipq_u8(x[14].val[0], x[15].val[0]);
2051
2052 w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
2053 vreinterpretq_u16_u8(w1.val[0]));
2054 w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
2055 vreinterpretq_u16_u8(w3.val[0]));
2056 w12 = vzipq_u16(vreinterpretq_u16_u8(w8.val[0]),
2057 vreinterpretq_u16_u8(w9.val[0]));
2058 w13 = vzipq_u16(vreinterpretq_u16_u8(w10.val[0]),
2059 vreinterpretq_u16_u8(w11.val[0]));
2060
2061 w6 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
2062 vreinterpretq_u32_u16(w5.val[0]));
2063 w7 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
2064 vreinterpretq_u32_u16(w5.val[1]));
2065 w14 = vzipq_u32(vreinterpretq_u32_u16(w12.val[0]),
2066 vreinterpretq_u32_u16(w13.val[0]));
2067 w15 = vzipq_u32(vreinterpretq_u32_u16(w12.val[1]),
2068 vreinterpretq_u32_u16(w13.val[1]));
2069
2070 // Store first 4-line result
2071
2072#if defined(__aarch64__)
2073 d[0].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[0]),
2074 vreinterpretq_u64_u32(w14.val[0]));
2075 d[0].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[0]),
2076 vreinterpretq_u64_u32(w14.val[0]));
2077 d[1].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[1]),
2078 vreinterpretq_u64_u32(w14.val[1]));
2079 d[1].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[1]),
2080 vreinterpretq_u64_u32(w14.val[1]));
2081 d[2].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[0]),
2082 vreinterpretq_u64_u32(w15.val[0]));
2083 d[2].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[0]),
2084 vreinterpretq_u64_u32(w15.val[0]));
2085 d[3].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[1]),
2086 vreinterpretq_u64_u32(w15.val[1]));
2087 d[3].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[1]),
2088 vreinterpretq_u64_u32(w15.val[1]));
2089#else
2090 d[0].val[0] = vreinterpretq_u64_u32(
2091 vcombine_u32(vget_low_u32(w6.val[0]), vget_low_u32(w14.val[0])));
2092 d[0].val[1] = vreinterpretq_u64_u32(
2093 vcombine_u32(vget_high_u32(w6.val[0]), vget_high_u32(w14.val[0])));
2094 d[1].val[0] = vreinterpretq_u64_u32(
2095 vcombine_u32(vget_low_u32(w6.val[1]), vget_low_u32(w14.val[1])));
2096 d[1].val[1] = vreinterpretq_u64_u32(
2097 vcombine_u32(vget_high_u32(w6.val[1]), vget_high_u32(w14.val[1])));
2098 d[2].val[0] = vreinterpretq_u64_u32(
2099 vcombine_u32(vget_low_u32(w7.val[0]), vget_low_u32(w15.val[0])));
2100 d[2].val[1] = vreinterpretq_u64_u32(
2101 vcombine_u32(vget_high_u32(w7.val[0]), vget_high_u32(w15.val[0])));
2102 d[3].val[0] = vreinterpretq_u64_u32(
2103 vcombine_u32(vget_low_u32(w7.val[1]), vget_low_u32(w15.val[1])));
2104 d[3].val[1] = vreinterpretq_u64_u32(
2105 vcombine_u32(vget_high_u32(w7.val[1]), vget_high_u32(w15.val[1])));
2106#endif
2107
2108 w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
2109 vreinterpretq_u16_u8(w1.val[1]));
2110 w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
2111 vreinterpretq_u16_u8(w3.val[1]));
2112 w12 = vzipq_u16(vreinterpretq_u16_u8(w8.val[1]),
2113 vreinterpretq_u16_u8(w9.val[1]));
2114 w13 = vzipq_u16(vreinterpretq_u16_u8(w10.val[1]),
2115 vreinterpretq_u16_u8(w11.val[1]));
2116
2117 w6 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
2118 vreinterpretq_u32_u16(w5.val[0]));
2119 w7 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
2120 vreinterpretq_u32_u16(w5.val[1]));
2121 w14 = vzipq_u32(vreinterpretq_u32_u16(w12.val[0]),
2122 vreinterpretq_u32_u16(w13.val[0]));
2123 w15 = vzipq_u32(vreinterpretq_u32_u16(w12.val[1]),
2124 vreinterpretq_u32_u16(w13.val[1]));
2125
2126 // Store second 4-line result
2127
2128#if defined(__aarch64__)
2129 d[4].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[0]),
2130 vreinterpretq_u64_u32(w14.val[0]));
2131 d[4].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[0]),
2132 vreinterpretq_u64_u32(w14.val[0]));
2133 d[5].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[1]),
2134 vreinterpretq_u64_u32(w14.val[1]));
2135 d[5].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[1]),
2136 vreinterpretq_u64_u32(w14.val[1]));
2137 d[6].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[0]),
2138 vreinterpretq_u64_u32(w15.val[0]));
2139 d[6].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[0]),
2140 vreinterpretq_u64_u32(w15.val[0]));
2141 d[7].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[1]),
2142 vreinterpretq_u64_u32(w15.val[1]));
2143 d[7].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[1]),
2144 vreinterpretq_u64_u32(w15.val[1]));
2145#else
2146 d[4].val[0] = vreinterpretq_u64_u32(
2147 vcombine_u32(vget_low_u32(w6.val[0]), vget_low_u32(w14.val[0])));
2148 d[4].val[1] = vreinterpretq_u64_u32(
2149 vcombine_u32(vget_high_u32(w6.val[0]), vget_high_u32(w14.val[0])));
2150 d[5].val[0] = vreinterpretq_u64_u32(
2151 vcombine_u32(vget_low_u32(w6.val[1]), vget_low_u32(w14.val[1])));
2152 d[5].val[1] = vreinterpretq_u64_u32(
2153 vcombine_u32(vget_high_u32(w6.val[1]), vget_high_u32(w14.val[1])));
2154 d[6].val[0] = vreinterpretq_u64_u32(
2155 vcombine_u32(vget_low_u32(w7.val[0]), vget_low_u32(w15.val[0])));
2156 d[6].val[1] = vreinterpretq_u64_u32(
2157 vcombine_u32(vget_high_u32(w7.val[0]), vget_high_u32(w15.val[0])));
2158 d[7].val[0] = vreinterpretq_u64_u32(
2159 vcombine_u32(vget_low_u32(w7.val[1]), vget_low_u32(w15.val[1])));
2160 d[7].val[1] = vreinterpretq_u64_u32(
2161 vcombine_u32(vget_high_u32(w7.val[1]), vget_high_u32(w15.val[1])));
2162#endif
2163
2164 // upper half
2165 w0 = vzipq_u8(x[0].val[1], x[1].val[1]);
2166 w1 = vzipq_u8(x[2].val[1], x[3].val[1]);
2167 w2 = vzipq_u8(x[4].val[1], x[5].val[1]);
2168 w3 = vzipq_u8(x[6].val[1], x[7].val[1]);
2169
2170 w8 = vzipq_u8(x[8].val[1], x[9].val[1]);
2171 w9 = vzipq_u8(x[10].val[1], x[11].val[1]);
2172 w10 = vzipq_u8(x[12].val[1], x[13].val[1]);
2173 w11 = vzipq_u8(x[14].val[1], x[15].val[1]);
2174
2175 w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
2176 vreinterpretq_u16_u8(w1.val[0]));
2177 w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
2178 vreinterpretq_u16_u8(w3.val[0]));
2179 w12 = vzipq_u16(vreinterpretq_u16_u8(w8.val[0]),
2180 vreinterpretq_u16_u8(w9.val[0]));
2181 w13 = vzipq_u16(vreinterpretq_u16_u8(w10.val[0]),
2182 vreinterpretq_u16_u8(w11.val[0]));
2183
2184 w6 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
2185 vreinterpretq_u32_u16(w5.val[0]));
2186 w7 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
2187 vreinterpretq_u32_u16(w5.val[1]));
2188 w14 = vzipq_u32(vreinterpretq_u32_u16(w12.val[0]),
2189 vreinterpretq_u32_u16(w13.val[0]));
2190 w15 = vzipq_u32(vreinterpretq_u32_u16(w12.val[1]),
2191 vreinterpretq_u32_u16(w13.val[1]));
2192
2193 // Store first 4-line result
2194
2195#if defined(__aarch64__)
2196 d[8].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[0]),
2197 vreinterpretq_u64_u32(w14.val[0]));
2198 d[8].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[0]),
2199 vreinterpretq_u64_u32(w14.val[0]));
2200 d[9].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[1]),
2201 vreinterpretq_u64_u32(w14.val[1]));
2202 d[9].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[1]),
2203 vreinterpretq_u64_u32(w14.val[1]));
2204 d[10].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[0]),
2205 vreinterpretq_u64_u32(w15.val[0]));
2206 d[10].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[0]),
2207 vreinterpretq_u64_u32(w15.val[0]));
2208 d[11].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[1]),
2209 vreinterpretq_u64_u32(w15.val[1]));
2210 d[11].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[1]),
2211 vreinterpretq_u64_u32(w15.val[1]));
2212#else
2213 d[8].val[0] = vreinterpretq_u64_u32(
2214 vcombine_u32(vget_low_u32(w6.val[0]), vget_low_u32(w14.val[0])));
2215 d[8].val[1] = vreinterpretq_u64_u32(
2216 vcombine_u32(vget_high_u32(w6.val[0]), vget_high_u32(w14.val[0])));
2217 d[9].val[0] = vreinterpretq_u64_u32(
2218 vcombine_u32(vget_low_u32(w6.val[1]), vget_low_u32(w14.val[1])));
2219 d[9].val[1] = vreinterpretq_u64_u32(
2220 vcombine_u32(vget_high_u32(w6.val[1]), vget_high_u32(w14.val[1])));
2221 d[10].val[0] = vreinterpretq_u64_u32(
2222 vcombine_u32(vget_low_u32(w7.val[0]), vget_low_u32(w15.val[0])));
2223 d[10].val[1] = vreinterpretq_u64_u32(
2224 vcombine_u32(vget_high_u32(w7.val[0]), vget_high_u32(w15.val[0])));
2225 d[11].val[0] = vreinterpretq_u64_u32(
2226 vcombine_u32(vget_low_u32(w7.val[1]), vget_low_u32(w15.val[1])));
2227 d[11].val[1] = vreinterpretq_u64_u32(
2228 vcombine_u32(vget_high_u32(w7.val[1]), vget_high_u32(w15.val[1])));
2229#endif
2230
2231 w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
2232 vreinterpretq_u16_u8(w1.val[1]));
2233 w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
2234 vreinterpretq_u16_u8(w3.val[1]));
2235 w12 = vzipq_u16(vreinterpretq_u16_u8(w8.val[1]),
2236 vreinterpretq_u16_u8(w9.val[1]));
2237 w13 = vzipq_u16(vreinterpretq_u16_u8(w10.val[1]),
2238 vreinterpretq_u16_u8(w11.val[1]));
2239
2240 w6 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
2241 vreinterpretq_u32_u16(w5.val[0]));
2242 w7 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
2243 vreinterpretq_u32_u16(w5.val[1]));
2244 w14 = vzipq_u32(vreinterpretq_u32_u16(w12.val[0]),
2245 vreinterpretq_u32_u16(w13.val[0]));
2246 w15 = vzipq_u32(vreinterpretq_u32_u16(w12.val[1]),
2247 vreinterpretq_u32_u16(w13.val[1]));
2248
2249 // Store second 4-line result
2250
2251#if defined(__aarch64__)
2252 d[12].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[0]),
2253 vreinterpretq_u64_u32(w14.val[0]));
2254 d[12].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[0]),
2255 vreinterpretq_u64_u32(w14.val[0]));
2256 d[13].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[1]),
2257 vreinterpretq_u64_u32(w14.val[1]));
2258 d[13].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[1]),
2259 vreinterpretq_u64_u32(w14.val[1]));
2260 d[14].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[0]),
2261 vreinterpretq_u64_u32(w15.val[0]));
2262 d[14].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[0]),
2263 vreinterpretq_u64_u32(w15.val[0]));
2264 d[15].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[1]),
2265 vreinterpretq_u64_u32(w15.val[1]));
2266 d[15].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[1]),
2267 vreinterpretq_u64_u32(w15.val[1]));
2268#else
2269 d[12].val[0] = vreinterpretq_u64_u32(
2270 vcombine_u32(vget_low_u32(w6.val[0]), vget_low_u32(w14.val[0])));
2271 d[12].val[1] = vreinterpretq_u64_u32(
2272 vcombine_u32(vget_high_u32(w6.val[0]), vget_high_u32(w14.val[0])));
2273 d[13].val[0] = vreinterpretq_u64_u32(
2274 vcombine_u32(vget_low_u32(w6.val[1]), vget_low_u32(w14.val[1])));
2275 d[13].val[1] = vreinterpretq_u64_u32(
2276 vcombine_u32(vget_high_u32(w6.val[1]), vget_high_u32(w14.val[1])));
2277 d[14].val[0] = vreinterpretq_u64_u32(
2278 vcombine_u32(vget_low_u32(w7.val[0]), vget_low_u32(w15.val[0])));
2279 d[14].val[1] = vreinterpretq_u64_u32(
2280 vcombine_u32(vget_high_u32(w7.val[0]), vget_high_u32(w15.val[0])));
2281 d[15].val[0] = vreinterpretq_u64_u32(
2282 vcombine_u32(vget_low_u32(w7.val[1]), vget_low_u32(w15.val[1])));
2283 d[15].val[1] = vreinterpretq_u64_u32(
2284 vcombine_u32(vget_high_u32(w7.val[1]), vget_high_u32(w15.val[1])));
2285#endif
2286}
2287
2288static void transpose_TX_16X16(const uint8_t *src, ptrdiff_t pitchSrc,
2289 uint8_t *dst, ptrdiff_t pitchDst) {
2290 uint8x16_t r[16];
2291 uint64x2_t d[16];
2292 for (int i = 0; i < 16; i++) {
2293 r[i] = vld1q_u8(src + i * pitchSrc);
2294 }
2295 transpose16x16_neon(r, d);
2296 for (int i = 0; i < 16; i++) {
2297 vst1q_u8(dst + i * pitchDst, vreinterpretq_u8_u64(d[i]));
2298 }
2299}
2300
2301static void transpose(const uint8_t *src, ptrdiff_t pitchSrc, uint8_t *dst,
2302 ptrdiff_t pitchDst, int width, int height) {
2303 for (int j = 0; j < height; j += 16) {
2304 for (int i = 0; i < width; i += 16) {
2305 transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
2306 dst + j * pitchDst + i, pitchDst);
2307 }
2308 }
2309}
2310
2311static void dr_prediction_z3_4x4_neon(uint8_t *dst, ptrdiff_t stride,
2312 const uint8_t *left, int upsample_left,
2313 int dy) {
2314 uint8x8_t dstvec[4];
2315 uint16x4x2_t dest;
2316
2317 dr_prediction_z1_HxW_internal_neon_64(4, 4, dstvec, left, upsample_left, dy);
2318 transpose4x8_8x4_low_neon(dstvec, &dest);
2319 vst1_lane_u32((uint32_t *)(dst + stride * 0),
2320 vreinterpret_u32_u16(dest.val[0]), 0);
2321 vst1_lane_u32((uint32_t *)(dst + stride * 1),
2322 vreinterpret_u32_u16(dest.val[0]), 1);
2323 vst1_lane_u32((uint32_t *)(dst + stride * 2),
2324 vreinterpret_u32_u16(dest.val[1]), 0);
2325 vst1_lane_u32((uint32_t *)(dst + stride * 3),
2326 vreinterpret_u32_u16(dest.val[1]), 1);
2327}
2328
2329static void dr_prediction_z3_8x8_neon(uint8_t *dst, ptrdiff_t stride,
2330 const uint8_t *left, int upsample_left,
2331 int dy) {
2332 uint8x8_t dstvec[8];
2333 uint32x2x2_t d[4];
2334
2335 dr_prediction_z1_HxW_internal_neon_64(8, 8, dstvec, left, upsample_left, dy);
2336 transpose8x8_neon(dstvec, d);
2337 vst1_u32((uint32_t *)(dst + 0 * stride), d[0].val[0]);
2338 vst1_u32((uint32_t *)(dst + 1 * stride), d[0].val[1]);
2339 vst1_u32((uint32_t *)(dst + 2 * stride), d[1].val[0]);
2340 vst1_u32((uint32_t *)(dst + 3 * stride), d[1].val[1]);
2341 vst1_u32((uint32_t *)(dst + 4 * stride), d[2].val[0]);
2342 vst1_u32((uint32_t *)(dst + 5 * stride), d[2].val[1]);
2343 vst1_u32((uint32_t *)(dst + 6 * stride), d[3].val[0]);
2344 vst1_u32((uint32_t *)(dst + 7 * stride), d[3].val[1]);
2345}
2346
2347static void dr_prediction_z3_4x8_neon(uint8_t *dst, ptrdiff_t stride,
2348 const uint8_t *left, int upsample_left,
2349 int dy) {
2350 uint8x8_t dstvec[4];
2351 uint16x4x2_t d[2];
2352
2353 dr_prediction_z1_HxW_internal_neon_64(8, 4, dstvec, left, upsample_left, dy);
2354 transpose4x8_8x4_neon(dstvec, d);
2355 vst1_lane_u32((uint32_t *)(dst + stride * 0),
2356 vreinterpret_u32_u16(d[0].val[0]), 0);
2357 vst1_lane_u32((uint32_t *)(dst + stride * 1),
2358 vreinterpret_u32_u16(d[0].val[0]), 1);
2359 vst1_lane_u32((uint32_t *)(dst + stride * 2),
2360 vreinterpret_u32_u16(d[0].val[1]), 0);
2361 vst1_lane_u32((uint32_t *)(dst + stride * 3),
2362 vreinterpret_u32_u16(d[0].val[1]), 1);
2363 vst1_lane_u32((uint32_t *)(dst + stride * 4),
2364 vreinterpret_u32_u16(d[1].val[0]), 0);
2365 vst1_lane_u32((uint32_t *)(dst + stride * 5),
2366 vreinterpret_u32_u16(d[1].val[0]), 1);
2367 vst1_lane_u32((uint32_t *)(dst + stride * 6),
2368 vreinterpret_u32_u16(d[1].val[1]), 0);
2369 vst1_lane_u32((uint32_t *)(dst + stride * 7),
2370 vreinterpret_u32_u16(d[1].val[1]), 1);
2371}
2372
2373static void dr_prediction_z3_8x4_neon(uint8_t *dst, ptrdiff_t stride,
2374 const uint8_t *left, int upsample_left,
2375 int dy) {
2376 uint8x8_t dstvec[8];
2377 uint32x2x2_t d[2];
2378
2379 dr_prediction_z1_HxW_internal_neon_64(4, 8, dstvec, left, upsample_left, dy);
2380 transpose8x8_low_neon(dstvec, d);
2381 vst1_u32((uint32_t *)(dst + 0 * stride), d[0].val[0]);
2382 vst1_u32((uint32_t *)(dst + 1 * stride), d[0].val[1]);
2383 vst1_u32((uint32_t *)(dst + 2 * stride), d[1].val[0]);
2384 vst1_u32((uint32_t *)(dst + 3 * stride), d[1].val[1]);
2385}
2386
2387static void dr_prediction_z3_8x16_neon(uint8_t *dst, ptrdiff_t stride,
2388 const uint8_t *left, int upsample_left,
2389 int dy) {
2390 uint8x16_t dstvec[8];
2391 uint64x2_t d[8];
2392
2393 dr_prediction_z1_HxW_internal_neon(16, 8, dstvec, left, upsample_left, dy);
2394 transpose8x16_16x8_neon(dstvec, d);
2395 for (int i = 0; i < 8; i++) {
2396 vst1_u8(dst + i * stride, vreinterpret_u8_u64(vget_low_u64(d[i])));
2397 vst1_u8(dst + (i + 8) * stride, vreinterpret_u8_u64(vget_high_u64(d[i])));
2398 }
2399}
2400
2401static void dr_prediction_z3_16x8_neon(uint8_t *dst, ptrdiff_t stride,
2402 const uint8_t *left, int upsample_left,
2403 int dy) {
2404 uint8x8_t dstvec[16];
2405 uint64x2_t d[8];
2406
2407 dr_prediction_z1_HxW_internal_neon_64(8, 16, dstvec, left, upsample_left, dy);
2408 transpose16x8_8x16_neon(dstvec, d);
2409 for (int i = 0; i < 8; i++) {
2410 vst1q_u8(dst + i * stride, vreinterpretq_u8_u64(d[i]));
2411 }
2412}
2413
2414static void dr_prediction_z3_4x16_neon(uint8_t *dst, ptrdiff_t stride,
2415 const uint8_t *left, int upsample_left,
2416 int dy) {
2417 uint8x16_t dstvec[4];
2418 uint16x8x2_t d[2];
2419
2420 dr_prediction_z1_HxW_internal_neon(16, 4, dstvec, left, upsample_left, dy);
2421 transpose4x16_neon(dstvec, d);
2422 vst1q_lane_u32((uint32_t *)(dst + stride * 0),
2423 vreinterpretq_u32_u16(d[0].val[0]), 0);
2424 vst1q_lane_u32((uint32_t *)(dst + stride * 1),
2425 vreinterpretq_u32_u16(d[0].val[0]), 1);
2426 vst1q_lane_u32((uint32_t *)(dst + stride * 2),
2427 vreinterpretq_u32_u16(d[0].val[0]), 2);
2428 vst1q_lane_u32((uint32_t *)(dst + stride * 3),
2429 vreinterpretq_u32_u16(d[0].val[0]), 3);
2430
2431 vst1q_lane_u32((uint32_t *)(dst + stride * 4),
2432 vreinterpretq_u32_u16(d[0].val[1]), 0);
2433 vst1q_lane_u32((uint32_t *)(dst + stride * 5),
2434 vreinterpretq_u32_u16(d[0].val[1]), 1);
2435 vst1q_lane_u32((uint32_t *)(dst + stride * 6),
2436 vreinterpretq_u32_u16(d[0].val[1]), 2);
2437 vst1q_lane_u32((uint32_t *)(dst + stride * 7),
2438 vreinterpretq_u32_u16(d[0].val[1]), 3);
2439
2440 vst1q_lane_u32((uint32_t *)(dst + stride * 8),
2441 vreinterpretq_u32_u16(d[1].val[0]), 0);
2442 vst1q_lane_u32((uint32_t *)(dst + stride * 9),
2443 vreinterpretq_u32_u16(d[1].val[0]), 1);
2444 vst1q_lane_u32((uint32_t *)(dst + stride * 10),
2445 vreinterpretq_u32_u16(d[1].val[0]), 2);
2446 vst1q_lane_u32((uint32_t *)(dst + stride * 11),
2447 vreinterpretq_u32_u16(d[1].val[0]), 3);
2448
2449 vst1q_lane_u32((uint32_t *)(dst + stride * 12),
2450 vreinterpretq_u32_u16(d[1].val[1]), 0);
2451 vst1q_lane_u32((uint32_t *)(dst + stride * 13),
2452 vreinterpretq_u32_u16(d[1].val[1]), 1);
2453 vst1q_lane_u32((uint32_t *)(dst + stride * 14),
2454 vreinterpretq_u32_u16(d[1].val[1]), 2);
2455 vst1q_lane_u32((uint32_t *)(dst + stride * 15),
2456 vreinterpretq_u32_u16(d[1].val[1]), 3);
2457}
2458
2459static void dr_prediction_z3_16x4_neon(uint8_t *dst, ptrdiff_t stride,
2460 const uint8_t *left, int upsample_left,
2461 int dy) {
2462 uint8x8_t dstvec[16];
2463 uint64x2_t d[8];
2464
2465 dr_prediction_z1_HxW_internal_neon_64(4, 16, dstvec, left, upsample_left, dy);
2466 transpose16x8_8x16_neon(dstvec, d);
2467 for (int i = 0; i < 4; i++) {
2468 vst1q_u8(dst + i * stride, vreinterpretq_u8_u64(d[i]));
2469 }
2470}
2471
2472static void dr_prediction_z3_8x32_neon(uint8_t *dst, ptrdiff_t stride,
2473 const uint8_t *left, int upsample_left,
2474 int dy) {
2475 uint8x16x2_t dstvec[16];
2476 uint64x2x2_t d[16];
2477 uint8x16_t v_zero = vdupq_n_u8(0);
2478
2479 dr_prediction_z1_32xN_internal_neon(8, dstvec, left, upsample_left, dy);
2480 for (int i = 8; i < 16; i++) {
2481 dstvec[i].val[0] = v_zero;
2482 dstvec[i].val[1] = v_zero;
2483 }
2484 transpose16x32_neon(dstvec, d);
2485 for (int i = 0; i < 16; i++) {
2486 vst1_u8(dst + 2 * i * stride,
2487 vreinterpret_u8_u64(vget_low_u64(d[i].val[0])));
2488 vst1_u8(dst + (2 * i + 1) * stride,
2489 vreinterpret_u8_u64(vget_low_u64(d[i].val[1])));
2490 }
2491}
2492
2493static void dr_prediction_z3_32x8_neon(uint8_t *dst, ptrdiff_t stride,
2494 const uint8_t *left, int upsample_left,
2495 int dy) {
2496 uint8x8_t dstvec[32];
2497 uint64x2_t d[16];
2498
2499 dr_prediction_z1_HxW_internal_neon_64(8, 32, dstvec, left, upsample_left, dy);
2500 transpose16x8_8x16_neon(dstvec, d);
2501 transpose16x8_8x16_neon(dstvec + 16, d + 8);
2502 for (int i = 0; i < 8; i++) {
2503 vst1q_u8(dst + i * stride, vreinterpretq_u8_u64(d[i]));
2504 vst1q_u8(dst + i * stride + 16, vreinterpretq_u8_u64(d[i + 8]));
2505 }
2506}
2507
2508static void dr_prediction_z3_16x16_neon(uint8_t *dst, ptrdiff_t stride,
2509 const uint8_t *left, int upsample_left,
2510 int dy) {
2511 uint8x16_t dstvec[16];
2512 uint64x2_t d[16];
2513
2514 dr_prediction_z1_HxW_internal_neon(16, 16, dstvec, left, upsample_left, dy);
2515 transpose16x16_neon(dstvec, d);
2516 for (int i = 0; i < 16; i++) {
2517 vst1q_u8(dst + i * stride, vreinterpretq_u8_u64(d[i]));
2518 }
2519}
2520
2521static void dr_prediction_z3_32x32_neon(uint8_t *dst, ptrdiff_t stride,
2522 const uint8_t *left, int upsample_left,
2523 int dy) {
2524 uint8x16x2_t dstvec[32];
2525 uint64x2x2_t d[32];
2526
2527 dr_prediction_z1_32xN_internal_neon(32, dstvec, left, upsample_left, dy);
2528 transpose16x32_neon(dstvec, d);
2529 transpose16x32_neon(dstvec + 16, d + 16);
2530 for (int i = 0; i < 16; i++) {
2531 vst1q_u8(dst + 2 * i * stride, vreinterpretq_u8_u64(d[i].val[0]));
2532 vst1q_u8(dst + 2 * i * stride + 16, vreinterpretq_u8_u64(d[i + 16].val[0]));
2533 vst1q_u8(dst + (2 * i + 1) * stride, vreinterpretq_u8_u64(d[i].val[1]));
2534 vst1q_u8(dst + (2 * i + 1) * stride + 16,
2535 vreinterpretq_u8_u64(d[i + 16].val[1]));
2536 }
2537}
2538
2539static void dr_prediction_z3_64x64_neon(uint8_t *dst, ptrdiff_t stride,
2540 const uint8_t *left, int upsample_left,
2541 int dy) {
2542 DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]);
2543
2544 dr_prediction_z1_64xN_neon(64, dstT, 64, left, upsample_left, dy);
2545 transpose(dstT, 64, dst, stride, 64, 64);
2546}
2547
2548static void dr_prediction_z3_16x32_neon(uint8_t *dst, ptrdiff_t stride,
2549 const uint8_t *left, int upsample_left,
2550 int dy) {
2551 uint8x16x2_t dstvec[16];
2552 uint64x2x2_t d[16];
2553
2554 dr_prediction_z1_32xN_internal_neon(16, dstvec, left, upsample_left, dy);
2555 transpose16x32_neon(dstvec, d);
2556 for (int i = 0; i < 16; i++) {
2557 vst1q_u8(dst + 2 * i * stride, vreinterpretq_u8_u64(d[i].val[0]));
2558 vst1q_u8(dst + (2 * i + 1) * stride, vreinterpretq_u8_u64(d[i].val[1]));
2559 }
2560}
2561
2562static void dr_prediction_z3_32x16_neon(uint8_t *dst, ptrdiff_t stride,
2563 const uint8_t *left, int upsample_left,
2564 int dy) {
2565 uint8x16_t dstvec[32];
2566 uint64x2_t d[16];
2567
2568 dr_prediction_z1_HxW_internal_neon(16, 32, dstvec, left, upsample_left, dy);
2569 for (int i = 0; i < 32; i += 16) {
2570 transpose16x16_neon((dstvec + i), d);
2571 for (int j = 0; j < 16; j++) {
2572 vst1q_u8(dst + j * stride + i, vreinterpretq_u8_u64(d[j]));
2573 }
2574 }
2575}
2576
2577static void dr_prediction_z3_32x64_neon(uint8_t *dst, ptrdiff_t stride,
2578 const uint8_t *left, int upsample_left,
2579 int dy) {
2580 uint8_t dstT[64 * 32];
2581
2582 dr_prediction_z1_64xN_neon(32, dstT, 64, left, upsample_left, dy);
2583 transpose(dstT, 64, dst, stride, 32, 64);
2584}
2585
2586static void dr_prediction_z3_64x32_neon(uint8_t *dst, ptrdiff_t stride,
2587 const uint8_t *left, int upsample_left,
2588 int dy) {
2589 uint8_t dstT[32 * 64];
2590
2591 dr_prediction_z1_32xN_neon(64, dstT, 32, left, upsample_left, dy);
2592 transpose(dstT, 32, dst, stride, 64, 32);
2593}
2594
2595static void dr_prediction_z3_16x64_neon(uint8_t *dst, ptrdiff_t stride,
2596 const uint8_t *left, int upsample_left,
2597 int dy) {
2598 uint8_t dstT[64 * 16];
2599
2600 dr_prediction_z1_64xN_neon(16, dstT, 64, left, upsample_left, dy);
2601 transpose(dstT, 64, dst, stride, 16, 64);
2602}
2603
2604static void dr_prediction_z3_64x16_neon(uint8_t *dst, ptrdiff_t stride,
2605 const uint8_t *left, int upsample_left,
2606 int dy) {
2607 uint8x16_t dstvec[64];
2608 uint64x2_t d[16];
2609
2610 dr_prediction_z1_HxW_internal_neon(16, 64, dstvec, left, upsample_left, dy);
2611 for (int i = 0; i < 64; i += 16) {
2612 transpose16x16_neon((dstvec + i), d);
2613 for (int j = 0; j < 16; j++) {
2614 vst1q_u8(dst + j * stride + i, vreinterpretq_u8_u64(d[j]));
2615 }
2616 }
2617}
2618
2619void av1_dr_prediction_z3_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
2620 const uint8_t *above, const uint8_t *left,
2621 int upsample_left, int dx, int dy) {
2622 (void)above;
2623 (void)dx;
2624 assert(dx == 1);
2625 assert(dy > 0);
2626
2627 if (bw == bh) {
2628 switch (bw) {
2629 case 4:
2630 dr_prediction_z3_4x4_neon(dst, stride, left, upsample_left, dy);
2631 break;
2632 case 8:
2633 dr_prediction_z3_8x8_neon(dst, stride, left, upsample_left, dy);
2634 break;
2635 case 16:
2636 dr_prediction_z3_16x16_neon(dst, stride, left, upsample_left, dy);
2637 break;
2638 case 32:
2639 dr_prediction_z3_32x32_neon(dst, stride, left, upsample_left, dy);
2640 break;
2641 case 64:
2642 dr_prediction_z3_64x64_neon(dst, stride, left, upsample_left, dy);
2643 break;
2644 }
2645 } else {
2646 if (bw < bh) {
2647 if (bw + bw == bh) {
2648 switch (bw) {
2649 case 4:
2650 dr_prediction_z3_4x8_neon(dst, stride, left, upsample_left, dy);
2651 break;
2652 case 8:
2653 dr_prediction_z3_8x16_neon(dst, stride, left, upsample_left, dy);
2654 break;
2655 case 16:
2656 dr_prediction_z3_16x32_neon(dst, stride, left, upsample_left, dy);
2657 break;
2658 case 32:
2659 dr_prediction_z3_32x64_neon(dst, stride, left, upsample_left, dy);
2660 break;
2661 }
2662 } else {
2663 switch (bw) {
2664 case 4:
2665 dr_prediction_z3_4x16_neon(dst, stride, left, upsample_left, dy);
2666 break;
2667 case 8:
2668 dr_prediction_z3_8x32_neon(dst, stride, left, upsample_left, dy);
2669 break;
2670 case 16:
2671 dr_prediction_z3_16x64_neon(dst, stride, left, upsample_left, dy);
2672 break;
2673 }
2674 }
2675 } else {
2676 if (bh + bh == bw) {
2677 switch (bh) {
2678 case 4:
2679 dr_prediction_z3_8x4_neon(dst, stride, left, upsample_left, dy);
2680 break;
2681 case 8:
2682 dr_prediction_z3_16x8_neon(dst, stride, left, upsample_left, dy);
2683 break;
2684 case 16:
2685 dr_prediction_z3_32x16_neon(dst, stride, left, upsample_left, dy);
2686 break;
2687 case 32:
2688 dr_prediction_z3_64x32_neon(dst, stride, left, upsample_left, dy);
2689 break;
2690 }
2691 } else {
2692 switch (bh) {
2693 case 4:
2694 dr_prediction_z3_16x4_neon(dst, stride, left, upsample_left, dy);
2695 break;
2696 case 8:
2697 dr_prediction_z3_32x8_neon(dst, stride, left, upsample_left, dy);
2698 break;
2699 case 16:
2700 dr_prediction_z3_64x16_neon(dst, stride, left, upsample_left, dy);
2701 break;
2702 }
2703 }
2704 }
2705 }
2706}
Vitalii Dziumenko7b9b7392020-05-26 04:42:51 +03002707static const int sm_weight_log2_scale = 8;
2708
2709// max(block_size_wide[BLOCK_LARGEST], block_size_high[BLOCK_LARGEST])
2710#define MAX_BLOCK_DIM 64
2711
2712/* clang-format off */
2713static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = {
2714 // Unused, because we always offset by bs, which is at least 2.
2715 0, 0,
2716 // bs = 2
2717 255, 128,
2718 // bs = 4
2719 255, 149, 85, 64,
2720 // bs = 8
2721 255, 197, 146, 105, 73, 50, 37, 32,
2722 // bs = 16
2723 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
2724 // bs = 32
2725 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
2726 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
2727 // bs = 64
2728 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
2729 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73,
2730 69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16,
2731 15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4,
2732};
2733/* clang-format on */
2734
2735// -----------------------------------------------------------------------------
2736// SMOOTH_PRED
2737
2738// pixels[0]: above and below_pred interleave vector
2739// pixels[1]: left vector
2740// pixels[2]: right_pred vector
2741static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
2742 int height, uint8x16_t *pixels) {
2743 uint32x4_t zero = vdupq_n_u32(0);
2744 const uint8x8_t d = vcreate_u8(((const uint32_t *)above)[0]);
2745 if (height == 4)
2746 pixels[1] =
2747 vreinterpretq_u8_u32(vld1q_lane_u32((const uint32_t *)left, zero, 0));
2748 else if (height == 8) {
2749 pixels[1] = vreinterpretq_u8_u64(vsetq_lane_u64(
2750 ((const uint64_t *)left)[0], vreinterpretq_u64_u32(zero), 0));
2751 } else {
2752 pixels[1] = vld1q_u8(left);
2753 }
2754
2755 pixels[2] = vreinterpretq_u8_u16(vdupq_n_u16(above[3]));
2756
2757 const uint16x8_t bp = vdupq_n_u16(left[height - 1]);
2758#if defined(__aarch64__)
2759 pixels[0] = vreinterpretq_u8_u16(vzip1q_u16(vmovl_u8(d), bp));
2760#else
2761 pixels[0] = vreinterpretq_u8_u16(vzipq_u16(vmovl_u8(d), bp).val[0]);
2762#endif // (__aarch64__)
2763}
2764
2765// weight_h[0]: weight_h vector
2766// weight_h[1]: scale - weight_h vector
2767// weight_h[2]: same as [0], second half for height = 16 only
2768// weight_h[3]: same as [1], second half for height = 16 only
2769// weight_w[0]: weights_w and scale - weights_w interleave vector
2770static INLINE void load_weight_w4(const uint8_t *weight_array, int height,
2771 uint16x8_t *weight_h, uint16x8_t *weight_w) {
2772 const uint16x8_t d = vdupq_n_u16((uint16_t)(1 << sm_weight_log2_scale));
2773 const uint8x8_t t = vcreate_u8(((const uint32_t *)(weight_array))[1]);
2774 weight_h[0] = vmovl_u8(t);
2775 weight_h[1] = vsubw_u8(d, t);
2776#if defined(__aarch64__)
2777 weight_w[0] = vzip1q_u16(weight_h[0], weight_h[1]);
2778#else
2779 weight_w[0] = vzipq_u16(weight_h[0], weight_h[1]).val[0];
2780#endif // (__aarch64__)
2781
2782 if (height == 8) {
2783 const uint8x8_t weight = vld1_u8(&weight_array[8]);
2784 weight_h[0] = vmovl_u8(weight);
2785 weight_h[1] = vsubw_u8(d, weight);
2786 } else if (height == 16) {
2787 const uint8x16_t zero = vdupq_n_u8(0);
2788 const uint8x16_t weight = vld1q_u8(&weight_array[16]);
2789 const uint8x16x2_t weight_h_02 = vzipq_u8(weight, zero);
2790 weight_h[0] = vreinterpretq_u16_u8(weight_h_02.val[0]);
2791 weight_h[1] = vsubq_u16(d, vreinterpretq_u16_u8(weight_h_02.val[0]));
2792 weight_h[2] = vreinterpretq_u16_u8(weight_h_02.val[1]);
2793 weight_h[3] = vsubq_u16(d, vreinterpretq_u16_u8(weight_h_02.val[1]));
2794 }
2795}
2796
2797static INLINE void smooth_pred_4xh(const uint8x16_t *pixel,
2798 const uint16x8_t *wh, const uint16x8_t *ww,
2799 int h, uint8_t *dst, ptrdiff_t stride,
2800 int second_half) {
2801 const uint16x4_t one = vdup_n_u16(1);
2802 const uint16x4_t inc = vdup_n_u16(0x202);
2803 uint16x4_t rep =
2804 second_half ? vdup_n_u16((uint16_t)0x8008) : vdup_n_u16((uint16_t)0x8000);
2805 uint16x4_t d = vdup_n_u16(0x100);
2806 const uint16x4_t v_pixel_0_lo = vmovn_u32(vreinterpretq_u32_u8(pixel[0]));
2807 const uint16x4_t v_pixel_0_hi =
2808 vmovn_u32(vreinterpretq_u32_u8(vextq_u8(pixel[0], pixel[0], 2)));
2809 const uint16x4_t v_pixel_2 = vget_low_u16(vreinterpretq_u16_u8(pixel[2]));
2810 const uint16x4_t ww_0_lo = vmovn_u32(vreinterpretq_u32_u16(ww[0]));
2811 const uint16x4_t ww_0_hi =
2812 vmovn_u32(vreinterpretq_u32_u16(vextq_u16(ww[0], ww[0], 1)));
2813 const uint8x8_t save_mask = vcreate_u8(0 + (2 << 8) + (4 << 16) + (6 << 24));
2814
2815#if !defined(__aarch64__)
2816 const uint8x8x2_t v_split1 = { { vget_low_u8(vreinterpretq_u8_u16(wh[0])),
2817 vget_high_u8(
2818 vreinterpretq_u8_u16(wh[0])) } };
2819 const uint8x8x2_t v_split2 = { { vget_low_u8(vreinterpretq_u8_u16(wh[1])),
2820 vget_high_u8(
2821 vreinterpretq_u8_u16(wh[1])) } };
2822 const uint8x8x2_t v_split3 = { { vget_low_u8(pixel[1]),
2823 vget_high_u8(pixel[1]) } };
2824#endif // (__aarch64__)
2825
2826 for (int i = 0; i < h; ++i) {
2827#if defined(__aarch64__)
2828 const uint8x8_t wg =
2829 vqtbl1_u8(vreinterpretq_u8_u16(wh[0]), vreinterpret_u8_u16(d));
2830 const uint8x8_t sc =
2831 vqtbl1_u8(vreinterpretq_u8_u16(wh[1]), vreinterpret_u8_u16(d));
2832#else
2833 const uint8x8_t wg = vtbl2_u8(v_split1, vreinterpret_u8_u16(d));
2834 const uint8x8_t sc = vtbl2_u8(v_split2, vreinterpret_u8_u16(d));
2835#endif // (__aarch64__)
2836
2837 uint32x4_t sum = vmull_u16(v_pixel_0_lo, vreinterpret_u16_u8(wg));
2838 sum = vmlal_u16(sum, v_pixel_0_hi, vreinterpret_u16_u8(sc));
2839
2840#if defined(__aarch64__)
2841 uint8x8_t b = vqtbl1_u8(pixel[1], vreinterpret_u8_u16(rep));
2842#else
2843 uint8x8_t b = vtbl2_u8(v_split3, vreinterpret_u8_u16(rep));
2844#endif // (__aarch64__)
2845
2846 sum = vmlal_u16(sum, vreinterpret_u16_u8(b), ww_0_lo);
2847 sum = vmlal_u16(sum, v_pixel_2, ww_0_hi);
2848 uint8x8_t sum_l = vreinterpret_u8_u16(vqrshrn_n_u32(sum, 9));
2849 uint32x2_t predsh = vreinterpret_u32_u8(vtbl1_u8(sum_l, save_mask));
2850 vst1_lane_u32((uint32_t *)dst, predsh, 0);
2851
2852 dst += stride;
2853
2854 rep = vadd_u16(rep, one);
2855 d = vadd_u16(d, inc);
2856 }
2857}
2858
2859void aom_smooth_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
2860 const uint8_t *above, const uint8_t *left) {
2861 uint8x16_t pixels[3];
2862 load_pixel_w4(above, left, 4, pixels);
2863
2864 uint16x8_t wh[4], ww[2];
2865 load_weight_w4(sm_weight_arrays, 4, wh, ww);
2866
2867 smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
2868}
2869
2870void aom_smooth_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
2871 const uint8_t *above, const uint8_t *left) {
2872 uint8x16_t pixels[3];
2873 load_pixel_w4(above, left, 8, pixels);
2874
2875 uint16x8_t wh[4], ww[2];
2876 load_weight_w4(sm_weight_arrays, 8, wh, ww);
2877
2878 smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
2879}
2880
2881void aom_smooth_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
2882 const uint8_t *above, const uint8_t *left) {
2883 uint8x16_t pixels[3];
2884 load_pixel_w4(above, left, 16, pixels);
2885
2886 uint16x8_t wh[4], ww[2];
2887 load_weight_w4(sm_weight_arrays, 16, wh, ww);
2888
2889 smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
2890 dst += stride << 3;
2891 smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
2892}
2893
2894// pixels[0]: above and below_pred interleave vector, first half
2895// pixels[1]: above and below_pred interleave vector, second half
2896// pixels[2]: left vector
2897// pixels[3]: right_pred vector
2898// pixels[4]: above and below_pred interleave vector, first half
2899// pixels[5]: above and below_pred interleave vector, second half
2900// pixels[6]: left vector + 16
2901// pixels[7]: right_pred vector
2902static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
2903 int height, uint8x16_t *pixels) {
2904 pixels[0] = vreinterpretq_u8_u16(vmovl_u8(vld1_u8(above)));
2905 pixels[1] = vreinterpretq_u8_u16(vdupq_n_u16((uint16_t)left[height - 1]));
2906 pixels[3] = vreinterpretq_u8_u16(vdupq_n_u16((uint16_t)above[7]));
2907
2908 if (height == 4) {
2909 const uint32x4_t zero32 = vdupq_n_u32(0);
2910 pixels[2] =
2911 vreinterpretq_u8_u32(vld1q_lane_u32((const uint32_t *)left, zero32, 0));
2912 } else if (height == 8) {
2913 const uint64x2_t zero64 = vdupq_n_u64(0);
2914 pixels[2] = vreinterpretq_u8_u64(
2915 vsetq_lane_u64(((const uint64_t *)left)[0], zero64, 0));
2916 } else if (height == 16) {
2917 pixels[2] = vld1q_u8(left);
2918 } else {
2919 pixels[2] = vld1q_u8(left);
2920 pixels[4] = pixels[0];
2921 pixels[5] = pixels[1];
2922 pixels[6] = vld1q_u8(left + 16);
2923 pixels[7] = pixels[3];
2924 }
2925}
2926
2927// weight_h[0]: weight_h vector
2928// weight_h[1]: scale - weight_h vector
2929// weight_h[2]: same as [0], offset 8
2930// weight_h[3]: same as [1], offset 8
2931// weight_h[4]: same as [0], offset 16
2932// weight_h[5]: same as [1], offset 16
2933// weight_h[6]: same as [0], offset 24
2934// weight_h[7]: same as [1], offset 24
2935// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
2936// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
2937static INLINE void load_weight_w8(const uint8_t *weight_array, int height,
2938 uint16x8_t *weight_h, uint16x8_t *weight_w) {
2939 const uint8x16_t zero = vdupq_n_u8(0);
2940 const int we_offset = height < 8 ? 4 : 8;
2941 uint8x16_t we = vld1q_u8(&weight_array[we_offset]);
2942#if defined(__aarch64__)
2943 weight_h[0] = vreinterpretq_u16_u8(vzip1q_u8(we, zero));
2944#else
2945 weight_h[0] = vreinterpretq_u16_u8(vzipq_u8(we, zero).val[0]);
2946#endif // (__aarch64__)
2947 const uint16x8_t d = vdupq_n_u16(256);
2948 weight_h[1] = vsubq_u16(d, weight_h[0]);
2949
2950 if (height == 4) {
2951 we = vextq_u8(we, zero, 4);
2952#if defined(__aarch64__)
2953 weight_w[0] = vreinterpretq_u16_u8(vzip1q_u8(we, zero));
2954#else
2955 weight_w[0] = vmovl_u8(vget_low_u8(we));
2956#endif // (__aarch64__)
2957 weight_w[1] = vsubq_u16(d, weight_w[0]);
2958 } else {
2959 weight_w[0] = weight_h[0];
2960 weight_w[1] = weight_h[1];
2961 }
2962
2963 if (height == 16) {
2964 we = vld1q_u8(&weight_array[16]);
2965 const uint8x16x2_t weight_h_02 = vzipq_u8(we, zero);
2966 weight_h[0] = vreinterpretq_u16_u8(weight_h_02.val[0]);
2967 weight_h[1] = vsubq_u16(d, weight_h[0]);
2968 weight_h[2] = vreinterpretq_u16_u8(weight_h_02.val[1]);
2969 weight_h[3] = vsubq_u16(d, weight_h[2]);
2970 } else if (height == 32) {
2971 const uint8x16_t weight_lo = vld1q_u8(&weight_array[32]);
2972 const uint8x16x2_t weight_h_02 = vzipq_u8(weight_lo, zero);
2973 weight_h[0] = vreinterpretq_u16_u8(weight_h_02.val[0]);
2974 weight_h[1] = vsubq_u16(d, weight_h[0]);
2975 weight_h[2] = vreinterpretq_u16_u8(weight_h_02.val[1]);
2976 weight_h[3] = vsubq_u16(d, weight_h[2]);
2977 const uint8x16_t weight_hi = vld1q_u8(&weight_array[32 + 16]);
2978 const uint8x16x2_t weight_h_46 = vzipq_u8(weight_hi, zero);
2979 weight_h[4] = vreinterpretq_u16_u8(weight_h_46.val[0]);
2980 weight_h[5] = vsubq_u16(d, weight_h[4]);
2981 weight_h[6] = vreinterpretq_u16_u8(weight_h_46.val[1]);
2982 weight_h[7] = vsubq_u16(d, weight_h[6]);
2983 }
2984}
2985
2986static INLINE void smooth_pred_8xh(const uint8x16_t *pixels,
2987 const uint16x8_t *wh, const uint16x8_t *ww,
2988 int h, uint8_t *dst, ptrdiff_t stride,
2989 int second_half) {
2990 const uint16x8_t one = vdupq_n_u16(1);
2991 const uint16x8_t inc = vdupq_n_u16(0x202);
2992 uint16x8_t rep = second_half ? vdupq_n_u16((uint16_t)0x8008)
2993 : vdupq_n_u16((uint16_t)0x8000);
2994 uint16x8_t d = vdupq_n_u16(0x100);
2995
2996#if !defined(__aarch64__)
2997 const uint8x8x2_t v_split1 = { { vget_low_u8(vreinterpretq_u8_u16(wh[0])),
2998 vget_high_u8(
2999 vreinterpretq_u8_u16(wh[0])) } };
3000 const uint8x8x2_t v_split2 = { { vget_low_u8(vreinterpretq_u8_u16(wh[1])),
3001 vget_high_u8(
3002 vreinterpretq_u8_u16(wh[1])) } };
3003 const uint8x8x2_t v_split3 = { { vget_low_u8(pixels[2]),
3004 vget_high_u8(pixels[2]) } };
3005#endif
3006
3007 for (int i = 0; i < h; ++i) {
3008#if defined(__aarch64__)
3009 const uint8x16_t wg_wg =
3010 vqtbl1q_u8(vreinterpretq_u8_u16(wh[0]), vreinterpretq_u8_u16(d));
3011 const uint8x16_t sc_sc =
3012 vqtbl1q_u8(vreinterpretq_u8_u16(wh[1]), vreinterpretq_u8_u16(d));
3013#else
3014 const uint8x8_t v_d_lo = vreinterpret_u8_u16(vget_low_u16(d));
3015 const uint8x8_t v_d_hi = vreinterpret_u8_u16(vget_high_u16(d));
3016 const uint8x16_t wg_wg =
3017 vcombine_u8(vtbl2_u8(v_split1, v_d_lo), vtbl2_u8(v_split1, v_d_hi));
3018 const uint8x16_t sc_sc =
3019 vcombine_u8(vtbl2_u8(v_split2, v_d_lo), vtbl2_u8(v_split2, v_d_hi));
3020#endif // (__aarch64__)
3021 uint16x8_t s01 =
3022 vmulq_u16(vreinterpretq_u16_u8(pixels[0]), vreinterpretq_u16_u8(wg_wg));
3023 s01 = vmlaq_u16(s01, vreinterpretq_u16_u8(pixels[1]),
3024 vreinterpretq_u16_u8(sc_sc));
3025#if defined(__aarch64__)
3026 const uint8x16_t b = vqtbl1q_u8(pixels[2], vreinterpretq_u8_u16(rep));
3027#else
3028 const uint8x16_t b = vcombine_u8(
3029 vtbl2_u8(v_split3, vget_low_u8(vreinterpretq_u8_u16(rep))),
3030 vtbl2_u8(v_split3, vget_high_u8(vreinterpretq_u8_u16(rep))));
3031#endif // (__aarch64__)
3032 uint16x8_t sum0 = vmulq_u16(vreinterpretq_u16_u8(b), ww[0]);
3033 sum0 = vmlaq_u16(sum0, vreinterpretq_u16_u8(pixels[3]), ww[1]);
3034
3035 uint32x4_t s0 = vaddl_u16(vget_low_u16(s01), vget_low_u16(sum0));
3036#if defined(__aarch64__)
3037 uint32x4_t s1 = vaddl_high_u16(s01, sum0);
3038#else
3039 uint32x4_t s1 = vaddl_u16(vget_high_u16(s01), vget_high_u16(sum0));
3040#endif // (__aarch64__)
3041
3042 sum0 = vcombine_u16(vqrshrn_n_u32(s0, 9), vqrshrn_n_u32(s1, 9));
3043 uint8x8_t predsh = vqmovn_u16(sum0);
3044 vst1_u8(dst, predsh);
3045
3046 dst += stride;
3047 rep = vaddq_u16(rep, one);
3048 d = vaddq_u16(d, inc);
3049 }
3050}
3051
3052void aom_smooth_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
3053 const uint8_t *above, const uint8_t *left) {
3054 uint8x16_t pixels[4];
3055 load_pixel_w8(above, left, 4, pixels);
3056
3057 uint16x8_t wh[4], ww[2];
3058 load_weight_w8(sm_weight_arrays, 4, wh, ww);
3059
3060 smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
3061}
3062
3063void aom_smooth_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
3064 const uint8_t *above, const uint8_t *left) {
3065 uint8x16_t pixels[4];
3066 load_pixel_w8(above, left, 8, pixels);
3067
3068 uint16x8_t wh[4], ww[2];
3069 load_weight_w8(sm_weight_arrays, 8, wh, ww);
3070
3071 smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
3072}
3073
3074void aom_smooth_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
3075 const uint8_t *above, const uint8_t *left) {
3076 uint8x16_t pixels[4];
3077 load_pixel_w8(above, left, 16, pixels);
3078
3079 uint16x8_t wh[4], ww[2];
3080 load_weight_w8(sm_weight_arrays, 16, wh, ww);
3081
3082 smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
3083 dst += stride << 3;
3084 smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
3085}
3086
3087void aom_smooth_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
3088 const uint8_t *above, const uint8_t *left) {
3089 uint8x16_t pixels[8];
3090 load_pixel_w8(above, left, 32, pixels);
3091
3092 uint16x8_t wh[8], ww[2];
3093 load_weight_w8(sm_weight_arrays, 32, wh, ww);
3094
3095 smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
3096 dst += stride << 3;
3097 smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
3098 dst += stride << 3;
3099 smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
3100 dst += stride << 3;
3101 smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
3102}
3103
3104static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
3105 const uint8_t *above,
3106 const uint8_t *left, uint32_t bw,
3107 uint32_t bh) {
3108 const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
3109 const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
3110 const uint16x8_t scale_value = vdupq_n_u16(256);
3111
3112 for (uint32_t y = 0; y < bh; ++y) {
3113 const uint8x8_t left_y = vdup_n_u8(left[y]);
3114 const uint8x8_t weights_y_dup = vdup_n_u8(sm_weights_h[y]);
3115 const uint32x4_t pred_scaled_bl =
3116 vdupq_n_u32(256 + (256 - sm_weights_h[y]) * left[bh - 1]);
3117
3118 for (uint32_t x = 0; x < bw; x += 8) {
3119 const uint8x8_t weights_x = vld1_u8(sm_weights_w + x);
3120 const uint8x8_t top_x = vld1_u8(above + x);
3121
3122 uint16x8_t pred_m1, pred_m2;
3123 uint32x4_t pred_lo, pred_hi;
3124 pred_m1 = vmull_u8(top_x, weights_y_dup);
3125 pred_m2 = vmull_u8(weights_x, left_y);
3126
3127 pred_lo = vaddl_u16(vget_low_u16(pred_m1), vget_low_u16(pred_m2));
3128#if defined(__aarch64__)
3129 pred_hi = vaddl_high_u16(pred_m1, pred_m2);
3130#else
3131 pred_hi = vaddl_u16(vget_high_u16(pred_m1), vget_high_u16(pred_m2));
3132#endif // (__aarch64__)
3133
3134 const uint16x8_t scale_m_weights_x = vsubw_u8(scale_value, weights_x);
3135
3136 const uint16x8_t swxtr = vmulq_n_u16(scale_m_weights_x, above[bw - 1]);
3137
3138 pred_lo = vaddq_u32(pred_lo, pred_scaled_bl);
3139 pred_hi = vaddq_u32(pred_hi, pred_scaled_bl);
3140
3141 pred_lo = vaddw_u16(pred_lo, vget_low_u16(swxtr));
3142#if defined(__aarch64__)
3143 pred_hi = vaddw_high_u16(pred_hi, swxtr);
3144#else
3145 pred_hi = vaddw_u16(pred_hi, vget_high_u16(swxtr));
3146#endif // (__aarch64__)
3147
3148 uint16x8_t pred =
3149 vcombine_u16(vshrn_n_u32(pred_lo, 9), vshrn_n_u32(pred_hi, 9));
3150
3151 uint8x8_t predsh = vqmovn_u16(pred);
3152
3153 vst1_u8(dst + x, predsh);
3154 }
3155
3156 dst += stride;
3157 }
3158}
3159
3160void aom_smooth_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
3161 const uint8_t *above, const uint8_t *left) {
3162 smooth_predictor_wxh(dst, stride, above, left, 16, 4);
3163}
3164
3165void aom_smooth_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
3166 const uint8_t *above, const uint8_t *left) {
3167 smooth_predictor_wxh(dst, stride, above, left, 16, 8);
3168}
3169
3170void aom_smooth_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
3171 const uint8_t *above,
3172 const uint8_t *left) {
3173 smooth_predictor_wxh(dst, stride, above, left, 16, 16);
3174}
3175
3176void aom_smooth_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
3177 const uint8_t *above,
3178 const uint8_t *left) {
3179 smooth_predictor_wxh(dst, stride, above, left, 16, 32);
3180}
3181
3182void aom_smooth_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
3183 const uint8_t *above, const uint8_t *left) {
3184 smooth_predictor_wxh(dst, stride, above, left, 32, 8);
3185}
3186
3187void aom_smooth_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
3188 const uint8_t *above,
3189 const uint8_t *left) {
3190 smooth_predictor_wxh(dst, stride, above, left, 32, 16);
3191}
3192
3193void aom_smooth_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
3194 const uint8_t *above,
3195 const uint8_t *left) {
3196 smooth_predictor_wxh(dst, stride, above, left, 32, 32);
3197}
3198
3199void aom_smooth_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
3200 const uint8_t *above,
3201 const uint8_t *left) {
3202 smooth_predictor_wxh(dst, stride, above, left, 32, 64);
3203}
3204
3205void aom_smooth_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
3206 const uint8_t *above,
3207 const uint8_t *left) {
3208 smooth_predictor_wxh(dst, stride, above, left, 64, 64);
3209}
3210
3211void aom_smooth_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
3212 const uint8_t *above,
3213 const uint8_t *left) {
3214 smooth_predictor_wxh(dst, stride, above, left, 64, 32);
3215}
3216
3217void aom_smooth_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
3218 const uint8_t *above,
3219 const uint8_t *left) {
3220 smooth_predictor_wxh(dst, stride, above, left, 64, 16);
3221}
3222
3223void aom_smooth_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
3224 const uint8_t *above,
3225 const uint8_t *left) {
3226 smooth_predictor_wxh(dst, stride, above, left, 16, 64);
3227}