blob: 69f408456c3ee7dbaaa373e853682c3c8f596f78 [file] [log] [blame]
Yaowu Xuc27fc142016-08-22 16:08:15 -07001/*
James Zernb7c05bd2024-06-11 19:15:10 -07002 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
Yaowu Xuc27fc142016-08-22 16:08:15 -07003 *
Yaowu Xu9c01aa12016-09-01 14:32:49 -07004 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
Yaowu Xuc27fc142016-08-22 16:08:15 -070010 */
11
12#include <arm_neon.h>
13
Tom Finegan60e653d2018-05-22 11:34:58 -070014#include "config/aom_config.h"
Tom Finegan44702c82018-05-22 13:00:39 -070015#include "config/aom_dsp_rtcd.h"
Tom Finegan60e653d2018-05-22 11:34:58 -070016
Yaowu Xuf883b422016-08-30 14:01:10 -070017#include "aom/aom_integer.h"
Jonathan Wright93fe5262023-01-31 15:50:20 +000018#include "aom_dsp/arm/mem_neon.h"
James Zern81a0c432022-05-18 13:47:48 -070019#include "aom_dsp/arm/sum_neon.h"
Yaowu Xuc27fc142016-08-22 16:08:15 -070020
Jonathan Wright0a770ff2023-04-24 15:27:07 +010021static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
22 uint16x8_t *const sad_sum) {
23 uint8x16_t abs_diff = vabdq_u8(src, ref);
24 *sad_sum = vpadalq_u8(*sad_sum, abs_diff);
25}
26
27static INLINE void sadwxhx3d_large_neon(const uint8_t *src, int src_stride,
28 const uint8_t *const ref[3],
29 int ref_stride, uint32_t res[3], int w,
30 int h, int h_overflow) {
31 uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
32 int h_limit = h > h_overflow ? h_overflow : h;
33
34 int ref_offset = 0;
35 int i = 0;
36 do {
37 uint16x8_t sum_lo[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
38 uint16x8_t sum_hi[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
39
40 do {
41 int j = 0;
42 do {
43 const uint8x16_t s0 = vld1q_u8(src + j);
44 sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]);
45 sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]);
46 sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]);
47
48 const uint8x16_t s1 = vld1q_u8(src + j + 16);
49 sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]);
50 sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]);
51 sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]);
52
53 j += 32;
54 } while (j < w);
55
56 src += src_stride;
57 ref_offset += ref_stride;
58 } while (++i < h_limit);
59
60 sum[0] = vpadalq_u16(sum[0], sum_lo[0]);
61 sum[0] = vpadalq_u16(sum[0], sum_hi[0]);
62 sum[1] = vpadalq_u16(sum[1], sum_lo[1]);
63 sum[1] = vpadalq_u16(sum[1], sum_hi[1]);
64 sum[2] = vpadalq_u16(sum[2], sum_lo[2]);
65 sum[2] = vpadalq_u16(sum[2], sum_hi[2]);
66
67 h_limit += h_overflow;
68 } while (i < h);
69
70 res[0] = horizontal_add_u32x4(sum[0]);
71 res[1] = horizontal_add_u32x4(sum[1]);
72 res[2] = horizontal_add_u32x4(sum[2]);
73}
74
75static INLINE void sad128xhx3d_neon(const uint8_t *src, int src_stride,
76 const uint8_t *const ref[3], int ref_stride,
77 uint32_t res[3], int h) {
78 sadwxhx3d_large_neon(src, src_stride, ref, ref_stride, res, 128, h, 32);
79}
80
81static INLINE void sad64xhx3d_neon(const uint8_t *src, int src_stride,
82 const uint8_t *const ref[3], int ref_stride,
83 uint32_t res[3], int h) {
84 sadwxhx3d_large_neon(src, src_stride, ref, ref_stride, res, 64, h, 64);
85}
86
87static INLINE void sad32xhx3d_neon(const uint8_t *src, int src_stride,
88 const uint8_t *const ref[3], int ref_stride,
89 uint32_t res[3], int h) {
90 uint16x8_t sum_lo[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
91 uint16x8_t sum_hi[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
92
93 int ref_offset = 0;
94 int i = h;
95 do {
96 const uint8x16_t s0 = vld1q_u8(src);
97 sad16_neon(s0, vld1q_u8(ref[0] + ref_offset), &sum_lo[0]);
98 sad16_neon(s0, vld1q_u8(ref[1] + ref_offset), &sum_lo[1]);
99 sad16_neon(s0, vld1q_u8(ref[2] + ref_offset), &sum_lo[2]);
100
101 const uint8x16_t s1 = vld1q_u8(src + 16);
102 sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + 16), &sum_hi[0]);
103 sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + 16), &sum_hi[1]);
104 sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + 16), &sum_hi[2]);
105
106 src += src_stride;
107 ref_offset += ref_stride;
108 } while (--i != 0);
109
110 res[0] = horizontal_long_add_u16x8(sum_lo[0], sum_hi[0]);
111 res[1] = horizontal_long_add_u16x8(sum_lo[1], sum_hi[1]);
112 res[2] = horizontal_long_add_u16x8(sum_lo[2], sum_hi[2]);
113}
114
115static INLINE void sad16xhx3d_neon(const uint8_t *src, int src_stride,
116 const uint8_t *const ref[3], int ref_stride,
117 uint32_t res[3], int h) {
118 uint16x8_t sum[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
119
120 int ref_offset = 0;
121 int i = h;
122 do {
123 const uint8x16_t s = vld1q_u8(src);
124 sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum[0]);
125 sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum[1]);
126 sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum[2]);
127
128 src += src_stride;
129 ref_offset += ref_stride;
130 } while (--i != 0);
131
132 res[0] = horizontal_add_u16x8(sum[0]);
133 res[1] = horizontal_add_u16x8(sum[1]);
134 res[2] = horizontal_add_u16x8(sum[2]);
135}
136
Jonathan Wright0a770ff2023-04-24 15:27:07 +0100137static INLINE void sad8xhx3d_neon(const uint8_t *src, int src_stride,
138 const uint8_t *const ref[3], int ref_stride,
139 uint32_t res[3], int h) {
140 uint16x8_t sum[3];
141
142 uint8x8_t s = vld1_u8(src);
143 sum[0] = vabdl_u8(s, vld1_u8(ref[0]));
144 sum[1] = vabdl_u8(s, vld1_u8(ref[1]));
145 sum[2] = vabdl_u8(s, vld1_u8(ref[2]));
146
147 src += src_stride;
148 int ref_offset = ref_stride;
149 int i = h - 1;
150 do {
151 s = vld1_u8(src);
152 sum[0] = vabal_u8(sum[0], s, vld1_u8(ref[0] + ref_offset));
153 sum[1] = vabal_u8(sum[1], s, vld1_u8(ref[1] + ref_offset));
154 sum[2] = vabal_u8(sum[2], s, vld1_u8(ref[2] + ref_offset));
155
156 src += src_stride;
157 ref_offset += ref_stride;
158 } while (--i != 0);
159
160 res[0] = horizontal_add_u16x8(sum[0]);
161 res[1] = horizontal_add_u16x8(sum[1]);
162 res[2] = horizontal_add_u16x8(sum[2]);
163}
164
165static INLINE void sad4xhx3d_neon(const uint8_t *src, int src_stride,
166 const uint8_t *const ref[3], int ref_stride,
167 uint32_t res[3], int h) {
168 assert(h % 2 == 0);
169 uint16x8_t sum[3];
170
171 uint8x8_t s = load_unaligned_u8(src, src_stride);
172 uint8x8_t r0 = load_unaligned_u8(ref[0], ref_stride);
173 uint8x8_t r1 = load_unaligned_u8(ref[1], ref_stride);
174 uint8x8_t r2 = load_unaligned_u8(ref[2], ref_stride);
175
176 sum[0] = vabdl_u8(s, r0);
177 sum[1] = vabdl_u8(s, r1);
178 sum[2] = vabdl_u8(s, r2);
179
180 src += 2 * src_stride;
181 int ref_offset = 2 * ref_stride;
182 int i = (h / 2) - 1;
183 do {
184 s = load_unaligned_u8(src, src_stride);
185 r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride);
186 r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride);
187 r2 = load_unaligned_u8(ref[2] + ref_offset, ref_stride);
188
189 sum[0] = vabal_u8(sum[0], s, r0);
190 sum[1] = vabal_u8(sum[1], s, r1);
191 sum[2] = vabal_u8(sum[2], s, r2);
192
193 src += 2 * src_stride;
194 ref_offset += 2 * ref_stride;
195 } while (--i != 0);
196
197 res[0] = horizontal_add_u16x8(sum[0]);
198 res[1] = horizontal_add_u16x8(sum[1]);
199 res[2] = horizontal_add_u16x8(sum[2]);
200}
201
202#define SAD_WXH_3D_NEON(w, h) \
203 void aom_sad##w##x##h##x3d_neon(const uint8_t *src, int src_stride, \
204 const uint8_t *const ref[4], int ref_stride, \
205 uint32_t res[4]) { \
206 sad##w##xhx3d_neon(src, src_stride, ref, ref_stride, res, (h)); \
207 }
208
209SAD_WXH_3D_NEON(4, 4)
210SAD_WXH_3D_NEON(4, 8)
211
212SAD_WXH_3D_NEON(8, 4)
213SAD_WXH_3D_NEON(8, 8)
214SAD_WXH_3D_NEON(8, 16)
215
216SAD_WXH_3D_NEON(16, 8)
217SAD_WXH_3D_NEON(16, 16)
218SAD_WXH_3D_NEON(16, 32)
219
220SAD_WXH_3D_NEON(32, 16)
221SAD_WXH_3D_NEON(32, 32)
222SAD_WXH_3D_NEON(32, 64)
223
224SAD_WXH_3D_NEON(64, 32)
225SAD_WXH_3D_NEON(64, 64)
226SAD_WXH_3D_NEON(64, 128)
227
228SAD_WXH_3D_NEON(128, 64)
229SAD_WXH_3D_NEON(128, 128)
230
231#if !CONFIG_REALTIME_ONLY
232SAD_WXH_3D_NEON(4, 16)
233SAD_WXH_3D_NEON(8, 32)
234SAD_WXH_3D_NEON(16, 4)
235SAD_WXH_3D_NEON(16, 64)
236SAD_WXH_3D_NEON(32, 8)
237SAD_WXH_3D_NEON(64, 16)
238#endif // !CONFIG_REALTIME_ONLY
239
240#undef SAD_WXH_3D_NEON
241
Jonathan Wrightd93da792023-04-18 10:56:55 +0100242static INLINE void sadwxhx4d_large_neon(const uint8_t *src, int src_stride,
243 const uint8_t *const ref[4],
244 int ref_stride, uint32_t res[4], int w,
245 int h, int h_overflow) {
Jonathan Wright8f8b66d2023-04-13 15:02:00 +0100246 uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
247 vdupq_n_u32(0) };
Jonathan Wrightd93da792023-04-18 10:56:55 +0100248 int h_limit = h > h_overflow ? h_overflow : h;
Yaowu Xuc27fc142016-08-22 16:08:15 -0700249
Jonathan Wrightd93da792023-04-18 10:56:55 +0100250 int ref_offset = 0;
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100251 int i = 0;
252 do {
253 uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
254 vdupq_n_u16(0) };
255 uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
256 vdupq_n_u16(0) };
257
258 do {
Jonathan Wrightd93da792023-04-18 10:56:55 +0100259 int j = 0;
260 do {
261 const uint8x16_t s0 = vld1q_u8(src + j);
262 sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]);
263 sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]);
264 sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]);
265 sad16_neon(s0, vld1q_u8(ref[3] + ref_offset + j), &sum_lo[3]);
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100266
Jonathan Wrightd93da792023-04-18 10:56:55 +0100267 const uint8x16_t s1 = vld1q_u8(src + j + 16);
268 sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]);
269 sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]);
270 sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]);
271 sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + j + 16), &sum_hi[3]);
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100272
Jonathan Wrightd93da792023-04-18 10:56:55 +0100273 j += 32;
274 } while (j < w);
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100275
Jonathan Wrightd93da792023-04-18 10:56:55 +0100276 src += src_stride;
277 ref_offset += ref_stride;
278 } while (++i < h_limit);
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100279
Jonathan Wright8f8b66d2023-04-13 15:02:00 +0100280 sum[0] = vpadalq_u16(sum[0], sum_lo[0]);
281 sum[0] = vpadalq_u16(sum[0], sum_hi[0]);
282 sum[1] = vpadalq_u16(sum[1], sum_lo[1]);
283 sum[1] = vpadalq_u16(sum[1], sum_hi[1]);
284 sum[2] = vpadalq_u16(sum[2], sum_lo[2]);
285 sum[2] = vpadalq_u16(sum[2], sum_hi[2]);
286 sum[3] = vpadalq_u16(sum[3], sum_lo[3]);
287 sum[3] = vpadalq_u16(sum[3], sum_hi[3]);
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100288
Jonathan Wrightd93da792023-04-18 10:56:55 +0100289 h_limit += h_overflow;
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100290 } while (i < h);
Jonathan Wright8f8b66d2023-04-13 15:02:00 +0100291
292 vst1q_u32(res, horizontal_add_4d_u32x4(sum));
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100293}
294
Jonathan Wrightd93da792023-04-18 10:56:55 +0100295static INLINE void sad128xhx4d_neon(const uint8_t *src, int src_stride,
296 const uint8_t *const ref[4], int ref_stride,
297 uint32_t res[4], int h) {
298 sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 128, h, 32);
299}
300
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100301static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride,
302 const uint8_t *const ref[4], int ref_stride,
303 uint32_t res[4], int h) {
Jonathan Wrightd93da792023-04-18 10:56:55 +0100304 sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 64, h, 64);
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100305}
306
307static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride,
308 const uint8_t *const ref[4], int ref_stride,
309 uint32_t res[4], int h) {
310 uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
311 vdupq_n_u16(0) };
312 uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
313 vdupq_n_u16(0) };
314
Jonathan Wrightd93da792023-04-18 10:56:55 +0100315 int ref_offset = 0;
316 int i = h;
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100317 do {
Jonathan Wrightd93da792023-04-18 10:56:55 +0100318 const uint8x16_t s0 = vld1q_u8(src);
319 sad16_neon(s0, vld1q_u8(ref[0] + ref_offset), &sum_lo[0]);
320 sad16_neon(s0, vld1q_u8(ref[1] + ref_offset), &sum_lo[1]);
321 sad16_neon(s0, vld1q_u8(ref[2] + ref_offset), &sum_lo[2]);
322 sad16_neon(s0, vld1q_u8(ref[3] + ref_offset), &sum_lo[3]);
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100323
Jonathan Wrightd93da792023-04-18 10:56:55 +0100324 const uint8x16_t s1 = vld1q_u8(src + 16);
325 sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + 16), &sum_hi[0]);
326 sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + 16), &sum_hi[1]);
327 sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + 16), &sum_hi[2]);
328 sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + 16), &sum_hi[3]);
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100329
Jonathan Wrightd93da792023-04-18 10:56:55 +0100330 src += src_stride;
331 ref_offset += ref_stride;
332 } while (--i != 0);
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100333
Jonathan Wright8f8b66d2023-04-13 15:02:00 +0100334 vst1q_u32(res, horizontal_long_add_4d_u16x8(sum_lo, sum_hi));
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100335}
336
337static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride,
338 const uint8_t *const ref[4], int ref_stride,
339 uint32_t res[4], int h) {
Jonathan Wright8f8b66d2023-04-13 15:02:00 +0100340 uint16x8_t sum_u16[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
341 vdupq_n_u16(0) };
342 uint32x4_t sum_u32[4];
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100343
Jonathan Wrightd93da792023-04-18 10:56:55 +0100344 int ref_offset = 0;
345 int i = h;
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100346 do {
Jonathan Wrightd93da792023-04-18 10:56:55 +0100347 const uint8x16_t s = vld1q_u8(src);
348 sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum_u16[0]);
349 sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum_u16[1]);
350 sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum_u16[2]);
351 sad16_neon(s, vld1q_u8(ref[3] + ref_offset), &sum_u16[3]);
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100352
Jonathan Wrightd93da792023-04-18 10:56:55 +0100353 src += src_stride;
354 ref_offset += ref_stride;
355 } while (--i != 0);
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100356
Jonathan Wright8f8b66d2023-04-13 15:02:00 +0100357 sum_u32[0] = vpaddlq_u16(sum_u16[0]);
358 sum_u32[1] = vpaddlq_u16(sum_u16[1]);
359 sum_u32[2] = vpaddlq_u16(sum_u16[2]);
360 sum_u32[3] = vpaddlq_u16(sum_u16[3]);
361
362 vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32));
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100363}
364
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100365static INLINE void sad8xhx4d_neon(const uint8_t *src, int src_stride,
366 const uint8_t *const ref[4], int ref_stride,
367 uint32_t res[4], int h) {
Jonathan Wrightd93da792023-04-18 10:56:55 +0100368 uint16x8_t sum[4];
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100369
Jonathan Wrightd93da792023-04-18 10:56:55 +0100370 uint8x8_t s = vld1_u8(src);
371 sum[0] = vabdl_u8(s, vld1_u8(ref[0]));
372 sum[1] = vabdl_u8(s, vld1_u8(ref[1]));
373 sum[2] = vabdl_u8(s, vld1_u8(ref[2]));
374 sum[3] = vabdl_u8(s, vld1_u8(ref[3]));
375
376 src += src_stride;
377 int ref_offset = ref_stride;
378 int i = h - 1;
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100379 do {
Jonathan Wrightd93da792023-04-18 10:56:55 +0100380 s = vld1_u8(src);
381 sum[0] = vabal_u8(sum[0], s, vld1_u8(ref[0] + ref_offset));
382 sum[1] = vabal_u8(sum[1], s, vld1_u8(ref[1] + ref_offset));
383 sum[2] = vabal_u8(sum[2], s, vld1_u8(ref[2] + ref_offset));
384 sum[3] = vabal_u8(sum[3], s, vld1_u8(ref[3] + ref_offset));
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100385
Jonathan Wrightd93da792023-04-18 10:56:55 +0100386 src += src_stride;
387 ref_offset += ref_stride;
388 } while (--i != 0);
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100389
Jonathan Wright3e1b9c52023-04-06 00:38:48 +0100390 vst1q_u32(res, horizontal_add_4d_u16x8(sum));
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100391}
392
393static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride,
394 const uint8_t *const ref[4], int ref_stride,
395 uint32_t res[4], int h) {
Jonathan Wrightd93da792023-04-18 10:56:55 +0100396 uint16x8_t sum[4];
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100397
Jonathan Wrightd93da792023-04-18 10:56:55 +0100398 uint8x8_t s = load_unaligned_u8(src, src_stride);
399 uint8x8_t r0 = load_unaligned_u8(ref[0], ref_stride);
400 uint8x8_t r1 = load_unaligned_u8(ref[1], ref_stride);
401 uint8x8_t r2 = load_unaligned_u8(ref[2], ref_stride);
402 uint8x8_t r3 = load_unaligned_u8(ref[3], ref_stride);
403
404 sum[0] = vabdl_u8(s, r0);
405 sum[1] = vabdl_u8(s, r1);
406 sum[2] = vabdl_u8(s, r2);
407 sum[3] = vabdl_u8(s, r3);
408
409 src += 2 * src_stride;
410 int ref_offset = 2 * ref_stride;
Mark Horvath6c744f62023-04-18 14:47:21 +0200411 int i = h / 2;
412 while (--i != 0) {
Jonathan Wrightd93da792023-04-18 10:56:55 +0100413 s = load_unaligned_u8(src, src_stride);
414 r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride);
415 r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride);
416 r2 = load_unaligned_u8(ref[2] + ref_offset, ref_stride);
417 r3 = load_unaligned_u8(ref[3] + ref_offset, ref_stride);
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100418
Jonathan Wrightd93da792023-04-18 10:56:55 +0100419 sum[0] = vabal_u8(sum[0], s, r0);
420 sum[1] = vabal_u8(sum[1], s, r1);
421 sum[2] = vabal_u8(sum[2], s, r2);
422 sum[3] = vabal_u8(sum[3], s, r3);
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100423
Jonathan Wrightd93da792023-04-18 10:56:55 +0100424 src += 2 * src_stride;
425 ref_offset += 2 * ref_stride;
Mark Horvath6c744f62023-04-18 14:47:21 +0200426 }
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100427
Jonathan Wright3e1b9c52023-04-06 00:38:48 +0100428 vst1q_u32(res, horizontal_add_4d_u16x8(sum));
Yaowu Xuc27fc142016-08-22 16:08:15 -0700429}
430
Jonathan Wrightf9fc16b2022-06-29 23:57:14 +0100431#define SAD_WXH_4D_NEON(w, h) \
432 void aom_sad##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \
433 const uint8_t *const ref[4], int ref_stride, \
434 uint32_t res[4]) { \
435 sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, res, (h)); \
436 }
Yaowu Xuc27fc142016-08-22 16:08:15 -0700437
Jonathan Wrightf9fc16b2022-06-29 23:57:14 +0100438SAD_WXH_4D_NEON(4, 4)
439SAD_WXH_4D_NEON(4, 8)
Yaowu Xuc27fc142016-08-22 16:08:15 -0700440
Jonathan Wrightf9fc16b2022-06-29 23:57:14 +0100441SAD_WXH_4D_NEON(8, 4)
442SAD_WXH_4D_NEON(8, 8)
443SAD_WXH_4D_NEON(8, 16)
Jonathan Wrightf9fc16b2022-06-29 23:57:14 +0100444
Jonathan Wrightf9fc16b2022-06-29 23:57:14 +0100445SAD_WXH_4D_NEON(16, 8)
446SAD_WXH_4D_NEON(16, 16)
447SAD_WXH_4D_NEON(16, 32)
Jonathan Wrightf9fc16b2022-06-29 23:57:14 +0100448
Jonathan Wrightf9fc16b2022-06-29 23:57:14 +0100449SAD_WXH_4D_NEON(32, 16)
450SAD_WXH_4D_NEON(32, 32)
451SAD_WXH_4D_NEON(32, 64)
452
Jonathan Wrightf9fc16b2022-06-29 23:57:14 +0100453SAD_WXH_4D_NEON(64, 32)
454SAD_WXH_4D_NEON(64, 64)
455SAD_WXH_4D_NEON(64, 128)
456
457SAD_WXH_4D_NEON(128, 64)
458SAD_WXH_4D_NEON(128, 128)
459
Mark Horvathbb3ed052023-04-18 14:56:59 +0200460#if !CONFIG_REALTIME_ONLY
461SAD_WXH_4D_NEON(4, 16)
462SAD_WXH_4D_NEON(8, 32)
463SAD_WXH_4D_NEON(16, 4)
464SAD_WXH_4D_NEON(16, 64)
465SAD_WXH_4D_NEON(32, 8)
466SAD_WXH_4D_NEON(64, 16)
467#endif // !CONFIG_REALTIME_ONLY
468
Jonathan Wrightf9fc16b2022-06-29 23:57:14 +0100469#undef SAD_WXH_4D_NEON
Krishna Malladifb78faa2020-08-21 10:25:23 -0700470
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100471#define SAD_SKIP_WXH_4D_NEON(w, h) \
472 void aom_sad_skip_##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \
Krishna Malladifb78faa2020-08-21 10:25:23 -0700473 const uint8_t *const ref[4], \
474 int ref_stride, uint32_t res[4]) { \
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100475 sad##w##xhx4d_neon(src, 2 * src_stride, ref, 2 * ref_stride, res, \
476 ((h) >> 1)); \
Krishna Malladifb78faa2020-08-21 10:25:23 -0700477 res[0] <<= 1; \
478 res[1] <<= 1; \
479 res[2] <<= 1; \
480 res[3] <<= 1; \
481 }
482
Mark Horvath6c744f62023-04-18 14:47:21 +0200483SAD_SKIP_WXH_4D_NEON(4, 4)
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100484SAD_SKIP_WXH_4D_NEON(4, 8)
Krishna Malladifb78faa2020-08-21 10:25:23 -0700485
Mark Horvath6c744f62023-04-18 14:47:21 +0200486SAD_SKIP_WXH_4D_NEON(8, 4)
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100487SAD_SKIP_WXH_4D_NEON(8, 8)
488SAD_SKIP_WXH_4D_NEON(8, 16)
Krishna Malladifb78faa2020-08-21 10:25:23 -0700489
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100490SAD_SKIP_WXH_4D_NEON(16, 8)
491SAD_SKIP_WXH_4D_NEON(16, 16)
492SAD_SKIP_WXH_4D_NEON(16, 32)
Krishna Malladifb78faa2020-08-21 10:25:23 -0700493
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100494SAD_SKIP_WXH_4D_NEON(32, 16)
495SAD_SKIP_WXH_4D_NEON(32, 32)
496SAD_SKIP_WXH_4D_NEON(32, 64)
Krishna Malladifb78faa2020-08-21 10:25:23 -0700497
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100498SAD_SKIP_WXH_4D_NEON(64, 32)
499SAD_SKIP_WXH_4D_NEON(64, 64)
500SAD_SKIP_WXH_4D_NEON(64, 128)
Krishna Malladifb78faa2020-08-21 10:25:23 -0700501
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100502SAD_SKIP_WXH_4D_NEON(128, 64)
503SAD_SKIP_WXH_4D_NEON(128, 128)
James Zernf2658a32022-02-09 10:18:38 -0800504
Mark Horvathbb3ed052023-04-18 14:56:59 +0200505#if !CONFIG_REALTIME_ONLY
506SAD_SKIP_WXH_4D_NEON(4, 16)
507SAD_SKIP_WXH_4D_NEON(8, 32)
508SAD_SKIP_WXH_4D_NEON(16, 4)
509SAD_SKIP_WXH_4D_NEON(16, 64)
510SAD_SKIP_WXH_4D_NEON(32, 8)
511SAD_SKIP_WXH_4D_NEON(64, 16)
512#endif // !CONFIG_REALTIME_ONLY
513
Jonathan Wright7c3e5172022-06-29 14:56:06 +0100514#undef SAD_SKIP_WXH_4D_NEON