Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 1 | /* |
James Zern | b7c05bd | 2024-06-11 19:15:10 -0700 | [diff] [blame] | 2 | * Copyright (c) 2016, Alliance for Open Media. All rights reserved. |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 3 | * |
Yaowu Xu | 9c01aa1 | 2016-09-01 14:32:49 -0700 | [diff] [blame] | 4 | * This source code is subject to the terms of the BSD 2 Clause License and |
| 5 | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| 6 | * was not distributed with this source code in the LICENSE file, you can |
| 7 | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| 8 | * Media Patent License 1.0 was not distributed with this source code in the |
| 9 | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 10 | */ |
| 11 | |
| 12 | #include <arm_neon.h> |
| 13 | |
Tom Finegan | 60e653d | 2018-05-22 11:34:58 -0700 | [diff] [blame] | 14 | #include "config/aom_config.h" |
Tom Finegan | 44702c8 | 2018-05-22 13:00:39 -0700 | [diff] [blame] | 15 | #include "config/aom_dsp_rtcd.h" |
Tom Finegan | 60e653d | 2018-05-22 11:34:58 -0700 | [diff] [blame] | 16 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 17 | #include "aom/aom_integer.h" |
Jonathan Wright | 93fe526 | 2023-01-31 15:50:20 +0000 | [diff] [blame] | 18 | #include "aom_dsp/arm/mem_neon.h" |
James Zern | 81a0c43 | 2022-05-18 13:47:48 -0700 | [diff] [blame] | 19 | #include "aom_dsp/arm/sum_neon.h" |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 20 | |
Jonathan Wright | 0a770ff | 2023-04-24 15:27:07 +0100 | [diff] [blame] | 21 | static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref, |
| 22 | uint16x8_t *const sad_sum) { |
| 23 | uint8x16_t abs_diff = vabdq_u8(src, ref); |
| 24 | *sad_sum = vpadalq_u8(*sad_sum, abs_diff); |
| 25 | } |
| 26 | |
| 27 | static INLINE void sadwxhx3d_large_neon(const uint8_t *src, int src_stride, |
| 28 | const uint8_t *const ref[3], |
| 29 | int ref_stride, uint32_t res[3], int w, |
| 30 | int h, int h_overflow) { |
| 31 | uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; |
| 32 | int h_limit = h > h_overflow ? h_overflow : h; |
| 33 | |
| 34 | int ref_offset = 0; |
| 35 | int i = 0; |
| 36 | do { |
| 37 | uint16x8_t sum_lo[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; |
| 38 | uint16x8_t sum_hi[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; |
| 39 | |
| 40 | do { |
| 41 | int j = 0; |
| 42 | do { |
| 43 | const uint8x16_t s0 = vld1q_u8(src + j); |
| 44 | sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]); |
| 45 | sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]); |
| 46 | sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]); |
| 47 | |
| 48 | const uint8x16_t s1 = vld1q_u8(src + j + 16); |
| 49 | sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]); |
| 50 | sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]); |
| 51 | sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]); |
| 52 | |
| 53 | j += 32; |
| 54 | } while (j < w); |
| 55 | |
| 56 | src += src_stride; |
| 57 | ref_offset += ref_stride; |
| 58 | } while (++i < h_limit); |
| 59 | |
| 60 | sum[0] = vpadalq_u16(sum[0], sum_lo[0]); |
| 61 | sum[0] = vpadalq_u16(sum[0], sum_hi[0]); |
| 62 | sum[1] = vpadalq_u16(sum[1], sum_lo[1]); |
| 63 | sum[1] = vpadalq_u16(sum[1], sum_hi[1]); |
| 64 | sum[2] = vpadalq_u16(sum[2], sum_lo[2]); |
| 65 | sum[2] = vpadalq_u16(sum[2], sum_hi[2]); |
| 66 | |
| 67 | h_limit += h_overflow; |
| 68 | } while (i < h); |
| 69 | |
| 70 | res[0] = horizontal_add_u32x4(sum[0]); |
| 71 | res[1] = horizontal_add_u32x4(sum[1]); |
| 72 | res[2] = horizontal_add_u32x4(sum[2]); |
| 73 | } |
| 74 | |
| 75 | static INLINE void sad128xhx3d_neon(const uint8_t *src, int src_stride, |
| 76 | const uint8_t *const ref[3], int ref_stride, |
| 77 | uint32_t res[3], int h) { |
| 78 | sadwxhx3d_large_neon(src, src_stride, ref, ref_stride, res, 128, h, 32); |
| 79 | } |
| 80 | |
| 81 | static INLINE void sad64xhx3d_neon(const uint8_t *src, int src_stride, |
| 82 | const uint8_t *const ref[3], int ref_stride, |
| 83 | uint32_t res[3], int h) { |
| 84 | sadwxhx3d_large_neon(src, src_stride, ref, ref_stride, res, 64, h, 64); |
| 85 | } |
| 86 | |
| 87 | static INLINE void sad32xhx3d_neon(const uint8_t *src, int src_stride, |
| 88 | const uint8_t *const ref[3], int ref_stride, |
| 89 | uint32_t res[3], int h) { |
| 90 | uint16x8_t sum_lo[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; |
| 91 | uint16x8_t sum_hi[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; |
| 92 | |
| 93 | int ref_offset = 0; |
| 94 | int i = h; |
| 95 | do { |
| 96 | const uint8x16_t s0 = vld1q_u8(src); |
| 97 | sad16_neon(s0, vld1q_u8(ref[0] + ref_offset), &sum_lo[0]); |
| 98 | sad16_neon(s0, vld1q_u8(ref[1] + ref_offset), &sum_lo[1]); |
| 99 | sad16_neon(s0, vld1q_u8(ref[2] + ref_offset), &sum_lo[2]); |
| 100 | |
| 101 | const uint8x16_t s1 = vld1q_u8(src + 16); |
| 102 | sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + 16), &sum_hi[0]); |
| 103 | sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + 16), &sum_hi[1]); |
| 104 | sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + 16), &sum_hi[2]); |
| 105 | |
| 106 | src += src_stride; |
| 107 | ref_offset += ref_stride; |
| 108 | } while (--i != 0); |
| 109 | |
| 110 | res[0] = horizontal_long_add_u16x8(sum_lo[0], sum_hi[0]); |
| 111 | res[1] = horizontal_long_add_u16x8(sum_lo[1], sum_hi[1]); |
| 112 | res[2] = horizontal_long_add_u16x8(sum_lo[2], sum_hi[2]); |
| 113 | } |
| 114 | |
| 115 | static INLINE void sad16xhx3d_neon(const uint8_t *src, int src_stride, |
| 116 | const uint8_t *const ref[3], int ref_stride, |
| 117 | uint32_t res[3], int h) { |
| 118 | uint16x8_t sum[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; |
| 119 | |
| 120 | int ref_offset = 0; |
| 121 | int i = h; |
| 122 | do { |
| 123 | const uint8x16_t s = vld1q_u8(src); |
| 124 | sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum[0]); |
| 125 | sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum[1]); |
| 126 | sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum[2]); |
| 127 | |
| 128 | src += src_stride; |
| 129 | ref_offset += ref_stride; |
| 130 | } while (--i != 0); |
| 131 | |
| 132 | res[0] = horizontal_add_u16x8(sum[0]); |
| 133 | res[1] = horizontal_add_u16x8(sum[1]); |
| 134 | res[2] = horizontal_add_u16x8(sum[2]); |
| 135 | } |
| 136 | |
Jonathan Wright | 0a770ff | 2023-04-24 15:27:07 +0100 | [diff] [blame] | 137 | static INLINE void sad8xhx3d_neon(const uint8_t *src, int src_stride, |
| 138 | const uint8_t *const ref[3], int ref_stride, |
| 139 | uint32_t res[3], int h) { |
| 140 | uint16x8_t sum[3]; |
| 141 | |
| 142 | uint8x8_t s = vld1_u8(src); |
| 143 | sum[0] = vabdl_u8(s, vld1_u8(ref[0])); |
| 144 | sum[1] = vabdl_u8(s, vld1_u8(ref[1])); |
| 145 | sum[2] = vabdl_u8(s, vld1_u8(ref[2])); |
| 146 | |
| 147 | src += src_stride; |
| 148 | int ref_offset = ref_stride; |
| 149 | int i = h - 1; |
| 150 | do { |
| 151 | s = vld1_u8(src); |
| 152 | sum[0] = vabal_u8(sum[0], s, vld1_u8(ref[0] + ref_offset)); |
| 153 | sum[1] = vabal_u8(sum[1], s, vld1_u8(ref[1] + ref_offset)); |
| 154 | sum[2] = vabal_u8(sum[2], s, vld1_u8(ref[2] + ref_offset)); |
| 155 | |
| 156 | src += src_stride; |
| 157 | ref_offset += ref_stride; |
| 158 | } while (--i != 0); |
| 159 | |
| 160 | res[0] = horizontal_add_u16x8(sum[0]); |
| 161 | res[1] = horizontal_add_u16x8(sum[1]); |
| 162 | res[2] = horizontal_add_u16x8(sum[2]); |
| 163 | } |
| 164 | |
| 165 | static INLINE void sad4xhx3d_neon(const uint8_t *src, int src_stride, |
| 166 | const uint8_t *const ref[3], int ref_stride, |
| 167 | uint32_t res[3], int h) { |
| 168 | assert(h % 2 == 0); |
| 169 | uint16x8_t sum[3]; |
| 170 | |
| 171 | uint8x8_t s = load_unaligned_u8(src, src_stride); |
| 172 | uint8x8_t r0 = load_unaligned_u8(ref[0], ref_stride); |
| 173 | uint8x8_t r1 = load_unaligned_u8(ref[1], ref_stride); |
| 174 | uint8x8_t r2 = load_unaligned_u8(ref[2], ref_stride); |
| 175 | |
| 176 | sum[0] = vabdl_u8(s, r0); |
| 177 | sum[1] = vabdl_u8(s, r1); |
| 178 | sum[2] = vabdl_u8(s, r2); |
| 179 | |
| 180 | src += 2 * src_stride; |
| 181 | int ref_offset = 2 * ref_stride; |
| 182 | int i = (h / 2) - 1; |
| 183 | do { |
| 184 | s = load_unaligned_u8(src, src_stride); |
| 185 | r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride); |
| 186 | r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride); |
| 187 | r2 = load_unaligned_u8(ref[2] + ref_offset, ref_stride); |
| 188 | |
| 189 | sum[0] = vabal_u8(sum[0], s, r0); |
| 190 | sum[1] = vabal_u8(sum[1], s, r1); |
| 191 | sum[2] = vabal_u8(sum[2], s, r2); |
| 192 | |
| 193 | src += 2 * src_stride; |
| 194 | ref_offset += 2 * ref_stride; |
| 195 | } while (--i != 0); |
| 196 | |
| 197 | res[0] = horizontal_add_u16x8(sum[0]); |
| 198 | res[1] = horizontal_add_u16x8(sum[1]); |
| 199 | res[2] = horizontal_add_u16x8(sum[2]); |
| 200 | } |
| 201 | |
| 202 | #define SAD_WXH_3D_NEON(w, h) \ |
| 203 | void aom_sad##w##x##h##x3d_neon(const uint8_t *src, int src_stride, \ |
| 204 | const uint8_t *const ref[4], int ref_stride, \ |
| 205 | uint32_t res[4]) { \ |
| 206 | sad##w##xhx3d_neon(src, src_stride, ref, ref_stride, res, (h)); \ |
| 207 | } |
| 208 | |
| 209 | SAD_WXH_3D_NEON(4, 4) |
| 210 | SAD_WXH_3D_NEON(4, 8) |
| 211 | |
| 212 | SAD_WXH_3D_NEON(8, 4) |
| 213 | SAD_WXH_3D_NEON(8, 8) |
| 214 | SAD_WXH_3D_NEON(8, 16) |
| 215 | |
| 216 | SAD_WXH_3D_NEON(16, 8) |
| 217 | SAD_WXH_3D_NEON(16, 16) |
| 218 | SAD_WXH_3D_NEON(16, 32) |
| 219 | |
| 220 | SAD_WXH_3D_NEON(32, 16) |
| 221 | SAD_WXH_3D_NEON(32, 32) |
| 222 | SAD_WXH_3D_NEON(32, 64) |
| 223 | |
| 224 | SAD_WXH_3D_NEON(64, 32) |
| 225 | SAD_WXH_3D_NEON(64, 64) |
| 226 | SAD_WXH_3D_NEON(64, 128) |
| 227 | |
| 228 | SAD_WXH_3D_NEON(128, 64) |
| 229 | SAD_WXH_3D_NEON(128, 128) |
| 230 | |
| 231 | #if !CONFIG_REALTIME_ONLY |
| 232 | SAD_WXH_3D_NEON(4, 16) |
| 233 | SAD_WXH_3D_NEON(8, 32) |
| 234 | SAD_WXH_3D_NEON(16, 4) |
| 235 | SAD_WXH_3D_NEON(16, 64) |
| 236 | SAD_WXH_3D_NEON(32, 8) |
| 237 | SAD_WXH_3D_NEON(64, 16) |
| 238 | #endif // !CONFIG_REALTIME_ONLY |
| 239 | |
| 240 | #undef SAD_WXH_3D_NEON |
| 241 | |
Jonathan Wright | d93da79 | 2023-04-18 10:56:55 +0100 | [diff] [blame] | 242 | static INLINE void sadwxhx4d_large_neon(const uint8_t *src, int src_stride, |
| 243 | const uint8_t *const ref[4], |
| 244 | int ref_stride, uint32_t res[4], int w, |
| 245 | int h, int h_overflow) { |
Jonathan Wright | 8f8b66d | 2023-04-13 15:02:00 +0100 | [diff] [blame] | 246 | uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), |
| 247 | vdupq_n_u32(0) }; |
Jonathan Wright | d93da79 | 2023-04-18 10:56:55 +0100 | [diff] [blame] | 248 | int h_limit = h > h_overflow ? h_overflow : h; |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 249 | |
Jonathan Wright | d93da79 | 2023-04-18 10:56:55 +0100 | [diff] [blame] | 250 | int ref_offset = 0; |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 251 | int i = 0; |
| 252 | do { |
| 253 | uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), |
| 254 | vdupq_n_u16(0) }; |
| 255 | uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), |
| 256 | vdupq_n_u16(0) }; |
| 257 | |
| 258 | do { |
Jonathan Wright | d93da79 | 2023-04-18 10:56:55 +0100 | [diff] [blame] | 259 | int j = 0; |
| 260 | do { |
| 261 | const uint8x16_t s0 = vld1q_u8(src + j); |
| 262 | sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]); |
| 263 | sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]); |
| 264 | sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]); |
| 265 | sad16_neon(s0, vld1q_u8(ref[3] + ref_offset + j), &sum_lo[3]); |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 266 | |
Jonathan Wright | d93da79 | 2023-04-18 10:56:55 +0100 | [diff] [blame] | 267 | const uint8x16_t s1 = vld1q_u8(src + j + 16); |
| 268 | sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]); |
| 269 | sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]); |
| 270 | sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]); |
| 271 | sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + j + 16), &sum_hi[3]); |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 272 | |
Jonathan Wright | d93da79 | 2023-04-18 10:56:55 +0100 | [diff] [blame] | 273 | j += 32; |
| 274 | } while (j < w); |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 275 | |
Jonathan Wright | d93da79 | 2023-04-18 10:56:55 +0100 | [diff] [blame] | 276 | src += src_stride; |
| 277 | ref_offset += ref_stride; |
| 278 | } while (++i < h_limit); |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 279 | |
Jonathan Wright | 8f8b66d | 2023-04-13 15:02:00 +0100 | [diff] [blame] | 280 | sum[0] = vpadalq_u16(sum[0], sum_lo[0]); |
| 281 | sum[0] = vpadalq_u16(sum[0], sum_hi[0]); |
| 282 | sum[1] = vpadalq_u16(sum[1], sum_lo[1]); |
| 283 | sum[1] = vpadalq_u16(sum[1], sum_hi[1]); |
| 284 | sum[2] = vpadalq_u16(sum[2], sum_lo[2]); |
| 285 | sum[2] = vpadalq_u16(sum[2], sum_hi[2]); |
| 286 | sum[3] = vpadalq_u16(sum[3], sum_lo[3]); |
| 287 | sum[3] = vpadalq_u16(sum[3], sum_hi[3]); |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 288 | |
Jonathan Wright | d93da79 | 2023-04-18 10:56:55 +0100 | [diff] [blame] | 289 | h_limit += h_overflow; |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 290 | } while (i < h); |
Jonathan Wright | 8f8b66d | 2023-04-13 15:02:00 +0100 | [diff] [blame] | 291 | |
| 292 | vst1q_u32(res, horizontal_add_4d_u32x4(sum)); |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 293 | } |
| 294 | |
Jonathan Wright | d93da79 | 2023-04-18 10:56:55 +0100 | [diff] [blame] | 295 | static INLINE void sad128xhx4d_neon(const uint8_t *src, int src_stride, |
| 296 | const uint8_t *const ref[4], int ref_stride, |
| 297 | uint32_t res[4], int h) { |
| 298 | sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 128, h, 32); |
| 299 | } |
| 300 | |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 301 | static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride, |
| 302 | const uint8_t *const ref[4], int ref_stride, |
| 303 | uint32_t res[4], int h) { |
Jonathan Wright | d93da79 | 2023-04-18 10:56:55 +0100 | [diff] [blame] | 304 | sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 64, h, 64); |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 305 | } |
| 306 | |
| 307 | static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride, |
| 308 | const uint8_t *const ref[4], int ref_stride, |
| 309 | uint32_t res[4], int h) { |
| 310 | uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), |
| 311 | vdupq_n_u16(0) }; |
| 312 | uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), |
| 313 | vdupq_n_u16(0) }; |
| 314 | |
Jonathan Wright | d93da79 | 2023-04-18 10:56:55 +0100 | [diff] [blame] | 315 | int ref_offset = 0; |
| 316 | int i = h; |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 317 | do { |
Jonathan Wright | d93da79 | 2023-04-18 10:56:55 +0100 | [diff] [blame] | 318 | const uint8x16_t s0 = vld1q_u8(src); |
| 319 | sad16_neon(s0, vld1q_u8(ref[0] + ref_offset), &sum_lo[0]); |
| 320 | sad16_neon(s0, vld1q_u8(ref[1] + ref_offset), &sum_lo[1]); |
| 321 | sad16_neon(s0, vld1q_u8(ref[2] + ref_offset), &sum_lo[2]); |
| 322 | sad16_neon(s0, vld1q_u8(ref[3] + ref_offset), &sum_lo[3]); |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 323 | |
Jonathan Wright | d93da79 | 2023-04-18 10:56:55 +0100 | [diff] [blame] | 324 | const uint8x16_t s1 = vld1q_u8(src + 16); |
| 325 | sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + 16), &sum_hi[0]); |
| 326 | sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + 16), &sum_hi[1]); |
| 327 | sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + 16), &sum_hi[2]); |
| 328 | sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + 16), &sum_hi[3]); |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 329 | |
Jonathan Wright | d93da79 | 2023-04-18 10:56:55 +0100 | [diff] [blame] | 330 | src += src_stride; |
| 331 | ref_offset += ref_stride; |
| 332 | } while (--i != 0); |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 333 | |
Jonathan Wright | 8f8b66d | 2023-04-13 15:02:00 +0100 | [diff] [blame] | 334 | vst1q_u32(res, horizontal_long_add_4d_u16x8(sum_lo, sum_hi)); |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 335 | } |
| 336 | |
| 337 | static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride, |
| 338 | const uint8_t *const ref[4], int ref_stride, |
| 339 | uint32_t res[4], int h) { |
Jonathan Wright | 8f8b66d | 2023-04-13 15:02:00 +0100 | [diff] [blame] | 340 | uint16x8_t sum_u16[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), |
| 341 | vdupq_n_u16(0) }; |
| 342 | uint32x4_t sum_u32[4]; |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 343 | |
Jonathan Wright | d93da79 | 2023-04-18 10:56:55 +0100 | [diff] [blame] | 344 | int ref_offset = 0; |
| 345 | int i = h; |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 346 | do { |
Jonathan Wright | d93da79 | 2023-04-18 10:56:55 +0100 | [diff] [blame] | 347 | const uint8x16_t s = vld1q_u8(src); |
| 348 | sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum_u16[0]); |
| 349 | sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum_u16[1]); |
| 350 | sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum_u16[2]); |
| 351 | sad16_neon(s, vld1q_u8(ref[3] + ref_offset), &sum_u16[3]); |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 352 | |
Jonathan Wright | d93da79 | 2023-04-18 10:56:55 +0100 | [diff] [blame] | 353 | src += src_stride; |
| 354 | ref_offset += ref_stride; |
| 355 | } while (--i != 0); |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 356 | |
Jonathan Wright | 8f8b66d | 2023-04-13 15:02:00 +0100 | [diff] [blame] | 357 | sum_u32[0] = vpaddlq_u16(sum_u16[0]); |
| 358 | sum_u32[1] = vpaddlq_u16(sum_u16[1]); |
| 359 | sum_u32[2] = vpaddlq_u16(sum_u16[2]); |
| 360 | sum_u32[3] = vpaddlq_u16(sum_u16[3]); |
| 361 | |
| 362 | vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32)); |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 363 | } |
| 364 | |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 365 | static INLINE void sad8xhx4d_neon(const uint8_t *src, int src_stride, |
| 366 | const uint8_t *const ref[4], int ref_stride, |
| 367 | uint32_t res[4], int h) { |
Jonathan Wright | d93da79 | 2023-04-18 10:56:55 +0100 | [diff] [blame] | 368 | uint16x8_t sum[4]; |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 369 | |
Jonathan Wright | d93da79 | 2023-04-18 10:56:55 +0100 | [diff] [blame] | 370 | uint8x8_t s = vld1_u8(src); |
| 371 | sum[0] = vabdl_u8(s, vld1_u8(ref[0])); |
| 372 | sum[1] = vabdl_u8(s, vld1_u8(ref[1])); |
| 373 | sum[2] = vabdl_u8(s, vld1_u8(ref[2])); |
| 374 | sum[3] = vabdl_u8(s, vld1_u8(ref[3])); |
| 375 | |
| 376 | src += src_stride; |
| 377 | int ref_offset = ref_stride; |
| 378 | int i = h - 1; |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 379 | do { |
Jonathan Wright | d93da79 | 2023-04-18 10:56:55 +0100 | [diff] [blame] | 380 | s = vld1_u8(src); |
| 381 | sum[0] = vabal_u8(sum[0], s, vld1_u8(ref[0] + ref_offset)); |
| 382 | sum[1] = vabal_u8(sum[1], s, vld1_u8(ref[1] + ref_offset)); |
| 383 | sum[2] = vabal_u8(sum[2], s, vld1_u8(ref[2] + ref_offset)); |
| 384 | sum[3] = vabal_u8(sum[3], s, vld1_u8(ref[3] + ref_offset)); |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 385 | |
Jonathan Wright | d93da79 | 2023-04-18 10:56:55 +0100 | [diff] [blame] | 386 | src += src_stride; |
| 387 | ref_offset += ref_stride; |
| 388 | } while (--i != 0); |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 389 | |
Jonathan Wright | 3e1b9c5 | 2023-04-06 00:38:48 +0100 | [diff] [blame] | 390 | vst1q_u32(res, horizontal_add_4d_u16x8(sum)); |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 391 | } |
| 392 | |
| 393 | static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride, |
| 394 | const uint8_t *const ref[4], int ref_stride, |
| 395 | uint32_t res[4], int h) { |
Jonathan Wright | d93da79 | 2023-04-18 10:56:55 +0100 | [diff] [blame] | 396 | uint16x8_t sum[4]; |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 397 | |
Jonathan Wright | d93da79 | 2023-04-18 10:56:55 +0100 | [diff] [blame] | 398 | uint8x8_t s = load_unaligned_u8(src, src_stride); |
| 399 | uint8x8_t r0 = load_unaligned_u8(ref[0], ref_stride); |
| 400 | uint8x8_t r1 = load_unaligned_u8(ref[1], ref_stride); |
| 401 | uint8x8_t r2 = load_unaligned_u8(ref[2], ref_stride); |
| 402 | uint8x8_t r3 = load_unaligned_u8(ref[3], ref_stride); |
| 403 | |
| 404 | sum[0] = vabdl_u8(s, r0); |
| 405 | sum[1] = vabdl_u8(s, r1); |
| 406 | sum[2] = vabdl_u8(s, r2); |
| 407 | sum[3] = vabdl_u8(s, r3); |
| 408 | |
| 409 | src += 2 * src_stride; |
| 410 | int ref_offset = 2 * ref_stride; |
Mark Horvath | 6c744f6 | 2023-04-18 14:47:21 +0200 | [diff] [blame] | 411 | int i = h / 2; |
| 412 | while (--i != 0) { |
Jonathan Wright | d93da79 | 2023-04-18 10:56:55 +0100 | [diff] [blame] | 413 | s = load_unaligned_u8(src, src_stride); |
| 414 | r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride); |
| 415 | r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride); |
| 416 | r2 = load_unaligned_u8(ref[2] + ref_offset, ref_stride); |
| 417 | r3 = load_unaligned_u8(ref[3] + ref_offset, ref_stride); |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 418 | |
Jonathan Wright | d93da79 | 2023-04-18 10:56:55 +0100 | [diff] [blame] | 419 | sum[0] = vabal_u8(sum[0], s, r0); |
| 420 | sum[1] = vabal_u8(sum[1], s, r1); |
| 421 | sum[2] = vabal_u8(sum[2], s, r2); |
| 422 | sum[3] = vabal_u8(sum[3], s, r3); |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 423 | |
Jonathan Wright | d93da79 | 2023-04-18 10:56:55 +0100 | [diff] [blame] | 424 | src += 2 * src_stride; |
| 425 | ref_offset += 2 * ref_stride; |
Mark Horvath | 6c744f6 | 2023-04-18 14:47:21 +0200 | [diff] [blame] | 426 | } |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 427 | |
Jonathan Wright | 3e1b9c5 | 2023-04-06 00:38:48 +0100 | [diff] [blame] | 428 | vst1q_u32(res, horizontal_add_4d_u16x8(sum)); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 429 | } |
| 430 | |
Jonathan Wright | f9fc16b | 2022-06-29 23:57:14 +0100 | [diff] [blame] | 431 | #define SAD_WXH_4D_NEON(w, h) \ |
| 432 | void aom_sad##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \ |
| 433 | const uint8_t *const ref[4], int ref_stride, \ |
| 434 | uint32_t res[4]) { \ |
| 435 | sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, res, (h)); \ |
| 436 | } |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 437 | |
Jonathan Wright | f9fc16b | 2022-06-29 23:57:14 +0100 | [diff] [blame] | 438 | SAD_WXH_4D_NEON(4, 4) |
| 439 | SAD_WXH_4D_NEON(4, 8) |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 440 | |
Jonathan Wright | f9fc16b | 2022-06-29 23:57:14 +0100 | [diff] [blame] | 441 | SAD_WXH_4D_NEON(8, 4) |
| 442 | SAD_WXH_4D_NEON(8, 8) |
| 443 | SAD_WXH_4D_NEON(8, 16) |
Jonathan Wright | f9fc16b | 2022-06-29 23:57:14 +0100 | [diff] [blame] | 444 | |
Jonathan Wright | f9fc16b | 2022-06-29 23:57:14 +0100 | [diff] [blame] | 445 | SAD_WXH_4D_NEON(16, 8) |
| 446 | SAD_WXH_4D_NEON(16, 16) |
| 447 | SAD_WXH_4D_NEON(16, 32) |
Jonathan Wright | f9fc16b | 2022-06-29 23:57:14 +0100 | [diff] [blame] | 448 | |
Jonathan Wright | f9fc16b | 2022-06-29 23:57:14 +0100 | [diff] [blame] | 449 | SAD_WXH_4D_NEON(32, 16) |
| 450 | SAD_WXH_4D_NEON(32, 32) |
| 451 | SAD_WXH_4D_NEON(32, 64) |
| 452 | |
Jonathan Wright | f9fc16b | 2022-06-29 23:57:14 +0100 | [diff] [blame] | 453 | SAD_WXH_4D_NEON(64, 32) |
| 454 | SAD_WXH_4D_NEON(64, 64) |
| 455 | SAD_WXH_4D_NEON(64, 128) |
| 456 | |
| 457 | SAD_WXH_4D_NEON(128, 64) |
| 458 | SAD_WXH_4D_NEON(128, 128) |
| 459 | |
Mark Horvath | bb3ed05 | 2023-04-18 14:56:59 +0200 | [diff] [blame] | 460 | #if !CONFIG_REALTIME_ONLY |
| 461 | SAD_WXH_4D_NEON(4, 16) |
| 462 | SAD_WXH_4D_NEON(8, 32) |
| 463 | SAD_WXH_4D_NEON(16, 4) |
| 464 | SAD_WXH_4D_NEON(16, 64) |
| 465 | SAD_WXH_4D_NEON(32, 8) |
| 466 | SAD_WXH_4D_NEON(64, 16) |
| 467 | #endif // !CONFIG_REALTIME_ONLY |
| 468 | |
Jonathan Wright | f9fc16b | 2022-06-29 23:57:14 +0100 | [diff] [blame] | 469 | #undef SAD_WXH_4D_NEON |
Krishna Malladi | fb78faa | 2020-08-21 10:25:23 -0700 | [diff] [blame] | 470 | |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 471 | #define SAD_SKIP_WXH_4D_NEON(w, h) \ |
| 472 | void aom_sad_skip_##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \ |
Krishna Malladi | fb78faa | 2020-08-21 10:25:23 -0700 | [diff] [blame] | 473 | const uint8_t *const ref[4], \ |
| 474 | int ref_stride, uint32_t res[4]) { \ |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 475 | sad##w##xhx4d_neon(src, 2 * src_stride, ref, 2 * ref_stride, res, \ |
| 476 | ((h) >> 1)); \ |
Krishna Malladi | fb78faa | 2020-08-21 10:25:23 -0700 | [diff] [blame] | 477 | res[0] <<= 1; \ |
| 478 | res[1] <<= 1; \ |
| 479 | res[2] <<= 1; \ |
| 480 | res[3] <<= 1; \ |
| 481 | } |
| 482 | |
Mark Horvath | 6c744f6 | 2023-04-18 14:47:21 +0200 | [diff] [blame] | 483 | SAD_SKIP_WXH_4D_NEON(4, 4) |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 484 | SAD_SKIP_WXH_4D_NEON(4, 8) |
Krishna Malladi | fb78faa | 2020-08-21 10:25:23 -0700 | [diff] [blame] | 485 | |
Mark Horvath | 6c744f6 | 2023-04-18 14:47:21 +0200 | [diff] [blame] | 486 | SAD_SKIP_WXH_4D_NEON(8, 4) |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 487 | SAD_SKIP_WXH_4D_NEON(8, 8) |
| 488 | SAD_SKIP_WXH_4D_NEON(8, 16) |
Krishna Malladi | fb78faa | 2020-08-21 10:25:23 -0700 | [diff] [blame] | 489 | |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 490 | SAD_SKIP_WXH_4D_NEON(16, 8) |
| 491 | SAD_SKIP_WXH_4D_NEON(16, 16) |
| 492 | SAD_SKIP_WXH_4D_NEON(16, 32) |
Krishna Malladi | fb78faa | 2020-08-21 10:25:23 -0700 | [diff] [blame] | 493 | |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 494 | SAD_SKIP_WXH_4D_NEON(32, 16) |
| 495 | SAD_SKIP_WXH_4D_NEON(32, 32) |
| 496 | SAD_SKIP_WXH_4D_NEON(32, 64) |
Krishna Malladi | fb78faa | 2020-08-21 10:25:23 -0700 | [diff] [blame] | 497 | |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 498 | SAD_SKIP_WXH_4D_NEON(64, 32) |
| 499 | SAD_SKIP_WXH_4D_NEON(64, 64) |
| 500 | SAD_SKIP_WXH_4D_NEON(64, 128) |
Krishna Malladi | fb78faa | 2020-08-21 10:25:23 -0700 | [diff] [blame] | 501 | |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 502 | SAD_SKIP_WXH_4D_NEON(128, 64) |
| 503 | SAD_SKIP_WXH_4D_NEON(128, 128) |
James Zern | f2658a3 | 2022-02-09 10:18:38 -0800 | [diff] [blame] | 504 | |
Mark Horvath | bb3ed05 | 2023-04-18 14:56:59 +0200 | [diff] [blame] | 505 | #if !CONFIG_REALTIME_ONLY |
| 506 | SAD_SKIP_WXH_4D_NEON(4, 16) |
| 507 | SAD_SKIP_WXH_4D_NEON(8, 32) |
| 508 | SAD_SKIP_WXH_4D_NEON(16, 4) |
| 509 | SAD_SKIP_WXH_4D_NEON(16, 64) |
| 510 | SAD_SKIP_WXH_4D_NEON(32, 8) |
| 511 | SAD_SKIP_WXH_4D_NEON(64, 16) |
| 512 | #endif // !CONFIG_REALTIME_ONLY |
| 513 | |
Jonathan Wright | 7c3e517 | 2022-06-29 14:56:06 +0100 | [diff] [blame] | 514 | #undef SAD_SKIP_WXH_4D_NEON |