Krishna Malladi | a8c08dd | 2020-02-21 14:08:06 -0800 | [diff] [blame] | 1 | /* |
James Zern | b7c05bd | 2024-06-11 19:15:10 -0700 | [diff] [blame] | 2 | * Copyright (c) 2020, Alliance for Open Media. All rights reserved. |
Krishna Malladi | a8c08dd | 2020-02-21 14:08:06 -0800 | [diff] [blame] | 3 | * |
James Zern | b7c05bd | 2024-06-11 19:15:10 -0700 | [diff] [blame] | 4 | * This source code is subject to the terms of the BSD 2 Clause License and |
| 5 | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| 6 | * was not distributed with this source code in the LICENSE file, you can |
| 7 | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| 8 | * Media Patent License 1.0 was not distributed with this source code in the |
| 9 | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
Krishna Malladi | a8c08dd | 2020-02-21 14:08:06 -0800 | [diff] [blame] | 10 | */ |
| 11 | |
| 12 | #include <arm_neon.h> |
Vitalii Dziumenko | 18de235 | 2020-05-14 11:49:25 +0300 | [diff] [blame] | 13 | |
Krishna Malladi | a8c08dd | 2020-02-21 14:08:06 -0800 | [diff] [blame] | 14 | #include "config/aom_dsp_rtcd.h" |
Yaowu Xu | 01d4a32 | 2021-07-15 07:46:13 -0700 | [diff] [blame] | 15 | #include "aom_dsp/arm/mem_neon.h" |
Vitalii Dziumenko | 18de235 | 2020-05-14 11:49:25 +0300 | [diff] [blame] | 16 | #include "aom_dsp/arm/sum_neon.h" |
Krishna Malladi | a8c08dd | 2020-02-21 14:08:06 -0800 | [diff] [blame] | 17 | |
Wan-Teh Chang | 12c64e8 | 2024-08-08 16:02:19 -0700 | [diff] [blame] | 18 | static inline void sse_16x1_neon(const uint8_t *src, const uint8_t *ref, |
Jonathan Wright | d212966 | 2022-06-01 20:56:49 +0100 | [diff] [blame] | 19 | uint32x4_t *sse) { |
| 20 | uint8x16_t s = vld1q_u8(src); |
| 21 | uint8x16_t r = vld1q_u8(ref); |
| 22 | |
| 23 | uint8x16_t abs_diff = vabdq_u8(s, r); |
| 24 | uint8x8_t abs_diff_lo = vget_low_u8(abs_diff); |
| 25 | uint8x8_t abs_diff_hi = vget_high_u8(abs_diff); |
| 26 | |
| 27 | *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_lo, abs_diff_lo)); |
| 28 | *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_hi, abs_diff_hi)); |
Krishna Malladi | cb98296 | 2020-02-25 09:44:11 -0800 | [diff] [blame] | 29 | } |
Jonathan Wright | d212966 | 2022-06-01 20:56:49 +0100 | [diff] [blame] | 30 | |
Wan-Teh Chang | 12c64e8 | 2024-08-08 16:02:19 -0700 | [diff] [blame] | 31 | static inline void sse_8x1_neon(const uint8_t *src, const uint8_t *ref, |
Jonathan Wright | d212966 | 2022-06-01 20:56:49 +0100 | [diff] [blame] | 32 | uint32x4_t *sse) { |
| 33 | uint8x8_t s = vld1_u8(src); |
| 34 | uint8x8_t r = vld1_u8(ref); |
| 35 | |
| 36 | uint8x8_t abs_diff = vabd_u8(s, r); |
| 37 | |
| 38 | *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff)); |
Vitalii Dziumenko | 18de235 | 2020-05-14 11:49:25 +0300 | [diff] [blame] | 39 | } |
Jonathan Wright | d212966 | 2022-06-01 20:56:49 +0100 | [diff] [blame] | 40 | |
Wan-Teh Chang | 12c64e8 | 2024-08-08 16:02:19 -0700 | [diff] [blame] | 41 | static inline void sse_4x2_neon(const uint8_t *src, int src_stride, |
Jonathan Wright | d212966 | 2022-06-01 20:56:49 +0100 | [diff] [blame] | 42 | const uint8_t *ref, int ref_stride, |
| 43 | uint32x4_t *sse) { |
| 44 | uint8x8_t s = load_unaligned_u8(src, src_stride); |
| 45 | uint8x8_t r = load_unaligned_u8(ref, ref_stride); |
| 46 | |
| 47 | uint8x8_t abs_diff = vabd_u8(s, r); |
| 48 | |
| 49 | *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff)); |
Vitalii Dziumenko | 18de235 | 2020-05-14 11:49:25 +0300 | [diff] [blame] | 50 | } |
Jonathan Wright | d212966 | 2022-06-01 20:56:49 +0100 | [diff] [blame] | 51 | |
Wan-Teh Chang | 12c64e8 | 2024-08-08 16:02:19 -0700 | [diff] [blame] | 52 | static inline uint32_t sse_wxh_neon(const uint8_t *src, int src_stride, |
Jonathan Wright | d212966 | 2022-06-01 20:56:49 +0100 | [diff] [blame] | 53 | const uint8_t *ref, int ref_stride, |
| 54 | int width, int height) { |
| 55 | uint32x4_t sse = vdupq_n_u32(0); |
| 56 | |
| 57 | if ((width & 0x07) && ((width & 0x07) < 5)) { |
Gerda Zsejke More | 415744f | 2022-11-24 14:47:21 +0100 | [diff] [blame] | 58 | int i = height; |
Jonathan Wright | d212966 | 2022-06-01 20:56:49 +0100 | [diff] [blame] | 59 | do { |
| 60 | int j = 0; |
Vitalii Dziumenko | 18de235 | 2020-05-14 11:49:25 +0300 | [diff] [blame] | 61 | do { |
Jonathan Wright | d212966 | 2022-06-01 20:56:49 +0100 | [diff] [blame] | 62 | sse_8x1_neon(src + j, ref + j, &sse); |
| 63 | sse_8x1_neon(src + j + src_stride, ref + j + ref_stride, &sse); |
| 64 | j += 8; |
| 65 | } while (j + 4 < width); |
| 66 | |
| 67 | sse_4x2_neon(src + j, src_stride, ref + j, ref_stride, &sse); |
| 68 | src += 2 * src_stride; |
| 69 | ref += 2 * ref_stride; |
Gerda Zsejke More | 415744f | 2022-11-24 14:47:21 +0100 | [diff] [blame] | 70 | i -= 2; |
| 71 | } while (i != 0); |
Jonathan Wright | d212966 | 2022-06-01 20:56:49 +0100 | [diff] [blame] | 72 | } else { |
Gerda Zsejke More | 415744f | 2022-11-24 14:47:21 +0100 | [diff] [blame] | 73 | int i = height; |
Jonathan Wright | d212966 | 2022-06-01 20:56:49 +0100 | [diff] [blame] | 74 | do { |
| 75 | int j = 0; |
Vitalii Dziumenko | 18de235 | 2020-05-14 11:49:25 +0300 | [diff] [blame] | 76 | do { |
Jonathan Wright | d212966 | 2022-06-01 20:56:49 +0100 | [diff] [blame] | 77 | sse_8x1_neon(src + j, ref + j, &sse); |
| 78 | j += 8; |
| 79 | } while (j < width); |
| 80 | |
| 81 | src += src_stride; |
| 82 | ref += ref_stride; |
Gerda Zsejke More | 415744f | 2022-11-24 14:47:21 +0100 | [diff] [blame] | 83 | } while (--i != 0); |
Krishna Malladi | a8c08dd | 2020-02-21 14:08:06 -0800 | [diff] [blame] | 84 | } |
Jonathan Wright | d212966 | 2022-06-01 20:56:49 +0100 | [diff] [blame] | 85 | return horizontal_add_u32x4(sse); |
| 86 | } |
| 87 | |
Wan-Teh Chang | 12c64e8 | 2024-08-08 16:02:19 -0700 | [diff] [blame] | 88 | static inline uint32_t sse_128xh_neon(const uint8_t *src, int src_stride, |
Jonathan Wright | df35863 | 2022-07-11 23:32:19 +0100 | [diff] [blame] | 89 | const uint8_t *ref, int ref_stride, |
| 90 | int height) { |
| 91 | uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; |
| 92 | |
Gerda Zsejke More | 415744f | 2022-11-24 14:47:21 +0100 | [diff] [blame] | 93 | int i = height; |
Jonathan Wright | df35863 | 2022-07-11 23:32:19 +0100 | [diff] [blame] | 94 | do { |
| 95 | sse_16x1_neon(src, ref, &sse[0]); |
| 96 | sse_16x1_neon(src + 16, ref + 16, &sse[1]); |
| 97 | sse_16x1_neon(src + 32, ref + 32, &sse[0]); |
| 98 | sse_16x1_neon(src + 48, ref + 48, &sse[1]); |
| 99 | sse_16x1_neon(src + 64, ref + 64, &sse[0]); |
| 100 | sse_16x1_neon(src + 80, ref + 80, &sse[1]); |
| 101 | sse_16x1_neon(src + 96, ref + 96, &sse[0]); |
| 102 | sse_16x1_neon(src + 112, ref + 112, &sse[1]); |
| 103 | |
| 104 | src += src_stride; |
| 105 | ref += ref_stride; |
Gerda Zsejke More | 415744f | 2022-11-24 14:47:21 +0100 | [diff] [blame] | 106 | } while (--i != 0); |
Jonathan Wright | df35863 | 2022-07-11 23:32:19 +0100 | [diff] [blame] | 107 | |
| 108 | return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1])); |
| 109 | } |
| 110 | |
Wan-Teh Chang | 12c64e8 | 2024-08-08 16:02:19 -0700 | [diff] [blame] | 111 | static inline uint32_t sse_64xh_neon(const uint8_t *src, int src_stride, |
Jonathan Wright | df35863 | 2022-07-11 23:32:19 +0100 | [diff] [blame] | 112 | const uint8_t *ref, int ref_stride, |
| 113 | int height) { |
| 114 | uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; |
| 115 | |
Gerda Zsejke More | 415744f | 2022-11-24 14:47:21 +0100 | [diff] [blame] | 116 | int i = height; |
Jonathan Wright | df35863 | 2022-07-11 23:32:19 +0100 | [diff] [blame] | 117 | do { |
| 118 | sse_16x1_neon(src, ref, &sse[0]); |
| 119 | sse_16x1_neon(src + 16, ref + 16, &sse[1]); |
| 120 | sse_16x1_neon(src + 32, ref + 32, &sse[0]); |
| 121 | sse_16x1_neon(src + 48, ref + 48, &sse[1]); |
| 122 | |
| 123 | src += src_stride; |
| 124 | ref += ref_stride; |
Gerda Zsejke More | 415744f | 2022-11-24 14:47:21 +0100 | [diff] [blame] | 125 | } while (--i != 0); |
Jonathan Wright | df35863 | 2022-07-11 23:32:19 +0100 | [diff] [blame] | 126 | |
| 127 | return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1])); |
| 128 | } |
| 129 | |
Wan-Teh Chang | 12c64e8 | 2024-08-08 16:02:19 -0700 | [diff] [blame] | 130 | static inline uint32_t sse_32xh_neon(const uint8_t *src, int src_stride, |
Jonathan Wright | df35863 | 2022-07-11 23:32:19 +0100 | [diff] [blame] | 131 | const uint8_t *ref, int ref_stride, |
| 132 | int height) { |
| 133 | uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; |
| 134 | |
Gerda Zsejke More | 415744f | 2022-11-24 14:47:21 +0100 | [diff] [blame] | 135 | int i = height; |
Jonathan Wright | df35863 | 2022-07-11 23:32:19 +0100 | [diff] [blame] | 136 | do { |
| 137 | sse_16x1_neon(src, ref, &sse[0]); |
| 138 | sse_16x1_neon(src + 16, ref + 16, &sse[1]); |
| 139 | |
| 140 | src += src_stride; |
| 141 | ref += ref_stride; |
Gerda Zsejke More | 415744f | 2022-11-24 14:47:21 +0100 | [diff] [blame] | 142 | } while (--i != 0); |
Jonathan Wright | df35863 | 2022-07-11 23:32:19 +0100 | [diff] [blame] | 143 | |
| 144 | return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1])); |
| 145 | } |
| 146 | |
Wan-Teh Chang | 12c64e8 | 2024-08-08 16:02:19 -0700 | [diff] [blame] | 147 | static inline uint32_t sse_16xh_neon(const uint8_t *src, int src_stride, |
Jonathan Wright | df35863 | 2022-07-11 23:32:19 +0100 | [diff] [blame] | 148 | const uint8_t *ref, int ref_stride, |
| 149 | int height) { |
| 150 | uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; |
| 151 | |
Gerda Zsejke More | 415744f | 2022-11-24 14:47:21 +0100 | [diff] [blame] | 152 | int i = height; |
Jonathan Wright | df35863 | 2022-07-11 23:32:19 +0100 | [diff] [blame] | 153 | do { |
| 154 | sse_16x1_neon(src, ref, &sse[0]); |
| 155 | src += src_stride; |
| 156 | ref += ref_stride; |
| 157 | sse_16x1_neon(src, ref, &sse[1]); |
| 158 | src += src_stride; |
| 159 | ref += ref_stride; |
Gerda Zsejke More | 415744f | 2022-11-24 14:47:21 +0100 | [diff] [blame] | 160 | i -= 2; |
| 161 | } while (i != 0); |
Jonathan Wright | df35863 | 2022-07-11 23:32:19 +0100 | [diff] [blame] | 162 | |
| 163 | return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1])); |
| 164 | } |
| 165 | |
Wan-Teh Chang | 12c64e8 | 2024-08-08 16:02:19 -0700 | [diff] [blame] | 166 | static inline uint32_t sse_8xh_neon(const uint8_t *src, int src_stride, |
Jonathan Wright | 3c1f3af | 2023-07-19 14:02:44 +0100 | [diff] [blame] | 167 | const uint8_t *ref, int ref_stride, |
| 168 | int height) { |
| 169 | uint32x4_t sse = vdupq_n_u32(0); |
| 170 | |
| 171 | int i = height; |
| 172 | do { |
| 173 | sse_8x1_neon(src, ref, &sse); |
| 174 | |
| 175 | src += src_stride; |
| 176 | ref += ref_stride; |
| 177 | } while (--i != 0); |
| 178 | |
| 179 | return horizontal_add_u32x4(sse); |
| 180 | } |
| 181 | |
Wan-Teh Chang | 12c64e8 | 2024-08-08 16:02:19 -0700 | [diff] [blame] | 182 | static inline uint32_t sse_4xh_neon(const uint8_t *src, int src_stride, |
Jonathan Wright | 3c1f3af | 2023-07-19 14:02:44 +0100 | [diff] [blame] | 183 | const uint8_t *ref, int ref_stride, |
| 184 | int height) { |
| 185 | uint32x4_t sse = vdupq_n_u32(0); |
| 186 | |
| 187 | int i = height; |
| 188 | do { |
| 189 | sse_4x2_neon(src, src_stride, ref, ref_stride, &sse); |
| 190 | |
| 191 | src += 2 * src_stride; |
| 192 | ref += 2 * ref_stride; |
| 193 | i -= 2; |
| 194 | } while (i != 0); |
| 195 | |
| 196 | return horizontal_add_u32x4(sse); |
| 197 | } |
| 198 | |
Jonathan Wright | d212966 | 2022-06-01 20:56:49 +0100 | [diff] [blame] | 199 | int64_t aom_sse_neon(const uint8_t *src, int src_stride, const uint8_t *ref, |
| 200 | int ref_stride, int width, int height) { |
| 201 | switch (width) { |
| 202 | case 4: return sse_4xh_neon(src, src_stride, ref, ref_stride, height); |
| 203 | case 8: return sse_8xh_neon(src, src_stride, ref, ref_stride, height); |
| 204 | case 16: return sse_16xh_neon(src, src_stride, ref, ref_stride, height); |
| 205 | case 32: return sse_32xh_neon(src, src_stride, ref, ref_stride, height); |
| 206 | case 64: return sse_64xh_neon(src, src_stride, ref, ref_stride, height); |
| 207 | case 128: return sse_128xh_neon(src, src_stride, ref, ref_stride, height); |
| 208 | default: |
| 209 | return sse_wxh_neon(src, src_stride, ref, ref_stride, width, height); |
| 210 | } |
Krishna Malladi | a8c08dd | 2020-02-21 14:08:06 -0800 | [diff] [blame] | 211 | } |