Salome Thirot | 22fba5b | 2023-05-05 15:54:13 +0100 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (c) 2023, Alliance for Open Media. All rights reserved |
| 3 | * |
| 4 | * This source code is subject to the terms of the BSD 2 Clause License and |
| 5 | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| 6 | * was not distributed with this source code in the LICENSE file, you can |
| 7 | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| 8 | * Media Patent License 1.0 was not distributed with this source code in the |
| 9 | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| 10 | */ |
| 11 | |
| 12 | #include <arm_neon.h> |
| 13 | #include "config/aom_config.h" |
| 14 | #include "config/aom_dsp_rtcd.h" |
| 15 | #include "mem_neon.h" |
| 16 | #include "sum_neon.h" |
| 17 | |
| 18 | static INLINE void obmc_sad_8x1_s16_neon(int16x8_t ref_s16, const int32_t *mask, |
| 19 | const int32_t *wsrc, uint32x4_t *sum) { |
| 20 | int32x4_t wsrc_lo = vld1q_s32(wsrc); |
| 21 | int32x4_t wsrc_hi = vld1q_s32(wsrc + 4); |
| 22 | |
| 23 | int32x4_t mask_lo = vld1q_s32(mask); |
| 24 | int32x4_t mask_hi = vld1q_s32(mask + 4); |
| 25 | |
| 26 | int16x8_t mask_s16 = |
| 27 | vuzpq_s16(vreinterpretq_s16_s32(mask_lo), vreinterpretq_s16_s32(mask_hi)) |
| 28 | .val[0]; |
| 29 | |
| 30 | int32x4_t pre_lo = vmull_s16(vget_low_s16(ref_s16), vget_low_s16(mask_s16)); |
| 31 | int32x4_t pre_hi = vmull_s16(vget_high_s16(ref_s16), vget_high_s16(mask_s16)); |
| 32 | |
| 33 | uint32x4_t abs_lo = vreinterpretq_u32_s32(vabdq_s32(wsrc_lo, pre_lo)); |
| 34 | uint32x4_t abs_hi = vreinterpretq_u32_s32(vabdq_s32(wsrc_hi, pre_hi)); |
| 35 | |
| 36 | *sum = vrsraq_n_u32(*sum, abs_lo, 12); |
| 37 | *sum = vrsraq_n_u32(*sum, abs_hi, 12); |
| 38 | } |
| 39 | |
James Zern | fe7676b | 2023-05-22 13:18:43 -0700 | [diff] [blame] | 40 | #if AOM_ARCH_AARCH64 |
Salome Thirot | 22fba5b | 2023-05-05 15:54:13 +0100 | [diff] [blame] | 41 | |
| 42 | // Use tbl for doing a double-width zero extension from 8->32 bits since we can |
| 43 | // do this in one instruction rather than two (indices out of range (255 here) |
| 44 | // are set to zero by tbl). |
| 45 | DECLARE_ALIGNED(16, static const uint8_t, obmc_variance_permute_idx[]) = { |
| 46 | 0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, |
| 47 | 4, 255, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255, 7, 255, 255, 255, |
| 48 | 8, 255, 255, 255, 9, 255, 255, 255, 10, 255, 255, 255, 11, 255, 255, 255, |
| 49 | 12, 255, 255, 255, 13, 255, 255, 255, 14, 255, 255, 255, 15, 255, 255, 255 |
| 50 | }; |
| 51 | |
| 52 | static INLINE void obmc_sad_8x1_s32_neon(uint32x4_t ref_u32_lo, |
| 53 | uint32x4_t ref_u32_hi, |
| 54 | const int32_t *mask, |
| 55 | const int32_t *wsrc, |
| 56 | uint32x4_t sum[2]) { |
| 57 | int32x4_t wsrc_lo = vld1q_s32(wsrc); |
| 58 | int32x4_t wsrc_hi = vld1q_s32(wsrc + 4); |
| 59 | int32x4_t mask_lo = vld1q_s32(mask); |
| 60 | int32x4_t mask_hi = vld1q_s32(mask + 4); |
| 61 | |
| 62 | int32x4_t pre_lo = vmulq_s32(vreinterpretq_s32_u32(ref_u32_lo), mask_lo); |
| 63 | int32x4_t pre_hi = vmulq_s32(vreinterpretq_s32_u32(ref_u32_hi), mask_hi); |
| 64 | |
| 65 | uint32x4_t abs_lo = vreinterpretq_u32_s32(vabdq_s32(wsrc_lo, pre_lo)); |
| 66 | uint32x4_t abs_hi = vreinterpretq_u32_s32(vabdq_s32(wsrc_hi, pre_hi)); |
| 67 | |
| 68 | sum[0] = vrsraq_n_u32(sum[0], abs_lo, 12); |
| 69 | sum[1] = vrsraq_n_u32(sum[1], abs_hi, 12); |
| 70 | } |
| 71 | |
| 72 | static INLINE unsigned int obmc_sad_large_neon(const uint8_t *ref, |
| 73 | int ref_stride, |
| 74 | const int32_t *wsrc, |
| 75 | const int32_t *mask, int width, |
| 76 | int height) { |
| 77 | uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; |
| 78 | |
| 79 | // Use tbl for doing a double-width zero extension from 8->32 bits since we |
| 80 | // can do this in one instruction rather than two. |
| 81 | uint8x16_t pre_idx0 = vld1q_u8(&obmc_variance_permute_idx[0]); |
| 82 | uint8x16_t pre_idx1 = vld1q_u8(&obmc_variance_permute_idx[16]); |
| 83 | uint8x16_t pre_idx2 = vld1q_u8(&obmc_variance_permute_idx[32]); |
| 84 | uint8x16_t pre_idx3 = vld1q_u8(&obmc_variance_permute_idx[48]); |
| 85 | |
| 86 | int h = height; |
| 87 | do { |
| 88 | int w = width; |
| 89 | const uint8_t *ref_ptr = ref; |
| 90 | do { |
| 91 | uint8x16_t r = vld1q_u8(ref_ptr); |
| 92 | |
| 93 | uint32x4_t ref_u32_lo = vreinterpretq_u32_u8(vqtbl1q_u8(r, pre_idx0)); |
| 94 | uint32x4_t ref_u32_hi = vreinterpretq_u32_u8(vqtbl1q_u8(r, pre_idx1)); |
| 95 | obmc_sad_8x1_s32_neon(ref_u32_lo, ref_u32_hi, mask, wsrc, sum); |
| 96 | |
| 97 | ref_u32_lo = vreinterpretq_u32_u8(vqtbl1q_u8(r, pre_idx2)); |
| 98 | ref_u32_hi = vreinterpretq_u32_u8(vqtbl1q_u8(r, pre_idx3)); |
| 99 | obmc_sad_8x1_s32_neon(ref_u32_lo, ref_u32_hi, mask + 8, wsrc + 8, sum); |
| 100 | |
| 101 | ref_ptr += 16; |
| 102 | wsrc += 16; |
| 103 | mask += 16; |
| 104 | w -= 16; |
| 105 | } while (w != 0); |
| 106 | |
| 107 | ref += ref_stride; |
| 108 | } while (--h != 0); |
| 109 | |
| 110 | return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1])); |
| 111 | } |
| 112 | |
James Zern | fe7676b | 2023-05-22 13:18:43 -0700 | [diff] [blame] | 113 | #else // !AOM_ARCH_AARCH64 |
Salome Thirot | 22fba5b | 2023-05-05 15:54:13 +0100 | [diff] [blame] | 114 | |
| 115 | static INLINE unsigned int obmc_sad_large_neon(const uint8_t *ref, |
| 116 | int ref_stride, |
| 117 | const int32_t *wsrc, |
| 118 | const int32_t *mask, int width, |
| 119 | int height) { |
| 120 | uint32x4_t sum = vdupq_n_u32(0); |
| 121 | |
| 122 | int h = height; |
| 123 | do { |
| 124 | int w = width; |
| 125 | const uint8_t *ref_ptr = ref; |
| 126 | do { |
| 127 | uint8x16_t r = vld1q_u8(ref_ptr); |
| 128 | |
| 129 | int16x8_t ref_s16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(r))); |
| 130 | obmc_sad_8x1_s16_neon(ref_s16, mask, wsrc, &sum); |
| 131 | |
| 132 | ref_s16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(r))); |
| 133 | obmc_sad_8x1_s16_neon(ref_s16, mask + 8, wsrc + 8, &sum); |
| 134 | |
| 135 | ref_ptr += 16; |
| 136 | wsrc += 16; |
| 137 | mask += 16; |
| 138 | w -= 16; |
| 139 | } while (w != 0); |
| 140 | |
| 141 | ref += ref_stride; |
| 142 | } while (--h != 0); |
| 143 | |
| 144 | return horizontal_add_u32x4(sum); |
| 145 | } |
| 146 | |
James Zern | fe7676b | 2023-05-22 13:18:43 -0700 | [diff] [blame] | 147 | #endif // AOM_ARCH_AARCH64 |
Salome Thirot | 22fba5b | 2023-05-05 15:54:13 +0100 | [diff] [blame] | 148 | |
| 149 | static INLINE unsigned int obmc_sad_128xh_neon(const uint8_t *ref, |
| 150 | int ref_stride, |
| 151 | const int32_t *wsrc, |
| 152 | const int32_t *mask, int h) { |
| 153 | return obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 128, h); |
| 154 | } |
| 155 | |
| 156 | static INLINE unsigned int obmc_sad_64xh_neon(const uint8_t *ref, |
| 157 | int ref_stride, |
| 158 | const int32_t *wsrc, |
| 159 | const int32_t *mask, int h) { |
| 160 | return obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 64, h); |
| 161 | } |
| 162 | |
| 163 | static INLINE unsigned int obmc_sad_32xh_neon(const uint8_t *ref, |
| 164 | int ref_stride, |
| 165 | const int32_t *wsrc, |
| 166 | const int32_t *mask, int h) { |
| 167 | return obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 32, h); |
| 168 | } |
| 169 | |
| 170 | static INLINE unsigned int obmc_sad_16xh_neon(const uint8_t *ref, |
| 171 | int ref_stride, |
| 172 | const int32_t *wsrc, |
| 173 | const int32_t *mask, int h) { |
| 174 | return obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 16, h); |
| 175 | } |
| 176 | |
| 177 | static INLINE unsigned int obmc_sad_8xh_neon(const uint8_t *ref, int ref_stride, |
| 178 | const int32_t *wsrc, |
| 179 | const int32_t *mask, int height) { |
| 180 | uint32x4_t sum = vdupq_n_u32(0); |
| 181 | |
| 182 | int h = height; |
| 183 | do { |
| 184 | uint8x8_t r = vld1_u8(ref); |
| 185 | |
| 186 | int16x8_t ref_s16 = vreinterpretq_s16_u16(vmovl_u8(r)); |
| 187 | obmc_sad_8x1_s16_neon(ref_s16, mask, wsrc, &sum); |
| 188 | |
| 189 | ref += ref_stride; |
| 190 | wsrc += 8; |
| 191 | mask += 8; |
| 192 | } while (--h != 0); |
| 193 | |
| 194 | return horizontal_add_u32x4(sum); |
| 195 | } |
| 196 | |
| 197 | static INLINE unsigned int obmc_sad_4xh_neon(const uint8_t *ref, int ref_stride, |
| 198 | const int32_t *wsrc, |
| 199 | const int32_t *mask, int height) { |
| 200 | uint32x4_t sum = vdupq_n_u32(0); |
| 201 | |
| 202 | int h = height / 2; |
| 203 | do { |
| 204 | uint8x8_t r = load_unaligned_u8(ref, ref_stride); |
| 205 | |
| 206 | int16x8_t ref_s16 = vreinterpretq_s16_u16(vmovl_u8(r)); |
| 207 | obmc_sad_8x1_s16_neon(ref_s16, mask, wsrc, &sum); |
| 208 | |
| 209 | ref += 2 * ref_stride; |
| 210 | wsrc += 8; |
| 211 | mask += 8; |
| 212 | } while (--h != 0); |
| 213 | |
| 214 | return horizontal_add_u32x4(sum); |
| 215 | } |
| 216 | |
| 217 | #define OBMC_SAD_WXH_NEON(w, h) \ |
| 218 | unsigned int aom_obmc_sad##w##x##h##_neon( \ |
| 219 | const uint8_t *ref, int ref_stride, const int32_t *wsrc, \ |
| 220 | const int32_t *mask) { \ |
| 221 | return obmc_sad_##w##xh_neon(ref, ref_stride, wsrc, mask, h); \ |
| 222 | } |
| 223 | |
| 224 | OBMC_SAD_WXH_NEON(4, 4) |
| 225 | OBMC_SAD_WXH_NEON(4, 8) |
| 226 | OBMC_SAD_WXH_NEON(4, 16) |
| 227 | |
| 228 | OBMC_SAD_WXH_NEON(8, 4) |
| 229 | OBMC_SAD_WXH_NEON(8, 8) |
| 230 | OBMC_SAD_WXH_NEON(8, 16) |
| 231 | OBMC_SAD_WXH_NEON(8, 32) |
| 232 | |
| 233 | OBMC_SAD_WXH_NEON(16, 4) |
| 234 | OBMC_SAD_WXH_NEON(16, 8) |
| 235 | OBMC_SAD_WXH_NEON(16, 16) |
| 236 | OBMC_SAD_WXH_NEON(16, 32) |
| 237 | OBMC_SAD_WXH_NEON(16, 64) |
| 238 | |
| 239 | OBMC_SAD_WXH_NEON(32, 8) |
| 240 | OBMC_SAD_WXH_NEON(32, 16) |
| 241 | OBMC_SAD_WXH_NEON(32, 32) |
| 242 | OBMC_SAD_WXH_NEON(32, 64) |
| 243 | |
| 244 | OBMC_SAD_WXH_NEON(64, 16) |
| 245 | OBMC_SAD_WXH_NEON(64, 32) |
| 246 | OBMC_SAD_WXH_NEON(64, 64) |
| 247 | OBMC_SAD_WXH_NEON(64, 128) |
| 248 | |
| 249 | OBMC_SAD_WXH_NEON(128, 64) |
| 250 | OBMC_SAD_WXH_NEON(128, 128) |