Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (c) 2016, Alliance for Open Media. All rights reserved |
| 3 | * |
| 4 | * This source code is subject to the terms of the BSD 2 Clause License and |
| 5 | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| 6 | * was not distributed with this source code in the LICENSE file, you can |
| 7 | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| 8 | * Media Patent License 1.0 was not distributed with this source code in the |
| 9 | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| 10 | */ |
| 11 | |
| 12 | #ifndef _V64_INTRINSICS_H |
| 13 | #define _V64_INTRINSICS_H |
| 14 | |
| 15 | #include <arm_neon.h> |
| 16 | #include "./v64_intrinsics_arm.h" |
Steinar Midtskogen | 7b7624e | 2016-09-01 19:45:29 +0200 | [diff] [blame] | 17 | #include "aom_ports/arm.h" |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 18 | |
Steinar Midtskogen | 7b7624e | 2016-09-01 19:45:29 +0200 | [diff] [blame] | 19 | #ifdef AOM_INCOMPATIBLE_GCC |
| 20 | #error Incompatible gcc |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 21 | #endif |
| 22 | |
| 23 | typedef int64x1_t v64; |
| 24 | |
| 25 | SIMD_INLINE uint32_t v64_low_u32(v64 a) { |
| 26 | return vget_lane_u32(vreinterpret_u32_s64(a), 0); |
| 27 | } |
| 28 | |
| 29 | SIMD_INLINE uint32_t v64_high_u32(v64 a) { |
| 30 | return vget_lane_u32(vreinterpret_u32_s64(a), 1); |
| 31 | } |
| 32 | |
| 33 | SIMD_INLINE int32_t v64_low_s32(v64 a) { |
| 34 | return vget_lane_s32(vreinterpret_s32_s64(a), 0); |
| 35 | } |
| 36 | |
| 37 | SIMD_INLINE int32_t v64_high_s32(v64 a) { |
| 38 | return vget_lane_s32(vreinterpret_s32_s64(a), 1); |
| 39 | } |
| 40 | |
| 41 | SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) { |
| 42 | return vcreate_s64((uint64_t)a << 48 | (uint64_t)b << 32 | (uint64_t)c << 16 | |
| 43 | d); |
| 44 | } |
| 45 | |
| 46 | SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) { |
| 47 | return vcreate_s64((uint64_t)x << 32 | y); |
| 48 | } |
| 49 | |
| 50 | SIMD_INLINE v64 v64_from_64(uint64_t x) { return vcreate_s64(x); } |
| 51 | |
Steinar Midtskogen | 7b7624e | 2016-09-01 19:45:29 +0200 | [diff] [blame] | 52 | SIMD_INLINE uint64_t v64_u64(v64 x) { return (uint64_t)x; } |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 53 | |
| 54 | SIMD_INLINE uint32_t u32_load_aligned(const void *p) { |
| 55 | return *((uint32_t *)p); |
| 56 | } |
| 57 | |
| 58 | SIMD_INLINE uint32_t u32_load_unaligned(const void *p) { |
| 59 | return vget_lane_u32(vreinterpret_u32_u8(vld1_u8((const uint8_t *)p)), 0); |
| 60 | } |
| 61 | |
| 62 | SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) { |
| 63 | *((uint32_t *)p) = a; |
| 64 | } |
| 65 | |
| 66 | SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) { |
Alex Converse | fa16041 | 2017-03-22 19:59:15 -0700 | [diff] [blame] | 67 | #if defined(__clang__) |
Steinar Midtskogen | 7b7624e | 2016-09-01 19:45:29 +0200 | [diff] [blame] | 68 | vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a), |
| 69 | 0); |
Alex Converse | fa16041 | 2017-03-22 19:59:15 -0700 | [diff] [blame] | 70 | #elif defined(__CC_ARM) |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 71 | *(__packed uint32_t *)p) = a; |
Alex Converse | fa16041 | 2017-03-22 19:59:15 -0700 | [diff] [blame] | 72 | #elif defined(__GNUC__) |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 73 | *((__attribute((packed)) uint32_t *)p) = a; |
| 74 | #else |
Steinar Midtskogen | 7b7624e | 2016-09-01 19:45:29 +0200 | [diff] [blame] | 75 | vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a), |
| 76 | 0); |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 77 | #endif |
| 78 | } |
| 79 | |
| 80 | SIMD_INLINE v64 v64_load_aligned(const void *p) { |
| 81 | return vreinterpret_s64_u8(vld1_u8((const uint8_t *)p)); |
| 82 | } |
| 83 | |
| 84 | SIMD_INLINE v64 v64_load_unaligned(const void *p) { |
| 85 | return v64_load_aligned(p); |
| 86 | } |
| 87 | |
| 88 | SIMD_INLINE void v64_store_aligned(void *p, v64 r) { |
| 89 | vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r)); |
| 90 | } |
| 91 | |
| 92 | SIMD_INLINE void v64_store_unaligned(void *p, v64 r) { |
| 93 | vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r)); |
| 94 | } |
| 95 | |
Steinar Midtskogen | 7b7624e | 2016-09-01 19:45:29 +0200 | [diff] [blame] | 96 | // The following function requires an immediate. |
| 97 | // Some compilers will check this if it's optimising, others wont. |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 98 | SIMD_INLINE v64 v64_align(v64 a, v64 b, unsigned int c) { |
Alex Converse | fa16041 | 2017-03-22 19:59:15 -0700 | [diff] [blame] | 99 | #if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 100 | return c ? vreinterpret_s64_s8( |
| 101 | vext_s8(vreinterpret_s8_s64(b), vreinterpret_s8_s64(a), c)) |
| 102 | : b; |
| 103 | #else |
Steinar Midtskogen | 6d2f3c2 | 2017-03-07 11:33:55 +0100 | [diff] [blame] | 104 | return c ? v64_from_64(((uint64_t)b >> c * 8) | ((uint64_t)a << (8 - c) * 8)) |
Steinar Midtskogen | 7b7624e | 2016-09-01 19:45:29 +0200 | [diff] [blame] | 105 | : b; |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 106 | #endif |
| 107 | } |
| 108 | |
| 109 | SIMD_INLINE v64 v64_zero() { return vreinterpret_s64_u8(vdup_n_u8(0)); } |
| 110 | |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 111 | SIMD_INLINE v64 v64_dup_8(uint8_t x) { |
| 112 | return vreinterpret_s64_u8(vdup_n_u8(x)); |
| 113 | } |
| 114 | |
| 115 | SIMD_INLINE v64 v64_dup_16(uint16_t x) { |
| 116 | return vreinterpret_s64_u16(vdup_n_u16(x)); |
| 117 | } |
| 118 | |
| 119 | SIMD_INLINE v64 v64_dup_32(uint32_t x) { |
| 120 | return vreinterpret_s64_u32(vdup_n_u32(x)); |
| 121 | } |
| 122 | |
| 123 | SIMD_INLINE int64_t v64_dotp_su8(v64 x, v64 y) { |
| 124 | int64x2_t r = vpaddlq_s32(vpaddlq_s16( |
| 125 | vmulq_s16(vmovl_s8(vreinterpret_s8_s64(x)), |
| 126 | vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(y)))))); |
Steinar Midtskogen | 7b7624e | 2016-09-01 19:45:29 +0200 | [diff] [blame] | 127 | return (int64_t)vadd_s64(vget_high_s64(r), vget_low_s64(r)); |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 128 | } |
| 129 | |
| 130 | SIMD_INLINE int64_t v64_dotp_s16(v64 x, v64 y) { |
| 131 | int64x2_t r = |
| 132 | vpaddlq_s32(vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); |
Steinar Midtskogen | 7b7624e | 2016-09-01 19:45:29 +0200 | [diff] [blame] | 133 | return (int64_t)(vget_high_s64(r) + vget_low_s64(r)); |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 134 | } |
| 135 | |
| 136 | SIMD_INLINE uint64_t v64_hadd_u8(v64 x) { |
Steinar Midtskogen | 7b7624e | 2016-09-01 19:45:29 +0200 | [diff] [blame] | 137 | return (uint64_t)vpaddl_u32(vpaddl_u16(vpaddl_u8(vreinterpret_u8_s64(x)))); |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 138 | } |
| 139 | |
| 140 | SIMD_INLINE int64_t v64_hadd_s16(v64 a) { |
Steinar Midtskogen | 7b7624e | 2016-09-01 19:45:29 +0200 | [diff] [blame] | 141 | return (int64_t)vpaddl_s32(vpaddl_s16(vreinterpret_s16_s64(a))); |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 142 | } |
| 143 | |
| 144 | typedef uint16x8_t sad64_internal; |
| 145 | |
| 146 | SIMD_INLINE sad64_internal v64_sad_u8_init() { return vdupq_n_u16(0); } |
| 147 | |
| 148 | /* Implementation dependent return value. Result must be finalised with |
| 149 | v64_sad_u8_sum(). |
| 150 | The result for more than 32 v64_sad_u8() calls is undefined. */ |
| 151 | SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) { |
| 152 | return vabal_u8(s, vreinterpret_u8_s64(a), vreinterpret_u8_s64(b)); |
| 153 | } |
| 154 | |
| 155 | SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) { |
| 156 | uint64x2_t r = vpaddlq_u32(vpaddlq_u16(s)); |
Steinar Midtskogen | 7b7624e | 2016-09-01 19:45:29 +0200 | [diff] [blame] | 157 | return (uint32_t)(uint64_t)(vget_high_u64(r) + vget_low_u64(r)); |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 158 | } |
| 159 | |
| 160 | typedef int64x1_t ssd64_internal; |
| 161 | |
Steinar Midtskogen | 7b7624e | 2016-09-01 19:45:29 +0200 | [diff] [blame] | 162 | SIMD_INLINE ssd64_internal v64_ssd_u8_init() { |
| 163 | return (ssd64_internal)(uint64_t)0; |
| 164 | } |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 165 | |
| 166 | /* Implementation dependent return value. Result must be finalised with |
| 167 | * v64_ssd_u8_sum(). */ |
| 168 | SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) { |
| 169 | uint8x8_t t = vabd_u8(vreinterpret_u8_s64(a), vreinterpret_u8_s64(b)); |
| 170 | uint64x2_t r = vpaddlq_u32(vpaddlq_u16(vmull_u8(t, t))); |
| 171 | return vadd_u64(s, vadd_u64(vget_high_u64(r), vget_low_u64(r))); |
| 172 | } |
| 173 | |
Steinar Midtskogen | 7b7624e | 2016-09-01 19:45:29 +0200 | [diff] [blame] | 174 | SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) { |
| 175 | return (uint32_t)(uint64_t)s; |
| 176 | } |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 177 | |
| 178 | SIMD_INLINE v64 v64_or(v64 x, v64 y) { return vorr_s64(x, y); } |
| 179 | |
| 180 | SIMD_INLINE v64 v64_xor(v64 x, v64 y) { return veor_s64(x, y); } |
| 181 | |
| 182 | SIMD_INLINE v64 v64_and(v64 x, v64 y) { return vand_s64(x, y); } |
| 183 | |
| 184 | SIMD_INLINE v64 v64_andn(v64 x, v64 y) { return vbic_s64(x, y); } |
| 185 | |
| 186 | SIMD_INLINE v64 v64_add_8(v64 x, v64 y) { |
| 187 | return vreinterpret_s64_u8( |
| 188 | vadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); |
| 189 | } |
| 190 | |
| 191 | SIMD_INLINE v64 v64_add_16(v64 x, v64 y) { |
| 192 | return vreinterpret_s64_s16( |
| 193 | vadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); |
| 194 | } |
| 195 | |
| 196 | SIMD_INLINE v64 v64_sadd_s16(v64 x, v64 y) { |
| 197 | return vreinterpret_s64_s16( |
| 198 | vqadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); |
| 199 | } |
| 200 | |
| 201 | SIMD_INLINE v64 v64_add_32(v64 x, v64 y) { |
| 202 | return vreinterpret_s64_u32( |
| 203 | vadd_u32(vreinterpret_u32_s64(x), vreinterpret_u32_s64(y))); |
| 204 | } |
| 205 | |
| 206 | SIMD_INLINE v64 v64_sub_8(v64 x, v64 y) { |
| 207 | return vreinterpret_s64_u8( |
| 208 | vsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); |
| 209 | } |
| 210 | |
| 211 | SIMD_INLINE v64 v64_sub_16(v64 x, v64 y) { |
| 212 | return vreinterpret_s64_s16( |
| 213 | vsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); |
| 214 | } |
| 215 | |
| 216 | SIMD_INLINE v64 v64_ssub_s16(v64 x, v64 y) { |
| 217 | return vreinterpret_s64_s16( |
| 218 | vqsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); |
| 219 | } |
| 220 | |
Steinar Midtskogen | 9b8444a | 2017-03-31 22:11:06 +0200 | [diff] [blame] | 221 | SIMD_INLINE v64 v64_ssub_u16(v64 x, v64 y) { |
| 222 | return vreinterpret_s64_u16( |
| 223 | vqsub_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y))); |
| 224 | } |
| 225 | |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 226 | SIMD_INLINE v64 v64_ssub_u8(v64 x, v64 y) { |
| 227 | return vreinterpret_s64_u8( |
| 228 | vqsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); |
| 229 | } |
| 230 | |
| 231 | SIMD_INLINE v64 v64_ssub_s8(v64 x, v64 y) { |
| 232 | return vreinterpret_s64_s8( |
| 233 | vqsub_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); |
| 234 | } |
| 235 | |
| 236 | SIMD_INLINE v64 v64_sub_32(v64 x, v64 y) { |
| 237 | return vreinterpret_s64_s32( |
| 238 | vsub_s32(vreinterpret_s32_s64(x), vreinterpret_s32_s64(y))); |
| 239 | } |
| 240 | |
| 241 | SIMD_INLINE v64 v64_abs_s16(v64 x) { |
| 242 | return vreinterpret_s64_s16(vabs_s16(vreinterpret_s16_s64(x))); |
| 243 | } |
| 244 | |
Steinar Midtskogen | 6033fb8 | 2017-04-02 21:32:41 +0200 | [diff] [blame] | 245 | SIMD_INLINE v64 v64_abs_s8(v64 x) { |
| 246 | return vreinterpret_s64_s8(vabs_s8(vreinterpret_s8_s64(x))); |
| 247 | } |
| 248 | |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 249 | SIMD_INLINE v64 v64_mullo_s16(v64 x, v64 y) { |
| 250 | return vreinterpret_s64_s16( |
| 251 | vmul_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); |
| 252 | } |
| 253 | |
| 254 | SIMD_INLINE v64 v64_mulhi_s16(v64 x, v64 y) { |
| 255 | return vreinterpret_s64_s16(vmovn_s32(vshrq_n_s32( |
| 256 | vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)), 16))); |
| 257 | } |
| 258 | |
| 259 | SIMD_INLINE v64 v64_mullo_s32(v64 x, v64 y) { |
| 260 | return vreinterpret_s64_s32( |
| 261 | vmul_s32(vreinterpret_s32_s64(x), vreinterpret_s32_s64(y))); |
| 262 | } |
| 263 | |
| 264 | SIMD_INLINE v64 v64_madd_s16(v64 x, v64 y) { |
| 265 | int32x4_t t = vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)); |
| 266 | return vreinterpret_s64_s32( |
| 267 | vpadd_s32(vreinterpret_s32_s64(vget_low_s64(vreinterpretq_s64_s32(t))), |
| 268 | vreinterpret_s32_s64(vget_high_s64(vreinterpretq_s64_s32(t))))); |
| 269 | } |
| 270 | |
| 271 | SIMD_INLINE v64 v64_madd_us8(v64 x, v64 y) { |
| 272 | return vreinterpret_s64_s16(vqmovn_s32(vpaddlq_s16( |
| 273 | vaddq_s16(vmull_s8(vadd_s8(vreinterpret_s8_s64(x), vdup_n_s8(-128)), |
| 274 | vreinterpret_s8_s64(y)), |
| 275 | vshlq_n_s16(vmovl_s8(vreinterpret_s8_s64(y)), 7))))); |
| 276 | } |
| 277 | |
| 278 | SIMD_INLINE v64 v64_avg_u8(v64 x, v64 y) { |
| 279 | return vreinterpret_s64_u8( |
| 280 | vrhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); |
| 281 | } |
| 282 | |
| 283 | SIMD_INLINE v64 v64_rdavg_u8(v64 x, v64 y) { |
| 284 | return vreinterpret_s64_u8( |
| 285 | vhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); |
| 286 | } |
| 287 | |
| 288 | SIMD_INLINE v64 v64_avg_u16(v64 x, v64 y) { |
| 289 | return vreinterpret_s64_u16( |
| 290 | vrhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y))); |
| 291 | } |
| 292 | |
| 293 | SIMD_INLINE v64 v64_max_u8(v64 x, v64 y) { |
| 294 | return vreinterpret_s64_u8( |
| 295 | vmax_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); |
| 296 | } |
| 297 | |
| 298 | SIMD_INLINE v64 v64_min_u8(v64 x, v64 y) { |
| 299 | return vreinterpret_s64_u8( |
| 300 | vmin_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); |
| 301 | } |
| 302 | |
| 303 | SIMD_INLINE v64 v64_max_s8(v64 x, v64 y) { |
| 304 | return vreinterpret_s64_s8( |
| 305 | vmax_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); |
| 306 | } |
| 307 | |
| 308 | SIMD_INLINE v64 v64_min_s8(v64 x, v64 y) { |
| 309 | return vreinterpret_s64_s8( |
| 310 | vmin_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); |
| 311 | } |
| 312 | |
| 313 | SIMD_INLINE v64 v64_max_s16(v64 x, v64 y) { |
| 314 | return vreinterpret_s64_s16( |
| 315 | vmax_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); |
| 316 | } |
| 317 | |
| 318 | SIMD_INLINE v64 v64_min_s16(v64 x, v64 y) { |
| 319 | return vreinterpret_s64_s16( |
| 320 | vmin_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); |
| 321 | } |
| 322 | |
| 323 | SIMD_INLINE v64 v64_ziplo_8(v64 x, v64 y) { |
| 324 | uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); |
| 325 | return vreinterpret_s64_u8(r.val[0]); |
| 326 | } |
| 327 | |
| 328 | SIMD_INLINE v64 v64_ziphi_8(v64 x, v64 y) { |
| 329 | uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); |
| 330 | return vreinterpret_s64_u8(r.val[1]); |
| 331 | } |
| 332 | |
| 333 | SIMD_INLINE v64 v64_ziplo_16(v64 x, v64 y) { |
| 334 | int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x)); |
| 335 | return vreinterpret_s64_s16(r.val[0]); |
| 336 | } |
| 337 | |
| 338 | SIMD_INLINE v64 v64_ziphi_16(v64 x, v64 y) { |
| 339 | int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x)); |
| 340 | return vreinterpret_s64_s16(r.val[1]); |
| 341 | } |
| 342 | |
| 343 | SIMD_INLINE v64 v64_ziplo_32(v64 x, v64 y) { |
| 344 | int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)); |
| 345 | return vreinterpret_s64_s32(r.val[0]); |
| 346 | } |
| 347 | |
| 348 | SIMD_INLINE v64 v64_ziphi_32(v64 x, v64 y) { |
| 349 | int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)); |
| 350 | return vreinterpret_s64_s32(r.val[1]); |
| 351 | } |
| 352 | |
| 353 | SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) { |
| 354 | return vreinterpret_s64_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_s64(a)))); |
| 355 | } |
| 356 | |
| 357 | SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) { |
| 358 | return vreinterpret_s64_u16(vget_high_u16(vmovl_u8(vreinterpret_u8_s64(a)))); |
| 359 | } |
| 360 | |
Steinar Midtskogen | 1b2b739 | 2017-04-11 14:19:20 +0200 | [diff] [blame] | 361 | SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) { |
| 362 | return vreinterpret_s64_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_s64(a)))); |
| 363 | } |
| 364 | |
| 365 | SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) { |
| 366 | return vreinterpret_s64_s16(vget_high_s16(vmovl_s8(vreinterpret_s8_s64(a)))); |
| 367 | } |
| 368 | |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 369 | SIMD_INLINE v64 v64_pack_s32_s16(v64 x, v64 y) { |
| 370 | return vreinterpret_s64_s16(vqmovn_s32( |
| 371 | vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))); |
| 372 | } |
| 373 | |
| 374 | SIMD_INLINE v64 v64_pack_s16_u8(v64 x, v64 y) { |
| 375 | return vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s32( |
| 376 | vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))))); |
| 377 | } |
| 378 | |
| 379 | SIMD_INLINE v64 v64_pack_s16_s8(v64 x, v64 y) { |
| 380 | return vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s32( |
| 381 | vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))))); |
| 382 | } |
| 383 | |
| 384 | SIMD_INLINE v64 v64_unziplo_8(v64 x, v64 y) { |
| 385 | uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); |
| 386 | return vreinterpret_s64_u8(r.val[0]); |
| 387 | } |
| 388 | |
| 389 | SIMD_INLINE v64 v64_unziphi_8(v64 x, v64 y) { |
| 390 | uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); |
| 391 | return vreinterpret_s64_u8(r.val[1]); |
| 392 | } |
| 393 | |
| 394 | SIMD_INLINE v64 v64_unziplo_16(v64 x, v64 y) { |
| 395 | uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)); |
| 396 | return vreinterpret_s64_u16(r.val[0]); |
| 397 | } |
| 398 | |
| 399 | SIMD_INLINE v64 v64_unziphi_16(v64 x, v64 y) { |
| 400 | uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)); |
| 401 | return vreinterpret_s64_u16(r.val[1]); |
| 402 | } |
| 403 | |
| 404 | SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 x) { |
| 405 | return vreinterpret_s64_s32(vget_low_s32(vmovl_s16(vreinterpret_s16_s64(x)))); |
| 406 | } |
| 407 | |
| 408 | SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 x) { |
| 409 | return vreinterpret_s64_u32(vget_low_u32(vmovl_u16(vreinterpret_u16_s64(x)))); |
| 410 | } |
| 411 | |
| 412 | SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 x) { |
| 413 | return vreinterpret_s64_s32( |
| 414 | vget_high_s32(vmovl_s16(vreinterpret_s16_s64(x)))); |
| 415 | } |
| 416 | |
| 417 | SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 x) { |
| 418 | return vreinterpret_s64_u32( |
| 419 | vget_high_u32(vmovl_u16(vreinterpret_u16_s64(x)))); |
| 420 | } |
| 421 | |
| 422 | SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) { |
| 423 | return vreinterpret_s64_u8( |
| 424 | vtbl1_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(pattern))); |
| 425 | } |
| 426 | |
| 427 | SIMD_INLINE v64 v64_cmpgt_s8(v64 x, v64 y) { |
| 428 | return vreinterpret_s64_u8( |
| 429 | vcgt_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); |
| 430 | } |
| 431 | |
| 432 | SIMD_INLINE v64 v64_cmplt_s8(v64 x, v64 y) { |
| 433 | return vreinterpret_s64_u8( |
| 434 | vclt_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); |
| 435 | } |
| 436 | |
| 437 | SIMD_INLINE v64 v64_cmpeq_8(v64 x, v64 y) { |
| 438 | return vreinterpret_s64_u8( |
| 439 | vceq_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); |
| 440 | } |
| 441 | |
| 442 | SIMD_INLINE v64 v64_cmpgt_s16(v64 x, v64 y) { |
| 443 | return vreinterpret_s64_u16( |
| 444 | vcgt_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); |
| 445 | } |
| 446 | |
| 447 | SIMD_INLINE v64 v64_cmplt_s16(v64 x, v64 y) { |
| 448 | return vreinterpret_s64_u16( |
| 449 | vclt_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); |
| 450 | } |
| 451 | |
| 452 | SIMD_INLINE v64 v64_cmpeq_16(v64 x, v64 y) { |
| 453 | return vreinterpret_s64_u16( |
| 454 | vceq_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); |
| 455 | } |
| 456 | |
| 457 | SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) { |
| 458 | return vreinterpret_s64_u8(vshl_u8(vreinterpret_u8_s64(a), vdup_n_s8(c))); |
| 459 | } |
| 460 | |
| 461 | SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) { |
| 462 | return vreinterpret_s64_u8(vshl_u8(vreinterpret_u8_s64(a), vdup_n_s8(-c))); |
| 463 | } |
| 464 | |
| 465 | SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) { |
| 466 | return vreinterpret_s64_s8(vshl_s8(vreinterpret_s8_s64(a), vdup_n_s8(-c))); |
| 467 | } |
| 468 | |
| 469 | SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) { |
| 470 | return vreinterpret_s64_u16(vshl_u16(vreinterpret_u16_s64(a), vdup_n_s16(c))); |
| 471 | } |
| 472 | |
| 473 | SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) { |
| 474 | return vreinterpret_s64_u16( |
| 475 | vshl_u16(vreinterpret_u16_s64(a), vdup_n_s16(-(int)c))); |
| 476 | } |
| 477 | |
| 478 | SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) { |
| 479 | return vreinterpret_s64_s16( |
| 480 | vshl_s16(vreinterpret_s16_s64(a), vdup_n_s16(-(int)c))); |
| 481 | } |
| 482 | |
| 483 | SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) { |
| 484 | return vreinterpret_s64_u32(vshl_u32(vreinterpret_u32_s64(a), vdup_n_s32(c))); |
| 485 | } |
| 486 | |
| 487 | SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) { |
| 488 | return vreinterpret_s64_u32( |
| 489 | vshl_u32(vreinterpret_u32_s64(a), vdup_n_s32(-(int)c))); |
| 490 | } |
| 491 | |
| 492 | SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) { |
| 493 | return vreinterpret_s64_s32( |
| 494 | vshl_s32(vreinterpret_s32_s64(a), vdup_n_s32(-(int)c))); |
| 495 | } |
| 496 | |
Steinar Midtskogen | 7b7624e | 2016-09-01 19:45:29 +0200 | [diff] [blame] | 497 | // The following functions require an immediate. |
| 498 | // Some compilers will check this during optimisation, others wont. |
Alex Converse | fa16041 | 2017-03-22 19:59:15 -0700 | [diff] [blame] | 499 | #if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 500 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 501 | SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int c) { |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 502 | return vshl_n_s64(a, c * 8); |
| 503 | } |
| 504 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 505 | SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int c) { |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 506 | return c ? (v64)vshr_n_u64(vreinterpret_u64_s64(a), c * 8) : a; |
| 507 | } |
| 508 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 509 | SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) { |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 510 | return vreinterpret_s64_u8(vshl_n_u8(vreinterpret_u8_s64(a), c)); |
| 511 | } |
| 512 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 513 | SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) { |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 514 | return vreinterpret_s64_u8(vshr_n_u8(vreinterpret_u8_s64(a), c)); |
| 515 | } |
| 516 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 517 | SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) { |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 518 | return vreinterpret_s64_s8(vshr_n_s8(vreinterpret_s8_s64(a), c)); |
| 519 | } |
| 520 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 521 | SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) { |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 522 | return vreinterpret_s64_u16(vshl_n_u16(vreinterpret_u16_s64(a), c)); |
| 523 | } |
| 524 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 525 | SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) { |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 526 | return vreinterpret_s64_u16(vshr_n_u16(vreinterpret_u16_s64(a), c)); |
| 527 | } |
| 528 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 529 | SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) { |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 530 | return vreinterpret_s64_s16(vshr_n_s16(vreinterpret_s16_s64(a), c)); |
| 531 | } |
| 532 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 533 | SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) { |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 534 | return vreinterpret_s64_u32(vshl_n_u32(vreinterpret_u32_s64(a), c)); |
| 535 | } |
| 536 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 537 | SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) { |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 538 | return vreinterpret_s64_u32(vshr_n_u32(vreinterpret_u32_s64(a), c)); |
| 539 | } |
| 540 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 541 | SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) { |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 542 | return vreinterpret_s64_s32(vshr_n_s32(vreinterpret_s32_s64(a), c)); |
| 543 | } |
| 544 | |
| 545 | #else |
| 546 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 547 | SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int c) { |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 548 | return v64_from_64(v64_u64(a) << c * 8); |
| 549 | } |
| 550 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 551 | SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int c) { |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 552 | return v64_from_64(v64_u64(a) >> c * 8); |
| 553 | } |
| 554 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 555 | SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) { return v64_shl_8(a, c); } |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 556 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 557 | SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) { return v64_shr_u8(a, c); } |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 558 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 559 | SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) { return v64_shr_s8(a, c); } |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 560 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 561 | SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) { return v64_shl_16(a, c); } |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 562 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 563 | SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) { |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 564 | return v64_shr_u16(a, c); |
| 565 | } |
| 566 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 567 | SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) { |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 568 | return v64_shr_s16(a, c); |
| 569 | } |
| 570 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 571 | SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) { return v64_shl_32(a, c); } |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 572 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 573 | SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) { |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 574 | return v64_shr_u32(a, c); |
| 575 | } |
| 576 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 577 | SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) { |
Steinar Midtskogen | a5f8ea1 | 2016-08-03 13:17:33 +0200 | [diff] [blame] | 578 | return v64_shr_s32(a, c); |
| 579 | } |
| 580 | |
| 581 | #endif |
| 582 | |
| 583 | #endif /* _V64_INTRINSICS_H */ |