blob: c7574eef5005b790063bff4a2ea72062a54d3013 [file] [log] [blame]
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +02001/*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12#ifndef _V64_INTRINSICS_H
13#define _V64_INTRINSICS_H
14
15#include <arm_neon.h>
16#include "./v64_intrinsics_arm.h"
Steinar Midtskogen7b7624e2016-09-01 19:45:29 +020017#include "aom_ports/arm.h"
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +020018
Steinar Midtskogen7b7624e2016-09-01 19:45:29 +020019#ifdef AOM_INCOMPATIBLE_GCC
20#error Incompatible gcc
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +020021#endif
22
23typedef int64x1_t v64;
24
25SIMD_INLINE uint32_t v64_low_u32(v64 a) {
26 return vget_lane_u32(vreinterpret_u32_s64(a), 0);
27}
28
29SIMD_INLINE uint32_t v64_high_u32(v64 a) {
30 return vget_lane_u32(vreinterpret_u32_s64(a), 1);
31}
32
33SIMD_INLINE int32_t v64_low_s32(v64 a) {
34 return vget_lane_s32(vreinterpret_s32_s64(a), 0);
35}
36
37SIMD_INLINE int32_t v64_high_s32(v64 a) {
38 return vget_lane_s32(vreinterpret_s32_s64(a), 1);
39}
40
41SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
42 return vcreate_s64((uint64_t)a << 48 | (uint64_t)b << 32 | (uint64_t)c << 16 |
43 d);
44}
45
46SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
47 return vcreate_s64((uint64_t)x << 32 | y);
48}
49
50SIMD_INLINE v64 v64_from_64(uint64_t x) { return vcreate_s64(x); }
51
Steinar Midtskogen7b7624e2016-09-01 19:45:29 +020052SIMD_INLINE uint64_t v64_u64(v64 x) { return (uint64_t)x; }
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +020053
54SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
55 return *((uint32_t *)p);
56}
57
58SIMD_INLINE uint32_t u32_load_unaligned(const void *p) {
59 return vget_lane_u32(vreinterpret_u32_u8(vld1_u8((const uint8_t *)p)), 0);
60}
61
62SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
63 *((uint32_t *)p) = a;
64}
65
66SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
Alex Conversefa160412017-03-22 19:59:15 -070067#if defined(__clang__)
Steinar Midtskogen7b7624e2016-09-01 19:45:29 +020068 vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a),
69 0);
Alex Conversefa160412017-03-22 19:59:15 -070070#elif defined(__CC_ARM)
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +020071 *(__packed uint32_t *)p) = a;
Alex Conversefa160412017-03-22 19:59:15 -070072#elif defined(__GNUC__)
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +020073 *((__attribute((packed)) uint32_t *)p) = a;
74#else
Steinar Midtskogen7b7624e2016-09-01 19:45:29 +020075 vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a),
76 0);
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +020077#endif
78}
79
80SIMD_INLINE v64 v64_load_aligned(const void *p) {
81 return vreinterpret_s64_u8(vld1_u8((const uint8_t *)p));
82}
83
84SIMD_INLINE v64 v64_load_unaligned(const void *p) {
85 return v64_load_aligned(p);
86}
87
88SIMD_INLINE void v64_store_aligned(void *p, v64 r) {
89 vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r));
90}
91
92SIMD_INLINE void v64_store_unaligned(void *p, v64 r) {
93 vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r));
94}
95
Steinar Midtskogen7b7624e2016-09-01 19:45:29 +020096// The following function requires an immediate.
97// Some compilers will check this if it's optimising, others wont.
Yaowu Xu032573d2017-04-24 15:04:17 -070098SIMD_INLINE v64 v64_align(v64 a, v64 b, unsigned int c) {
Alex Conversefa160412017-03-22 19:59:15 -070099#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200100 return c ? vreinterpret_s64_s8(
101 vext_s8(vreinterpret_s8_s64(b), vreinterpret_s8_s64(a), c))
102 : b;
103#else
Steinar Midtskogen6d2f3c22017-03-07 11:33:55 +0100104 return c ? v64_from_64(((uint64_t)b >> c * 8) | ((uint64_t)a << (8 - c) * 8))
Steinar Midtskogen7b7624e2016-09-01 19:45:29 +0200105 : b;
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200106#endif
107}
108
109SIMD_INLINE v64 v64_zero() { return vreinterpret_s64_u8(vdup_n_u8(0)); }
110
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200111SIMD_INLINE v64 v64_dup_8(uint8_t x) {
112 return vreinterpret_s64_u8(vdup_n_u8(x));
113}
114
115SIMD_INLINE v64 v64_dup_16(uint16_t x) {
116 return vreinterpret_s64_u16(vdup_n_u16(x));
117}
118
119SIMD_INLINE v64 v64_dup_32(uint32_t x) {
120 return vreinterpret_s64_u32(vdup_n_u32(x));
121}
122
123SIMD_INLINE int64_t v64_dotp_su8(v64 x, v64 y) {
124 int64x2_t r = vpaddlq_s32(vpaddlq_s16(
125 vmulq_s16(vmovl_s8(vreinterpret_s8_s64(x)),
126 vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(y))))));
Steinar Midtskogen7b7624e2016-09-01 19:45:29 +0200127 return (int64_t)vadd_s64(vget_high_s64(r), vget_low_s64(r));
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200128}
129
130SIMD_INLINE int64_t v64_dotp_s16(v64 x, v64 y) {
131 int64x2_t r =
132 vpaddlq_s32(vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
Steinar Midtskogen7b7624e2016-09-01 19:45:29 +0200133 return (int64_t)(vget_high_s64(r) + vget_low_s64(r));
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200134}
135
136SIMD_INLINE uint64_t v64_hadd_u8(v64 x) {
Steinar Midtskogen7b7624e2016-09-01 19:45:29 +0200137 return (uint64_t)vpaddl_u32(vpaddl_u16(vpaddl_u8(vreinterpret_u8_s64(x))));
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200138}
139
140SIMD_INLINE int64_t v64_hadd_s16(v64 a) {
Steinar Midtskogen7b7624e2016-09-01 19:45:29 +0200141 return (int64_t)vpaddl_s32(vpaddl_s16(vreinterpret_s16_s64(a)));
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200142}
143
144typedef uint16x8_t sad64_internal;
145
146SIMD_INLINE sad64_internal v64_sad_u8_init() { return vdupq_n_u16(0); }
147
148/* Implementation dependent return value. Result must be finalised with
149 v64_sad_u8_sum().
150 The result for more than 32 v64_sad_u8() calls is undefined. */
151SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
152 return vabal_u8(s, vreinterpret_u8_s64(a), vreinterpret_u8_s64(b));
153}
154
155SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) {
156 uint64x2_t r = vpaddlq_u32(vpaddlq_u16(s));
Steinar Midtskogen7b7624e2016-09-01 19:45:29 +0200157 return (uint32_t)(uint64_t)(vget_high_u64(r) + vget_low_u64(r));
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200158}
159
160typedef int64x1_t ssd64_internal;
161
Steinar Midtskogen7b7624e2016-09-01 19:45:29 +0200162SIMD_INLINE ssd64_internal v64_ssd_u8_init() {
163 return (ssd64_internal)(uint64_t)0;
164}
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200165
166/* Implementation dependent return value. Result must be finalised with
167 * v64_ssd_u8_sum(). */
168SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
169 uint8x8_t t = vabd_u8(vreinterpret_u8_s64(a), vreinterpret_u8_s64(b));
170 uint64x2_t r = vpaddlq_u32(vpaddlq_u16(vmull_u8(t, t)));
171 return vadd_u64(s, vadd_u64(vget_high_u64(r), vget_low_u64(r)));
172}
173
Steinar Midtskogen7b7624e2016-09-01 19:45:29 +0200174SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) {
175 return (uint32_t)(uint64_t)s;
176}
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200177
178SIMD_INLINE v64 v64_or(v64 x, v64 y) { return vorr_s64(x, y); }
179
180SIMD_INLINE v64 v64_xor(v64 x, v64 y) { return veor_s64(x, y); }
181
182SIMD_INLINE v64 v64_and(v64 x, v64 y) { return vand_s64(x, y); }
183
184SIMD_INLINE v64 v64_andn(v64 x, v64 y) { return vbic_s64(x, y); }
185
186SIMD_INLINE v64 v64_add_8(v64 x, v64 y) {
187 return vreinterpret_s64_u8(
188 vadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
189}
190
191SIMD_INLINE v64 v64_add_16(v64 x, v64 y) {
192 return vreinterpret_s64_s16(
193 vadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
194}
195
196SIMD_INLINE v64 v64_sadd_s16(v64 x, v64 y) {
197 return vreinterpret_s64_s16(
198 vqadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
199}
200
201SIMD_INLINE v64 v64_add_32(v64 x, v64 y) {
202 return vreinterpret_s64_u32(
203 vadd_u32(vreinterpret_u32_s64(x), vreinterpret_u32_s64(y)));
204}
205
206SIMD_INLINE v64 v64_sub_8(v64 x, v64 y) {
207 return vreinterpret_s64_u8(
208 vsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
209}
210
211SIMD_INLINE v64 v64_sub_16(v64 x, v64 y) {
212 return vreinterpret_s64_s16(
213 vsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
214}
215
216SIMD_INLINE v64 v64_ssub_s16(v64 x, v64 y) {
217 return vreinterpret_s64_s16(
218 vqsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
219}
220
Steinar Midtskogen9b8444a2017-03-31 22:11:06 +0200221SIMD_INLINE v64 v64_ssub_u16(v64 x, v64 y) {
222 return vreinterpret_s64_u16(
223 vqsub_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y)));
224}
225
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200226SIMD_INLINE v64 v64_ssub_u8(v64 x, v64 y) {
227 return vreinterpret_s64_u8(
228 vqsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
229}
230
231SIMD_INLINE v64 v64_ssub_s8(v64 x, v64 y) {
232 return vreinterpret_s64_s8(
233 vqsub_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
234}
235
236SIMD_INLINE v64 v64_sub_32(v64 x, v64 y) {
237 return vreinterpret_s64_s32(
238 vsub_s32(vreinterpret_s32_s64(x), vreinterpret_s32_s64(y)));
239}
240
241SIMD_INLINE v64 v64_abs_s16(v64 x) {
242 return vreinterpret_s64_s16(vabs_s16(vreinterpret_s16_s64(x)));
243}
244
Steinar Midtskogen6033fb82017-04-02 21:32:41 +0200245SIMD_INLINE v64 v64_abs_s8(v64 x) {
246 return vreinterpret_s64_s8(vabs_s8(vreinterpret_s8_s64(x)));
247}
248
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200249SIMD_INLINE v64 v64_mullo_s16(v64 x, v64 y) {
250 return vreinterpret_s64_s16(
251 vmul_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
252}
253
254SIMD_INLINE v64 v64_mulhi_s16(v64 x, v64 y) {
255 return vreinterpret_s64_s16(vmovn_s32(vshrq_n_s32(
256 vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)), 16)));
257}
258
259SIMD_INLINE v64 v64_mullo_s32(v64 x, v64 y) {
260 return vreinterpret_s64_s32(
261 vmul_s32(vreinterpret_s32_s64(x), vreinterpret_s32_s64(y)));
262}
263
264SIMD_INLINE v64 v64_madd_s16(v64 x, v64 y) {
265 int32x4_t t = vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y));
266 return vreinterpret_s64_s32(
267 vpadd_s32(vreinterpret_s32_s64(vget_low_s64(vreinterpretq_s64_s32(t))),
268 vreinterpret_s32_s64(vget_high_s64(vreinterpretq_s64_s32(t)))));
269}
270
271SIMD_INLINE v64 v64_madd_us8(v64 x, v64 y) {
272 return vreinterpret_s64_s16(vqmovn_s32(vpaddlq_s16(
273 vaddq_s16(vmull_s8(vadd_s8(vreinterpret_s8_s64(x), vdup_n_s8(-128)),
274 vreinterpret_s8_s64(y)),
275 vshlq_n_s16(vmovl_s8(vreinterpret_s8_s64(y)), 7)))));
276}
277
278SIMD_INLINE v64 v64_avg_u8(v64 x, v64 y) {
279 return vreinterpret_s64_u8(
280 vrhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
281}
282
283SIMD_INLINE v64 v64_rdavg_u8(v64 x, v64 y) {
284 return vreinterpret_s64_u8(
285 vhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
286}
287
288SIMD_INLINE v64 v64_avg_u16(v64 x, v64 y) {
289 return vreinterpret_s64_u16(
290 vrhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y)));
291}
292
293SIMD_INLINE v64 v64_max_u8(v64 x, v64 y) {
294 return vreinterpret_s64_u8(
295 vmax_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
296}
297
298SIMD_INLINE v64 v64_min_u8(v64 x, v64 y) {
299 return vreinterpret_s64_u8(
300 vmin_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
301}
302
303SIMD_INLINE v64 v64_max_s8(v64 x, v64 y) {
304 return vreinterpret_s64_s8(
305 vmax_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
306}
307
308SIMD_INLINE v64 v64_min_s8(v64 x, v64 y) {
309 return vreinterpret_s64_s8(
310 vmin_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
311}
312
313SIMD_INLINE v64 v64_max_s16(v64 x, v64 y) {
314 return vreinterpret_s64_s16(
315 vmax_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
316}
317
318SIMD_INLINE v64 v64_min_s16(v64 x, v64 y) {
319 return vreinterpret_s64_s16(
320 vmin_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
321}
322
323SIMD_INLINE v64 v64_ziplo_8(v64 x, v64 y) {
324 uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
325 return vreinterpret_s64_u8(r.val[0]);
326}
327
328SIMD_INLINE v64 v64_ziphi_8(v64 x, v64 y) {
329 uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
330 return vreinterpret_s64_u8(r.val[1]);
331}
332
333SIMD_INLINE v64 v64_ziplo_16(v64 x, v64 y) {
334 int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x));
335 return vreinterpret_s64_s16(r.val[0]);
336}
337
338SIMD_INLINE v64 v64_ziphi_16(v64 x, v64 y) {
339 int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x));
340 return vreinterpret_s64_s16(r.val[1]);
341}
342
343SIMD_INLINE v64 v64_ziplo_32(v64 x, v64 y) {
344 int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x));
345 return vreinterpret_s64_s32(r.val[0]);
346}
347
348SIMD_INLINE v64 v64_ziphi_32(v64 x, v64 y) {
349 int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x));
350 return vreinterpret_s64_s32(r.val[1]);
351}
352
353SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) {
354 return vreinterpret_s64_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_s64(a))));
355}
356
357SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) {
358 return vreinterpret_s64_u16(vget_high_u16(vmovl_u8(vreinterpret_u8_s64(a))));
359}
360
Steinar Midtskogen1b2b7392017-04-11 14:19:20 +0200361SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) {
362 return vreinterpret_s64_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_s64(a))));
363}
364
365SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) {
366 return vreinterpret_s64_s16(vget_high_s16(vmovl_s8(vreinterpret_s8_s64(a))));
367}
368
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200369SIMD_INLINE v64 v64_pack_s32_s16(v64 x, v64 y) {
370 return vreinterpret_s64_s16(vqmovn_s32(
371 vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))));
372}
373
374SIMD_INLINE v64 v64_pack_s16_u8(v64 x, v64 y) {
375 return vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s32(
376 vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))));
377}
378
379SIMD_INLINE v64 v64_pack_s16_s8(v64 x, v64 y) {
380 return vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s32(
381 vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))));
382}
383
384SIMD_INLINE v64 v64_unziplo_8(v64 x, v64 y) {
385 uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
386 return vreinterpret_s64_u8(r.val[0]);
387}
388
389SIMD_INLINE v64 v64_unziphi_8(v64 x, v64 y) {
390 uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
391 return vreinterpret_s64_u8(r.val[1]);
392}
393
394SIMD_INLINE v64 v64_unziplo_16(v64 x, v64 y) {
395 uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
396 return vreinterpret_s64_u16(r.val[0]);
397}
398
399SIMD_INLINE v64 v64_unziphi_16(v64 x, v64 y) {
400 uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
401 return vreinterpret_s64_u16(r.val[1]);
402}
403
404SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 x) {
405 return vreinterpret_s64_s32(vget_low_s32(vmovl_s16(vreinterpret_s16_s64(x))));
406}
407
408SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 x) {
409 return vreinterpret_s64_u32(vget_low_u32(vmovl_u16(vreinterpret_u16_s64(x))));
410}
411
412SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 x) {
413 return vreinterpret_s64_s32(
414 vget_high_s32(vmovl_s16(vreinterpret_s16_s64(x))));
415}
416
417SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 x) {
418 return vreinterpret_s64_u32(
419 vget_high_u32(vmovl_u16(vreinterpret_u16_s64(x))));
420}
421
422SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) {
423 return vreinterpret_s64_u8(
424 vtbl1_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(pattern)));
425}
426
427SIMD_INLINE v64 v64_cmpgt_s8(v64 x, v64 y) {
428 return vreinterpret_s64_u8(
429 vcgt_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
430}
431
432SIMD_INLINE v64 v64_cmplt_s8(v64 x, v64 y) {
433 return vreinterpret_s64_u8(
434 vclt_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
435}
436
437SIMD_INLINE v64 v64_cmpeq_8(v64 x, v64 y) {
438 return vreinterpret_s64_u8(
439 vceq_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
440}
441
442SIMD_INLINE v64 v64_cmpgt_s16(v64 x, v64 y) {
443 return vreinterpret_s64_u16(
444 vcgt_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
445}
446
447SIMD_INLINE v64 v64_cmplt_s16(v64 x, v64 y) {
448 return vreinterpret_s64_u16(
449 vclt_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
450}
451
452SIMD_INLINE v64 v64_cmpeq_16(v64 x, v64 y) {
453 return vreinterpret_s64_u16(
454 vceq_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
455}
456
457SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) {
458 return vreinterpret_s64_u8(vshl_u8(vreinterpret_u8_s64(a), vdup_n_s8(c)));
459}
460
461SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) {
462 return vreinterpret_s64_u8(vshl_u8(vreinterpret_u8_s64(a), vdup_n_s8(-c)));
463}
464
465SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) {
466 return vreinterpret_s64_s8(vshl_s8(vreinterpret_s8_s64(a), vdup_n_s8(-c)));
467}
468
469SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) {
470 return vreinterpret_s64_u16(vshl_u16(vreinterpret_u16_s64(a), vdup_n_s16(c)));
471}
472
473SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) {
474 return vreinterpret_s64_u16(
475 vshl_u16(vreinterpret_u16_s64(a), vdup_n_s16(-(int)c)));
476}
477
478SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) {
479 return vreinterpret_s64_s16(
480 vshl_s16(vreinterpret_s16_s64(a), vdup_n_s16(-(int)c)));
481}
482
483SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) {
484 return vreinterpret_s64_u32(vshl_u32(vreinterpret_u32_s64(a), vdup_n_s32(c)));
485}
486
487SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) {
488 return vreinterpret_s64_u32(
489 vshl_u32(vreinterpret_u32_s64(a), vdup_n_s32(-(int)c)));
490}
491
492SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) {
493 return vreinterpret_s64_s32(
494 vshl_s32(vreinterpret_s32_s64(a), vdup_n_s32(-(int)c)));
495}
496
Steinar Midtskogen7b7624e2016-09-01 19:45:29 +0200497// The following functions require an immediate.
498// Some compilers will check this during optimisation, others wont.
Alex Conversefa160412017-03-22 19:59:15 -0700499#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200500
Yaowu Xu032573d2017-04-24 15:04:17 -0700501SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int c) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200502 return vshl_n_s64(a, c * 8);
503}
504
Yaowu Xu032573d2017-04-24 15:04:17 -0700505SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int c) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200506 return c ? (v64)vshr_n_u64(vreinterpret_u64_s64(a), c * 8) : a;
507}
508
Yaowu Xu032573d2017-04-24 15:04:17 -0700509SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200510 return vreinterpret_s64_u8(vshl_n_u8(vreinterpret_u8_s64(a), c));
511}
512
Yaowu Xu032573d2017-04-24 15:04:17 -0700513SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200514 return vreinterpret_s64_u8(vshr_n_u8(vreinterpret_u8_s64(a), c));
515}
516
Yaowu Xu032573d2017-04-24 15:04:17 -0700517SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200518 return vreinterpret_s64_s8(vshr_n_s8(vreinterpret_s8_s64(a), c));
519}
520
Yaowu Xu032573d2017-04-24 15:04:17 -0700521SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200522 return vreinterpret_s64_u16(vshl_n_u16(vreinterpret_u16_s64(a), c));
523}
524
Yaowu Xu032573d2017-04-24 15:04:17 -0700525SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200526 return vreinterpret_s64_u16(vshr_n_u16(vreinterpret_u16_s64(a), c));
527}
528
Yaowu Xu032573d2017-04-24 15:04:17 -0700529SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200530 return vreinterpret_s64_s16(vshr_n_s16(vreinterpret_s16_s64(a), c));
531}
532
Yaowu Xu032573d2017-04-24 15:04:17 -0700533SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200534 return vreinterpret_s64_u32(vshl_n_u32(vreinterpret_u32_s64(a), c));
535}
536
Yaowu Xu032573d2017-04-24 15:04:17 -0700537SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200538 return vreinterpret_s64_u32(vshr_n_u32(vreinterpret_u32_s64(a), c));
539}
540
Yaowu Xu032573d2017-04-24 15:04:17 -0700541SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200542 return vreinterpret_s64_s32(vshr_n_s32(vreinterpret_s32_s64(a), c));
543}
544
545#else
546
Yaowu Xu032573d2017-04-24 15:04:17 -0700547SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int c) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200548 return v64_from_64(v64_u64(a) << c * 8);
549}
550
Yaowu Xu032573d2017-04-24 15:04:17 -0700551SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int c) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200552 return v64_from_64(v64_u64(a) >> c * 8);
553}
554
Yaowu Xu032573d2017-04-24 15:04:17 -0700555SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) { return v64_shl_8(a, c); }
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200556
Yaowu Xu032573d2017-04-24 15:04:17 -0700557SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) { return v64_shr_u8(a, c); }
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200558
Yaowu Xu032573d2017-04-24 15:04:17 -0700559SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) { return v64_shr_s8(a, c); }
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200560
Yaowu Xu032573d2017-04-24 15:04:17 -0700561SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) { return v64_shl_16(a, c); }
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200562
Yaowu Xu032573d2017-04-24 15:04:17 -0700563SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200564 return v64_shr_u16(a, c);
565}
566
Yaowu Xu032573d2017-04-24 15:04:17 -0700567SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200568 return v64_shr_s16(a, c);
569}
570
Yaowu Xu032573d2017-04-24 15:04:17 -0700571SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) { return v64_shl_32(a, c); }
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200572
Yaowu Xu032573d2017-04-24 15:04:17 -0700573SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200574 return v64_shr_u32(a, c);
575}
576
Yaowu Xu032573d2017-04-24 15:04:17 -0700577SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200578 return v64_shr_s32(a, c);
579}
580
581#endif
582
583#endif /* _V64_INTRINSICS_H */