blob: b5f8a835c296193336ff9909bb6b98bf4c32a514 [file] [log] [blame]
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +02001/*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12#ifndef _V64_INTRINSICS_C_H
13#define _V64_INTRINSICS_C_H
14
15/* Note: This implements the intrinsics in plain, unoptimised C.
16 Intended for reference, porting or debugging. */
17
18#include <stdio.h>
19#include <stdlib.h>
Tom Finegan60e653d2018-05-22 11:34:58 -070020
21#include "config/aom_config.h"
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +020022
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +020023typedef union {
24 uint8_t u8[8];
25 uint16_t u16[4];
26 uint32_t u32[2];
27 uint64_t u64;
28 int8_t s8[8];
29 int16_t s16[4];
30 int32_t s32[2];
31 int64_t s64;
32} c_v64;
33
34SIMD_INLINE uint32_t c_v64_low_u32(c_v64 a) { return a.u32[CONFIG_BIG_ENDIAN]; }
35
36SIMD_INLINE uint32_t c_v64_high_u32(c_v64 a) {
37 return a.u32[!CONFIG_BIG_ENDIAN];
38}
39
40SIMD_INLINE int32_t c_v64_low_s32(c_v64 a) { return a.s32[CONFIG_BIG_ENDIAN]; }
41
42SIMD_INLINE int32_t c_v64_high_s32(c_v64 a) {
43 return a.s32[!CONFIG_BIG_ENDIAN];
44}
45
46SIMD_INLINE c_v64 c_v64_from_32(uint32_t x, uint32_t y) {
47 c_v64 t;
48 t.u32[!CONFIG_BIG_ENDIAN] = x;
49 t.u32[CONFIG_BIG_ENDIAN] = y;
50 return t;
51}
52
53SIMD_INLINE c_v64 c_v64_from_64(uint64_t x) {
54 c_v64 t;
55 t.u64 = x;
56 return t;
57}
58
59SIMD_INLINE uint64_t c_v64_u64(c_v64 x) { return x.u64; }
60
61SIMD_INLINE c_v64 c_v64_from_16(uint16_t a, uint16_t b, uint16_t c,
62 uint16_t d) {
63 c_v64 t;
64 if (CONFIG_BIG_ENDIAN) {
65 t.u16[0] = a;
66 t.u16[1] = b;
67 t.u16[2] = c;
68 t.u16[3] = d;
69 } else {
70 t.u16[3] = a;
71 t.u16[2] = b;
72 t.u16[1] = c;
73 t.u16[0] = d;
74 }
75 return t;
76}
77
78SIMD_INLINE uint32_t c_u32_load_unaligned(const void *p) {
79 uint32_t t;
80 uint8_t *pp = (uint8_t *)p;
81 uint8_t *q = (uint8_t *)&t;
82 int c;
83 for (c = 0; c < 4; c++) q[c] = pp[c];
84 return t;
85}
86
87SIMD_INLINE void c_u32_store_unaligned(void *p, uint32_t a) {
88 uint8_t *pp = (uint8_t *)p;
89 uint8_t *q = (uint8_t *)&a;
90 int c;
91 for (c = 0; c < 4; c++) pp[c] = q[c];
92}
93
94SIMD_INLINE uint32_t c_u32_load_aligned(const void *p) {
Steinar Midtskogenea42c4e2016-12-12 09:40:34 +010095 if (SIMD_CHECK && (uintptr_t)p & 3) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +020096 fprintf(stderr, "Error: Unaligned u32 load at %p\n", p);
97 abort();
98 }
99 return c_u32_load_unaligned(p);
100}
101
102SIMD_INLINE void c_u32_store_aligned(void *p, uint32_t a) {
Steinar Midtskogenea42c4e2016-12-12 09:40:34 +0100103 if (SIMD_CHECK && (uintptr_t)p & 3) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200104 fprintf(stderr, "Error: Unaligned u32 store at %p\n", p);
105 abort();
106 }
107 c_u32_store_unaligned(p, a);
108}
109
110SIMD_INLINE c_v64 c_v64_load_unaligned(const void *p) {
111 c_v64 t;
112 uint8_t *pp = (uint8_t *)p;
113 uint8_t *q = (uint8_t *)&t;
114 int c;
115 for (c = 0; c < 8; c++) q[c] = pp[c];
116 return t;
117}
118
119SIMD_INLINE c_v64 c_v64_load_aligned(const void *p) {
Steinar Midtskogenea42c4e2016-12-12 09:40:34 +0100120 if (SIMD_CHECK && (uintptr_t)p & 7) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200121 fprintf(stderr, "Error: Unaligned c_v64 load at %p\n", p);
122 abort();
123 }
124 return c_v64_load_unaligned(p);
125}
126
127SIMD_INLINE void c_v64_store_unaligned(void *p, c_v64 a) {
128 uint8_t *q = (uint8_t *)p;
129 uint8_t *r = (uint8_t *)&a;
130 int c;
131 for (c = 0; c < 8; c++) q[c] = r[c];
132}
133
134SIMD_INLINE void c_v64_store_aligned(void *p, c_v64 a) {
Steinar Midtskogenea42c4e2016-12-12 09:40:34 +0100135 if (SIMD_CHECK && (uintptr_t)p & 7) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200136 fprintf(stderr, "Error: Unaligned c_v64 store at %p\n", p);
137 abort();
138 }
139 c_v64_store_unaligned(p, a);
140}
141
142SIMD_INLINE c_v64 c_v64_zero() {
143 c_v64 t;
144 t.u64 = 0;
145 return t;
146}
147
148SIMD_INLINE c_v64 c_v64_dup_8(uint8_t x) {
149 c_v64 t;
150 t.u8[0] = t.u8[1] = t.u8[2] = t.u8[3] = t.u8[4] = t.u8[5] = t.u8[6] =
151 t.u8[7] = x;
152 return t;
153}
154
155SIMD_INLINE c_v64 c_v64_dup_16(uint16_t x) {
156 c_v64 t;
157 t.u16[0] = t.u16[1] = t.u16[2] = t.u16[3] = x;
158 return t;
159}
160
161SIMD_INLINE c_v64 c_v64_dup_32(uint32_t x) {
162 c_v64 t;
163 t.u32[0] = t.u32[1] = x;
164 return t;
165}
166
167SIMD_INLINE c_v64 c_v64_add_8(c_v64 a, c_v64 b) {
168 c_v64 t;
169 int c;
170 for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] + b.u8[c];
171 return t;
172}
173
174SIMD_INLINE c_v64 c_v64_add_16(c_v64 a, c_v64 b) {
175 c_v64 t;
176 int c;
177 for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] + b.u16[c];
178 return t;
179}
180
181SIMD_INLINE c_v64 c_v64_sadd_s16(c_v64 a, c_v64 b) {
182 c_v64 t;
183 int c;
184 for (c = 0; c < 4; c++)
185 t.s16[c] = (int32_t)a.s16[c] + (int32_t)b.s16[c] > 32767
186 ? 32767
187 : (int32_t)a.s16[c] + (int32_t)b.s16[c] < -32768
188 ? -32768
189 : (int32_t)a.s16[c] + (int32_t)b.s16[c];
190 return t;
191}
192
193SIMD_INLINE c_v64 c_v64_add_32(c_v64 a, c_v64 b) {
194 c_v64 t;
Steinar Midtskogen95f1c2a2017-03-08 09:35:45 +0100195 t.u32[0] = (uint32_t)((uint64_t)a.u32[0] + b.u32[0]);
196 t.u32[1] = (uint32_t)((uint64_t)a.u32[1] + b.u32[1]);
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200197 return t;
198}
199
200SIMD_INLINE c_v64 c_v64_sub_8(c_v64 a, c_v64 b) {
201 c_v64 t;
202 int c;
203 for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] - b.u8[c];
204 return t;
205}
206
207SIMD_INLINE c_v64 c_v64_ssub_u8(c_v64 a, c_v64 b) {
208 c_v64 t;
209 int c;
210 for (c = 0; c < 8; c++)
Steinar Midtskogen95f1c2a2017-03-08 09:35:45 +0100211 t.u8[c] = (int32_t)a.u8[c] - (int32_t)b.u8[c] < 0 ? 0 : a.u8[c] - b.u8[c];
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200212 return t;
213}
214
215SIMD_INLINE c_v64 c_v64_ssub_s8(c_v64 a, c_v64 b) {
216 c_v64 t;
217 int c;
218 for (c = 0; c < 8; c++) {
219 int16_t d = (int16_t)a.s8[c] - (int16_t)b.s8[c];
220 t.s8[c] = d > 127 ? 127 : (d < -128 ? -128 : d);
221 }
222 return t;
223}
224
225SIMD_INLINE c_v64 c_v64_sub_16(c_v64 a, c_v64 b) {
226 c_v64 t;
227 int c;
228 for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] - b.u16[c];
229 return t;
230}
231
232SIMD_INLINE c_v64 c_v64_ssub_s16(c_v64 a, c_v64 b) {
233 c_v64 t;
234 int c;
235 for (c = 0; c < 4; c++)
236 t.s16[c] = (int32_t)a.s16[c] - (int32_t)b.s16[c] < -32768
237 ? -32768
238 : (int32_t)a.s16[c] - (int32_t)b.s16[c] > 32767
239 ? 32767
240 : (int32_t)a.s16[c] - (int32_t)b.s16[c];
241 return t;
242}
243
Steinar Midtskogen9b8444a2017-03-31 22:11:06 +0200244SIMD_INLINE c_v64 c_v64_ssub_u16(c_v64 a, c_v64 b) {
245 c_v64 t;
246 int c;
247 for (c = 0; c < 4; c++)
248 t.u16[c] =
249 (int32_t)a.u16[c] - (int32_t)b.u16[c] < 0 ? 0 : a.u16[c] - b.u16[c];
250 return t;
251}
252
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200253SIMD_INLINE c_v64 c_v64_sub_32(c_v64 a, c_v64 b) {
254 c_v64 t;
Steinar Midtskogen95f1c2a2017-03-08 09:35:45 +0100255 t.u32[0] = (uint32_t)((int64_t)a.u32[0] - b.u32[0]);
256 t.u32[1] = (uint32_t)((int64_t)a.u32[1] - b.u32[1]);
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200257 return t;
258}
259
260SIMD_INLINE c_v64 c_v64_abs_s16(c_v64 a) {
261 c_v64 t;
262 int c;
263 for (c = 0; c < 4; c++)
264 t.u16[c] = (int16_t)a.u16[c] > 0 ? a.u16[c] : -a.u16[c];
265 return t;
266}
267
Steinar Midtskogen6033fb82017-04-02 21:32:41 +0200268SIMD_INLINE c_v64 c_v64_abs_s8(c_v64 a) {
269 c_v64 t;
270 int c;
271 for (c = 0; c < 8; c++) t.u8[c] = (int8_t)a.u8[c] > 0 ? a.u8[c] : -a.u8[c];
272 return t;
273}
274
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200275SIMD_INLINE c_v64 _c_v64_zip_8(c_v64 a, c_v64 b, int mode) {
276 c_v64 t;
277 if (mode) {
278 t.u8[7] = a.u8[7];
279 t.u8[6] = b.u8[7];
280 t.u8[5] = a.u8[6];
281 t.u8[4] = b.u8[6];
282 t.u8[3] = a.u8[5];
283 t.u8[2] = b.u8[5];
284 t.u8[1] = a.u8[4];
285 t.u8[0] = b.u8[4];
286 } else {
287 t.u8[7] = a.u8[3];
288 t.u8[6] = b.u8[3];
289 t.u8[5] = a.u8[2];
290 t.u8[4] = b.u8[2];
291 t.u8[3] = a.u8[1];
292 t.u8[2] = b.u8[1];
293 t.u8[1] = a.u8[0];
294 t.u8[0] = b.u8[0];
295 }
296 return t;
297}
298
299SIMD_INLINE c_v64 c_v64_ziplo_8(c_v64 a, c_v64 b) {
300 return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 1) : _c_v64_zip_8(a, b, 0);
301}
302
303SIMD_INLINE c_v64 c_v64_ziphi_8(c_v64 a, c_v64 b) {
304 return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 0) : _c_v64_zip_8(a, b, 1);
305}
306
307SIMD_INLINE c_v64 _c_v64_zip_16(c_v64 a, c_v64 b, int mode) {
308 c_v64 t;
309 if (mode) {
310 t.u16[3] = a.u16[3];
311 t.u16[2] = b.u16[3];
312 t.u16[1] = a.u16[2];
313 t.u16[0] = b.u16[2];
314 } else {
315 t.u16[3] = a.u16[1];
316 t.u16[2] = b.u16[1];
317 t.u16[1] = a.u16[0];
318 t.u16[0] = b.u16[0];
319 }
320 return t;
321}
322
323SIMD_INLINE c_v64 c_v64_ziplo_16(c_v64 a, c_v64 b) {
324 return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 1) : _c_v64_zip_16(a, b, 0);
325}
326
327SIMD_INLINE c_v64 c_v64_ziphi_16(c_v64 a, c_v64 b) {
328 return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 0) : _c_v64_zip_16(a, b, 1);
329}
330
331SIMD_INLINE c_v64 _c_v64_zip_32(c_v64 a, c_v64 b, int mode) {
332 c_v64 t;
333 if (mode) {
334 t.u32[1] = a.u32[1];
335 t.u32[0] = b.u32[1];
336 } else {
337 t.u32[1] = a.u32[0];
338 t.u32[0] = b.u32[0];
339 }
340 return t;
341}
342
343SIMD_INLINE c_v64 c_v64_ziplo_32(c_v64 a, c_v64 b) {
344 return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 1) : _c_v64_zip_32(a, b, 0);
345}
346
347SIMD_INLINE c_v64 c_v64_ziphi_32(c_v64 a, c_v64 b) {
348 return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 0) : _c_v64_zip_32(a, b, 1);
349}
350
351SIMD_INLINE c_v64 _c_v64_unzip_8(c_v64 a, c_v64 b, int mode) {
352 c_v64 t;
353 if (mode) {
354 t.u8[7] = b.u8[7];
355 t.u8[6] = b.u8[5];
356 t.u8[5] = b.u8[3];
357 t.u8[4] = b.u8[1];
358 t.u8[3] = a.u8[7];
359 t.u8[2] = a.u8[5];
360 t.u8[1] = a.u8[3];
361 t.u8[0] = a.u8[1];
362 } else {
363 t.u8[7] = a.u8[6];
364 t.u8[6] = a.u8[4];
365 t.u8[5] = a.u8[2];
366 t.u8[4] = a.u8[0];
367 t.u8[3] = b.u8[6];
368 t.u8[2] = b.u8[4];
369 t.u8[1] = b.u8[2];
370 t.u8[0] = b.u8[0];
371 }
372 return t;
373}
374
375SIMD_INLINE c_v64 c_v64_unziplo_8(c_v64 a, c_v64 b) {
376 return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(a, b, 1) : _c_v64_unzip_8(a, b, 0);
377}
378
379SIMD_INLINE c_v64 c_v64_unziphi_8(c_v64 a, c_v64 b) {
380 return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(b, a, 0) : _c_v64_unzip_8(b, a, 1);
381}
382
383SIMD_INLINE c_v64 _c_v64_unzip_16(c_v64 a, c_v64 b, int mode) {
384 c_v64 t;
385 if (mode) {
386 t.u16[3] = b.u16[3];
387 t.u16[2] = b.u16[1];
388 t.u16[1] = a.u16[3];
389 t.u16[0] = a.u16[1];
390 } else {
391 t.u16[3] = a.u16[2];
392 t.u16[2] = a.u16[0];
393 t.u16[1] = b.u16[2];
394 t.u16[0] = b.u16[0];
395 }
396 return t;
397}
398
399SIMD_INLINE c_v64 c_v64_unziplo_16(c_v64 a, c_v64 b) {
400 return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(a, b, 1)
401 : _c_v64_unzip_16(a, b, 0);
402}
403
404SIMD_INLINE c_v64 c_v64_unziphi_16(c_v64 a, c_v64 b) {
405 return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(b, a, 0)
406 : _c_v64_unzip_16(b, a, 1);
407}
408
409SIMD_INLINE c_v64 c_v64_unpacklo_u8_s16(c_v64 a) {
410 c_v64 t;
411 int endian = !!CONFIG_BIG_ENDIAN * 4;
412 t.s16[3] = (int16_t)a.u8[3 + endian];
413 t.s16[2] = (int16_t)a.u8[2 + endian];
414 t.s16[1] = (int16_t)a.u8[1 + endian];
415 t.s16[0] = (int16_t)a.u8[0 + endian];
416 return t;
417}
418
419SIMD_INLINE c_v64 c_v64_unpackhi_u8_s16(c_v64 a) {
420 c_v64 t;
421 int endian = !!CONFIG_BIG_ENDIAN * 4;
422 t.s16[3] = (int16_t)a.u8[7 - endian];
423 t.s16[2] = (int16_t)a.u8[6 - endian];
424 t.s16[1] = (int16_t)a.u8[5 - endian];
425 t.s16[0] = (int16_t)a.u8[4 - endian];
426 return t;
427}
428
Steinar Midtskogen1b2b7392017-04-11 14:19:20 +0200429SIMD_INLINE c_v64 c_v64_unpacklo_s8_s16(c_v64 a) {
430 c_v64 t;
431 int endian = !!CONFIG_BIG_ENDIAN * 4;
432 t.s16[3] = (int16_t)a.s8[3 + endian];
433 t.s16[2] = (int16_t)a.s8[2 + endian];
434 t.s16[1] = (int16_t)a.s8[1 + endian];
435 t.s16[0] = (int16_t)a.s8[0 + endian];
436 return t;
437}
438
439SIMD_INLINE c_v64 c_v64_unpackhi_s8_s16(c_v64 a) {
440 c_v64 t;
441 int endian = !!CONFIG_BIG_ENDIAN * 4;
442 t.s16[3] = (int16_t)a.s8[7 - endian];
443 t.s16[2] = (int16_t)a.s8[6 - endian];
444 t.s16[1] = (int16_t)a.s8[5 - endian];
445 t.s16[0] = (int16_t)a.s8[4 - endian];
446 return t;
447}
448
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200449SIMD_INLINE c_v64 c_v64_pack_s32_s16(c_v64 a, c_v64 b) {
450 c_v64 t;
451 if (CONFIG_BIG_ENDIAN) {
452 c_v64 u = a;
453 a = b;
454 b = u;
455 }
456 t.s16[3] = a.s32[1] > 32767 ? 32767 : a.s32[1] < -32768 ? -32768 : a.s32[1];
457 t.s16[2] = a.s32[0] > 32767 ? 32767 : a.s32[0] < -32768 ? -32768 : a.s32[0];
458 t.s16[1] = b.s32[1] > 32767 ? 32767 : b.s32[1] < -32768 ? -32768 : b.s32[1];
459 t.s16[0] = b.s32[0] > 32767 ? 32767 : b.s32[0] < -32768 ? -32768 : b.s32[0];
460 return t;
461}
462
463SIMD_INLINE c_v64 c_v64_pack_s16_u8(c_v64 a, c_v64 b) {
464 c_v64 t;
465 if (CONFIG_BIG_ENDIAN) {
466 c_v64 u = a;
467 a = b;
468 b = u;
469 }
470 t.u8[7] = a.s16[3] > 255 ? 255 : a.s16[3] < 0 ? 0 : a.s16[3];
471 t.u8[6] = a.s16[2] > 255 ? 255 : a.s16[2] < 0 ? 0 : a.s16[2];
472 t.u8[5] = a.s16[1] > 255 ? 255 : a.s16[1] < 0 ? 0 : a.s16[1];
473 t.u8[4] = a.s16[0] > 255 ? 255 : a.s16[0] < 0 ? 0 : a.s16[0];
474 t.u8[3] = b.s16[3] > 255 ? 255 : b.s16[3] < 0 ? 0 : b.s16[3];
475 t.u8[2] = b.s16[2] > 255 ? 255 : b.s16[2] < 0 ? 0 : b.s16[2];
476 t.u8[1] = b.s16[1] > 255 ? 255 : b.s16[1] < 0 ? 0 : b.s16[1];
477 t.u8[0] = b.s16[0] > 255 ? 255 : b.s16[0] < 0 ? 0 : b.s16[0];
478 return t;
479}
480
481SIMD_INLINE c_v64 c_v64_pack_s16_s8(c_v64 a, c_v64 b) {
482 c_v64 t;
483 if (CONFIG_BIG_ENDIAN) {
484 c_v64 u = a;
485 a = b;
486 b = u;
487 }
488 t.u8[7] = a.s16[3] > 127 ? 127 : a.s16[3] < -128 ? 128 : a.s16[3];
489 t.u8[6] = a.s16[2] > 127 ? 127 : a.s16[2] < -128 ? 128 : a.s16[2];
490 t.u8[5] = a.s16[1] > 127 ? 127 : a.s16[1] < -128 ? 128 : a.s16[1];
491 t.u8[4] = a.s16[0] > 127 ? 127 : a.s16[0] < -128 ? 128 : a.s16[0];
492 t.u8[3] = b.s16[3] > 127 ? 127 : b.s16[3] < -128 ? 128 : b.s16[3];
493 t.u8[2] = b.s16[2] > 127 ? 127 : b.s16[2] < -128 ? 128 : b.s16[2];
494 t.u8[1] = b.s16[1] > 127 ? 127 : b.s16[1] < -128 ? 128 : b.s16[1];
495 t.u8[0] = b.s16[0] > 127 ? 127 : b.s16[0] < -128 ? 128 : b.s16[0];
496 return t;
497}
498
499SIMD_INLINE c_v64 c_v64_unpacklo_u16_s32(c_v64 a) {
500 c_v64 t;
501 t.s32[1] = a.u16[1 + !!CONFIG_BIG_ENDIAN * 2];
502 t.s32[0] = a.u16[0 + !!CONFIG_BIG_ENDIAN * 2];
503 return t;
504}
505
506SIMD_INLINE c_v64 c_v64_unpacklo_s16_s32(c_v64 a) {
507 c_v64 t;
508 t.s32[1] = a.s16[1 + !!CONFIG_BIG_ENDIAN * 2];
509 t.s32[0] = a.s16[0 + !!CONFIG_BIG_ENDIAN * 2];
510 return t;
511}
512
513SIMD_INLINE c_v64 c_v64_unpackhi_u16_s32(c_v64 a) {
514 c_v64 t;
515 t.s32[1] = a.u16[3 - !!CONFIG_BIG_ENDIAN * 2];
516 t.s32[0] = a.u16[2 - !!CONFIG_BIG_ENDIAN * 2];
517 return t;
518}
519
520SIMD_INLINE c_v64 c_v64_unpackhi_s16_s32(c_v64 a) {
521 c_v64 t;
522 t.s32[1] = a.s16[3 - !!CONFIG_BIG_ENDIAN * 2];
523 t.s32[0] = a.s16[2 - !!CONFIG_BIG_ENDIAN * 2];
524 return t;
525}
526
527SIMD_INLINE c_v64 c_v64_shuffle_8(c_v64 a, c_v64 pattern) {
528 c_v64 t;
529 int c;
530 for (c = 0; c < 8; c++) {
Steinar Midtskogenea42c4e2016-12-12 09:40:34 +0100531 if (SIMD_CHECK && (pattern.u8[c] & ~7)) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200532 fprintf(stderr, "Error: Undefined v64_shuffle_8 index %d/%d\n",
533 pattern.u8[c], c);
534 abort();
535 }
536 t.u8[c] =
537 a.u8[CONFIG_BIG_ENDIAN ? 7 - (pattern.u8[c] & 7) : pattern.u8[c] & 7];
538 }
539 return t;
540}
541
542SIMD_INLINE int64_t c_v64_dotp_su8(c_v64 a, c_v64 b) {
543 return a.s8[7] * b.u8[7] + a.s8[6] * b.u8[6] + a.s8[5] * b.u8[5] +
544 a.s8[4] * b.u8[4] + a.s8[3] * b.u8[3] + a.s8[2] * b.u8[2] +
545 a.s8[1] * b.u8[1] + a.s8[0] * b.u8[0];
546}
547
548SIMD_INLINE int64_t c_v64_dotp_s16(c_v64 a, c_v64 b) {
549 return (int64_t)(a.s16[3] * b.s16[3] + a.s16[2] * b.s16[2]) +
550 (int64_t)(a.s16[1] * b.s16[1] + a.s16[0] * b.s16[0]);
551}
552
553SIMD_INLINE uint64_t c_v64_hadd_u8(c_v64 a) {
554 return a.u8[7] + a.u8[6] + a.u8[5] + a.u8[4] + a.u8[3] + a.u8[2] + a.u8[1] +
555 a.u8[0];
556}
557
558SIMD_INLINE int64_t c_v64_hadd_s16(c_v64 a) {
559 return a.s16[3] + a.s16[2] + a.s16[1] + a.s16[0];
560}
561
562typedef uint32_t c_sad64_internal;
563
564/* Implementation dependent return value. Result must be finalised with
565 v64_sad_u8_sum().
566 The result for more than 32 v64_sad_u8() calls is undefined. */
567SIMD_INLINE c_sad64_internal c_v64_sad_u8_init() { return 0; }
568
569SIMD_INLINE c_sad64_internal c_v64_sad_u8(c_sad64_internal s, c_v64 a,
570 c_v64 b) {
571 int c;
572 for (c = 0; c < 8; c++)
573 s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
574 return s;
575}
576
577SIMD_INLINE uint32_t c_v64_sad_u8_sum(c_sad64_internal s) { return s; }
578
579typedef uint32_t c_ssd64_internal;
580
581/* Implementation dependent return value. Result must be finalised with
582 * v64_ssd_u8_sum(). */
583SIMD_INLINE c_ssd64_internal c_v64_ssd_u8_init() { return 0; }
584
585SIMD_INLINE c_ssd64_internal c_v64_ssd_u8(c_ssd64_internal s, c_v64 a,
586 c_v64 b) {
587 int c;
588 for (c = 0; c < 8; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
589 return s;
590}
591
592SIMD_INLINE uint32_t c_v64_ssd_u8_sum(c_ssd64_internal s) { return s; }
593
594SIMD_INLINE c_v64 c_v64_or(c_v64 a, c_v64 b) {
595 c_v64 t;
596 t.u64 = a.u64 | b.u64;
597 return t;
598}
599
600SIMD_INLINE c_v64 c_v64_xor(c_v64 a, c_v64 b) {
601 c_v64 t;
602 t.u64 = a.u64 ^ b.u64;
603 return t;
604}
605
606SIMD_INLINE c_v64 c_v64_and(c_v64 a, c_v64 b) {
607 c_v64 t;
608 t.u64 = a.u64 & b.u64;
609 return t;
610}
611
612SIMD_INLINE c_v64 c_v64_andn(c_v64 a, c_v64 b) {
613 c_v64 t;
614 t.u64 = a.u64 & ~b.u64;
615 return t;
616}
617
618SIMD_INLINE c_v64 c_v64_mullo_s16(c_v64 a, c_v64 b) {
619 c_v64 t;
620 int c;
621 for (c = 0; c < 4; c++) t.s16[c] = (int16_t)(a.s16[c] * b.s16[c]);
622 return t;
623}
624
625SIMD_INLINE c_v64 c_v64_mulhi_s16(c_v64 a, c_v64 b) {
626 c_v64 t;
627 int c;
628 for (c = 0; c < 4; c++) t.s16[c] = (a.s16[c] * b.s16[c]) >> 16;
629 return t;
630}
631
632SIMD_INLINE c_v64 c_v64_mullo_s32(c_v64 a, c_v64 b) {
633 c_v64 t;
Steinar Midtskogen6c795762017-03-07 20:55:48 +0100634 t.s32[0] = (int32_t)((int64_t)a.s32[0] * b.s32[0]);
635 t.s32[1] = (int32_t)((int64_t)a.s32[1] * b.s32[1]);
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200636 return t;
637}
638
639SIMD_INLINE c_v64 c_v64_madd_s16(c_v64 a, c_v64 b) {
640 c_v64 t;
641 t.s32[0] = a.s16[0] * b.s16[0] + a.s16[1] * b.s16[1];
642 t.s32[1] = a.s16[2] * b.s16[2] + a.s16[3] * b.s16[3];
643 return t;
644}
645
646SIMD_INLINE c_v64 c_v64_madd_us8(c_v64 a, c_v64 b) {
647 c_v64 t;
648 int32_t u;
649 u = a.u8[0] * b.s8[0] + a.u8[1] * b.s8[1];
650 t.s16[0] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
651 u = a.u8[2] * b.s8[2] + a.u8[3] * b.s8[3];
652 t.s16[1] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
653 u = a.u8[4] * b.s8[4] + a.u8[5] * b.s8[5];
654 t.s16[2] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
655 u = a.u8[6] * b.s8[6] + a.u8[7] * b.s8[7];
656 t.s16[3] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
657 return t;
658}
659
660SIMD_INLINE c_v64 c_v64_avg_u8(c_v64 a, c_v64 b) {
661 c_v64 t;
662 int c;
663 for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c] + 1) >> 1;
664 return t;
665}
666
667SIMD_INLINE c_v64 c_v64_rdavg_u8(c_v64 a, c_v64 b) {
668 c_v64 t;
669 int c;
670 for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c]) >> 1;
671 return t;
672}
673
674SIMD_INLINE c_v64 c_v64_avg_u16(c_v64 a, c_v64 b) {
675 c_v64 t;
676 int c;
677 for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c] + 1) >> 1;
678 return t;
679}
680
681SIMD_INLINE c_v64 c_v64_min_u8(c_v64 a, c_v64 b) {
682 c_v64 t;
683 int c;
684 for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? b.u8[c] : a.u8[c];
685 return t;
686}
687
688SIMD_INLINE c_v64 c_v64_max_u8(c_v64 a, c_v64 b) {
689 c_v64 t;
690 int c;
691 for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? a.u8[c] : b.u8[c];
692 return t;
693}
694
695SIMD_INLINE c_v64 c_v64_min_s8(c_v64 a, c_v64 b) {
696 c_v64 t;
697 int c;
698 for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? b.s8[c] : a.s8[c];
699 return t;
700}
701
702SIMD_INLINE c_v64 c_v64_max_s8(c_v64 a, c_v64 b) {
703 c_v64 t;
704 int c;
705 for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? a.s8[c] : b.s8[c];
706 return t;
707}
708
709SIMD_INLINE c_v64 c_v64_min_s16(c_v64 a, c_v64 b) {
710 c_v64 t;
711 int c;
712 for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? b.s16[c] : a.s16[c];
713 return t;
714}
715
716SIMD_INLINE c_v64 c_v64_max_s16(c_v64 a, c_v64 b) {
717 c_v64 t;
718 int c;
719 for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? a.s16[c] : b.s16[c];
720 return t;
721}
722
723SIMD_INLINE c_v64 c_v64_cmpgt_s8(c_v64 a, c_v64 b) {
724 c_v64 t;
725 int c;
726 for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] > b.s8[c]);
727 return t;
728}
729
730SIMD_INLINE c_v64 c_v64_cmplt_s8(c_v64 a, c_v64 b) {
731 c_v64 t;
732 int c;
733 for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] < b.s8[c]);
734 return t;
735}
736
737SIMD_INLINE c_v64 c_v64_cmpeq_8(c_v64 a, c_v64 b) {
738 c_v64 t;
739 int c;
740 for (c = 0; c < 8; c++) t.s8[c] = -(a.u8[c] == b.u8[c]);
741 return t;
742}
743
744SIMD_INLINE c_v64 c_v64_cmpgt_s16(c_v64 a, c_v64 b) {
745 c_v64 t;
746 int c;
747 for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] > b.s16[c]);
748 return t;
749}
750
751SIMD_INLINE c_v64 c_v64_cmplt_s16(c_v64 a, c_v64 b) {
752 c_v64 t;
753 int c;
754 for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] < b.s16[c]);
755 return t;
756}
757
758SIMD_INLINE c_v64 c_v64_cmpeq_16(c_v64 a, c_v64 b) {
759 c_v64 t;
760 int c;
761 for (c = 0; c < 4; c++) t.s16[c] = -(a.u16[c] == b.u16[c]);
762 return t;
763}
764
765SIMD_INLINE c_v64 c_v64_shl_8(c_v64 a, unsigned int n) {
766 c_v64 t;
767 int c;
Steinar Midtskogenea42c4e2016-12-12 09:40:34 +0100768 if (SIMD_CHECK && n > 7) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200769 fprintf(stderr, "Error: Undefined u8 shift left %d\n", n);
770 abort();
771 }
772 for (c = 0; c < 8; c++) t.s8[c] = a.u8[c] << n;
773 return t;
774}
775
776SIMD_INLINE c_v64 c_v64_shr_u8(c_v64 a, unsigned int n) {
777 c_v64 t;
778 int c;
Steinar Midtskogenea42c4e2016-12-12 09:40:34 +0100779 if (SIMD_CHECK && n > 7) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200780 fprintf(stderr, "Error: Undefined u8 shift right %d\n", n);
781 abort();
782 }
783 for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] >> n;
784 return t;
785}
786
787SIMD_INLINE c_v64 c_v64_shr_s8(c_v64 a, unsigned int n) {
788 c_v64 t;
789 int c;
Steinar Midtskogenea42c4e2016-12-12 09:40:34 +0100790 if (SIMD_CHECK && n > 7) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200791 fprintf(stderr, "Error: Undefined s8 shift right %d\n", n);
792 abort();
793 }
794 for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] >> n;
795 return t;
796}
797
798SIMD_INLINE c_v64 c_v64_shl_16(c_v64 a, unsigned int n) {
799 c_v64 t;
800 int c;
Steinar Midtskogenea42c4e2016-12-12 09:40:34 +0100801 if (SIMD_CHECK && n > 15) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200802 fprintf(stderr, "Error: Undefined u16 shift left %d\n", n);
803 abort();
804 }
805 for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] << n;
806 return t;
807}
808
809SIMD_INLINE c_v64 c_v64_shr_u16(c_v64 a, unsigned int n) {
810 c_v64 t;
811 int c;
Steinar Midtskogenea42c4e2016-12-12 09:40:34 +0100812 if (SIMD_CHECK && n > 15) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200813 fprintf(stderr, "Error: Undefined u16 shift right %d\n", n);
814 abort();
815 }
816 for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] >> n;
817 return t;
818}
819
820SIMD_INLINE c_v64 c_v64_shr_s16(c_v64 a, unsigned int n) {
821 c_v64 t;
822 int c;
Steinar Midtskogenea42c4e2016-12-12 09:40:34 +0100823 if (SIMD_CHECK && n > 15) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200824 fprintf(stderr, "Error: undefined s16 shift right %d\n", n);
825 abort();
826 }
827 for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] >> n;
828 return t;
829}
830
831SIMD_INLINE c_v64 c_v64_shl_32(c_v64 a, unsigned int n) {
832 c_v64 t;
Steinar Midtskogenea42c4e2016-12-12 09:40:34 +0100833 if (SIMD_CHECK && n > 31) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200834 fprintf(stderr, "Error: undefined u32 shift left %d\n", n);
835 abort();
836 }
837 t.u32[1] = a.u32[1] << n;
838 t.u32[0] = a.u32[0] << n;
839 return t;
840}
841
842SIMD_INLINE c_v64 c_v64_shr_u32(c_v64 a, unsigned int n) {
843 c_v64 t;
Steinar Midtskogenea42c4e2016-12-12 09:40:34 +0100844 if (SIMD_CHECK && n > 31) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200845 fprintf(stderr, "Error: undefined u32 shift right %d\n", n);
846 abort();
847 }
848 t.u32[1] = a.u32[1] >> n;
849 t.u32[0] = a.u32[0] >> n;
850 return t;
851}
852
853SIMD_INLINE c_v64 c_v64_shr_s32(c_v64 a, unsigned int n) {
854 c_v64 t;
Steinar Midtskogenea42c4e2016-12-12 09:40:34 +0100855 if (SIMD_CHECK && n > 31) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200856 fprintf(stderr, "Error: undefined s32 shift right %d\n", n);
857 abort();
858 }
859 t.s32[1] = a.s32[1] >> n;
860 t.s32[0] = a.s32[0] >> n;
861 return t;
862}
863
Yaowu Xu032573d2017-04-24 15:04:17 -0700864SIMD_INLINE c_v64 c_v64_shr_n_byte(c_v64 x, unsigned int i) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200865 c_v64 t;
866 t.u64 = x.u64 >> i * 8;
867 return t;
868}
869
Yaowu Xu032573d2017-04-24 15:04:17 -0700870SIMD_INLINE c_v64 c_v64_shl_n_byte(c_v64 x, unsigned int i) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200871 c_v64 t;
872 t.u64 = x.u64 << i * 8;
873 return t;
874}
875
Yaowu Xu032573d2017-04-24 15:04:17 -0700876SIMD_INLINE c_v64 c_v64_align(c_v64 a, c_v64 b, unsigned int c) {
Steinar Midtskogenea42c4e2016-12-12 09:40:34 +0100877 if (SIMD_CHECK && c > 7) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200878 fprintf(stderr, "Error: undefined alignment %d\n", c);
879 abort();
880 }
881 return c ? c_v64_or(c_v64_shr_n_byte(b, c), c_v64_shl_n_byte(a, 8 - c)) : b;
882}
883
Yaowu Xu032573d2017-04-24 15:04:17 -0700884SIMD_INLINE c_v64 c_v64_shl_n_8(c_v64 a, unsigned int c) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200885 return c_v64_shl_8(a, c);
886}
887
Yaowu Xu032573d2017-04-24 15:04:17 -0700888SIMD_INLINE c_v64 c_v64_shr_n_u8(c_v64 a, unsigned int c) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200889 return c_v64_shr_u8(a, c);
890}
891
Yaowu Xu032573d2017-04-24 15:04:17 -0700892SIMD_INLINE c_v64 c_v64_shr_n_s8(c_v64 a, unsigned int c) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200893 return c_v64_shr_s8(a, c);
894}
895
Yaowu Xu032573d2017-04-24 15:04:17 -0700896SIMD_INLINE c_v64 c_v64_shl_n_16(c_v64 a, unsigned int c) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200897 return c_v64_shl_16(a, c);
898}
899
Yaowu Xu032573d2017-04-24 15:04:17 -0700900SIMD_INLINE c_v64 c_v64_shr_n_u16(c_v64 a, unsigned int c) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200901 return c_v64_shr_u16(a, c);
902}
903
Yaowu Xu032573d2017-04-24 15:04:17 -0700904SIMD_INLINE c_v64 c_v64_shr_n_s16(c_v64 a, unsigned int c) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200905 return c_v64_shr_s16(a, c);
906}
907
Yaowu Xu032573d2017-04-24 15:04:17 -0700908SIMD_INLINE c_v64 c_v64_shl_n_32(c_v64 a, unsigned int c) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200909 return c_v64_shl_32(a, c);
910}
911
Yaowu Xu032573d2017-04-24 15:04:17 -0700912SIMD_INLINE c_v64 c_v64_shr_n_u32(c_v64 a, unsigned int c) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200913 return c_v64_shr_u32(a, c);
914}
915
Yaowu Xu032573d2017-04-24 15:04:17 -0700916SIMD_INLINE c_v64 c_v64_shr_n_s32(c_v64 a, unsigned int c) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200917 return c_v64_shr_s32(a, c);
918}
919
920#endif /* _V64_INTRINSICS_C_H */