blob: 525070ad60af172c2519a9f59188603fb17f1008 [file] [log] [blame]
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +02001/*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12#ifndef _V64_INTRINSICS_C_H
13#define _V64_INTRINSICS_C_H
14
15/* Note: This implements the intrinsics in plain, unoptimised C.
16 Intended for reference, porting or debugging. */
17
18#include <stdio.h>
19#include <stdlib.h>
20#include "./aom_config.h"
21
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +020022typedef union {
23 uint8_t u8[8];
24 uint16_t u16[4];
25 uint32_t u32[2];
26 uint64_t u64;
27 int8_t s8[8];
28 int16_t s16[4];
29 int32_t s32[2];
30 int64_t s64;
31} c_v64;
32
33SIMD_INLINE uint32_t c_v64_low_u32(c_v64 a) { return a.u32[CONFIG_BIG_ENDIAN]; }
34
35SIMD_INLINE uint32_t c_v64_high_u32(c_v64 a) {
36 return a.u32[!CONFIG_BIG_ENDIAN];
37}
38
39SIMD_INLINE int32_t c_v64_low_s32(c_v64 a) { return a.s32[CONFIG_BIG_ENDIAN]; }
40
41SIMD_INLINE int32_t c_v64_high_s32(c_v64 a) {
42 return a.s32[!CONFIG_BIG_ENDIAN];
43}
44
45SIMD_INLINE c_v64 c_v64_from_32(uint32_t x, uint32_t y) {
46 c_v64 t;
47 t.u32[!CONFIG_BIG_ENDIAN] = x;
48 t.u32[CONFIG_BIG_ENDIAN] = y;
49 return t;
50}
51
52SIMD_INLINE c_v64 c_v64_from_64(uint64_t x) {
53 c_v64 t;
54 t.u64 = x;
55 return t;
56}
57
58SIMD_INLINE uint64_t c_v64_u64(c_v64 x) { return x.u64; }
59
60SIMD_INLINE c_v64 c_v64_from_16(uint16_t a, uint16_t b, uint16_t c,
61 uint16_t d) {
62 c_v64 t;
63 if (CONFIG_BIG_ENDIAN) {
64 t.u16[0] = a;
65 t.u16[1] = b;
66 t.u16[2] = c;
67 t.u16[3] = d;
68 } else {
69 t.u16[3] = a;
70 t.u16[2] = b;
71 t.u16[1] = c;
72 t.u16[0] = d;
73 }
74 return t;
75}
76
77SIMD_INLINE uint32_t c_u32_load_unaligned(const void *p) {
78 uint32_t t;
79 uint8_t *pp = (uint8_t *)p;
80 uint8_t *q = (uint8_t *)&t;
81 int c;
82 for (c = 0; c < 4; c++) q[c] = pp[c];
83 return t;
84}
85
86SIMD_INLINE void c_u32_store_unaligned(void *p, uint32_t a) {
87 uint8_t *pp = (uint8_t *)p;
88 uint8_t *q = (uint8_t *)&a;
89 int c;
90 for (c = 0; c < 4; c++) pp[c] = q[c];
91}
92
93SIMD_INLINE uint32_t c_u32_load_aligned(const void *p) {
Steinar Midtskogenea42c4e2016-12-12 09:40:34 +010094 if (SIMD_CHECK && (uintptr_t)p & 3) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +020095 fprintf(stderr, "Error: Unaligned u32 load at %p\n", p);
96 abort();
97 }
98 return c_u32_load_unaligned(p);
99}
100
101SIMD_INLINE void c_u32_store_aligned(void *p, uint32_t a) {
Steinar Midtskogenea42c4e2016-12-12 09:40:34 +0100102 if (SIMD_CHECK && (uintptr_t)p & 3) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200103 fprintf(stderr, "Error: Unaligned u32 store at %p\n", p);
104 abort();
105 }
106 c_u32_store_unaligned(p, a);
107}
108
109SIMD_INLINE c_v64 c_v64_load_unaligned(const void *p) {
110 c_v64 t;
111 uint8_t *pp = (uint8_t *)p;
112 uint8_t *q = (uint8_t *)&t;
113 int c;
114 for (c = 0; c < 8; c++) q[c] = pp[c];
115 return t;
116}
117
118SIMD_INLINE c_v64 c_v64_load_aligned(const void *p) {
Steinar Midtskogenea42c4e2016-12-12 09:40:34 +0100119 if (SIMD_CHECK && (uintptr_t)p & 7) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200120 fprintf(stderr, "Error: Unaligned c_v64 load at %p\n", p);
121 abort();
122 }
123 return c_v64_load_unaligned(p);
124}
125
126SIMD_INLINE void c_v64_store_unaligned(void *p, c_v64 a) {
127 uint8_t *q = (uint8_t *)p;
128 uint8_t *r = (uint8_t *)&a;
129 int c;
130 for (c = 0; c < 8; c++) q[c] = r[c];
131}
132
133SIMD_INLINE void c_v64_store_aligned(void *p, c_v64 a) {
Steinar Midtskogenea42c4e2016-12-12 09:40:34 +0100134 if (SIMD_CHECK && (uintptr_t)p & 7) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200135 fprintf(stderr, "Error: Unaligned c_v64 store at %p\n", p);
136 abort();
137 }
138 c_v64_store_unaligned(p, a);
139}
140
141SIMD_INLINE c_v64 c_v64_zero() {
142 c_v64 t;
143 t.u64 = 0;
144 return t;
145}
146
147SIMD_INLINE c_v64 c_v64_dup_8(uint8_t x) {
148 c_v64 t;
149 t.u8[0] = t.u8[1] = t.u8[2] = t.u8[3] = t.u8[4] = t.u8[5] = t.u8[6] =
150 t.u8[7] = x;
151 return t;
152}
153
154SIMD_INLINE c_v64 c_v64_dup_16(uint16_t x) {
155 c_v64 t;
156 t.u16[0] = t.u16[1] = t.u16[2] = t.u16[3] = x;
157 return t;
158}
159
160SIMD_INLINE c_v64 c_v64_dup_32(uint32_t x) {
161 c_v64 t;
162 t.u32[0] = t.u32[1] = x;
163 return t;
164}
165
166SIMD_INLINE c_v64 c_v64_add_8(c_v64 a, c_v64 b) {
167 c_v64 t;
168 int c;
169 for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] + b.u8[c];
170 return t;
171}
172
173SIMD_INLINE c_v64 c_v64_add_16(c_v64 a, c_v64 b) {
174 c_v64 t;
175 int c;
176 for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] + b.u16[c];
177 return t;
178}
179
180SIMD_INLINE c_v64 c_v64_sadd_s16(c_v64 a, c_v64 b) {
181 c_v64 t;
182 int c;
183 for (c = 0; c < 4; c++)
184 t.s16[c] = (int32_t)a.s16[c] + (int32_t)b.s16[c] > 32767
185 ? 32767
186 : (int32_t)a.s16[c] + (int32_t)b.s16[c] < -32768
187 ? -32768
188 : (int32_t)a.s16[c] + (int32_t)b.s16[c];
189 return t;
190}
191
192SIMD_INLINE c_v64 c_v64_add_32(c_v64 a, c_v64 b) {
193 c_v64 t;
Steinar Midtskogen95f1c2a2017-03-08 09:35:45 +0100194 t.u32[0] = (uint32_t)((uint64_t)a.u32[0] + b.u32[0]);
195 t.u32[1] = (uint32_t)((uint64_t)a.u32[1] + b.u32[1]);
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200196 return t;
197}
198
199SIMD_INLINE c_v64 c_v64_sub_8(c_v64 a, c_v64 b) {
200 c_v64 t;
201 int c;
202 for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] - b.u8[c];
203 return t;
204}
205
206SIMD_INLINE c_v64 c_v64_ssub_u8(c_v64 a, c_v64 b) {
207 c_v64 t;
208 int c;
209 for (c = 0; c < 8; c++)
Steinar Midtskogen95f1c2a2017-03-08 09:35:45 +0100210 t.u8[c] = (int32_t)a.u8[c] - (int32_t)b.u8[c] < 0 ? 0 : a.u8[c] - b.u8[c];
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200211 return t;
212}
213
214SIMD_INLINE c_v64 c_v64_ssub_s8(c_v64 a, c_v64 b) {
215 c_v64 t;
216 int c;
217 for (c = 0; c < 8; c++) {
218 int16_t d = (int16_t)a.s8[c] - (int16_t)b.s8[c];
219 t.s8[c] = d > 127 ? 127 : (d < -128 ? -128 : d);
220 }
221 return t;
222}
223
224SIMD_INLINE c_v64 c_v64_sub_16(c_v64 a, c_v64 b) {
225 c_v64 t;
226 int c;
227 for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] - b.u16[c];
228 return t;
229}
230
231SIMD_INLINE c_v64 c_v64_ssub_s16(c_v64 a, c_v64 b) {
232 c_v64 t;
233 int c;
234 for (c = 0; c < 4; c++)
235 t.s16[c] = (int32_t)a.s16[c] - (int32_t)b.s16[c] < -32768
236 ? -32768
237 : (int32_t)a.s16[c] - (int32_t)b.s16[c] > 32767
238 ? 32767
239 : (int32_t)a.s16[c] - (int32_t)b.s16[c];
240 return t;
241}
242
243SIMD_INLINE c_v64 c_v64_sub_32(c_v64 a, c_v64 b) {
244 c_v64 t;
Steinar Midtskogen95f1c2a2017-03-08 09:35:45 +0100245 t.u32[0] = (uint32_t)((int64_t)a.u32[0] - b.u32[0]);
246 t.u32[1] = (uint32_t)((int64_t)a.u32[1] - b.u32[1]);
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200247 return t;
248}
249
250SIMD_INLINE c_v64 c_v64_abs_s16(c_v64 a) {
251 c_v64 t;
252 int c;
253 for (c = 0; c < 4; c++)
254 t.u16[c] = (int16_t)a.u16[c] > 0 ? a.u16[c] : -a.u16[c];
255 return t;
256}
257
258SIMD_INLINE c_v64 _c_v64_zip_8(c_v64 a, c_v64 b, int mode) {
259 c_v64 t;
260 if (mode) {
261 t.u8[7] = a.u8[7];
262 t.u8[6] = b.u8[7];
263 t.u8[5] = a.u8[6];
264 t.u8[4] = b.u8[6];
265 t.u8[3] = a.u8[5];
266 t.u8[2] = b.u8[5];
267 t.u8[1] = a.u8[4];
268 t.u8[0] = b.u8[4];
269 } else {
270 t.u8[7] = a.u8[3];
271 t.u8[6] = b.u8[3];
272 t.u8[5] = a.u8[2];
273 t.u8[4] = b.u8[2];
274 t.u8[3] = a.u8[1];
275 t.u8[2] = b.u8[1];
276 t.u8[1] = a.u8[0];
277 t.u8[0] = b.u8[0];
278 }
279 return t;
280}
281
282SIMD_INLINE c_v64 c_v64_ziplo_8(c_v64 a, c_v64 b) {
283 return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 1) : _c_v64_zip_8(a, b, 0);
284}
285
286SIMD_INLINE c_v64 c_v64_ziphi_8(c_v64 a, c_v64 b) {
287 return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 0) : _c_v64_zip_8(a, b, 1);
288}
289
290SIMD_INLINE c_v64 _c_v64_zip_16(c_v64 a, c_v64 b, int mode) {
291 c_v64 t;
292 if (mode) {
293 t.u16[3] = a.u16[3];
294 t.u16[2] = b.u16[3];
295 t.u16[1] = a.u16[2];
296 t.u16[0] = b.u16[2];
297 } else {
298 t.u16[3] = a.u16[1];
299 t.u16[2] = b.u16[1];
300 t.u16[1] = a.u16[0];
301 t.u16[0] = b.u16[0];
302 }
303 return t;
304}
305
306SIMD_INLINE c_v64 c_v64_ziplo_16(c_v64 a, c_v64 b) {
307 return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 1) : _c_v64_zip_16(a, b, 0);
308}
309
310SIMD_INLINE c_v64 c_v64_ziphi_16(c_v64 a, c_v64 b) {
311 return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 0) : _c_v64_zip_16(a, b, 1);
312}
313
314SIMD_INLINE c_v64 _c_v64_zip_32(c_v64 a, c_v64 b, int mode) {
315 c_v64 t;
316 if (mode) {
317 t.u32[1] = a.u32[1];
318 t.u32[0] = b.u32[1];
319 } else {
320 t.u32[1] = a.u32[0];
321 t.u32[0] = b.u32[0];
322 }
323 return t;
324}
325
326SIMD_INLINE c_v64 c_v64_ziplo_32(c_v64 a, c_v64 b) {
327 return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 1) : _c_v64_zip_32(a, b, 0);
328}
329
330SIMD_INLINE c_v64 c_v64_ziphi_32(c_v64 a, c_v64 b) {
331 return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 0) : _c_v64_zip_32(a, b, 1);
332}
333
334SIMD_INLINE c_v64 _c_v64_unzip_8(c_v64 a, c_v64 b, int mode) {
335 c_v64 t;
336 if (mode) {
337 t.u8[7] = b.u8[7];
338 t.u8[6] = b.u8[5];
339 t.u8[5] = b.u8[3];
340 t.u8[4] = b.u8[1];
341 t.u8[3] = a.u8[7];
342 t.u8[2] = a.u8[5];
343 t.u8[1] = a.u8[3];
344 t.u8[0] = a.u8[1];
345 } else {
346 t.u8[7] = a.u8[6];
347 t.u8[6] = a.u8[4];
348 t.u8[5] = a.u8[2];
349 t.u8[4] = a.u8[0];
350 t.u8[3] = b.u8[6];
351 t.u8[2] = b.u8[4];
352 t.u8[1] = b.u8[2];
353 t.u8[0] = b.u8[0];
354 }
355 return t;
356}
357
358SIMD_INLINE c_v64 c_v64_unziplo_8(c_v64 a, c_v64 b) {
359 return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(a, b, 1) : _c_v64_unzip_8(a, b, 0);
360}
361
362SIMD_INLINE c_v64 c_v64_unziphi_8(c_v64 a, c_v64 b) {
363 return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(b, a, 0) : _c_v64_unzip_8(b, a, 1);
364}
365
366SIMD_INLINE c_v64 _c_v64_unzip_16(c_v64 a, c_v64 b, int mode) {
367 c_v64 t;
368 if (mode) {
369 t.u16[3] = b.u16[3];
370 t.u16[2] = b.u16[1];
371 t.u16[1] = a.u16[3];
372 t.u16[0] = a.u16[1];
373 } else {
374 t.u16[3] = a.u16[2];
375 t.u16[2] = a.u16[0];
376 t.u16[1] = b.u16[2];
377 t.u16[0] = b.u16[0];
378 }
379 return t;
380}
381
382SIMD_INLINE c_v64 c_v64_unziplo_16(c_v64 a, c_v64 b) {
383 return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(a, b, 1)
384 : _c_v64_unzip_16(a, b, 0);
385}
386
387SIMD_INLINE c_v64 c_v64_unziphi_16(c_v64 a, c_v64 b) {
388 return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(b, a, 0)
389 : _c_v64_unzip_16(b, a, 1);
390}
391
392SIMD_INLINE c_v64 c_v64_unpacklo_u8_s16(c_v64 a) {
393 c_v64 t;
394 int endian = !!CONFIG_BIG_ENDIAN * 4;
395 t.s16[3] = (int16_t)a.u8[3 + endian];
396 t.s16[2] = (int16_t)a.u8[2 + endian];
397 t.s16[1] = (int16_t)a.u8[1 + endian];
398 t.s16[0] = (int16_t)a.u8[0 + endian];
399 return t;
400}
401
402SIMD_INLINE c_v64 c_v64_unpackhi_u8_s16(c_v64 a) {
403 c_v64 t;
404 int endian = !!CONFIG_BIG_ENDIAN * 4;
405 t.s16[3] = (int16_t)a.u8[7 - endian];
406 t.s16[2] = (int16_t)a.u8[6 - endian];
407 t.s16[1] = (int16_t)a.u8[5 - endian];
408 t.s16[0] = (int16_t)a.u8[4 - endian];
409 return t;
410}
411
412SIMD_INLINE c_v64 c_v64_pack_s32_s16(c_v64 a, c_v64 b) {
413 c_v64 t;
414 if (CONFIG_BIG_ENDIAN) {
415 c_v64 u = a;
416 a = b;
417 b = u;
418 }
419 t.s16[3] = a.s32[1] > 32767 ? 32767 : a.s32[1] < -32768 ? -32768 : a.s32[1];
420 t.s16[2] = a.s32[0] > 32767 ? 32767 : a.s32[0] < -32768 ? -32768 : a.s32[0];
421 t.s16[1] = b.s32[1] > 32767 ? 32767 : b.s32[1] < -32768 ? -32768 : b.s32[1];
422 t.s16[0] = b.s32[0] > 32767 ? 32767 : b.s32[0] < -32768 ? -32768 : b.s32[0];
423 return t;
424}
425
426SIMD_INLINE c_v64 c_v64_pack_s16_u8(c_v64 a, c_v64 b) {
427 c_v64 t;
428 if (CONFIG_BIG_ENDIAN) {
429 c_v64 u = a;
430 a = b;
431 b = u;
432 }
433 t.u8[7] = a.s16[3] > 255 ? 255 : a.s16[3] < 0 ? 0 : a.s16[3];
434 t.u8[6] = a.s16[2] > 255 ? 255 : a.s16[2] < 0 ? 0 : a.s16[2];
435 t.u8[5] = a.s16[1] > 255 ? 255 : a.s16[1] < 0 ? 0 : a.s16[1];
436 t.u8[4] = a.s16[0] > 255 ? 255 : a.s16[0] < 0 ? 0 : a.s16[0];
437 t.u8[3] = b.s16[3] > 255 ? 255 : b.s16[3] < 0 ? 0 : b.s16[3];
438 t.u8[2] = b.s16[2] > 255 ? 255 : b.s16[2] < 0 ? 0 : b.s16[2];
439 t.u8[1] = b.s16[1] > 255 ? 255 : b.s16[1] < 0 ? 0 : b.s16[1];
440 t.u8[0] = b.s16[0] > 255 ? 255 : b.s16[0] < 0 ? 0 : b.s16[0];
441 return t;
442}
443
444SIMD_INLINE c_v64 c_v64_pack_s16_s8(c_v64 a, c_v64 b) {
445 c_v64 t;
446 if (CONFIG_BIG_ENDIAN) {
447 c_v64 u = a;
448 a = b;
449 b = u;
450 }
451 t.u8[7] = a.s16[3] > 127 ? 127 : a.s16[3] < -128 ? 128 : a.s16[3];
452 t.u8[6] = a.s16[2] > 127 ? 127 : a.s16[2] < -128 ? 128 : a.s16[2];
453 t.u8[5] = a.s16[1] > 127 ? 127 : a.s16[1] < -128 ? 128 : a.s16[1];
454 t.u8[4] = a.s16[0] > 127 ? 127 : a.s16[0] < -128 ? 128 : a.s16[0];
455 t.u8[3] = b.s16[3] > 127 ? 127 : b.s16[3] < -128 ? 128 : b.s16[3];
456 t.u8[2] = b.s16[2] > 127 ? 127 : b.s16[2] < -128 ? 128 : b.s16[2];
457 t.u8[1] = b.s16[1] > 127 ? 127 : b.s16[1] < -128 ? 128 : b.s16[1];
458 t.u8[0] = b.s16[0] > 127 ? 127 : b.s16[0] < -128 ? 128 : b.s16[0];
459 return t;
460}
461
462SIMD_INLINE c_v64 c_v64_unpacklo_u16_s32(c_v64 a) {
463 c_v64 t;
464 t.s32[1] = a.u16[1 + !!CONFIG_BIG_ENDIAN * 2];
465 t.s32[0] = a.u16[0 + !!CONFIG_BIG_ENDIAN * 2];
466 return t;
467}
468
469SIMD_INLINE c_v64 c_v64_unpacklo_s16_s32(c_v64 a) {
470 c_v64 t;
471 t.s32[1] = a.s16[1 + !!CONFIG_BIG_ENDIAN * 2];
472 t.s32[0] = a.s16[0 + !!CONFIG_BIG_ENDIAN * 2];
473 return t;
474}
475
476SIMD_INLINE c_v64 c_v64_unpackhi_u16_s32(c_v64 a) {
477 c_v64 t;
478 t.s32[1] = a.u16[3 - !!CONFIG_BIG_ENDIAN * 2];
479 t.s32[0] = a.u16[2 - !!CONFIG_BIG_ENDIAN * 2];
480 return t;
481}
482
483SIMD_INLINE c_v64 c_v64_unpackhi_s16_s32(c_v64 a) {
484 c_v64 t;
485 t.s32[1] = a.s16[3 - !!CONFIG_BIG_ENDIAN * 2];
486 t.s32[0] = a.s16[2 - !!CONFIG_BIG_ENDIAN * 2];
487 return t;
488}
489
490SIMD_INLINE c_v64 c_v64_shuffle_8(c_v64 a, c_v64 pattern) {
491 c_v64 t;
492 int c;
493 for (c = 0; c < 8; c++) {
Steinar Midtskogenea42c4e2016-12-12 09:40:34 +0100494 if (SIMD_CHECK && (pattern.u8[c] & ~7)) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200495 fprintf(stderr, "Error: Undefined v64_shuffle_8 index %d/%d\n",
496 pattern.u8[c], c);
497 abort();
498 }
499 t.u8[c] =
500 a.u8[CONFIG_BIG_ENDIAN ? 7 - (pattern.u8[c] & 7) : pattern.u8[c] & 7];
501 }
502 return t;
503}
504
505SIMD_INLINE int64_t c_v64_dotp_su8(c_v64 a, c_v64 b) {
506 return a.s8[7] * b.u8[7] + a.s8[6] * b.u8[6] + a.s8[5] * b.u8[5] +
507 a.s8[4] * b.u8[4] + a.s8[3] * b.u8[3] + a.s8[2] * b.u8[2] +
508 a.s8[1] * b.u8[1] + a.s8[0] * b.u8[0];
509}
510
511SIMD_INLINE int64_t c_v64_dotp_s16(c_v64 a, c_v64 b) {
512 return (int64_t)(a.s16[3] * b.s16[3] + a.s16[2] * b.s16[2]) +
513 (int64_t)(a.s16[1] * b.s16[1] + a.s16[0] * b.s16[0]);
514}
515
516SIMD_INLINE uint64_t c_v64_hadd_u8(c_v64 a) {
517 return a.u8[7] + a.u8[6] + a.u8[5] + a.u8[4] + a.u8[3] + a.u8[2] + a.u8[1] +
518 a.u8[0];
519}
520
521SIMD_INLINE int64_t c_v64_hadd_s16(c_v64 a) {
522 return a.s16[3] + a.s16[2] + a.s16[1] + a.s16[0];
523}
524
525typedef uint32_t c_sad64_internal;
526
527/* Implementation dependent return value. Result must be finalised with
528 v64_sad_u8_sum().
529 The result for more than 32 v64_sad_u8() calls is undefined. */
530SIMD_INLINE c_sad64_internal c_v64_sad_u8_init() { return 0; }
531
532SIMD_INLINE c_sad64_internal c_v64_sad_u8(c_sad64_internal s, c_v64 a,
533 c_v64 b) {
534 int c;
535 for (c = 0; c < 8; c++)
536 s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
537 return s;
538}
539
540SIMD_INLINE uint32_t c_v64_sad_u8_sum(c_sad64_internal s) { return s; }
541
542typedef uint32_t c_ssd64_internal;
543
544/* Implementation dependent return value. Result must be finalised with
545 * v64_ssd_u8_sum(). */
546SIMD_INLINE c_ssd64_internal c_v64_ssd_u8_init() { return 0; }
547
548SIMD_INLINE c_ssd64_internal c_v64_ssd_u8(c_ssd64_internal s, c_v64 a,
549 c_v64 b) {
550 int c;
551 for (c = 0; c < 8; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
552 return s;
553}
554
555SIMD_INLINE uint32_t c_v64_ssd_u8_sum(c_ssd64_internal s) { return s; }
556
557SIMD_INLINE c_v64 c_v64_or(c_v64 a, c_v64 b) {
558 c_v64 t;
559 t.u64 = a.u64 | b.u64;
560 return t;
561}
562
563SIMD_INLINE c_v64 c_v64_xor(c_v64 a, c_v64 b) {
564 c_v64 t;
565 t.u64 = a.u64 ^ b.u64;
566 return t;
567}
568
569SIMD_INLINE c_v64 c_v64_and(c_v64 a, c_v64 b) {
570 c_v64 t;
571 t.u64 = a.u64 & b.u64;
572 return t;
573}
574
575SIMD_INLINE c_v64 c_v64_andn(c_v64 a, c_v64 b) {
576 c_v64 t;
577 t.u64 = a.u64 & ~b.u64;
578 return t;
579}
580
581SIMD_INLINE c_v64 c_v64_mullo_s16(c_v64 a, c_v64 b) {
582 c_v64 t;
583 int c;
584 for (c = 0; c < 4; c++) t.s16[c] = (int16_t)(a.s16[c] * b.s16[c]);
585 return t;
586}
587
588SIMD_INLINE c_v64 c_v64_mulhi_s16(c_v64 a, c_v64 b) {
589 c_v64 t;
590 int c;
591 for (c = 0; c < 4; c++) t.s16[c] = (a.s16[c] * b.s16[c]) >> 16;
592 return t;
593}
594
595SIMD_INLINE c_v64 c_v64_mullo_s32(c_v64 a, c_v64 b) {
596 c_v64 t;
Steinar Midtskogen6c795762017-03-07 20:55:48 +0100597 t.s32[0] = (int32_t)((int64_t)a.s32[0] * b.s32[0]);
598 t.s32[1] = (int32_t)((int64_t)a.s32[1] * b.s32[1]);
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200599 return t;
600}
601
602SIMD_INLINE c_v64 c_v64_madd_s16(c_v64 a, c_v64 b) {
603 c_v64 t;
604 t.s32[0] = a.s16[0] * b.s16[0] + a.s16[1] * b.s16[1];
605 t.s32[1] = a.s16[2] * b.s16[2] + a.s16[3] * b.s16[3];
606 return t;
607}
608
609SIMD_INLINE c_v64 c_v64_madd_us8(c_v64 a, c_v64 b) {
610 c_v64 t;
611 int32_t u;
612 u = a.u8[0] * b.s8[0] + a.u8[1] * b.s8[1];
613 t.s16[0] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
614 u = a.u8[2] * b.s8[2] + a.u8[3] * b.s8[3];
615 t.s16[1] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
616 u = a.u8[4] * b.s8[4] + a.u8[5] * b.s8[5];
617 t.s16[2] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
618 u = a.u8[6] * b.s8[6] + a.u8[7] * b.s8[7];
619 t.s16[3] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
620 return t;
621}
622
623SIMD_INLINE c_v64 c_v64_avg_u8(c_v64 a, c_v64 b) {
624 c_v64 t;
625 int c;
626 for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c] + 1) >> 1;
627 return t;
628}
629
630SIMD_INLINE c_v64 c_v64_rdavg_u8(c_v64 a, c_v64 b) {
631 c_v64 t;
632 int c;
633 for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c]) >> 1;
634 return t;
635}
636
637SIMD_INLINE c_v64 c_v64_avg_u16(c_v64 a, c_v64 b) {
638 c_v64 t;
639 int c;
640 for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c] + 1) >> 1;
641 return t;
642}
643
644SIMD_INLINE c_v64 c_v64_min_u8(c_v64 a, c_v64 b) {
645 c_v64 t;
646 int c;
647 for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? b.u8[c] : a.u8[c];
648 return t;
649}
650
651SIMD_INLINE c_v64 c_v64_max_u8(c_v64 a, c_v64 b) {
652 c_v64 t;
653 int c;
654 for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? a.u8[c] : b.u8[c];
655 return t;
656}
657
658SIMD_INLINE c_v64 c_v64_min_s8(c_v64 a, c_v64 b) {
659 c_v64 t;
660 int c;
661 for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? b.s8[c] : a.s8[c];
662 return t;
663}
664
665SIMD_INLINE c_v64 c_v64_max_s8(c_v64 a, c_v64 b) {
666 c_v64 t;
667 int c;
668 for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? a.s8[c] : b.s8[c];
669 return t;
670}
671
672SIMD_INLINE c_v64 c_v64_min_s16(c_v64 a, c_v64 b) {
673 c_v64 t;
674 int c;
675 for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? b.s16[c] : a.s16[c];
676 return t;
677}
678
679SIMD_INLINE c_v64 c_v64_max_s16(c_v64 a, c_v64 b) {
680 c_v64 t;
681 int c;
682 for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? a.s16[c] : b.s16[c];
683 return t;
684}
685
686SIMD_INLINE c_v64 c_v64_cmpgt_s8(c_v64 a, c_v64 b) {
687 c_v64 t;
688 int c;
689 for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] > b.s8[c]);
690 return t;
691}
692
693SIMD_INLINE c_v64 c_v64_cmplt_s8(c_v64 a, c_v64 b) {
694 c_v64 t;
695 int c;
696 for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] < b.s8[c]);
697 return t;
698}
699
700SIMD_INLINE c_v64 c_v64_cmpeq_8(c_v64 a, c_v64 b) {
701 c_v64 t;
702 int c;
703 for (c = 0; c < 8; c++) t.s8[c] = -(a.u8[c] == b.u8[c]);
704 return t;
705}
706
707SIMD_INLINE c_v64 c_v64_cmpgt_s16(c_v64 a, c_v64 b) {
708 c_v64 t;
709 int c;
710 for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] > b.s16[c]);
711 return t;
712}
713
714SIMD_INLINE c_v64 c_v64_cmplt_s16(c_v64 a, c_v64 b) {
715 c_v64 t;
716 int c;
717 for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] < b.s16[c]);
718 return t;
719}
720
721SIMD_INLINE c_v64 c_v64_cmpeq_16(c_v64 a, c_v64 b) {
722 c_v64 t;
723 int c;
724 for (c = 0; c < 4; c++) t.s16[c] = -(a.u16[c] == b.u16[c]);
725 return t;
726}
727
728SIMD_INLINE c_v64 c_v64_shl_8(c_v64 a, unsigned int n) {
729 c_v64 t;
730 int c;
Steinar Midtskogenea42c4e2016-12-12 09:40:34 +0100731 if (SIMD_CHECK && n > 7) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200732 fprintf(stderr, "Error: Undefined u8 shift left %d\n", n);
733 abort();
734 }
735 for (c = 0; c < 8; c++) t.s8[c] = a.u8[c] << n;
736 return t;
737}
738
739SIMD_INLINE c_v64 c_v64_shr_u8(c_v64 a, unsigned int n) {
740 c_v64 t;
741 int c;
Steinar Midtskogenea42c4e2016-12-12 09:40:34 +0100742 if (SIMD_CHECK && n > 7) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200743 fprintf(stderr, "Error: Undefined u8 shift right %d\n", n);
744 abort();
745 }
746 for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] >> n;
747 return t;
748}
749
750SIMD_INLINE c_v64 c_v64_shr_s8(c_v64 a, unsigned int n) {
751 c_v64 t;
752 int c;
Steinar Midtskogenea42c4e2016-12-12 09:40:34 +0100753 if (SIMD_CHECK && n > 7) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200754 fprintf(stderr, "Error: Undefined s8 shift right %d\n", n);
755 abort();
756 }
757 for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] >> n;
758 return t;
759}
760
761SIMD_INLINE c_v64 c_v64_shl_16(c_v64 a, unsigned int n) {
762 c_v64 t;
763 int c;
Steinar Midtskogenea42c4e2016-12-12 09:40:34 +0100764 if (SIMD_CHECK && n > 15) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200765 fprintf(stderr, "Error: Undefined u16 shift left %d\n", n);
766 abort();
767 }
768 for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] << n;
769 return t;
770}
771
772SIMD_INLINE c_v64 c_v64_shr_u16(c_v64 a, unsigned int n) {
773 c_v64 t;
774 int c;
Steinar Midtskogenea42c4e2016-12-12 09:40:34 +0100775 if (SIMD_CHECK && n > 15) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200776 fprintf(stderr, "Error: Undefined u16 shift right %d\n", n);
777 abort();
778 }
779 for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] >> n;
780 return t;
781}
782
783SIMD_INLINE c_v64 c_v64_shr_s16(c_v64 a, unsigned int n) {
784 c_v64 t;
785 int c;
Steinar Midtskogenea42c4e2016-12-12 09:40:34 +0100786 if (SIMD_CHECK && n > 15) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200787 fprintf(stderr, "Error: undefined s16 shift right %d\n", n);
788 abort();
789 }
790 for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] >> n;
791 return t;
792}
793
794SIMD_INLINE c_v64 c_v64_shl_32(c_v64 a, unsigned int n) {
795 c_v64 t;
Steinar Midtskogenea42c4e2016-12-12 09:40:34 +0100796 if (SIMD_CHECK && n > 31) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200797 fprintf(stderr, "Error: undefined u32 shift left %d\n", n);
798 abort();
799 }
800 t.u32[1] = a.u32[1] << n;
801 t.u32[0] = a.u32[0] << n;
802 return t;
803}
804
805SIMD_INLINE c_v64 c_v64_shr_u32(c_v64 a, unsigned int n) {
806 c_v64 t;
Steinar Midtskogenea42c4e2016-12-12 09:40:34 +0100807 if (SIMD_CHECK && n > 31) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200808 fprintf(stderr, "Error: undefined u32 shift right %d\n", n);
809 abort();
810 }
811 t.u32[1] = a.u32[1] >> n;
812 t.u32[0] = a.u32[0] >> n;
813 return t;
814}
815
816SIMD_INLINE c_v64 c_v64_shr_s32(c_v64 a, unsigned int n) {
817 c_v64 t;
Steinar Midtskogenea42c4e2016-12-12 09:40:34 +0100818 if (SIMD_CHECK && n > 31) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200819 fprintf(stderr, "Error: undefined s32 shift right %d\n", n);
820 abort();
821 }
822 t.s32[1] = a.s32[1] >> n;
823 t.s32[0] = a.s32[0] >> n;
824 return t;
825}
826
827SIMD_INLINE c_v64 c_v64_shr_n_byte(c_v64 x, const unsigned int i) {
828 c_v64 t;
829 t.u64 = x.u64 >> i * 8;
830 return t;
831}
832
833SIMD_INLINE c_v64 c_v64_shl_n_byte(c_v64 x, const unsigned int i) {
834 c_v64 t;
835 t.u64 = x.u64 << i * 8;
836 return t;
837}
838
839SIMD_INLINE c_v64 c_v64_align(c_v64 a, c_v64 b, const unsigned int c) {
Steinar Midtskogenea42c4e2016-12-12 09:40:34 +0100840 if (SIMD_CHECK && c > 7) {
Steinar Midtskogena5f8ea12016-08-03 13:17:33 +0200841 fprintf(stderr, "Error: undefined alignment %d\n", c);
842 abort();
843 }
844 return c ? c_v64_or(c_v64_shr_n_byte(b, c), c_v64_shl_n_byte(a, 8 - c)) : b;
845}
846
847SIMD_INLINE c_v64 c_v64_shl_n_8(c_v64 a, const unsigned int c) {
848 return c_v64_shl_8(a, c);
849}
850
851SIMD_INLINE c_v64 c_v64_shr_n_u8(c_v64 a, const unsigned int c) {
852 return c_v64_shr_u8(a, c);
853}
854
855SIMD_INLINE c_v64 c_v64_shr_n_s8(c_v64 a, const unsigned int c) {
856 return c_v64_shr_s8(a, c);
857}
858
859SIMD_INLINE c_v64 c_v64_shl_n_16(c_v64 a, const unsigned int c) {
860 return c_v64_shl_16(a, c);
861}
862
863SIMD_INLINE c_v64 c_v64_shr_n_u16(c_v64 a, const unsigned int c) {
864 return c_v64_shr_u16(a, c);
865}
866
867SIMD_INLINE c_v64 c_v64_shr_n_s16(c_v64 a, const unsigned int c) {
868 return c_v64_shr_s16(a, c);
869}
870
871SIMD_INLINE c_v64 c_v64_shl_n_32(c_v64 a, const unsigned int c) {
872 return c_v64_shl_32(a, c);
873}
874
875SIMD_INLINE c_v64 c_v64_shr_n_u32(c_v64 a, const unsigned int c) {
876 return c_v64_shr_u32(a, c);
877}
878
879SIMD_INLINE c_v64 c_v64_shr_n_s32(c_v64 a, const unsigned int c) {
880 return c_v64_shr_s32(a, c);
881}
882
883#endif /* _V64_INTRINSICS_C_H */