blob: b17f09289637127b027f986bc9cbc433f074fb1c [file] [log] [blame]
Steinar Midtskogen045d4132016-10-18 12:20:05 +02001/*
Lester Lu6bc30d62021-12-16 19:13:21 +00002 * Copyright (c) 2021, Alliance for Open Media. All rights reserved
Steinar Midtskogen045d4132016-10-18 12:20:05 +02003 *
Lester Lu6bc30d62021-12-16 19:13:21 +00004 * This source code is subject to the terms of the BSD 3-Clause Clear License
5 * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
6 * License was not distributed with this source code in the LICENSE file, you
7 * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the
8 * Alliance for Open Media Patent License 1.0 was not distributed with this
9 * source code in the PATENTS file, you can obtain it at
10 * aomedia.org/license/patent-license/.
Steinar Midtskogen045d4132016-10-18 12:20:05 +020011 */
12
James Zerne1cbb132018-08-22 14:10:36 -070013#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_
14#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_
Steinar Midtskogen045d4132016-10-18 12:20:05 +020015
16#include <stdio.h>
17#include <stdlib.h>
Tom Finegan60e653d2018-05-22 11:34:58 -070018
19#include "config/aom_config.h"
Steinar Midtskogen045d4132016-10-18 12:20:05 +020020
Tom Finegandd3e2a52018-05-23 14:33:09 -070021#include "aom_dsp/simd/v128_intrinsics_c.h"
22
Steinar Midtskogen045d4132016-10-18 12:20:05 +020023typedef union {
24 uint8_t u8[32];
25 uint16_t u16[16];
26 uint32_t u32[8];
27 uint64_t u64[4];
28 int8_t s8[32];
29 int16_t s16[16];
30 int32_t s32[8];
31 int64_t s64[4];
32 c_v64 v64[4];
33 c_v128 v128[2];
34} c_v256;
35
36SIMD_INLINE uint32_t c_v256_low_u32(c_v256 a) { return a.u32[0]; }
37
38SIMD_INLINE c_v64 c_v256_low_v64(c_v256 a) { return a.v64[0]; }
39
Steinar Midtskogen0578d432018-05-28 14:47:36 +020040SIMD_INLINE uint64_t c_v256_low_u64(c_v256 a) { return a.u64[0]; }
41
Steinar Midtskogen045d4132016-10-18 12:20:05 +020042SIMD_INLINE c_v128 c_v256_low_v128(c_v256 a) { return a.v128[0]; }
43
44SIMD_INLINE c_v128 c_v256_high_v128(c_v256 a) { return a.v128[1]; }
45
46SIMD_INLINE c_v256 c_v256_from_v128(c_v128 hi, c_v128 lo) {
47 c_v256 t;
48 t.v128[1] = hi;
49 t.v128[0] = lo;
50 return t;
51}
52
53SIMD_INLINE c_v256 c_v256_from_64(uint64_t a, uint64_t b, uint64_t c,
54 uint64_t d) {
55 c_v256 t;
56 t.u64[3] = a;
57 t.u64[2] = b;
58 t.u64[1] = c;
59 t.u64[0] = d;
60 return t;
61}
62
63SIMD_INLINE c_v256 c_v256_from_v64(c_v64 a, c_v64 b, c_v64 c, c_v64 d) {
64 c_v256 t;
65 t.u64[3] = a.u64;
66 t.u64[2] = b.u64;
67 t.u64[1] = c.u64;
68 t.u64[0] = d.u64;
69 return t;
70}
71
72SIMD_INLINE c_v256 c_v256_load_unaligned(const void *p) {
73 c_v256 t;
74 uint8_t *pp = (uint8_t *)p;
75 uint8_t *q = (uint8_t *)&t;
76 int c;
77 for (c = 0; c < 32; c++) q[c] = pp[c];
78 return t;
79}
80
81SIMD_INLINE c_v256 c_v256_load_aligned(const void *p) {
Steinar Midtskogenea42c4e2016-12-12 09:40:34 +010082 if (SIMD_CHECK && (uintptr_t)p & 31) {
Steinar Midtskogen045d4132016-10-18 12:20:05 +020083 fprintf(stderr, "Error: unaligned v256 load at %p\n", p);
84 abort();
85 }
86 return c_v256_load_unaligned(p);
87}
88
89SIMD_INLINE void c_v256_store_unaligned(void *p, c_v256 a) {
90 uint8_t *pp = (uint8_t *)p;
91 uint8_t *q = (uint8_t *)&a;
92 int c;
93 for (c = 0; c < 32; c++) pp[c] = q[c];
94}
95
96SIMD_INLINE void c_v256_store_aligned(void *p, c_v256 a) {
Steinar Midtskogenea42c4e2016-12-12 09:40:34 +010097 if (SIMD_CHECK && (uintptr_t)p & 31) {
Steinar Midtskogen045d4132016-10-18 12:20:05 +020098 fprintf(stderr, "Error: unaligned v256 store at %p\n", p);
99 abort();
100 }
101 c_v256_store_unaligned(p, a);
102}
103
104SIMD_INLINE c_v256 c_v256_zero() {
105 c_v256 t;
106 t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = 0;
107 return t;
108}
109
110SIMD_INLINE c_v256 c_v256_dup_8(uint8_t x) {
111 c_v256 t;
112 t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_8(x);
113 return t;
114}
115
116SIMD_INLINE c_v256 c_v256_dup_16(uint16_t x) {
117 c_v256 t;
118 t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_16(x);
119 return t;
120}
121
122SIMD_INLINE c_v256 c_v256_dup_32(uint32_t x) {
123 c_v256 t;
124 t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_32(x);
125 return t;
126}
127
Steinar Midtskogen0578d432018-05-28 14:47:36 +0200128SIMD_INLINE c_v256 c_v256_dup_64(uint64_t x) {
129 c_v256 t;
130 t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = x;
131 return t;
132}
133
134SIMD_INLINE int64_t c_v256_dotp_su8(c_v256 a, c_v256 b) {
135 return c_v128_dotp_su8(a.v128[1], b.v128[1]) +
136 c_v128_dotp_su8(a.v128[0], b.v128[0]);
137}
138
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200139SIMD_INLINE int64_t c_v256_dotp_s16(c_v256 a, c_v256 b) {
140 return c_v128_dotp_s16(a.v128[1], b.v128[1]) +
141 c_v128_dotp_s16(a.v128[0], b.v128[0]);
142}
143
Steinar Midtskogen0578d432018-05-28 14:47:36 +0200144SIMD_INLINE int64_t c_v256_dotp_s32(c_v256 a, c_v256 b) {
145 return c_v128_dotp_s32(a.v128[1], b.v128[1]) +
146 c_v128_dotp_s32(a.v128[0], b.v128[0]);
147}
148
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200149SIMD_INLINE uint64_t c_v256_hadd_u8(c_v256 a) {
150 return c_v128_hadd_u8(a.v128[1]) + c_v128_hadd_u8(a.v128[0]);
151}
152
Steinar Midtskogen50b2fc22020-03-24 14:23:51 +0100153typedef struct {
154 uint32_t val;
155 int count;
156} c_sad256_internal;
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200157
Steinar Midtskogen50b2fc22020-03-24 14:23:51 +0100158SIMD_INLINE c_sad256_internal c_v256_sad_u8_init(void) {
159 c_sad256_internal t;
160 t.val = t.count = 0;
161 return t;
162}
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200163
164/* Implementation dependent return value. Result must be finalised with
165 v256_sad_u8_sum().
166 The result for more than 16 v256_sad_u8() calls is undefined. */
Steinar Midtskogen0578d432018-05-28 14:47:36 +0200167SIMD_INLINE c_sad256_internal c_v256_sad_u8(c_sad256_internal s, c_v256 a,
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200168 c_v256 b) {
169 int c;
170 for (c = 0; c < 32; c++)
Steinar Midtskogen50b2fc22020-03-24 14:23:51 +0100171 s.val += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
172 s.count++;
173 if (SIMD_CHECK && s.count > 32) {
174 fprintf(stderr,
175 "Error: sad called 32 times returning an undefined result\n");
176 abort();
177 }
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200178 return s;
179}
180
Steinar Midtskogen50b2fc22020-03-24 14:23:51 +0100181SIMD_INLINE uint32_t c_v256_sad_u8_sum(c_sad256_internal s) { return s.val; }
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200182
183typedef uint32_t c_ssd256_internal;
184
185SIMD_INLINE c_ssd256_internal c_v256_ssd_u8_init() { return 0; }
186
187/* Implementation dependent return value. Result must be finalised with
188 * v256_ssd_u8_sum(). */
189SIMD_INLINE c_ssd256_internal c_v256_ssd_u8(c_ssd256_internal s, c_v256 a,
190 c_v256 b) {
191 int c;
192 for (c = 0; c < 32; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
193 return s;
194}
195
196SIMD_INLINE uint32_t c_v256_ssd_u8_sum(c_ssd256_internal s) { return s; }
197
198SIMD_INLINE c_v256 c_v256_or(c_v256 a, c_v256 b) {
199 return c_v256_from_v128(c_v128_or(a.v128[1], b.v128[1]),
200 c_v128_or(a.v128[0], b.v128[0]));
201}
202
203SIMD_INLINE c_v256 c_v256_xor(c_v256 a, c_v256 b) {
204 return c_v256_from_v128(c_v128_xor(a.v128[1], b.v128[1]),
205 c_v128_xor(a.v128[0], b.v128[0]));
206}
207
208SIMD_INLINE c_v256 c_v256_and(c_v256 a, c_v256 b) {
209 return c_v256_from_v128(c_v128_and(a.v128[1], b.v128[1]),
210 c_v128_and(a.v128[0], b.v128[0]));
211}
212
213SIMD_INLINE c_v256 c_v256_andn(c_v256 a, c_v256 b) {
214 return c_v256_from_v128(c_v128_andn(a.v128[1], b.v128[1]),
215 c_v128_andn(a.v128[0], b.v128[0]));
216}
217
218SIMD_INLINE c_v256 c_v256_add_8(c_v256 a, c_v256 b) {
219 return c_v256_from_v128(c_v128_add_8(a.v128[1], b.v128[1]),
220 c_v128_add_8(a.v128[0], b.v128[0]));
221}
222
223SIMD_INLINE c_v256 c_v256_add_16(c_v256 a, c_v256 b) {
224 return c_v256_from_v128(c_v128_add_16(a.v128[1], b.v128[1]),
225 c_v128_add_16(a.v128[0], b.v128[0]));
226}
227
Steinar Midtskogen0578d432018-05-28 14:47:36 +0200228SIMD_INLINE c_v256 c_v256_sadd_s8(c_v256 a, c_v256 b) {
229 return c_v256_from_v128(c_v128_sadd_s8(a.v128[1], b.v128[1]),
230 c_v128_sadd_s8(a.v128[0], b.v128[0]));
231}
232
233SIMD_INLINE c_v256 c_v256_sadd_u8(c_v256 a, c_v256 b) {
234 return c_v256_from_v128(c_v128_sadd_u8(a.v128[1], b.v128[1]),
235 c_v128_sadd_u8(a.v128[0], b.v128[0]));
236}
237
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200238SIMD_INLINE c_v256 c_v256_sadd_s16(c_v256 a, c_v256 b) {
239 return c_v256_from_v128(c_v128_sadd_s16(a.v128[1], b.v128[1]),
240 c_v128_sadd_s16(a.v128[0], b.v128[0]));
241}
242
243SIMD_INLINE c_v256 c_v256_add_32(c_v256 a, c_v256 b) {
244 return c_v256_from_v128(c_v128_add_32(a.v128[1], b.v128[1]),
245 c_v128_add_32(a.v128[0], b.v128[0]));
246}
247
Steinar Midtskogen0578d432018-05-28 14:47:36 +0200248SIMD_INLINE c_v256 c_v256_add_64(c_v256 a, c_v256 b) {
249 return c_v256_from_v128(c_v128_add_64(a.v128[1], b.v128[1]),
250 c_v128_add_64(a.v128[0], b.v128[0]));
251}
252
253SIMD_INLINE c_v256 c_v256_sub_64(c_v256 a, c_v256 b) {
254 return c_v256_from_v128(c_v128_sub_64(a.v128[1], b.v128[1]),
255 c_v128_sub_64(a.v128[0], b.v128[0]));
256}
257
258SIMD_INLINE c_v256 c_v256_padd_u8(c_v256 a) {
259 c_v256 t;
260 for (int i = 0; i < 16; i++)
261 t.u16[i] = (uint16_t)a.u8[i * 2] + (uint16_t)a.u8[i * 2 + 1];
262 return t;
263}
264
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200265SIMD_INLINE c_v256 c_v256_padd_s16(c_v256 a) {
266 c_v256 t;
267 t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1];
268 t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3];
269 t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5];
270 t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7];
271 t.s32[4] = (int32_t)a.s16[8] + (int32_t)a.s16[9];
272 t.s32[5] = (int32_t)a.s16[10] + (int32_t)a.s16[11];
273 t.s32[6] = (int32_t)a.s16[12] + (int32_t)a.s16[13];
274 t.s32[7] = (int32_t)a.s16[14] + (int32_t)a.s16[15];
275 return t;
276}
277
278SIMD_INLINE c_v256 c_v256_sub_8(c_v256 a, c_v256 b) {
279 return c_v256_from_v128(c_v128_sub_8(a.v128[1], b.v128[1]),
280 c_v128_sub_8(a.v128[0], b.v128[0]));
281}
282
283SIMD_INLINE c_v256 c_v256_ssub_u8(c_v256 a, c_v256 b) {
284 return c_v256_from_v128(c_v128_ssub_u8(a.v128[1], b.v128[1]),
285 c_v128_ssub_u8(a.v128[0], b.v128[0]));
286}
287
288SIMD_INLINE c_v256 c_v256_ssub_s8(c_v256 a, c_v256 b) {
289 return c_v256_from_v128(c_v128_ssub_s8(a.v128[1], b.v128[1]),
290 c_v128_ssub_s8(a.v128[0], b.v128[0]));
291}
292
293SIMD_INLINE c_v256 c_v256_sub_16(c_v256 a, c_v256 b) {
294 return c_v256_from_v128(c_v128_sub_16(a.v128[1], b.v128[1]),
295 c_v128_sub_16(a.v128[0], b.v128[0]));
296}
297
298SIMD_INLINE c_v256 c_v256_ssub_s16(c_v256 a, c_v256 b) {
299 return c_v256_from_v128(c_v128_ssub_s16(a.v128[1], b.v128[1]),
300 c_v128_ssub_s16(a.v128[0], b.v128[0]));
301}
302
Steinar Midtskogen9b8444a2017-03-31 22:11:06 +0200303SIMD_INLINE c_v256 c_v256_ssub_u16(c_v256 a, c_v256 b) {
304 return c_v256_from_v128(c_v128_ssub_u16(a.v128[1], b.v128[1]),
305 c_v128_ssub_u16(a.v128[0], b.v128[0]));
306}
307
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200308SIMD_INLINE c_v256 c_v256_sub_32(c_v256 a, c_v256 b) {
309 return c_v256_from_v128(c_v128_sub_32(a.v128[1], b.v128[1]),
310 c_v128_sub_32(a.v128[0], b.v128[0]));
311}
312
313SIMD_INLINE c_v256 c_v256_abs_s16(c_v256 a) {
314 return c_v256_from_v128(c_v128_abs_s16(a.v128[1]), c_v128_abs_s16(a.v128[0]));
315}
316
Steinar Midtskogen6033fb82017-04-02 21:32:41 +0200317SIMD_INLINE c_v256 c_v256_abs_s8(c_v256 a) {
318 return c_v256_from_v128(c_v128_abs_s8(a.v128[1]), c_v128_abs_s8(a.v128[0]));
319}
320
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200321SIMD_INLINE c_v256 c_v256_mul_s16(c_v128 a, c_v128 b) {
322 c_v128 lo_bits = c_v128_mullo_s16(a, b);
323 c_v128 hi_bits = c_v128_mulhi_s16(a, b);
324 return c_v256_from_v128(c_v128_ziphi_16(hi_bits, lo_bits),
325 c_v128_ziplo_16(hi_bits, lo_bits));
326}
327
328SIMD_INLINE c_v256 c_v256_mullo_s16(c_v256 a, c_v256 b) {
329 return c_v256_from_v128(c_v128_mullo_s16(a.v128[1], b.v128[1]),
330 c_v128_mullo_s16(a.v128[0], b.v128[0]));
331}
332
333SIMD_INLINE c_v256 c_v256_mulhi_s16(c_v256 a, c_v256 b) {
334 return c_v256_from_v128(c_v128_mulhi_s16(a.v128[1], b.v128[1]),
335 c_v128_mulhi_s16(a.v128[0], b.v128[0]));
336}
337
338SIMD_INLINE c_v256 c_v256_mullo_s32(c_v256 a, c_v256 b) {
339 return c_v256_from_v128(c_v128_mullo_s32(a.v128[1], b.v128[1]),
340 c_v128_mullo_s32(a.v128[0], b.v128[0]));
341}
342
343SIMD_INLINE c_v256 c_v256_madd_s16(c_v256 a, c_v256 b) {
344 return c_v256_from_v128(c_v128_madd_s16(a.v128[1], b.v128[1]),
345 c_v128_madd_s16(a.v128[0], b.v128[0]));
346}
347
348SIMD_INLINE c_v256 c_v256_madd_us8(c_v256 a, c_v256 b) {
349 return c_v256_from_v128(c_v128_madd_us8(a.v128[1], b.v128[1]),
350 c_v128_madd_us8(a.v128[0], b.v128[0]));
351}
352
353SIMD_INLINE c_v256 c_v256_avg_u8(c_v256 a, c_v256 b) {
354 return c_v256_from_v128(c_v128_avg_u8(a.v128[1], b.v128[1]),
355 c_v128_avg_u8(a.v128[0], b.v128[0]));
356}
357
358SIMD_INLINE c_v256 c_v256_rdavg_u8(c_v256 a, c_v256 b) {
359 return c_v256_from_v128(c_v128_rdavg_u8(a.v128[1], b.v128[1]),
360 c_v128_rdavg_u8(a.v128[0], b.v128[0]));
361}
362
Steinar Midtskogen0578d432018-05-28 14:47:36 +0200363SIMD_INLINE c_v256 c_v256_rdavg_u16(c_v256 a, c_v256 b) {
364 return c_v256_from_v128(c_v128_rdavg_u16(a.v128[1], b.v128[1]),
365 c_v128_rdavg_u16(a.v128[0], b.v128[0]));
366}
367
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200368SIMD_INLINE c_v256 c_v256_avg_u16(c_v256 a, c_v256 b) {
369 return c_v256_from_v128(c_v128_avg_u16(a.v128[1], b.v128[1]),
370 c_v128_avg_u16(a.v128[0], b.v128[0]));
371}
372
373SIMD_INLINE c_v256 c_v256_min_u8(c_v256 a, c_v256 b) {
374 return c_v256_from_v128(c_v128_min_u8(a.v128[1], b.v128[1]),
375 c_v128_min_u8(a.v128[0], b.v128[0]));
376}
377
378SIMD_INLINE c_v256 c_v256_max_u8(c_v256 a, c_v256 b) {
379 return c_v256_from_v128(c_v128_max_u8(a.v128[1], b.v128[1]),
380 c_v128_max_u8(a.v128[0], b.v128[0]));
381}
382
383SIMD_INLINE c_v256 c_v256_min_s8(c_v256 a, c_v256 b) {
384 return c_v256_from_v128(c_v128_min_s8(a.v128[1], b.v128[1]),
385 c_v128_min_s8(a.v128[0], b.v128[0]));
386}
387
Steinar Midtskogen0578d432018-05-28 14:47:36 +0200388SIMD_INLINE uint32_t c_v256_movemask_8(c_v256 a) {
389 return ((a.s8[31] < 0) << 31) | ((a.s8[30] < 0) << 30) |
390 ((a.s8[29] < 0) << 29) | ((a.s8[28] < 0) << 28) |
391 ((a.s8[27] < 0) << 27) | ((a.s8[26] < 0) << 26) |
392 ((a.s8[25] < 0) << 25) | ((a.s8[24] < 0) << 24) |
393 ((a.s8[23] < 0) << 23) | ((a.s8[22] < 0) << 22) |
394 ((a.s8[21] < 0) << 21) | ((a.s8[20] < 0) << 20) |
395 ((a.s8[19] < 0) << 19) | ((a.s8[18] < 0) << 18) |
396 ((a.s8[17] < 0) << 17) | ((a.s8[16] < 0) << 16) |
397 ((a.s8[15] < 0) << 15) | ((a.s8[14] < 0) << 14) |
398 ((a.s8[13] < 0) << 13) | ((a.s8[12] < 0) << 12) |
399 ((a.s8[11] < 0) << 11) | ((a.s8[10] < 0) << 10) |
400 ((a.s8[9] < 0) << 9) | ((a.s8[8] < 0) << 8) | ((a.s8[7] < 0) << 7) |
401 ((a.s8[6] < 0) << 6) | ((a.s8[5] < 0) << 5) | ((a.s8[4] < 0) << 4) |
402 ((a.s8[3] < 0) << 3) | ((a.s8[2] < 0) << 2) | ((a.s8[1] < 0) << 1) |
403 ((a.s8[0] < 0) << 0);
404}
405
406SIMD_INLINE c_v256 c_v256_blend_8(c_v256 a, c_v256 b, c_v256 c) {
407 c_v256 t;
408 for (int i = 0; i < 32; i++) t.u8[i] = c.s8[i] < 0 ? b.u8[i] : a.u8[i];
409 return t;
410}
411
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200412SIMD_INLINE c_v256 c_v256_max_s8(c_v256 a, c_v256 b) {
413 return c_v256_from_v128(c_v128_max_s8(a.v128[1], b.v128[1]),
414 c_v128_max_s8(a.v128[0], b.v128[0]));
415}
416
417SIMD_INLINE c_v256 c_v256_min_s16(c_v256 a, c_v256 b) {
418 return c_v256_from_v128(c_v128_min_s16(a.v128[1], b.v128[1]),
419 c_v128_min_s16(a.v128[0], b.v128[0]));
420}
421
422SIMD_INLINE c_v256 c_v256_max_s16(c_v256 a, c_v256 b) {
423 return c_v256_from_v128(c_v128_max_s16(a.v128[1], b.v128[1]),
424 c_v128_max_s16(a.v128[0], b.v128[0]));
425}
426
Steinar Midtskogen0578d432018-05-28 14:47:36 +0200427SIMD_INLINE c_v256 c_v256_min_s32(c_v256 a, c_v256 b) {
428 return c_v256_from_v128(c_v128_min_s32(a.v128[1], b.v128[1]),
429 c_v128_min_s32(a.v128[0], b.v128[0]));
430}
431
432SIMD_INLINE c_v256 c_v256_max_s32(c_v256 a, c_v256 b) {
433 return c_v256_from_v128(c_v128_max_s32(a.v128[1], b.v128[1]),
434 c_v128_max_s32(a.v128[0], b.v128[0]));
435}
436
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200437SIMD_INLINE c_v256 c_v256_ziplo_8(c_v256 a, c_v256 b) {
438 return c_v256_from_v128(c_v128_ziphi_8(a.v128[0], b.v128[0]),
439 c_v128_ziplo_8(a.v128[0], b.v128[0]));
440}
441
442SIMD_INLINE c_v256 c_v256_ziphi_8(c_v256 a, c_v256 b) {
443 return c_v256_from_v128(c_v128_ziphi_8(a.v128[1], b.v128[1]),
444 c_v128_ziplo_8(a.v128[1], b.v128[1]));
445}
446
447SIMD_INLINE c_v256 c_v256_ziplo_16(c_v256 a, c_v256 b) {
448 return c_v256_from_v128(c_v128_ziphi_16(a.v128[0], b.v128[0]),
449 c_v128_ziplo_16(a.v128[0], b.v128[0]));
450}
451
452SIMD_INLINE c_v256 c_v256_ziphi_16(c_v256 a, c_v256 b) {
453 return c_v256_from_v128(c_v128_ziphi_16(a.v128[1], b.v128[1]),
454 c_v128_ziplo_16(a.v128[1], b.v128[1]));
455}
456
457SIMD_INLINE c_v256 c_v256_ziplo_32(c_v256 a, c_v256 b) {
458 return c_v256_from_v128(c_v128_ziphi_32(a.v128[0], b.v128[0]),
459 c_v128_ziplo_32(a.v128[0], b.v128[0]));
460}
461
462SIMD_INLINE c_v256 c_v256_ziphi_32(c_v256 a, c_v256 b) {
463 return c_v256_from_v128(c_v128_ziphi_32(a.v128[1], b.v128[1]),
464 c_v128_ziplo_32(a.v128[1], b.v128[1]));
465}
466
467SIMD_INLINE c_v256 c_v256_ziplo_64(c_v256 a, c_v256 b) {
468 return c_v256_from_v128(c_v128_ziphi_64(a.v128[0], b.v128[0]),
469 c_v128_ziplo_64(a.v128[0], b.v128[0]));
470}
471
472SIMD_INLINE c_v256 c_v256_ziphi_64(c_v256 a, c_v256 b) {
473 return c_v256_from_v128(c_v128_ziphi_64(a.v128[1], b.v128[1]),
474 c_v128_ziplo_64(a.v128[1], b.v128[1]));
475}
476
477SIMD_INLINE c_v256 c_v256_ziplo_128(c_v256 a, c_v256 b) {
478 return c_v256_from_v128(a.v128[0], b.v128[0]);
479}
480
481SIMD_INLINE c_v256 c_v256_ziphi_128(c_v256 a, c_v256 b) {
482 return c_v256_from_v128(a.v128[1], b.v128[1]);
483}
484
485SIMD_INLINE c_v256 c_v256_zip_8(c_v128 a, c_v128 b) {
486 return c_v256_from_v128(c_v128_ziphi_8(a, b), c_v128_ziplo_8(a, b));
487}
488
489SIMD_INLINE c_v256 c_v256_zip_16(c_v128 a, c_v128 b) {
490 return c_v256_from_v128(c_v128_ziphi_16(a, b), c_v128_ziplo_16(a, b));
491}
492
493SIMD_INLINE c_v256 c_v256_zip_32(c_v128 a, c_v128 b) {
494 return c_v256_from_v128(c_v128_ziphi_32(a, b), c_v128_ziplo_32(a, b));
495}
496
497SIMD_INLINE c_v256 _c_v256_unzip_8(c_v256 a, c_v256 b, int mode) {
498 c_v256 t;
499 int i;
500 if (mode) {
501 for (i = 0; i < 16; i++) {
502 t.u8[i] = a.u8[i * 2 + 1];
503 t.u8[i + 16] = b.u8[i * 2 + 1];
504 }
505 } else {
506 for (i = 0; i < 16; i++) {
507 t.u8[i] = b.u8[i * 2];
508 t.u8[i + 16] = a.u8[i * 2];
509 }
510 }
511 return t;
512}
513
514SIMD_INLINE c_v256 c_v256_unziplo_8(c_v256 a, c_v256 b) {
515 return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(a, b, 1)
516 : _c_v256_unzip_8(a, b, 0);
517}
518
519SIMD_INLINE c_v256 c_v256_unziphi_8(c_v256 a, c_v256 b) {
520 return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(b, a, 0)
521 : _c_v256_unzip_8(b, a, 1);
522}
523
524SIMD_INLINE c_v256 _c_v256_unzip_16(c_v256 a, c_v256 b, int mode) {
525 c_v256 t;
526 int i;
527 if (mode) {
528 for (i = 0; i < 8; i++) {
529 t.u16[i] = a.u16[i * 2 + 1];
530 t.u16[i + 8] = b.u16[i * 2 + 1];
531 }
532 } else {
533 for (i = 0; i < 8; i++) {
534 t.u16[i] = b.u16[i * 2];
535 t.u16[i + 8] = a.u16[i * 2];
536 }
537 }
538 return t;
539}
540
541SIMD_INLINE c_v256 c_v256_unziplo_16(c_v256 a, c_v256 b) {
542 return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(a, b, 1)
543 : _c_v256_unzip_16(a, b, 0);
544}
545
546SIMD_INLINE c_v256 c_v256_unziphi_16(c_v256 a, c_v256 b) {
547 return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(b, a, 0)
548 : _c_v256_unzip_16(b, a, 1);
549}
550
551SIMD_INLINE c_v256 _c_v256_unzip_32(c_v256 a, c_v256 b, int mode) {
552 c_v256 t;
553 if (mode) {
554 t.u32[7] = b.u32[7];
555 t.u32[6] = b.u32[5];
556 t.u32[5] = b.u32[3];
557 t.u32[4] = b.u32[1];
558 t.u32[3] = a.u32[7];
559 t.u32[2] = a.u32[5];
560 t.u32[1] = a.u32[3];
561 t.u32[0] = a.u32[1];
562 } else {
563 t.u32[7] = a.u32[6];
564 t.u32[6] = a.u32[4];
565 t.u32[5] = a.u32[2];
566 t.u32[4] = a.u32[0];
567 t.u32[3] = b.u32[6];
568 t.u32[2] = b.u32[4];
569 t.u32[1] = b.u32[2];
570 t.u32[0] = b.u32[0];
571 }
572 return t;
573}
574
575SIMD_INLINE c_v256 c_v256_unziplo_32(c_v256 a, c_v256 b) {
576 return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(a, b, 1)
577 : _c_v256_unzip_32(a, b, 0);
578}
579
580SIMD_INLINE c_v256 c_v256_unziphi_32(c_v256 a, c_v256 b) {
581 return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(b, a, 0)
582 : _c_v256_unzip_32(b, a, 1);
583}
584
Steinar Midtskogen0578d432018-05-28 14:47:36 +0200585SIMD_INLINE c_v256 _c_v256_unzip_64(c_v256 a, c_v256 b, int mode) {
586 c_v256 t;
587 if (mode) {
588 t.u64[3] = b.u64[3];
589 t.u64[2] = b.u64[1];
590 t.u64[1] = a.u64[3];
591 t.u64[0] = a.u64[1];
592 } else {
593 t.u64[3] = a.u64[2];
594 t.u64[2] = a.u64[0];
595 t.u64[1] = b.u64[2];
596 t.u64[0] = b.u64[0];
597 }
598 return t;
599}
600
601SIMD_INLINE c_v256 c_v256_unziplo_64(c_v256 a, c_v256 b) {
602 return CONFIG_BIG_ENDIAN ? _c_v256_unzip_64(a, b, 1)
603 : _c_v256_unzip_64(a, b, 0);
604}
605
606SIMD_INLINE c_v256 c_v256_unziphi_64(c_v256 a, c_v256 b) {
607 return CONFIG_BIG_ENDIAN ? _c_v256_unzip_64(b, a, 0)
608 : _c_v256_unzip_64(b, a, 1);
609}
610
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200611SIMD_INLINE c_v256 c_v256_unpack_u8_s16(c_v128 a) {
612 return c_v256_from_v128(c_v128_unpackhi_u8_s16(a), c_v128_unpacklo_u8_s16(a));
613}
614
615SIMD_INLINE c_v256 c_v256_unpacklo_u8_s16(c_v256 a) {
616 return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[0]),
617 c_v128_unpacklo_u8_s16(a.v128[0]));
618}
619
620SIMD_INLINE c_v256 c_v256_unpackhi_u8_s16(c_v256 a) {
621 return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[1]),
622 c_v128_unpacklo_u8_s16(a.v128[1]));
623}
624
Steinar Midtskogen1b2b7392017-04-11 14:19:20 +0200625SIMD_INLINE c_v256 c_v256_unpack_s8_s16(c_v128 a) {
626 return c_v256_from_v128(c_v128_unpackhi_s8_s16(a), c_v128_unpacklo_s8_s16(a));
627}
628
629SIMD_INLINE c_v256 c_v256_unpacklo_s8_s16(c_v256 a) {
630 return c_v256_from_v128(c_v128_unpackhi_s8_s16(a.v128[0]),
631 c_v128_unpacklo_s8_s16(a.v128[0]));
632}
633
634SIMD_INLINE c_v256 c_v256_unpackhi_s8_s16(c_v256 a) {
635 return c_v256_from_v128(c_v128_unpackhi_s8_s16(a.v128[1]),
636 c_v128_unpacklo_s8_s16(a.v128[1]));
637}
638
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200639SIMD_INLINE c_v256 c_v256_pack_s32_s16(c_v256 a, c_v256 b) {
640 return c_v256_from_v128(c_v128_pack_s32_s16(a.v128[1], a.v128[0]),
641 c_v128_pack_s32_s16(b.v128[1], b.v128[0]));
642}
643
Steinar Midtskogen0578d432018-05-28 14:47:36 +0200644SIMD_INLINE c_v256 c_v256_pack_s32_u16(c_v256 a, c_v256 b) {
645 return c_v256_from_v128(c_v128_pack_s32_u16(a.v128[1], a.v128[0]),
646 c_v128_pack_s32_u16(b.v128[1], b.v128[0]));
647}
648
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200649SIMD_INLINE c_v256 c_v256_pack_s16_u8(c_v256 a, c_v256 b) {
650 return c_v256_from_v128(c_v128_pack_s16_u8(a.v128[1], a.v128[0]),
651 c_v128_pack_s16_u8(b.v128[1], b.v128[0]));
652}
653
654SIMD_INLINE c_v256 c_v256_pack_s16_s8(c_v256 a, c_v256 b) {
655 return c_v256_from_v128(c_v128_pack_s16_s8(a.v128[1], a.v128[0]),
656 c_v128_pack_s16_s8(b.v128[1], b.v128[0]));
657}
658
659SIMD_INLINE c_v256 c_v256_unpack_u16_s32(c_v128 a) {
660 return c_v256_from_v128(c_v128_unpackhi_u16_s32(a),
661 c_v128_unpacklo_u16_s32(a));
662}
663
664SIMD_INLINE c_v256 c_v256_unpack_s16_s32(c_v128 a) {
665 return c_v256_from_v128(c_v128_unpackhi_s16_s32(a),
666 c_v128_unpacklo_s16_s32(a));
667}
668
669SIMD_INLINE c_v256 c_v256_unpacklo_u16_s32(c_v256 a) {
670 return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[0]),
671 c_v128_unpacklo_u16_s32(a.v128[0]));
672}
673
674SIMD_INLINE c_v256 c_v256_unpacklo_s16_s32(c_v256 a) {
675 return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[0]),
676 c_v128_unpacklo_s16_s32(a.v128[0]));
677}
678
679SIMD_INLINE c_v256 c_v256_unpackhi_u16_s32(c_v256 a) {
680 return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[1]),
681 c_v128_unpacklo_u16_s32(a.v128[1]));
682}
683
684SIMD_INLINE c_v256 c_v256_unpackhi_s16_s32(c_v256 a) {
685 return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[1]),
686 c_v128_unpacklo_s16_s32(a.v128[1]));
687}
688
689SIMD_INLINE c_v256 c_v256_shuffle_8(c_v256 a, c_v256 pattern) {
690 c_v256 t;
691 int c;
Steinar Midtskogen0578d432018-05-28 14:47:36 +0200692 for (c = 0; c < 32; c++)
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200693 t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31)
694 : pattern.u8[c] & 31];
Steinar Midtskogen0578d432018-05-28 14:47:36 +0200695
696 return t;
697}
698
699SIMD_INLINE c_v256 c_v256_wideshuffle_8(c_v256 a, c_v256 b, c_v256 pattern) {
700 c_v256 t;
701 int c;
702 for (c = 0; c < 32; c++)
703 t.u8[c] = (pattern.u8[c] < 32
704 ? b.u8
705 : a.u8)[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31)
706 : pattern.u8[c] & 31];
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200707 return t;
708}
709
710// Pairwise / dual-lane shuffle: shuffle two 128 bit lates.
711SIMD_INLINE c_v256 c_v256_pshuffle_8(c_v256 a, c_v256 pattern) {
712 return c_v256_from_v128(
713 c_v128_shuffle_8(c_v256_high_v128(a), c_v256_high_v128(pattern)),
714 c_v128_shuffle_8(c_v256_low_v128(a), c_v256_low_v128(pattern)));
715}
716
717SIMD_INLINE c_v256 c_v256_cmpgt_s8(c_v256 a, c_v256 b) {
718 return c_v256_from_v128(c_v128_cmpgt_s8(a.v128[1], b.v128[1]),
719 c_v128_cmpgt_s8(a.v128[0], b.v128[0]));
720}
721
722SIMD_INLINE c_v256 c_v256_cmplt_s8(c_v256 a, c_v256 b) {
723 return c_v256_from_v128(c_v128_cmplt_s8(a.v128[1], b.v128[1]),
724 c_v128_cmplt_s8(a.v128[0], b.v128[0]));
725}
726
727SIMD_INLINE c_v256 c_v256_cmpeq_8(c_v256 a, c_v256 b) {
728 return c_v256_from_v128(c_v128_cmpeq_8(a.v128[1], b.v128[1]),
729 c_v128_cmpeq_8(a.v128[0], b.v128[0]));
730}
731
732SIMD_INLINE c_v256 c_v256_cmpgt_s16(c_v256 a, c_v256 b) {
733 return c_v256_from_v128(c_v128_cmpgt_s16(a.v128[1], b.v128[1]),
734 c_v128_cmpgt_s16(a.v128[0], b.v128[0]));
735}
736
737SIMD_INLINE c_v256 c_v256_cmplt_s16(c_v256 a, c_v256 b) {
738 return c_v256_from_v128(c_v128_cmplt_s16(a.v128[1], b.v128[1]),
739 c_v128_cmplt_s16(a.v128[0], b.v128[0]));
740}
741
742SIMD_INLINE c_v256 c_v256_cmpeq_16(c_v256 a, c_v256 b) {
743 return c_v256_from_v128(c_v128_cmpeq_16(a.v128[1], b.v128[1]),
744 c_v128_cmpeq_16(a.v128[0], b.v128[0]));
745}
746
Steinar Midtskogen0578d432018-05-28 14:47:36 +0200747SIMD_INLINE c_v256 c_v256_cmpgt_s32(c_v256 a, c_v256 b) {
748 return c_v256_from_v128(c_v128_cmpgt_s32(a.v128[1], b.v128[1]),
749 c_v128_cmpgt_s32(a.v128[0], b.v128[0]));
750}
751
752SIMD_INLINE c_v256 c_v256_cmplt_s32(c_v256 a, c_v256 b) {
753 return c_v256_from_v128(c_v128_cmplt_s32(a.v128[1], b.v128[1]),
754 c_v128_cmplt_s32(a.v128[0], b.v128[0]));
755}
756
757SIMD_INLINE c_v256 c_v256_cmpeq_32(c_v256 a, c_v256 b) {
758 return c_v256_from_v128(c_v128_cmpeq_32(a.v128[1], b.v128[1]),
759 c_v128_cmpeq_32(a.v128[0], b.v128[0]));
760}
761
Yaowu Xu032573d2017-04-24 15:04:17 -0700762SIMD_INLINE c_v256 c_v256_shl_n_byte(c_v256 a, unsigned int n) {
Steinar Midtskogen50b2fc22020-03-24 14:23:51 +0100763 if (n == 0) return a;
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200764 if (n < 16)
765 return c_v256_from_v128(c_v128_or(c_v128_shl_n_byte(a.v128[1], n),
766 c_v128_shr_n_byte(a.v128[0], 16 - n)),
767 c_v128_shl_n_byte(a.v128[0], n));
768 else if (n > 16)
769 return c_v256_from_v128(c_v128_shl_n_byte(a.v128[0], n - 16),
770 c_v128_zero());
771 else
772 return c_v256_from_v128(c_v256_low_v128(a), c_v128_zero());
773}
774
Yaowu Xu032573d2017-04-24 15:04:17 -0700775SIMD_INLINE c_v256 c_v256_shr_n_byte(c_v256 a, unsigned int n) {
Steinar Midtskogen50b2fc22020-03-24 14:23:51 +0100776 if (n == 0) return a;
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200777 if (n < 16)
778 return c_v256_from_v128(c_v128_shr_n_byte(a.v128[1], n),
779 c_v128_or(c_v128_shr_n_byte(a.v128[0], n),
780 c_v128_shl_n_byte(a.v128[1], 16 - n)));
781 else if (n > 16)
782 return c_v256_from_v128(c_v128_zero(),
783 c_v128_shr_n_byte(a.v128[1], n - 16));
784 else
785 return c_v256_from_v128(c_v128_zero(), c_v256_high_v128(a));
786}
787
Yaowu Xu032573d2017-04-24 15:04:17 -0700788SIMD_INLINE c_v256 c_v256_align(c_v256 a, c_v256 b, unsigned int c) {
Steinar Midtskogenea42c4e2016-12-12 09:40:34 +0100789 if (SIMD_CHECK && c > 31) {
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200790 fprintf(stderr, "Error: undefined alignment %d\n", c);
791 abort();
792 }
793 return c ? c_v256_or(c_v256_shr_n_byte(b, c), c_v256_shl_n_byte(a, 32 - c))
794 : b;
795}
796
Yaowu Xu032573d2017-04-24 15:04:17 -0700797SIMD_INLINE c_v256 c_v256_shl_8(c_v256 a, unsigned int c) {
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200798 return c_v256_from_v128(c_v128_shl_8(a.v128[1], c),
799 c_v128_shl_8(a.v128[0], c));
800}
801
Yaowu Xu032573d2017-04-24 15:04:17 -0700802SIMD_INLINE c_v256 c_v256_shr_u8(c_v256 a, unsigned int c) {
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200803 return c_v256_from_v128(c_v128_shr_u8(a.v128[1], c),
804 c_v128_shr_u8(a.v128[0], c));
805}
806
Yaowu Xu032573d2017-04-24 15:04:17 -0700807SIMD_INLINE c_v256 c_v256_shr_s8(c_v256 a, unsigned int c) {
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200808 return c_v256_from_v128(c_v128_shr_s8(a.v128[1], c),
809 c_v128_shr_s8(a.v128[0], c));
810}
811
Yaowu Xu032573d2017-04-24 15:04:17 -0700812SIMD_INLINE c_v256 c_v256_shl_16(c_v256 a, unsigned int c) {
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200813 return c_v256_from_v128(c_v128_shl_16(a.v128[1], c),
814 c_v128_shl_16(a.v128[0], c));
815}
816
Yaowu Xu032573d2017-04-24 15:04:17 -0700817SIMD_INLINE c_v256 c_v256_shr_u16(c_v256 a, unsigned int c) {
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200818 return c_v256_from_v128(c_v128_shr_u16(a.v128[1], c),
819 c_v128_shr_u16(a.v128[0], c));
820}
821
Yaowu Xu032573d2017-04-24 15:04:17 -0700822SIMD_INLINE c_v256 c_v256_shr_s16(c_v256 a, unsigned int c) {
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200823 return c_v256_from_v128(c_v128_shr_s16(a.v128[1], c),
824 c_v128_shr_s16(a.v128[0], c));
825}
826
Yaowu Xu032573d2017-04-24 15:04:17 -0700827SIMD_INLINE c_v256 c_v256_shl_32(c_v256 a, unsigned int c) {
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200828 return c_v256_from_v128(c_v128_shl_32(a.v128[1], c),
829 c_v128_shl_32(a.v128[0], c));
830}
831
Yaowu Xu032573d2017-04-24 15:04:17 -0700832SIMD_INLINE c_v256 c_v256_shr_u32(c_v256 a, unsigned int c) {
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200833 return c_v256_from_v128(c_v128_shr_u32(a.v128[1], c),
834 c_v128_shr_u32(a.v128[0], c));
835}
836
Yaowu Xu032573d2017-04-24 15:04:17 -0700837SIMD_INLINE c_v256 c_v256_shr_s32(c_v256 a, unsigned int c) {
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200838 return c_v256_from_v128(c_v128_shr_s32(a.v128[1], c),
839 c_v128_shr_s32(a.v128[0], c));
840}
841
Steinar Midtskogen0578d432018-05-28 14:47:36 +0200842SIMD_INLINE c_v256 c_v256_shr_s64(c_v256 a, unsigned int n) {
843 c_v256 t;
844 if (SIMD_CHECK && n > 63) {
845 fprintf(stderr, "Error: undefined s64 shift right %d\n", n);
846 abort();
847 }
848 t.s64[3] = a.s64[3] >> n;
849 t.s64[2] = a.s64[2] >> n;
850 t.s64[1] = a.s64[1] >> n;
851 t.s64[0] = a.s64[0] >> n;
852 return t;
853}
854
855SIMD_INLINE c_v256 c_v256_shr_u64(c_v256 a, unsigned int n) {
856 c_v256 t;
857 if (SIMD_CHECK && n > 63) {
858 fprintf(stderr, "Error: undefined s64 shift right %d\n", n);
859 abort();
860 }
861 t.u64[3] = a.u64[3] >> n;
862 t.u64[2] = a.u64[2] >> n;
863 t.u64[1] = a.u64[1] >> n;
864 t.u64[0] = a.u64[0] >> n;
865 return t;
866}
867
868SIMD_INLINE c_v256 c_v256_shl_64(c_v256 a, unsigned int n) {
869 c_v256 t;
870 if (SIMD_CHECK && n > 63) {
871 fprintf(stderr, "Error: undefined s64 shift right %d\n", n);
872 abort();
873 }
874 t.u64[3] = a.u64[3] << n;
875 t.u64[2] = a.u64[2] << n;
876 t.u64[1] = a.u64[1] << n;
877 t.u64[0] = a.u64[0] << n;
878 return t;
879}
880
Yaowu Xu032573d2017-04-24 15:04:17 -0700881SIMD_INLINE c_v256 c_v256_shl_n_8(c_v256 a, unsigned int n) {
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200882 return c_v256_shl_8(a, n);
883}
884
Yaowu Xu032573d2017-04-24 15:04:17 -0700885SIMD_INLINE c_v256 c_v256_shl_n_16(c_v256 a, unsigned int n) {
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200886 return c_v256_shl_16(a, n);
887}
888
Yaowu Xu032573d2017-04-24 15:04:17 -0700889SIMD_INLINE c_v256 c_v256_shl_n_32(c_v256 a, unsigned int n) {
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200890 return c_v256_shl_32(a, n);
891}
892
Steinar Midtskogen0578d432018-05-28 14:47:36 +0200893SIMD_INLINE c_v256 c_v256_shl_n_64(c_v256 a, unsigned int n) {
894 return c_v256_shl_64(a, n);
895}
896
Yaowu Xu032573d2017-04-24 15:04:17 -0700897SIMD_INLINE c_v256 c_v256_shr_n_u8(c_v256 a, unsigned int n) {
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200898 return c_v256_shr_u8(a, n);
899}
900
Yaowu Xu032573d2017-04-24 15:04:17 -0700901SIMD_INLINE c_v256 c_v256_shr_n_u16(c_v256 a, unsigned int n) {
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200902 return c_v256_shr_u16(a, n);
903}
904
Yaowu Xu032573d2017-04-24 15:04:17 -0700905SIMD_INLINE c_v256 c_v256_shr_n_u32(c_v256 a, unsigned int n) {
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200906 return c_v256_shr_u32(a, n);
907}
908
Steinar Midtskogen0578d432018-05-28 14:47:36 +0200909SIMD_INLINE c_v256 c_v256_shr_n_u64(c_v256 a, unsigned int n) {
910 return c_v256_shr_u64(a, n);
911}
912
Yaowu Xu032573d2017-04-24 15:04:17 -0700913SIMD_INLINE c_v256 c_v256_shr_n_s8(c_v256 a, unsigned int n) {
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200914 return c_v256_shr_s8(a, n);
915}
916
Yaowu Xu032573d2017-04-24 15:04:17 -0700917SIMD_INLINE c_v256 c_v256_shr_n_s16(c_v256 a, unsigned int n) {
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200918 return c_v256_shr_s16(a, n);
919}
920
Yaowu Xu032573d2017-04-24 15:04:17 -0700921SIMD_INLINE c_v256 c_v256_shr_n_s32(c_v256 a, unsigned int n) {
Steinar Midtskogen045d4132016-10-18 12:20:05 +0200922 return c_v256_shr_s32(a, n);
923}
924
Steinar Midtskogen0578d432018-05-28 14:47:36 +0200925SIMD_INLINE c_v256 c_v256_shr_n_s64(c_v256 a, unsigned int n) {
926 return c_v256_shr_s64(a, n);
927}
928
929SIMD_INLINE c_v256 c_v256_shr_n_word(c_v256 a, const unsigned int n) {
930 return c_v256_shr_n_byte(a, 2 * n);
931}
932SIMD_INLINE c_v256 c_v256_shl_n_word(c_v256 a, const unsigned int n) {
933 return c_v256_shl_n_byte(a, 2 * n);
934}
935
936typedef uint32_t c_sad256_internal_u16;
937
938SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16_init() { return 0; }
939
940/* Implementation dependent return value. Result must be finalised with
941 v256_sad_u16_sum(). */
942SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16(c_sad256_internal_u16 s,
943 c_v256 a, c_v256 b) {
944 int c;
945 for (c = 0; c < 16; c++)
946 s += a.u16[c] > b.u16[c] ? a.u16[c] - b.u16[c] : b.u16[c] - a.u16[c];
947 return s;
948}
949
950SIMD_INLINE uint32_t c_v256_sad_u16_sum(c_sad256_internal_u16 s) { return s; }
951
952typedef uint64_t c_ssd256_internal_s16;
953
954SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16_init() { return 0; }
955
956/* Implementation dependent return value. Result must be finalised with
957 * v256_ssd_s16_sum(). */
958SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16(c_ssd256_internal_s16 s,
959 c_v256 a, c_v256 b) {
960 int c;
961 for (c = 0; c < 16; c++)
962 s += (int32_t)(int16_t)(a.s16[c] - b.s16[c]) *
963 (int32_t)(int16_t)(a.s16[c] - b.s16[c]);
964 return s;
965}
966
967SIMD_INLINE uint64_t c_v256_ssd_s16_sum(c_ssd256_internal_s16 s) { return s; }
968
James Zerne1cbb132018-08-22 14:10:36 -0700969#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_