Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 1 | /* |
Lester Lu | 6bc30d6 | 2021-12-16 19:13:21 +0000 | [diff] [blame^] | 2 | * Copyright (c) 2021, Alliance for Open Media. All rights reserved |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 3 | * |
Lester Lu | 6bc30d6 | 2021-12-16 19:13:21 +0000 | [diff] [blame^] | 4 | * This source code is subject to the terms of the BSD 3-Clause Clear License |
| 5 | * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear |
| 6 | * License was not distributed with this source code in the LICENSE file, you |
| 7 | * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the |
| 8 | * Alliance for Open Media Patent License 1.0 was not distributed with this |
| 9 | * source code in the PATENTS file, you can obtain it at |
| 10 | * aomedia.org/license/patent-license/. |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 11 | */ |
| 12 | |
James Zern | e1cbb13 | 2018-08-22 14:10:36 -0700 | [diff] [blame] | 13 | #ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_ |
| 14 | #define AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_ |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 15 | |
| 16 | #include <stdio.h> |
| 17 | #include <stdlib.h> |
Tom Finegan | 60e653d | 2018-05-22 11:34:58 -0700 | [diff] [blame] | 18 | |
| 19 | #include "config/aom_config.h" |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 20 | |
Tom Finegan | dd3e2a5 | 2018-05-23 14:33:09 -0700 | [diff] [blame] | 21 | #include "aom_dsp/simd/v128_intrinsics_c.h" |
| 22 | |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 23 | typedef union { |
| 24 | uint8_t u8[32]; |
| 25 | uint16_t u16[16]; |
| 26 | uint32_t u32[8]; |
| 27 | uint64_t u64[4]; |
| 28 | int8_t s8[32]; |
| 29 | int16_t s16[16]; |
| 30 | int32_t s32[8]; |
| 31 | int64_t s64[4]; |
| 32 | c_v64 v64[4]; |
| 33 | c_v128 v128[2]; |
| 34 | } c_v256; |
| 35 | |
| 36 | SIMD_INLINE uint32_t c_v256_low_u32(c_v256 a) { return a.u32[0]; } |
| 37 | |
| 38 | SIMD_INLINE c_v64 c_v256_low_v64(c_v256 a) { return a.v64[0]; } |
| 39 | |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 40 | SIMD_INLINE uint64_t c_v256_low_u64(c_v256 a) { return a.u64[0]; } |
| 41 | |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 42 | SIMD_INLINE c_v128 c_v256_low_v128(c_v256 a) { return a.v128[0]; } |
| 43 | |
| 44 | SIMD_INLINE c_v128 c_v256_high_v128(c_v256 a) { return a.v128[1]; } |
| 45 | |
| 46 | SIMD_INLINE c_v256 c_v256_from_v128(c_v128 hi, c_v128 lo) { |
| 47 | c_v256 t; |
| 48 | t.v128[1] = hi; |
| 49 | t.v128[0] = lo; |
| 50 | return t; |
| 51 | } |
| 52 | |
| 53 | SIMD_INLINE c_v256 c_v256_from_64(uint64_t a, uint64_t b, uint64_t c, |
| 54 | uint64_t d) { |
| 55 | c_v256 t; |
| 56 | t.u64[3] = a; |
| 57 | t.u64[2] = b; |
| 58 | t.u64[1] = c; |
| 59 | t.u64[0] = d; |
| 60 | return t; |
| 61 | } |
| 62 | |
| 63 | SIMD_INLINE c_v256 c_v256_from_v64(c_v64 a, c_v64 b, c_v64 c, c_v64 d) { |
| 64 | c_v256 t; |
| 65 | t.u64[3] = a.u64; |
| 66 | t.u64[2] = b.u64; |
| 67 | t.u64[1] = c.u64; |
| 68 | t.u64[0] = d.u64; |
| 69 | return t; |
| 70 | } |
| 71 | |
| 72 | SIMD_INLINE c_v256 c_v256_load_unaligned(const void *p) { |
| 73 | c_v256 t; |
| 74 | uint8_t *pp = (uint8_t *)p; |
| 75 | uint8_t *q = (uint8_t *)&t; |
| 76 | int c; |
| 77 | for (c = 0; c < 32; c++) q[c] = pp[c]; |
| 78 | return t; |
| 79 | } |
| 80 | |
| 81 | SIMD_INLINE c_v256 c_v256_load_aligned(const void *p) { |
Steinar Midtskogen | ea42c4e | 2016-12-12 09:40:34 +0100 | [diff] [blame] | 82 | if (SIMD_CHECK && (uintptr_t)p & 31) { |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 83 | fprintf(stderr, "Error: unaligned v256 load at %p\n", p); |
| 84 | abort(); |
| 85 | } |
| 86 | return c_v256_load_unaligned(p); |
| 87 | } |
| 88 | |
| 89 | SIMD_INLINE void c_v256_store_unaligned(void *p, c_v256 a) { |
| 90 | uint8_t *pp = (uint8_t *)p; |
| 91 | uint8_t *q = (uint8_t *)&a; |
| 92 | int c; |
| 93 | for (c = 0; c < 32; c++) pp[c] = q[c]; |
| 94 | } |
| 95 | |
| 96 | SIMD_INLINE void c_v256_store_aligned(void *p, c_v256 a) { |
Steinar Midtskogen | ea42c4e | 2016-12-12 09:40:34 +0100 | [diff] [blame] | 97 | if (SIMD_CHECK && (uintptr_t)p & 31) { |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 98 | fprintf(stderr, "Error: unaligned v256 store at %p\n", p); |
| 99 | abort(); |
| 100 | } |
| 101 | c_v256_store_unaligned(p, a); |
| 102 | } |
| 103 | |
| 104 | SIMD_INLINE c_v256 c_v256_zero() { |
| 105 | c_v256 t; |
| 106 | t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = 0; |
| 107 | return t; |
| 108 | } |
| 109 | |
| 110 | SIMD_INLINE c_v256 c_v256_dup_8(uint8_t x) { |
| 111 | c_v256 t; |
| 112 | t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_8(x); |
| 113 | return t; |
| 114 | } |
| 115 | |
| 116 | SIMD_INLINE c_v256 c_v256_dup_16(uint16_t x) { |
| 117 | c_v256 t; |
| 118 | t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_16(x); |
| 119 | return t; |
| 120 | } |
| 121 | |
| 122 | SIMD_INLINE c_v256 c_v256_dup_32(uint32_t x) { |
| 123 | c_v256 t; |
| 124 | t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_32(x); |
| 125 | return t; |
| 126 | } |
| 127 | |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 128 | SIMD_INLINE c_v256 c_v256_dup_64(uint64_t x) { |
| 129 | c_v256 t; |
| 130 | t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = x; |
| 131 | return t; |
| 132 | } |
| 133 | |
| 134 | SIMD_INLINE int64_t c_v256_dotp_su8(c_v256 a, c_v256 b) { |
| 135 | return c_v128_dotp_su8(a.v128[1], b.v128[1]) + |
| 136 | c_v128_dotp_su8(a.v128[0], b.v128[0]); |
| 137 | } |
| 138 | |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 139 | SIMD_INLINE int64_t c_v256_dotp_s16(c_v256 a, c_v256 b) { |
| 140 | return c_v128_dotp_s16(a.v128[1], b.v128[1]) + |
| 141 | c_v128_dotp_s16(a.v128[0], b.v128[0]); |
| 142 | } |
| 143 | |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 144 | SIMD_INLINE int64_t c_v256_dotp_s32(c_v256 a, c_v256 b) { |
| 145 | return c_v128_dotp_s32(a.v128[1], b.v128[1]) + |
| 146 | c_v128_dotp_s32(a.v128[0], b.v128[0]); |
| 147 | } |
| 148 | |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 149 | SIMD_INLINE uint64_t c_v256_hadd_u8(c_v256 a) { |
| 150 | return c_v128_hadd_u8(a.v128[1]) + c_v128_hadd_u8(a.v128[0]); |
| 151 | } |
| 152 | |
Steinar Midtskogen | 50b2fc2 | 2020-03-24 14:23:51 +0100 | [diff] [blame] | 153 | typedef struct { |
| 154 | uint32_t val; |
| 155 | int count; |
| 156 | } c_sad256_internal; |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 157 | |
Steinar Midtskogen | 50b2fc2 | 2020-03-24 14:23:51 +0100 | [diff] [blame] | 158 | SIMD_INLINE c_sad256_internal c_v256_sad_u8_init(void) { |
| 159 | c_sad256_internal t; |
| 160 | t.val = t.count = 0; |
| 161 | return t; |
| 162 | } |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 163 | |
| 164 | /* Implementation dependent return value. Result must be finalised with |
| 165 | v256_sad_u8_sum(). |
| 166 | The result for more than 16 v256_sad_u8() calls is undefined. */ |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 167 | SIMD_INLINE c_sad256_internal c_v256_sad_u8(c_sad256_internal s, c_v256 a, |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 168 | c_v256 b) { |
| 169 | int c; |
| 170 | for (c = 0; c < 32; c++) |
Steinar Midtskogen | 50b2fc2 | 2020-03-24 14:23:51 +0100 | [diff] [blame] | 171 | s.val += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c]; |
| 172 | s.count++; |
| 173 | if (SIMD_CHECK && s.count > 32) { |
| 174 | fprintf(stderr, |
| 175 | "Error: sad called 32 times returning an undefined result\n"); |
| 176 | abort(); |
| 177 | } |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 178 | return s; |
| 179 | } |
| 180 | |
Steinar Midtskogen | 50b2fc2 | 2020-03-24 14:23:51 +0100 | [diff] [blame] | 181 | SIMD_INLINE uint32_t c_v256_sad_u8_sum(c_sad256_internal s) { return s.val; } |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 182 | |
| 183 | typedef uint32_t c_ssd256_internal; |
| 184 | |
| 185 | SIMD_INLINE c_ssd256_internal c_v256_ssd_u8_init() { return 0; } |
| 186 | |
| 187 | /* Implementation dependent return value. Result must be finalised with |
| 188 | * v256_ssd_u8_sum(). */ |
| 189 | SIMD_INLINE c_ssd256_internal c_v256_ssd_u8(c_ssd256_internal s, c_v256 a, |
| 190 | c_v256 b) { |
| 191 | int c; |
| 192 | for (c = 0; c < 32; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]); |
| 193 | return s; |
| 194 | } |
| 195 | |
| 196 | SIMD_INLINE uint32_t c_v256_ssd_u8_sum(c_ssd256_internal s) { return s; } |
| 197 | |
| 198 | SIMD_INLINE c_v256 c_v256_or(c_v256 a, c_v256 b) { |
| 199 | return c_v256_from_v128(c_v128_or(a.v128[1], b.v128[1]), |
| 200 | c_v128_or(a.v128[0], b.v128[0])); |
| 201 | } |
| 202 | |
| 203 | SIMD_INLINE c_v256 c_v256_xor(c_v256 a, c_v256 b) { |
| 204 | return c_v256_from_v128(c_v128_xor(a.v128[1], b.v128[1]), |
| 205 | c_v128_xor(a.v128[0], b.v128[0])); |
| 206 | } |
| 207 | |
| 208 | SIMD_INLINE c_v256 c_v256_and(c_v256 a, c_v256 b) { |
| 209 | return c_v256_from_v128(c_v128_and(a.v128[1], b.v128[1]), |
| 210 | c_v128_and(a.v128[0], b.v128[0])); |
| 211 | } |
| 212 | |
| 213 | SIMD_INLINE c_v256 c_v256_andn(c_v256 a, c_v256 b) { |
| 214 | return c_v256_from_v128(c_v128_andn(a.v128[1], b.v128[1]), |
| 215 | c_v128_andn(a.v128[0], b.v128[0])); |
| 216 | } |
| 217 | |
| 218 | SIMD_INLINE c_v256 c_v256_add_8(c_v256 a, c_v256 b) { |
| 219 | return c_v256_from_v128(c_v128_add_8(a.v128[1], b.v128[1]), |
| 220 | c_v128_add_8(a.v128[0], b.v128[0])); |
| 221 | } |
| 222 | |
| 223 | SIMD_INLINE c_v256 c_v256_add_16(c_v256 a, c_v256 b) { |
| 224 | return c_v256_from_v128(c_v128_add_16(a.v128[1], b.v128[1]), |
| 225 | c_v128_add_16(a.v128[0], b.v128[0])); |
| 226 | } |
| 227 | |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 228 | SIMD_INLINE c_v256 c_v256_sadd_s8(c_v256 a, c_v256 b) { |
| 229 | return c_v256_from_v128(c_v128_sadd_s8(a.v128[1], b.v128[1]), |
| 230 | c_v128_sadd_s8(a.v128[0], b.v128[0])); |
| 231 | } |
| 232 | |
| 233 | SIMD_INLINE c_v256 c_v256_sadd_u8(c_v256 a, c_v256 b) { |
| 234 | return c_v256_from_v128(c_v128_sadd_u8(a.v128[1], b.v128[1]), |
| 235 | c_v128_sadd_u8(a.v128[0], b.v128[0])); |
| 236 | } |
| 237 | |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 238 | SIMD_INLINE c_v256 c_v256_sadd_s16(c_v256 a, c_v256 b) { |
| 239 | return c_v256_from_v128(c_v128_sadd_s16(a.v128[1], b.v128[1]), |
| 240 | c_v128_sadd_s16(a.v128[0], b.v128[0])); |
| 241 | } |
| 242 | |
| 243 | SIMD_INLINE c_v256 c_v256_add_32(c_v256 a, c_v256 b) { |
| 244 | return c_v256_from_v128(c_v128_add_32(a.v128[1], b.v128[1]), |
| 245 | c_v128_add_32(a.v128[0], b.v128[0])); |
| 246 | } |
| 247 | |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 248 | SIMD_INLINE c_v256 c_v256_add_64(c_v256 a, c_v256 b) { |
| 249 | return c_v256_from_v128(c_v128_add_64(a.v128[1], b.v128[1]), |
| 250 | c_v128_add_64(a.v128[0], b.v128[0])); |
| 251 | } |
| 252 | |
| 253 | SIMD_INLINE c_v256 c_v256_sub_64(c_v256 a, c_v256 b) { |
| 254 | return c_v256_from_v128(c_v128_sub_64(a.v128[1], b.v128[1]), |
| 255 | c_v128_sub_64(a.v128[0], b.v128[0])); |
| 256 | } |
| 257 | |
| 258 | SIMD_INLINE c_v256 c_v256_padd_u8(c_v256 a) { |
| 259 | c_v256 t; |
| 260 | for (int i = 0; i < 16; i++) |
| 261 | t.u16[i] = (uint16_t)a.u8[i * 2] + (uint16_t)a.u8[i * 2 + 1]; |
| 262 | return t; |
| 263 | } |
| 264 | |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 265 | SIMD_INLINE c_v256 c_v256_padd_s16(c_v256 a) { |
| 266 | c_v256 t; |
| 267 | t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1]; |
| 268 | t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3]; |
| 269 | t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5]; |
| 270 | t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7]; |
| 271 | t.s32[4] = (int32_t)a.s16[8] + (int32_t)a.s16[9]; |
| 272 | t.s32[5] = (int32_t)a.s16[10] + (int32_t)a.s16[11]; |
| 273 | t.s32[6] = (int32_t)a.s16[12] + (int32_t)a.s16[13]; |
| 274 | t.s32[7] = (int32_t)a.s16[14] + (int32_t)a.s16[15]; |
| 275 | return t; |
| 276 | } |
| 277 | |
| 278 | SIMD_INLINE c_v256 c_v256_sub_8(c_v256 a, c_v256 b) { |
| 279 | return c_v256_from_v128(c_v128_sub_8(a.v128[1], b.v128[1]), |
| 280 | c_v128_sub_8(a.v128[0], b.v128[0])); |
| 281 | } |
| 282 | |
| 283 | SIMD_INLINE c_v256 c_v256_ssub_u8(c_v256 a, c_v256 b) { |
| 284 | return c_v256_from_v128(c_v128_ssub_u8(a.v128[1], b.v128[1]), |
| 285 | c_v128_ssub_u8(a.v128[0], b.v128[0])); |
| 286 | } |
| 287 | |
| 288 | SIMD_INLINE c_v256 c_v256_ssub_s8(c_v256 a, c_v256 b) { |
| 289 | return c_v256_from_v128(c_v128_ssub_s8(a.v128[1], b.v128[1]), |
| 290 | c_v128_ssub_s8(a.v128[0], b.v128[0])); |
| 291 | } |
| 292 | |
| 293 | SIMD_INLINE c_v256 c_v256_sub_16(c_v256 a, c_v256 b) { |
| 294 | return c_v256_from_v128(c_v128_sub_16(a.v128[1], b.v128[1]), |
| 295 | c_v128_sub_16(a.v128[0], b.v128[0])); |
| 296 | } |
| 297 | |
| 298 | SIMD_INLINE c_v256 c_v256_ssub_s16(c_v256 a, c_v256 b) { |
| 299 | return c_v256_from_v128(c_v128_ssub_s16(a.v128[1], b.v128[1]), |
| 300 | c_v128_ssub_s16(a.v128[0], b.v128[0])); |
| 301 | } |
| 302 | |
Steinar Midtskogen | 9b8444a | 2017-03-31 22:11:06 +0200 | [diff] [blame] | 303 | SIMD_INLINE c_v256 c_v256_ssub_u16(c_v256 a, c_v256 b) { |
| 304 | return c_v256_from_v128(c_v128_ssub_u16(a.v128[1], b.v128[1]), |
| 305 | c_v128_ssub_u16(a.v128[0], b.v128[0])); |
| 306 | } |
| 307 | |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 308 | SIMD_INLINE c_v256 c_v256_sub_32(c_v256 a, c_v256 b) { |
| 309 | return c_v256_from_v128(c_v128_sub_32(a.v128[1], b.v128[1]), |
| 310 | c_v128_sub_32(a.v128[0], b.v128[0])); |
| 311 | } |
| 312 | |
| 313 | SIMD_INLINE c_v256 c_v256_abs_s16(c_v256 a) { |
| 314 | return c_v256_from_v128(c_v128_abs_s16(a.v128[1]), c_v128_abs_s16(a.v128[0])); |
| 315 | } |
| 316 | |
Steinar Midtskogen | 6033fb8 | 2017-04-02 21:32:41 +0200 | [diff] [blame] | 317 | SIMD_INLINE c_v256 c_v256_abs_s8(c_v256 a) { |
| 318 | return c_v256_from_v128(c_v128_abs_s8(a.v128[1]), c_v128_abs_s8(a.v128[0])); |
| 319 | } |
| 320 | |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 321 | SIMD_INLINE c_v256 c_v256_mul_s16(c_v128 a, c_v128 b) { |
| 322 | c_v128 lo_bits = c_v128_mullo_s16(a, b); |
| 323 | c_v128 hi_bits = c_v128_mulhi_s16(a, b); |
| 324 | return c_v256_from_v128(c_v128_ziphi_16(hi_bits, lo_bits), |
| 325 | c_v128_ziplo_16(hi_bits, lo_bits)); |
| 326 | } |
| 327 | |
| 328 | SIMD_INLINE c_v256 c_v256_mullo_s16(c_v256 a, c_v256 b) { |
| 329 | return c_v256_from_v128(c_v128_mullo_s16(a.v128[1], b.v128[1]), |
| 330 | c_v128_mullo_s16(a.v128[0], b.v128[0])); |
| 331 | } |
| 332 | |
| 333 | SIMD_INLINE c_v256 c_v256_mulhi_s16(c_v256 a, c_v256 b) { |
| 334 | return c_v256_from_v128(c_v128_mulhi_s16(a.v128[1], b.v128[1]), |
| 335 | c_v128_mulhi_s16(a.v128[0], b.v128[0])); |
| 336 | } |
| 337 | |
| 338 | SIMD_INLINE c_v256 c_v256_mullo_s32(c_v256 a, c_v256 b) { |
| 339 | return c_v256_from_v128(c_v128_mullo_s32(a.v128[1], b.v128[1]), |
| 340 | c_v128_mullo_s32(a.v128[0], b.v128[0])); |
| 341 | } |
| 342 | |
| 343 | SIMD_INLINE c_v256 c_v256_madd_s16(c_v256 a, c_v256 b) { |
| 344 | return c_v256_from_v128(c_v128_madd_s16(a.v128[1], b.v128[1]), |
| 345 | c_v128_madd_s16(a.v128[0], b.v128[0])); |
| 346 | } |
| 347 | |
| 348 | SIMD_INLINE c_v256 c_v256_madd_us8(c_v256 a, c_v256 b) { |
| 349 | return c_v256_from_v128(c_v128_madd_us8(a.v128[1], b.v128[1]), |
| 350 | c_v128_madd_us8(a.v128[0], b.v128[0])); |
| 351 | } |
| 352 | |
| 353 | SIMD_INLINE c_v256 c_v256_avg_u8(c_v256 a, c_v256 b) { |
| 354 | return c_v256_from_v128(c_v128_avg_u8(a.v128[1], b.v128[1]), |
| 355 | c_v128_avg_u8(a.v128[0], b.v128[0])); |
| 356 | } |
| 357 | |
| 358 | SIMD_INLINE c_v256 c_v256_rdavg_u8(c_v256 a, c_v256 b) { |
| 359 | return c_v256_from_v128(c_v128_rdavg_u8(a.v128[1], b.v128[1]), |
| 360 | c_v128_rdavg_u8(a.v128[0], b.v128[0])); |
| 361 | } |
| 362 | |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 363 | SIMD_INLINE c_v256 c_v256_rdavg_u16(c_v256 a, c_v256 b) { |
| 364 | return c_v256_from_v128(c_v128_rdavg_u16(a.v128[1], b.v128[1]), |
| 365 | c_v128_rdavg_u16(a.v128[0], b.v128[0])); |
| 366 | } |
| 367 | |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 368 | SIMD_INLINE c_v256 c_v256_avg_u16(c_v256 a, c_v256 b) { |
| 369 | return c_v256_from_v128(c_v128_avg_u16(a.v128[1], b.v128[1]), |
| 370 | c_v128_avg_u16(a.v128[0], b.v128[0])); |
| 371 | } |
| 372 | |
| 373 | SIMD_INLINE c_v256 c_v256_min_u8(c_v256 a, c_v256 b) { |
| 374 | return c_v256_from_v128(c_v128_min_u8(a.v128[1], b.v128[1]), |
| 375 | c_v128_min_u8(a.v128[0], b.v128[0])); |
| 376 | } |
| 377 | |
| 378 | SIMD_INLINE c_v256 c_v256_max_u8(c_v256 a, c_v256 b) { |
| 379 | return c_v256_from_v128(c_v128_max_u8(a.v128[1], b.v128[1]), |
| 380 | c_v128_max_u8(a.v128[0], b.v128[0])); |
| 381 | } |
| 382 | |
| 383 | SIMD_INLINE c_v256 c_v256_min_s8(c_v256 a, c_v256 b) { |
| 384 | return c_v256_from_v128(c_v128_min_s8(a.v128[1], b.v128[1]), |
| 385 | c_v128_min_s8(a.v128[0], b.v128[0])); |
| 386 | } |
| 387 | |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 388 | SIMD_INLINE uint32_t c_v256_movemask_8(c_v256 a) { |
| 389 | return ((a.s8[31] < 0) << 31) | ((a.s8[30] < 0) << 30) | |
| 390 | ((a.s8[29] < 0) << 29) | ((a.s8[28] < 0) << 28) | |
| 391 | ((a.s8[27] < 0) << 27) | ((a.s8[26] < 0) << 26) | |
| 392 | ((a.s8[25] < 0) << 25) | ((a.s8[24] < 0) << 24) | |
| 393 | ((a.s8[23] < 0) << 23) | ((a.s8[22] < 0) << 22) | |
| 394 | ((a.s8[21] < 0) << 21) | ((a.s8[20] < 0) << 20) | |
| 395 | ((a.s8[19] < 0) << 19) | ((a.s8[18] < 0) << 18) | |
| 396 | ((a.s8[17] < 0) << 17) | ((a.s8[16] < 0) << 16) | |
| 397 | ((a.s8[15] < 0) << 15) | ((a.s8[14] < 0) << 14) | |
| 398 | ((a.s8[13] < 0) << 13) | ((a.s8[12] < 0) << 12) | |
| 399 | ((a.s8[11] < 0) << 11) | ((a.s8[10] < 0) << 10) | |
| 400 | ((a.s8[9] < 0) << 9) | ((a.s8[8] < 0) << 8) | ((a.s8[7] < 0) << 7) | |
| 401 | ((a.s8[6] < 0) << 6) | ((a.s8[5] < 0) << 5) | ((a.s8[4] < 0) << 4) | |
| 402 | ((a.s8[3] < 0) << 3) | ((a.s8[2] < 0) << 2) | ((a.s8[1] < 0) << 1) | |
| 403 | ((a.s8[0] < 0) << 0); |
| 404 | } |
| 405 | |
| 406 | SIMD_INLINE c_v256 c_v256_blend_8(c_v256 a, c_v256 b, c_v256 c) { |
| 407 | c_v256 t; |
| 408 | for (int i = 0; i < 32; i++) t.u8[i] = c.s8[i] < 0 ? b.u8[i] : a.u8[i]; |
| 409 | return t; |
| 410 | } |
| 411 | |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 412 | SIMD_INLINE c_v256 c_v256_max_s8(c_v256 a, c_v256 b) { |
| 413 | return c_v256_from_v128(c_v128_max_s8(a.v128[1], b.v128[1]), |
| 414 | c_v128_max_s8(a.v128[0], b.v128[0])); |
| 415 | } |
| 416 | |
| 417 | SIMD_INLINE c_v256 c_v256_min_s16(c_v256 a, c_v256 b) { |
| 418 | return c_v256_from_v128(c_v128_min_s16(a.v128[1], b.v128[1]), |
| 419 | c_v128_min_s16(a.v128[0], b.v128[0])); |
| 420 | } |
| 421 | |
| 422 | SIMD_INLINE c_v256 c_v256_max_s16(c_v256 a, c_v256 b) { |
| 423 | return c_v256_from_v128(c_v128_max_s16(a.v128[1], b.v128[1]), |
| 424 | c_v128_max_s16(a.v128[0], b.v128[0])); |
| 425 | } |
| 426 | |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 427 | SIMD_INLINE c_v256 c_v256_min_s32(c_v256 a, c_v256 b) { |
| 428 | return c_v256_from_v128(c_v128_min_s32(a.v128[1], b.v128[1]), |
| 429 | c_v128_min_s32(a.v128[0], b.v128[0])); |
| 430 | } |
| 431 | |
| 432 | SIMD_INLINE c_v256 c_v256_max_s32(c_v256 a, c_v256 b) { |
| 433 | return c_v256_from_v128(c_v128_max_s32(a.v128[1], b.v128[1]), |
| 434 | c_v128_max_s32(a.v128[0], b.v128[0])); |
| 435 | } |
| 436 | |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 437 | SIMD_INLINE c_v256 c_v256_ziplo_8(c_v256 a, c_v256 b) { |
| 438 | return c_v256_from_v128(c_v128_ziphi_8(a.v128[0], b.v128[0]), |
| 439 | c_v128_ziplo_8(a.v128[0], b.v128[0])); |
| 440 | } |
| 441 | |
| 442 | SIMD_INLINE c_v256 c_v256_ziphi_8(c_v256 a, c_v256 b) { |
| 443 | return c_v256_from_v128(c_v128_ziphi_8(a.v128[1], b.v128[1]), |
| 444 | c_v128_ziplo_8(a.v128[1], b.v128[1])); |
| 445 | } |
| 446 | |
| 447 | SIMD_INLINE c_v256 c_v256_ziplo_16(c_v256 a, c_v256 b) { |
| 448 | return c_v256_from_v128(c_v128_ziphi_16(a.v128[0], b.v128[0]), |
| 449 | c_v128_ziplo_16(a.v128[0], b.v128[0])); |
| 450 | } |
| 451 | |
| 452 | SIMD_INLINE c_v256 c_v256_ziphi_16(c_v256 a, c_v256 b) { |
| 453 | return c_v256_from_v128(c_v128_ziphi_16(a.v128[1], b.v128[1]), |
| 454 | c_v128_ziplo_16(a.v128[1], b.v128[1])); |
| 455 | } |
| 456 | |
| 457 | SIMD_INLINE c_v256 c_v256_ziplo_32(c_v256 a, c_v256 b) { |
| 458 | return c_v256_from_v128(c_v128_ziphi_32(a.v128[0], b.v128[0]), |
| 459 | c_v128_ziplo_32(a.v128[0], b.v128[0])); |
| 460 | } |
| 461 | |
| 462 | SIMD_INLINE c_v256 c_v256_ziphi_32(c_v256 a, c_v256 b) { |
| 463 | return c_v256_from_v128(c_v128_ziphi_32(a.v128[1], b.v128[1]), |
| 464 | c_v128_ziplo_32(a.v128[1], b.v128[1])); |
| 465 | } |
| 466 | |
| 467 | SIMD_INLINE c_v256 c_v256_ziplo_64(c_v256 a, c_v256 b) { |
| 468 | return c_v256_from_v128(c_v128_ziphi_64(a.v128[0], b.v128[0]), |
| 469 | c_v128_ziplo_64(a.v128[0], b.v128[0])); |
| 470 | } |
| 471 | |
| 472 | SIMD_INLINE c_v256 c_v256_ziphi_64(c_v256 a, c_v256 b) { |
| 473 | return c_v256_from_v128(c_v128_ziphi_64(a.v128[1], b.v128[1]), |
| 474 | c_v128_ziplo_64(a.v128[1], b.v128[1])); |
| 475 | } |
| 476 | |
| 477 | SIMD_INLINE c_v256 c_v256_ziplo_128(c_v256 a, c_v256 b) { |
| 478 | return c_v256_from_v128(a.v128[0], b.v128[0]); |
| 479 | } |
| 480 | |
| 481 | SIMD_INLINE c_v256 c_v256_ziphi_128(c_v256 a, c_v256 b) { |
| 482 | return c_v256_from_v128(a.v128[1], b.v128[1]); |
| 483 | } |
| 484 | |
| 485 | SIMD_INLINE c_v256 c_v256_zip_8(c_v128 a, c_v128 b) { |
| 486 | return c_v256_from_v128(c_v128_ziphi_8(a, b), c_v128_ziplo_8(a, b)); |
| 487 | } |
| 488 | |
| 489 | SIMD_INLINE c_v256 c_v256_zip_16(c_v128 a, c_v128 b) { |
| 490 | return c_v256_from_v128(c_v128_ziphi_16(a, b), c_v128_ziplo_16(a, b)); |
| 491 | } |
| 492 | |
| 493 | SIMD_INLINE c_v256 c_v256_zip_32(c_v128 a, c_v128 b) { |
| 494 | return c_v256_from_v128(c_v128_ziphi_32(a, b), c_v128_ziplo_32(a, b)); |
| 495 | } |
| 496 | |
| 497 | SIMD_INLINE c_v256 _c_v256_unzip_8(c_v256 a, c_v256 b, int mode) { |
| 498 | c_v256 t; |
| 499 | int i; |
| 500 | if (mode) { |
| 501 | for (i = 0; i < 16; i++) { |
| 502 | t.u8[i] = a.u8[i * 2 + 1]; |
| 503 | t.u8[i + 16] = b.u8[i * 2 + 1]; |
| 504 | } |
| 505 | } else { |
| 506 | for (i = 0; i < 16; i++) { |
| 507 | t.u8[i] = b.u8[i * 2]; |
| 508 | t.u8[i + 16] = a.u8[i * 2]; |
| 509 | } |
| 510 | } |
| 511 | return t; |
| 512 | } |
| 513 | |
| 514 | SIMD_INLINE c_v256 c_v256_unziplo_8(c_v256 a, c_v256 b) { |
| 515 | return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(a, b, 1) |
| 516 | : _c_v256_unzip_8(a, b, 0); |
| 517 | } |
| 518 | |
| 519 | SIMD_INLINE c_v256 c_v256_unziphi_8(c_v256 a, c_v256 b) { |
| 520 | return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(b, a, 0) |
| 521 | : _c_v256_unzip_8(b, a, 1); |
| 522 | } |
| 523 | |
| 524 | SIMD_INLINE c_v256 _c_v256_unzip_16(c_v256 a, c_v256 b, int mode) { |
| 525 | c_v256 t; |
| 526 | int i; |
| 527 | if (mode) { |
| 528 | for (i = 0; i < 8; i++) { |
| 529 | t.u16[i] = a.u16[i * 2 + 1]; |
| 530 | t.u16[i + 8] = b.u16[i * 2 + 1]; |
| 531 | } |
| 532 | } else { |
| 533 | for (i = 0; i < 8; i++) { |
| 534 | t.u16[i] = b.u16[i * 2]; |
| 535 | t.u16[i + 8] = a.u16[i * 2]; |
| 536 | } |
| 537 | } |
| 538 | return t; |
| 539 | } |
| 540 | |
| 541 | SIMD_INLINE c_v256 c_v256_unziplo_16(c_v256 a, c_v256 b) { |
| 542 | return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(a, b, 1) |
| 543 | : _c_v256_unzip_16(a, b, 0); |
| 544 | } |
| 545 | |
| 546 | SIMD_INLINE c_v256 c_v256_unziphi_16(c_v256 a, c_v256 b) { |
| 547 | return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(b, a, 0) |
| 548 | : _c_v256_unzip_16(b, a, 1); |
| 549 | } |
| 550 | |
| 551 | SIMD_INLINE c_v256 _c_v256_unzip_32(c_v256 a, c_v256 b, int mode) { |
| 552 | c_v256 t; |
| 553 | if (mode) { |
| 554 | t.u32[7] = b.u32[7]; |
| 555 | t.u32[6] = b.u32[5]; |
| 556 | t.u32[5] = b.u32[3]; |
| 557 | t.u32[4] = b.u32[1]; |
| 558 | t.u32[3] = a.u32[7]; |
| 559 | t.u32[2] = a.u32[5]; |
| 560 | t.u32[1] = a.u32[3]; |
| 561 | t.u32[0] = a.u32[1]; |
| 562 | } else { |
| 563 | t.u32[7] = a.u32[6]; |
| 564 | t.u32[6] = a.u32[4]; |
| 565 | t.u32[5] = a.u32[2]; |
| 566 | t.u32[4] = a.u32[0]; |
| 567 | t.u32[3] = b.u32[6]; |
| 568 | t.u32[2] = b.u32[4]; |
| 569 | t.u32[1] = b.u32[2]; |
| 570 | t.u32[0] = b.u32[0]; |
| 571 | } |
| 572 | return t; |
| 573 | } |
| 574 | |
| 575 | SIMD_INLINE c_v256 c_v256_unziplo_32(c_v256 a, c_v256 b) { |
| 576 | return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(a, b, 1) |
| 577 | : _c_v256_unzip_32(a, b, 0); |
| 578 | } |
| 579 | |
| 580 | SIMD_INLINE c_v256 c_v256_unziphi_32(c_v256 a, c_v256 b) { |
| 581 | return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(b, a, 0) |
| 582 | : _c_v256_unzip_32(b, a, 1); |
| 583 | } |
| 584 | |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 585 | SIMD_INLINE c_v256 _c_v256_unzip_64(c_v256 a, c_v256 b, int mode) { |
| 586 | c_v256 t; |
| 587 | if (mode) { |
| 588 | t.u64[3] = b.u64[3]; |
| 589 | t.u64[2] = b.u64[1]; |
| 590 | t.u64[1] = a.u64[3]; |
| 591 | t.u64[0] = a.u64[1]; |
| 592 | } else { |
| 593 | t.u64[3] = a.u64[2]; |
| 594 | t.u64[2] = a.u64[0]; |
| 595 | t.u64[1] = b.u64[2]; |
| 596 | t.u64[0] = b.u64[0]; |
| 597 | } |
| 598 | return t; |
| 599 | } |
| 600 | |
| 601 | SIMD_INLINE c_v256 c_v256_unziplo_64(c_v256 a, c_v256 b) { |
| 602 | return CONFIG_BIG_ENDIAN ? _c_v256_unzip_64(a, b, 1) |
| 603 | : _c_v256_unzip_64(a, b, 0); |
| 604 | } |
| 605 | |
| 606 | SIMD_INLINE c_v256 c_v256_unziphi_64(c_v256 a, c_v256 b) { |
| 607 | return CONFIG_BIG_ENDIAN ? _c_v256_unzip_64(b, a, 0) |
| 608 | : _c_v256_unzip_64(b, a, 1); |
| 609 | } |
| 610 | |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 611 | SIMD_INLINE c_v256 c_v256_unpack_u8_s16(c_v128 a) { |
| 612 | return c_v256_from_v128(c_v128_unpackhi_u8_s16(a), c_v128_unpacklo_u8_s16(a)); |
| 613 | } |
| 614 | |
| 615 | SIMD_INLINE c_v256 c_v256_unpacklo_u8_s16(c_v256 a) { |
| 616 | return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[0]), |
| 617 | c_v128_unpacklo_u8_s16(a.v128[0])); |
| 618 | } |
| 619 | |
| 620 | SIMD_INLINE c_v256 c_v256_unpackhi_u8_s16(c_v256 a) { |
| 621 | return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[1]), |
| 622 | c_v128_unpacklo_u8_s16(a.v128[1])); |
| 623 | } |
| 624 | |
Steinar Midtskogen | 1b2b739 | 2017-04-11 14:19:20 +0200 | [diff] [blame] | 625 | SIMD_INLINE c_v256 c_v256_unpack_s8_s16(c_v128 a) { |
| 626 | return c_v256_from_v128(c_v128_unpackhi_s8_s16(a), c_v128_unpacklo_s8_s16(a)); |
| 627 | } |
| 628 | |
| 629 | SIMD_INLINE c_v256 c_v256_unpacklo_s8_s16(c_v256 a) { |
| 630 | return c_v256_from_v128(c_v128_unpackhi_s8_s16(a.v128[0]), |
| 631 | c_v128_unpacklo_s8_s16(a.v128[0])); |
| 632 | } |
| 633 | |
| 634 | SIMD_INLINE c_v256 c_v256_unpackhi_s8_s16(c_v256 a) { |
| 635 | return c_v256_from_v128(c_v128_unpackhi_s8_s16(a.v128[1]), |
| 636 | c_v128_unpacklo_s8_s16(a.v128[1])); |
| 637 | } |
| 638 | |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 639 | SIMD_INLINE c_v256 c_v256_pack_s32_s16(c_v256 a, c_v256 b) { |
| 640 | return c_v256_from_v128(c_v128_pack_s32_s16(a.v128[1], a.v128[0]), |
| 641 | c_v128_pack_s32_s16(b.v128[1], b.v128[0])); |
| 642 | } |
| 643 | |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 644 | SIMD_INLINE c_v256 c_v256_pack_s32_u16(c_v256 a, c_v256 b) { |
| 645 | return c_v256_from_v128(c_v128_pack_s32_u16(a.v128[1], a.v128[0]), |
| 646 | c_v128_pack_s32_u16(b.v128[1], b.v128[0])); |
| 647 | } |
| 648 | |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 649 | SIMD_INLINE c_v256 c_v256_pack_s16_u8(c_v256 a, c_v256 b) { |
| 650 | return c_v256_from_v128(c_v128_pack_s16_u8(a.v128[1], a.v128[0]), |
| 651 | c_v128_pack_s16_u8(b.v128[1], b.v128[0])); |
| 652 | } |
| 653 | |
| 654 | SIMD_INLINE c_v256 c_v256_pack_s16_s8(c_v256 a, c_v256 b) { |
| 655 | return c_v256_from_v128(c_v128_pack_s16_s8(a.v128[1], a.v128[0]), |
| 656 | c_v128_pack_s16_s8(b.v128[1], b.v128[0])); |
| 657 | } |
| 658 | |
| 659 | SIMD_INLINE c_v256 c_v256_unpack_u16_s32(c_v128 a) { |
| 660 | return c_v256_from_v128(c_v128_unpackhi_u16_s32(a), |
| 661 | c_v128_unpacklo_u16_s32(a)); |
| 662 | } |
| 663 | |
| 664 | SIMD_INLINE c_v256 c_v256_unpack_s16_s32(c_v128 a) { |
| 665 | return c_v256_from_v128(c_v128_unpackhi_s16_s32(a), |
| 666 | c_v128_unpacklo_s16_s32(a)); |
| 667 | } |
| 668 | |
| 669 | SIMD_INLINE c_v256 c_v256_unpacklo_u16_s32(c_v256 a) { |
| 670 | return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[0]), |
| 671 | c_v128_unpacklo_u16_s32(a.v128[0])); |
| 672 | } |
| 673 | |
| 674 | SIMD_INLINE c_v256 c_v256_unpacklo_s16_s32(c_v256 a) { |
| 675 | return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[0]), |
| 676 | c_v128_unpacklo_s16_s32(a.v128[0])); |
| 677 | } |
| 678 | |
| 679 | SIMD_INLINE c_v256 c_v256_unpackhi_u16_s32(c_v256 a) { |
| 680 | return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[1]), |
| 681 | c_v128_unpacklo_u16_s32(a.v128[1])); |
| 682 | } |
| 683 | |
| 684 | SIMD_INLINE c_v256 c_v256_unpackhi_s16_s32(c_v256 a) { |
| 685 | return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[1]), |
| 686 | c_v128_unpacklo_s16_s32(a.v128[1])); |
| 687 | } |
| 688 | |
| 689 | SIMD_INLINE c_v256 c_v256_shuffle_8(c_v256 a, c_v256 pattern) { |
| 690 | c_v256 t; |
| 691 | int c; |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 692 | for (c = 0; c < 32; c++) |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 693 | t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31) |
| 694 | : pattern.u8[c] & 31]; |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 695 | |
| 696 | return t; |
| 697 | } |
| 698 | |
| 699 | SIMD_INLINE c_v256 c_v256_wideshuffle_8(c_v256 a, c_v256 b, c_v256 pattern) { |
| 700 | c_v256 t; |
| 701 | int c; |
| 702 | for (c = 0; c < 32; c++) |
| 703 | t.u8[c] = (pattern.u8[c] < 32 |
| 704 | ? b.u8 |
| 705 | : a.u8)[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31) |
| 706 | : pattern.u8[c] & 31]; |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 707 | return t; |
| 708 | } |
| 709 | |
| 710 | // Pairwise / dual-lane shuffle: shuffle two 128 bit lates. |
| 711 | SIMD_INLINE c_v256 c_v256_pshuffle_8(c_v256 a, c_v256 pattern) { |
| 712 | return c_v256_from_v128( |
| 713 | c_v128_shuffle_8(c_v256_high_v128(a), c_v256_high_v128(pattern)), |
| 714 | c_v128_shuffle_8(c_v256_low_v128(a), c_v256_low_v128(pattern))); |
| 715 | } |
| 716 | |
| 717 | SIMD_INLINE c_v256 c_v256_cmpgt_s8(c_v256 a, c_v256 b) { |
| 718 | return c_v256_from_v128(c_v128_cmpgt_s8(a.v128[1], b.v128[1]), |
| 719 | c_v128_cmpgt_s8(a.v128[0], b.v128[0])); |
| 720 | } |
| 721 | |
| 722 | SIMD_INLINE c_v256 c_v256_cmplt_s8(c_v256 a, c_v256 b) { |
| 723 | return c_v256_from_v128(c_v128_cmplt_s8(a.v128[1], b.v128[1]), |
| 724 | c_v128_cmplt_s8(a.v128[0], b.v128[0])); |
| 725 | } |
| 726 | |
| 727 | SIMD_INLINE c_v256 c_v256_cmpeq_8(c_v256 a, c_v256 b) { |
| 728 | return c_v256_from_v128(c_v128_cmpeq_8(a.v128[1], b.v128[1]), |
| 729 | c_v128_cmpeq_8(a.v128[0], b.v128[0])); |
| 730 | } |
| 731 | |
| 732 | SIMD_INLINE c_v256 c_v256_cmpgt_s16(c_v256 a, c_v256 b) { |
| 733 | return c_v256_from_v128(c_v128_cmpgt_s16(a.v128[1], b.v128[1]), |
| 734 | c_v128_cmpgt_s16(a.v128[0], b.v128[0])); |
| 735 | } |
| 736 | |
| 737 | SIMD_INLINE c_v256 c_v256_cmplt_s16(c_v256 a, c_v256 b) { |
| 738 | return c_v256_from_v128(c_v128_cmplt_s16(a.v128[1], b.v128[1]), |
| 739 | c_v128_cmplt_s16(a.v128[0], b.v128[0])); |
| 740 | } |
| 741 | |
| 742 | SIMD_INLINE c_v256 c_v256_cmpeq_16(c_v256 a, c_v256 b) { |
| 743 | return c_v256_from_v128(c_v128_cmpeq_16(a.v128[1], b.v128[1]), |
| 744 | c_v128_cmpeq_16(a.v128[0], b.v128[0])); |
| 745 | } |
| 746 | |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 747 | SIMD_INLINE c_v256 c_v256_cmpgt_s32(c_v256 a, c_v256 b) { |
| 748 | return c_v256_from_v128(c_v128_cmpgt_s32(a.v128[1], b.v128[1]), |
| 749 | c_v128_cmpgt_s32(a.v128[0], b.v128[0])); |
| 750 | } |
| 751 | |
| 752 | SIMD_INLINE c_v256 c_v256_cmplt_s32(c_v256 a, c_v256 b) { |
| 753 | return c_v256_from_v128(c_v128_cmplt_s32(a.v128[1], b.v128[1]), |
| 754 | c_v128_cmplt_s32(a.v128[0], b.v128[0])); |
| 755 | } |
| 756 | |
| 757 | SIMD_INLINE c_v256 c_v256_cmpeq_32(c_v256 a, c_v256 b) { |
| 758 | return c_v256_from_v128(c_v128_cmpeq_32(a.v128[1], b.v128[1]), |
| 759 | c_v128_cmpeq_32(a.v128[0], b.v128[0])); |
| 760 | } |
| 761 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 762 | SIMD_INLINE c_v256 c_v256_shl_n_byte(c_v256 a, unsigned int n) { |
Steinar Midtskogen | 50b2fc2 | 2020-03-24 14:23:51 +0100 | [diff] [blame] | 763 | if (n == 0) return a; |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 764 | if (n < 16) |
| 765 | return c_v256_from_v128(c_v128_or(c_v128_shl_n_byte(a.v128[1], n), |
| 766 | c_v128_shr_n_byte(a.v128[0], 16 - n)), |
| 767 | c_v128_shl_n_byte(a.v128[0], n)); |
| 768 | else if (n > 16) |
| 769 | return c_v256_from_v128(c_v128_shl_n_byte(a.v128[0], n - 16), |
| 770 | c_v128_zero()); |
| 771 | else |
| 772 | return c_v256_from_v128(c_v256_low_v128(a), c_v128_zero()); |
| 773 | } |
| 774 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 775 | SIMD_INLINE c_v256 c_v256_shr_n_byte(c_v256 a, unsigned int n) { |
Steinar Midtskogen | 50b2fc2 | 2020-03-24 14:23:51 +0100 | [diff] [blame] | 776 | if (n == 0) return a; |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 777 | if (n < 16) |
| 778 | return c_v256_from_v128(c_v128_shr_n_byte(a.v128[1], n), |
| 779 | c_v128_or(c_v128_shr_n_byte(a.v128[0], n), |
| 780 | c_v128_shl_n_byte(a.v128[1], 16 - n))); |
| 781 | else if (n > 16) |
| 782 | return c_v256_from_v128(c_v128_zero(), |
| 783 | c_v128_shr_n_byte(a.v128[1], n - 16)); |
| 784 | else |
| 785 | return c_v256_from_v128(c_v128_zero(), c_v256_high_v128(a)); |
| 786 | } |
| 787 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 788 | SIMD_INLINE c_v256 c_v256_align(c_v256 a, c_v256 b, unsigned int c) { |
Steinar Midtskogen | ea42c4e | 2016-12-12 09:40:34 +0100 | [diff] [blame] | 789 | if (SIMD_CHECK && c > 31) { |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 790 | fprintf(stderr, "Error: undefined alignment %d\n", c); |
| 791 | abort(); |
| 792 | } |
| 793 | return c ? c_v256_or(c_v256_shr_n_byte(b, c), c_v256_shl_n_byte(a, 32 - c)) |
| 794 | : b; |
| 795 | } |
| 796 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 797 | SIMD_INLINE c_v256 c_v256_shl_8(c_v256 a, unsigned int c) { |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 798 | return c_v256_from_v128(c_v128_shl_8(a.v128[1], c), |
| 799 | c_v128_shl_8(a.v128[0], c)); |
| 800 | } |
| 801 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 802 | SIMD_INLINE c_v256 c_v256_shr_u8(c_v256 a, unsigned int c) { |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 803 | return c_v256_from_v128(c_v128_shr_u8(a.v128[1], c), |
| 804 | c_v128_shr_u8(a.v128[0], c)); |
| 805 | } |
| 806 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 807 | SIMD_INLINE c_v256 c_v256_shr_s8(c_v256 a, unsigned int c) { |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 808 | return c_v256_from_v128(c_v128_shr_s8(a.v128[1], c), |
| 809 | c_v128_shr_s8(a.v128[0], c)); |
| 810 | } |
| 811 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 812 | SIMD_INLINE c_v256 c_v256_shl_16(c_v256 a, unsigned int c) { |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 813 | return c_v256_from_v128(c_v128_shl_16(a.v128[1], c), |
| 814 | c_v128_shl_16(a.v128[0], c)); |
| 815 | } |
| 816 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 817 | SIMD_INLINE c_v256 c_v256_shr_u16(c_v256 a, unsigned int c) { |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 818 | return c_v256_from_v128(c_v128_shr_u16(a.v128[1], c), |
| 819 | c_v128_shr_u16(a.v128[0], c)); |
| 820 | } |
| 821 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 822 | SIMD_INLINE c_v256 c_v256_shr_s16(c_v256 a, unsigned int c) { |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 823 | return c_v256_from_v128(c_v128_shr_s16(a.v128[1], c), |
| 824 | c_v128_shr_s16(a.v128[0], c)); |
| 825 | } |
| 826 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 827 | SIMD_INLINE c_v256 c_v256_shl_32(c_v256 a, unsigned int c) { |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 828 | return c_v256_from_v128(c_v128_shl_32(a.v128[1], c), |
| 829 | c_v128_shl_32(a.v128[0], c)); |
| 830 | } |
| 831 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 832 | SIMD_INLINE c_v256 c_v256_shr_u32(c_v256 a, unsigned int c) { |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 833 | return c_v256_from_v128(c_v128_shr_u32(a.v128[1], c), |
| 834 | c_v128_shr_u32(a.v128[0], c)); |
| 835 | } |
| 836 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 837 | SIMD_INLINE c_v256 c_v256_shr_s32(c_v256 a, unsigned int c) { |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 838 | return c_v256_from_v128(c_v128_shr_s32(a.v128[1], c), |
| 839 | c_v128_shr_s32(a.v128[0], c)); |
| 840 | } |
| 841 | |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 842 | SIMD_INLINE c_v256 c_v256_shr_s64(c_v256 a, unsigned int n) { |
| 843 | c_v256 t; |
| 844 | if (SIMD_CHECK && n > 63) { |
| 845 | fprintf(stderr, "Error: undefined s64 shift right %d\n", n); |
| 846 | abort(); |
| 847 | } |
| 848 | t.s64[3] = a.s64[3] >> n; |
| 849 | t.s64[2] = a.s64[2] >> n; |
| 850 | t.s64[1] = a.s64[1] >> n; |
| 851 | t.s64[0] = a.s64[0] >> n; |
| 852 | return t; |
| 853 | } |
| 854 | |
| 855 | SIMD_INLINE c_v256 c_v256_shr_u64(c_v256 a, unsigned int n) { |
| 856 | c_v256 t; |
| 857 | if (SIMD_CHECK && n > 63) { |
| 858 | fprintf(stderr, "Error: undefined s64 shift right %d\n", n); |
| 859 | abort(); |
| 860 | } |
| 861 | t.u64[3] = a.u64[3] >> n; |
| 862 | t.u64[2] = a.u64[2] >> n; |
| 863 | t.u64[1] = a.u64[1] >> n; |
| 864 | t.u64[0] = a.u64[0] >> n; |
| 865 | return t; |
| 866 | } |
| 867 | |
| 868 | SIMD_INLINE c_v256 c_v256_shl_64(c_v256 a, unsigned int n) { |
| 869 | c_v256 t; |
| 870 | if (SIMD_CHECK && n > 63) { |
| 871 | fprintf(stderr, "Error: undefined s64 shift right %d\n", n); |
| 872 | abort(); |
| 873 | } |
| 874 | t.u64[3] = a.u64[3] << n; |
| 875 | t.u64[2] = a.u64[2] << n; |
| 876 | t.u64[1] = a.u64[1] << n; |
| 877 | t.u64[0] = a.u64[0] << n; |
| 878 | return t; |
| 879 | } |
| 880 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 881 | SIMD_INLINE c_v256 c_v256_shl_n_8(c_v256 a, unsigned int n) { |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 882 | return c_v256_shl_8(a, n); |
| 883 | } |
| 884 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 885 | SIMD_INLINE c_v256 c_v256_shl_n_16(c_v256 a, unsigned int n) { |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 886 | return c_v256_shl_16(a, n); |
| 887 | } |
| 888 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 889 | SIMD_INLINE c_v256 c_v256_shl_n_32(c_v256 a, unsigned int n) { |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 890 | return c_v256_shl_32(a, n); |
| 891 | } |
| 892 | |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 893 | SIMD_INLINE c_v256 c_v256_shl_n_64(c_v256 a, unsigned int n) { |
| 894 | return c_v256_shl_64(a, n); |
| 895 | } |
| 896 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 897 | SIMD_INLINE c_v256 c_v256_shr_n_u8(c_v256 a, unsigned int n) { |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 898 | return c_v256_shr_u8(a, n); |
| 899 | } |
| 900 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 901 | SIMD_INLINE c_v256 c_v256_shr_n_u16(c_v256 a, unsigned int n) { |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 902 | return c_v256_shr_u16(a, n); |
| 903 | } |
| 904 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 905 | SIMD_INLINE c_v256 c_v256_shr_n_u32(c_v256 a, unsigned int n) { |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 906 | return c_v256_shr_u32(a, n); |
| 907 | } |
| 908 | |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 909 | SIMD_INLINE c_v256 c_v256_shr_n_u64(c_v256 a, unsigned int n) { |
| 910 | return c_v256_shr_u64(a, n); |
| 911 | } |
| 912 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 913 | SIMD_INLINE c_v256 c_v256_shr_n_s8(c_v256 a, unsigned int n) { |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 914 | return c_v256_shr_s8(a, n); |
| 915 | } |
| 916 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 917 | SIMD_INLINE c_v256 c_v256_shr_n_s16(c_v256 a, unsigned int n) { |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 918 | return c_v256_shr_s16(a, n); |
| 919 | } |
| 920 | |
Yaowu Xu | 032573d | 2017-04-24 15:04:17 -0700 | [diff] [blame] | 921 | SIMD_INLINE c_v256 c_v256_shr_n_s32(c_v256 a, unsigned int n) { |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 922 | return c_v256_shr_s32(a, n); |
| 923 | } |
| 924 | |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 925 | SIMD_INLINE c_v256 c_v256_shr_n_s64(c_v256 a, unsigned int n) { |
| 926 | return c_v256_shr_s64(a, n); |
| 927 | } |
| 928 | |
| 929 | SIMD_INLINE c_v256 c_v256_shr_n_word(c_v256 a, const unsigned int n) { |
| 930 | return c_v256_shr_n_byte(a, 2 * n); |
| 931 | } |
| 932 | SIMD_INLINE c_v256 c_v256_shl_n_word(c_v256 a, const unsigned int n) { |
| 933 | return c_v256_shl_n_byte(a, 2 * n); |
| 934 | } |
| 935 | |
| 936 | typedef uint32_t c_sad256_internal_u16; |
| 937 | |
| 938 | SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16_init() { return 0; } |
| 939 | |
| 940 | /* Implementation dependent return value. Result must be finalised with |
| 941 | v256_sad_u16_sum(). */ |
| 942 | SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16(c_sad256_internal_u16 s, |
| 943 | c_v256 a, c_v256 b) { |
| 944 | int c; |
| 945 | for (c = 0; c < 16; c++) |
| 946 | s += a.u16[c] > b.u16[c] ? a.u16[c] - b.u16[c] : b.u16[c] - a.u16[c]; |
| 947 | return s; |
| 948 | } |
| 949 | |
| 950 | SIMD_INLINE uint32_t c_v256_sad_u16_sum(c_sad256_internal_u16 s) { return s; } |
| 951 | |
| 952 | typedef uint64_t c_ssd256_internal_s16; |
| 953 | |
| 954 | SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16_init() { return 0; } |
| 955 | |
| 956 | /* Implementation dependent return value. Result must be finalised with |
| 957 | * v256_ssd_s16_sum(). */ |
| 958 | SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16(c_ssd256_internal_s16 s, |
| 959 | c_v256 a, c_v256 b) { |
| 960 | int c; |
| 961 | for (c = 0; c < 16; c++) |
| 962 | s += (int32_t)(int16_t)(a.s16[c] - b.s16[c]) * |
| 963 | (int32_t)(int16_t)(a.s16[c] - b.s16[c]); |
| 964 | return s; |
| 965 | } |
| 966 | |
| 967 | SIMD_INLINE uint64_t c_v256_ssd_s16_sum(c_ssd256_internal_s16 s) { return s; } |
| 968 | |
James Zern | e1cbb13 | 2018-08-22 14:10:36 -0700 | [diff] [blame] | 969 | #endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_ |