Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (c) 2016, Alliance for Open Media. All rights reserved |
| 3 | * |
| 4 | * This source code is subject to the terms of the BSD 2 Clause License and |
| 5 | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| 6 | * was not distributed with this source code in the LICENSE file, you can |
| 7 | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| 8 | * Media Patent License 1.0 was not distributed with this source code in the |
| 9 | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| 10 | */ |
| 11 | |
James Zern | e1cbb13 | 2018-08-22 14:10:36 -0700 | [diff] [blame] | 12 | #ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_ |
| 13 | #define AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_ |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 14 | |
| 15 | #if !defined(__AVX2__) |
| 16 | |
Tom Finegan | dd3e2a5 | 2018-05-23 14:33:09 -0700 | [diff] [blame] | 17 | #include "aom_dsp/simd/v256_intrinsics_v128.h" |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 18 | |
| 19 | #else |
| 20 | |
| 21 | // The _m256i type seems to cause problems for g++'s mangling prior to |
| 22 | // version 5, but adding -fabi-version=0 fixes this. |
Alex Converse | fa16041 | 2017-03-22 19:59:15 -0700 | [diff] [blame] | 23 | #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5 && \ |
| 24 | defined(__AVX2__) && defined(__cplusplus) |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 25 | #pragma GCC optimize "-fabi-version=0" |
| 26 | #endif |
| 27 | |
| 28 | #include <immintrin.h> |
Tom Finegan | dd3e2a5 | 2018-05-23 14:33:09 -0700 | [diff] [blame] | 29 | |
| 30 | #include "aom_dsp/simd/v128_intrinsics_x86.h" |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 31 | |
| 32 | typedef __m256i v256; |
| 33 | |
| 34 | SIMD_INLINE uint32_t v256_low_u32(v256 a) { |
| 35 | return (uint32_t)_mm_cvtsi128_si32(_mm256_extracti128_si256(a, 0)); |
| 36 | } |
| 37 | |
| 38 | SIMD_INLINE v64 v256_low_v64(v256 a) { |
| 39 | return _mm_unpacklo_epi64(_mm256_extracti128_si256(a, 0), v64_zero()); |
| 40 | } |
| 41 | |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 42 | SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); } |
| 43 | |
| 44 | SIMD_INLINE v128 v256_low_v128(v256 a) { return _mm256_castsi256_si128(a); } |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 45 | |
| 46 | SIMD_INLINE v128 v256_high_v128(v256 a) { |
| 47 | return _mm256_extracti128_si256(a, 1); |
| 48 | } |
| 49 | |
| 50 | SIMD_INLINE v256 v256_from_v128(v128 a, v128 b) { |
| 51 | // gcc seems to be missing _mm256_set_m128i() |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 52 | return _mm256_inserti128_si256(_mm256_castsi128_si256(b), a, 1); |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 53 | } |
| 54 | |
| 55 | SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) { |
| 56 | return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d)); |
| 57 | } |
| 58 | |
| 59 | SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) { |
Steinar Midtskogen | 50b2fc2 | 2020-03-24 14:23:51 +0100 | [diff] [blame] | 60 | return _mm256_set_epi64x(a, b, c, d); |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 61 | } |
| 62 | |
| 63 | SIMD_INLINE v256 v256_load_aligned(const void *p) { |
| 64 | return _mm256_load_si256((const __m256i *)p); |
| 65 | } |
| 66 | |
| 67 | SIMD_INLINE v256 v256_load_unaligned(const void *p) { |
| 68 | return _mm256_loadu_si256((const __m256i *)p); |
| 69 | } |
| 70 | |
| 71 | SIMD_INLINE void v256_store_aligned(void *p, v256 a) { |
| 72 | _mm256_store_si256((__m256i *)p, a); |
| 73 | } |
| 74 | |
| 75 | SIMD_INLINE void v256_store_unaligned(void *p, v256 a) { |
| 76 | _mm256_storeu_si256((__m256i *)p, a); |
| 77 | } |
| 78 | |
Steinar Midtskogen | 50b2fc2 | 2020-03-24 14:23:51 +0100 | [diff] [blame] | 79 | SIMD_INLINE v256 v256_zero(void) { return _mm256_setzero_si256(); } |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 80 | |
| 81 | SIMD_INLINE v256 v256_dup_8(uint8_t x) { return _mm256_set1_epi8(x); } |
| 82 | |
| 83 | SIMD_INLINE v256 v256_dup_16(uint16_t x) { return _mm256_set1_epi16(x); } |
| 84 | |
| 85 | SIMD_INLINE v256 v256_dup_32(uint32_t x) { return _mm256_set1_epi32(x); } |
| 86 | |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 87 | SIMD_INLINE v256 v256_dup_64(uint64_t x) { return _mm256_set1_epi64x(x); } |
| 88 | |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 89 | SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return _mm256_add_epi8(a, b); } |
| 90 | |
| 91 | SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return _mm256_add_epi16(a, b); } |
| 92 | |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 93 | SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { return _mm256_adds_epu8(a, b); } |
| 94 | |
| 95 | SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { return _mm256_adds_epi8(a, b); } |
| 96 | |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 97 | SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { |
| 98 | return _mm256_adds_epi16(a, b); |
| 99 | } |
| 100 | |
| 101 | SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return _mm256_add_epi32(a, b); } |
| 102 | |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 103 | SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { return _mm256_add_epi64(a, b); } |
| 104 | |
| 105 | SIMD_INLINE v256 v256_padd_u8(v256 a) { |
| 106 | return _mm256_maddubs_epi16(a, _mm256_set1_epi8(1)); |
| 107 | } |
| 108 | |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 109 | SIMD_INLINE v256 v256_padd_s16(v256 a) { |
| 110 | return _mm256_madd_epi16(a, _mm256_set1_epi16(1)); |
| 111 | } |
| 112 | |
| 113 | SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return _mm256_sub_epi8(a, b); } |
| 114 | |
| 115 | SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return _mm256_subs_epu8(a, b); } |
| 116 | |
| 117 | SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return _mm256_subs_epi8(a, b); } |
| 118 | |
| 119 | SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return _mm256_sub_epi16(a, b); } |
| 120 | |
| 121 | SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) { |
| 122 | return _mm256_subs_epi16(a, b); |
| 123 | } |
| 124 | |
Steinar Midtskogen | 9b8444a | 2017-03-31 22:11:06 +0200 | [diff] [blame] | 125 | SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) { |
| 126 | return _mm256_subs_epu16(a, b); |
| 127 | } |
| 128 | |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 129 | SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return _mm256_sub_epi32(a, b); } |
| 130 | |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 131 | SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { return _mm256_sub_epi64(a, b); } |
| 132 | |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 133 | SIMD_INLINE v256 v256_abs_s16(v256 a) { return _mm256_abs_epi16(a); } |
| 134 | |
Steinar Midtskogen | 6033fb8 | 2017-04-02 21:32:41 +0200 | [diff] [blame] | 135 | SIMD_INLINE v256 v256_abs_s8(v256 a) { return _mm256_abs_epi8(a); } |
| 136 | |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 137 | // AVX doesn't have the direct intrinsics to zip/unzip 8, 16, 32 bit |
| 138 | // lanes of lower or upper halves of a 256bit vector because the |
| 139 | // unpack/pack intrinsics operate on the 256 bit input vector as 2 |
| 140 | // independent 128 bit vectors. |
| 141 | SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 142 | return _mm256_unpacklo_epi8( |
| 143 | _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), |
| 144 | _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 145 | } |
| 146 | |
| 147 | SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 148 | return _mm256_unpackhi_epi8( |
| 149 | _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), |
| 150 | _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 151 | } |
| 152 | |
| 153 | SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) { |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 154 | return _mm256_unpacklo_epi16( |
| 155 | _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), |
| 156 | _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 157 | } |
| 158 | |
| 159 | SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) { |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 160 | return _mm256_unpackhi_epi16( |
| 161 | _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), |
| 162 | _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 163 | } |
| 164 | |
| 165 | SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) { |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 166 | return _mm256_unpacklo_epi32( |
| 167 | _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), |
| 168 | _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 169 | } |
| 170 | |
| 171 | SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) { |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 172 | return _mm256_unpackhi_epi32( |
| 173 | _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), |
| 174 | _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 175 | } |
| 176 | |
| 177 | SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) { |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 178 | return _mm256_unpacklo_epi64( |
| 179 | _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), |
| 180 | _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 181 | } |
| 182 | |
| 183 | SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) { |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 184 | return _mm256_unpackhi_epi64( |
| 185 | _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), |
| 186 | _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 187 | } |
| 188 | |
| 189 | SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) { |
Steinar Midtskogen | 50b2fc2 | 2020-03-24 14:23:51 +0100 | [diff] [blame] | 190 | return _mm256_permute2x128_si256(a, b, 0x02); |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 191 | } |
| 192 | |
| 193 | SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) { |
Steinar Midtskogen | 50b2fc2 | 2020-03-24 14:23:51 +0100 | [diff] [blame] | 194 | return _mm256_permute2x128_si256(a, b, 0x13); |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 195 | } |
| 196 | |
| 197 | SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) { |
| 198 | return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b)); |
| 199 | } |
| 200 | |
| 201 | SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) { |
| 202 | return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b)); |
| 203 | } |
| 204 | |
| 205 | SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) { |
| 206 | return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b)); |
| 207 | } |
| 208 | |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 209 | SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) { |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 210 | return _mm256_permute4x64_epi64( |
| 211 | _mm256_packs_epi16(_mm256_srai_epi16(b, 8), _mm256_srai_epi16(a, 8)), |
| 212 | _MM_SHUFFLE(3, 1, 2, 0)); |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 213 | } |
| 214 | |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 215 | SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) { |
| 216 | return v256_unziphi_8(_mm256_slli_si256(a, 1), _mm256_slli_si256(b, 1)); |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 217 | } |
| 218 | |
| 219 | SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) { |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 220 | return _mm256_permute4x64_epi64( |
| 221 | _mm256_packs_epi32(_mm256_srai_epi32(b, 16), _mm256_srai_epi32(a, 16)), |
| 222 | _MM_SHUFFLE(3, 1, 2, 0)); |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 223 | } |
| 224 | |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 225 | SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) { |
| 226 | return v256_unziphi_16(_mm256_slli_si256(a, 2), _mm256_slli_si256(b, 2)); |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 227 | } |
| 228 | |
| 229 | SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) { |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 230 | return _mm256_permute4x64_epi64( |
| 231 | _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(b), |
| 232 | _mm256_castsi256_ps(a), |
| 233 | _MM_SHUFFLE(3, 1, 3, 1))), |
| 234 | _MM_SHUFFLE(3, 1, 2, 0)); |
| 235 | } |
| 236 | |
| 237 | SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) { |
| 238 | return _mm256_permute4x64_epi64( |
| 239 | _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(b), |
| 240 | _mm256_castsi256_ps(a), |
| 241 | _MM_SHUFFLE(2, 0, 2, 0))), |
| 242 | _MM_SHUFFLE(3, 1, 2, 0)); |
| 243 | } |
| 244 | |
| 245 | SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) { |
| 246 | return _mm256_permute4x64_epi64( |
| 247 | _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(b), |
| 248 | _mm256_castsi256_pd(a), 15)), |
| 249 | _MM_SHUFFLE(3, 1, 2, 0)); |
| 250 | } |
| 251 | |
| 252 | SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) { |
| 253 | return _mm256_permute4x64_epi64( |
| 254 | _mm256_castpd_si256( |
| 255 | _mm256_shuffle_pd(_mm256_castsi256_pd(b), _mm256_castsi256_pd(a), 0)), |
| 256 | _MM_SHUFFLE(3, 1, 2, 0)); |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 257 | } |
| 258 | |
Steinar Midtskogen | 50b2fc2 | 2020-03-24 14:23:51 +0100 | [diff] [blame] | 259 | SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { return _mm256_cvtepu8_epi16(a); } |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 260 | |
| 261 | SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) { |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 262 | return _mm256_unpacklo_epi8( |
| 263 | _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)), |
| 264 | _mm256_setzero_si256()); |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 265 | } |
| 266 | |
| 267 | SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) { |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 268 | return _mm256_unpackhi_epi8( |
| 269 | _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)), |
| 270 | _mm256_setzero_si256()); |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 271 | } |
| 272 | |
Steinar Midtskogen | 1b2b739 | 2017-04-11 14:19:20 +0200 | [diff] [blame] | 273 | SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) { |
| 274 | return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a)); |
| 275 | } |
| 276 | |
| 277 | SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) { |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 278 | return _mm256_srai_epi16( |
| 279 | _mm256_unpacklo_epi8( |
| 280 | a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))), |
| 281 | 8); |
Steinar Midtskogen | 1b2b739 | 2017-04-11 14:19:20 +0200 | [diff] [blame] | 282 | } |
| 283 | |
| 284 | SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) { |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 285 | return _mm256_srai_epi16( |
| 286 | _mm256_unpackhi_epi8( |
| 287 | a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))), |
| 288 | 8); |
Steinar Midtskogen | 1b2b739 | 2017-04-11 14:19:20 +0200 | [diff] [blame] | 289 | } |
| 290 | |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 291 | SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) { |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 292 | return _mm256_permute4x64_epi64(_mm256_packs_epi32(b, a), |
| 293 | _MM_SHUFFLE(3, 1, 2, 0)); |
| 294 | } |
| 295 | |
| 296 | SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) { |
| 297 | return _mm256_permute4x64_epi64(_mm256_packus_epi32(b, a), |
| 298 | _MM_SHUFFLE(3, 1, 2, 0)); |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 299 | } |
| 300 | |
| 301 | SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) { |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 302 | return _mm256_permute4x64_epi64(_mm256_packus_epi16(b, a), |
| 303 | _MM_SHUFFLE(3, 1, 2, 0)); |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 304 | } |
| 305 | |
| 306 | SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) { |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 307 | return _mm256_permute4x64_epi64(_mm256_packs_epi16(b, a), |
| 308 | _MM_SHUFFLE(3, 1, 2, 0)); |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 309 | } |
| 310 | |
| 311 | SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) { |
Steinar Midtskogen | 50b2fc2 | 2020-03-24 14:23:51 +0100 | [diff] [blame] | 312 | return _mm256_cvtepu16_epi32(a); |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 313 | } |
| 314 | |
| 315 | SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) { |
Steinar Midtskogen | 50b2fc2 | 2020-03-24 14:23:51 +0100 | [diff] [blame] | 316 | return _mm256_cvtepi16_epi32(a); |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 317 | } |
| 318 | |
| 319 | SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) { |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 320 | return _mm256_unpacklo_epi16( |
| 321 | _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)), |
| 322 | _mm256_setzero_si256()); |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 323 | } |
| 324 | |
| 325 | SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) { |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 326 | return _mm256_srai_epi32( |
| 327 | _mm256_unpacklo_epi16( |
| 328 | a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))), |
| 329 | 16); |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 330 | } |
| 331 | |
| 332 | SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) { |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 333 | return _mm256_unpackhi_epi16( |
| 334 | _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)), |
| 335 | _mm256_setzero_si256()); |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 336 | } |
| 337 | |
| 338 | SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) { |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 339 | return _mm256_srai_epi32( |
| 340 | _mm256_unpackhi_epi16( |
| 341 | a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))), |
| 342 | 16); |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 343 | } |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 344 | |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 345 | SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) { |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 346 | return _mm256_blendv_epi8( |
| 347 | _mm256_shuffle_epi8( |
| 348 | _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 1, 0, 1)), pattern), |
| 349 | _mm256_shuffle_epi8( |
| 350 | _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 0, 0, 0)), pattern), |
| 351 | _mm256_cmpgt_epi8(v256_dup_8(16), pattern)); |
| 352 | } |
| 353 | |
| 354 | SIMD_INLINE v256 v256_wideshuffle_8(v256 a, v256 b, v256 pattern) { |
| 355 | v256 c32 = v256_dup_8(32); |
| 356 | v256 p32 = v256_sub_8(pattern, c32); |
| 357 | v256 r1 = _mm256_blendv_epi8( |
| 358 | _mm256_shuffle_epi8( |
| 359 | _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 1, 0, 1)), p32), |
| 360 | _mm256_shuffle_epi8( |
| 361 | _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 0, 0, 0)), p32), |
| 362 | _mm256_cmpgt_epi8(v256_dup_8(48), pattern)); |
| 363 | v256 r2 = _mm256_blendv_epi8( |
| 364 | _mm256_shuffle_epi8( |
| 365 | _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 3, 0, 3)), pattern), |
| 366 | _mm256_shuffle_epi8( |
| 367 | _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 2, 0, 2)), pattern), |
| 368 | _mm256_cmpgt_epi8(v256_dup_8(16), pattern)); |
| 369 | return _mm256_blendv_epi8(r1, r2, _mm256_cmpgt_epi8(c32, pattern)); |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 370 | } |
| 371 | |
| 372 | SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) { |
| 373 | return _mm256_shuffle_epi8(a, pattern); |
| 374 | } |
| 375 | |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 376 | SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) { |
| 377 | v256 t1 = _mm256_madd_epi16(v256_unpackhi_s8_s16(a), v256_unpackhi_u8_s16(b)); |
| 378 | v256 t2 = _mm256_madd_epi16(v256_unpacklo_s8_s16(a), v256_unpacklo_u8_s16(b)); |
| 379 | t1 = _mm256_add_epi32(t1, t2); |
| 380 | v128 t = _mm_add_epi32(_mm256_extracti128_si256(t1, 0), |
| 381 | _mm256_extracti128_si256(t1, 1)); |
| 382 | t = _mm_add_epi32(t, _mm_srli_si128(t, 8)); |
| 383 | t = _mm_add_epi32(t, _mm_srli_si128(t, 4)); |
| 384 | return (int32_t)v128_low_u32(t); |
| 385 | } |
| 386 | |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 387 | SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) { |
| 388 | v256 r = _mm256_madd_epi16(a, b); |
| 389 | #if defined(__x86_64__) |
| 390 | v128 t; |
| 391 | r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)), |
| 392 | _mm256_cvtepi32_epi64(v256_low_v128(r))); |
| 393 | t = v256_low_v128(_mm256_add_epi64( |
| 394 | r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1)))); |
| 395 | return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8))); |
| 396 | #else |
| 397 | v128 l = v256_low_v128(r); |
| 398 | v128 h = v256_high_v128(r); |
| 399 | return (int64_t)_mm_cvtsi128_si32(l) + |
| 400 | (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) + |
| 401 | (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) + |
| 402 | (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) + |
| 403 | (int64_t)_mm_cvtsi128_si32(h) + |
| 404 | (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) + |
| 405 | (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) + |
| 406 | (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12)); |
| 407 | #endif |
| 408 | } |
| 409 | |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 410 | SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) { |
| 411 | v256 r = _mm256_mullo_epi32(a, b); |
| 412 | #if defined(__x86_64__) |
| 413 | v128 t; |
| 414 | r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)), |
| 415 | _mm256_cvtepi32_epi64(v256_low_v128(r))); |
| 416 | t = v256_low_v128(_mm256_add_epi64( |
| 417 | r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1)))); |
| 418 | return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8))); |
| 419 | #else |
| 420 | v128 l = v256_low_v128(r); |
| 421 | v128 h = v256_high_v128(r); |
| 422 | return (int64_t)_mm_cvtsi128_si32(l) + |
| 423 | (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) + |
| 424 | (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) + |
| 425 | (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) + |
| 426 | (int64_t)_mm_cvtsi128_si32(h) + |
| 427 | (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) + |
| 428 | (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) + |
| 429 | (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12)); |
| 430 | #endif |
| 431 | } |
| 432 | |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 433 | SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { |
| 434 | v256 t = _mm256_sad_epu8(a, _mm256_setzero_si256()); |
| 435 | v128 lo = v256_low_v128(t); |
| 436 | v128 hi = v256_high_v128(t); |
| 437 | lo = v128_add_32(lo, hi); |
| 438 | return v64_low_u32(v128_low_v64(lo)) + v128_low_u32(v128_high_v64(lo)); |
| 439 | } |
| 440 | |
| 441 | typedef v256 sad256_internal; |
| 442 | |
Steinar Midtskogen | 50b2fc2 | 2020-03-24 14:23:51 +0100 | [diff] [blame] | 443 | SIMD_INLINE sad256_internal v256_sad_u8_init(void) { |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 444 | return _mm256_setzero_si256(); |
| 445 | } |
| 446 | |
| 447 | /* Implementation dependent return value. Result must be finalised with |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 448 | v256_sad_u8_sum(). |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 449 | The result for more than 32 v256_sad_u8() calls is undefined. */ |
| 450 | SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) { |
| 451 | return _mm256_add_epi64(s, _mm256_sad_epu8(a, b)); |
| 452 | } |
| 453 | |
| 454 | SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) { |
| 455 | v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s)); |
| 456 | return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t))); |
| 457 | } |
| 458 | |
| 459 | typedef v256 ssd256_internal; |
| 460 | |
Steinar Midtskogen | 50b2fc2 | 2020-03-24 14:23:51 +0100 | [diff] [blame] | 461 | SIMD_INLINE ssd256_internal v256_ssd_u8_init(void) { |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 462 | return _mm256_setzero_si256(); |
| 463 | } |
| 464 | |
| 465 | /* Implementation dependent return value. Result must be finalised with |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 466 | * v256_ssd_u8_sum(). */ |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 467 | SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) { |
| 468 | v256 l = _mm256_sub_epi16(_mm256_unpacklo_epi8(a, _mm256_setzero_si256()), |
| 469 | _mm256_unpacklo_epi8(b, _mm256_setzero_si256())); |
| 470 | v256 h = _mm256_sub_epi16(_mm256_unpackhi_epi8(a, _mm256_setzero_si256()), |
| 471 | _mm256_unpackhi_epi8(b, _mm256_setzero_si256())); |
| 472 | v256 rl = _mm256_madd_epi16(l, l); |
| 473 | v256 rh = _mm256_madd_epi16(h, h); |
| 474 | v128 c = _mm_cvtsi32_si128(32); |
| 475 | rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 8)); |
| 476 | rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 4)); |
| 477 | rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 8)); |
| 478 | rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 4)); |
| 479 | return _mm256_add_epi64( |
| 480 | s, |
| 481 | _mm256_srl_epi64(_mm256_sll_epi64(_mm256_unpacklo_epi64(rl, rh), c), c)); |
| 482 | } |
| 483 | |
| 484 | SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) { |
| 485 | v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s)); |
| 486 | return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t))); |
| 487 | } |
| 488 | |
| 489 | SIMD_INLINE v256 v256_or(v256 a, v256 b) { return _mm256_or_si256(a, b); } |
| 490 | |
| 491 | SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return _mm256_xor_si256(a, b); } |
| 492 | |
| 493 | SIMD_INLINE v256 v256_and(v256 a, v256 b) { return _mm256_and_si256(a, b); } |
| 494 | |
| 495 | SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return _mm256_andnot_si256(b, a); } |
| 496 | |
| 497 | SIMD_INLINE v256 v256_mul_s16(v64 a, v64 b) { |
| 498 | v128 lo_bits = v128_mullo_s16(a, b); |
| 499 | v128 hi_bits = v128_mulhi_s16(a, b); |
| 500 | return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits), |
| 501 | v128_ziplo_16(hi_bits, lo_bits)); |
| 502 | } |
| 503 | |
| 504 | SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) { |
| 505 | return _mm256_mullo_epi16(a, b); |
| 506 | } |
| 507 | |
| 508 | SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) { |
| 509 | return _mm256_mulhi_epi16(a, b); |
| 510 | } |
| 511 | |
| 512 | SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) { |
| 513 | return _mm256_mullo_epi32(a, b); |
| 514 | } |
| 515 | |
| 516 | SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { |
| 517 | return _mm256_madd_epi16(a, b); |
| 518 | } |
| 519 | |
| 520 | SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { |
| 521 | return _mm256_maddubs_epi16(a, b); |
| 522 | } |
| 523 | |
| 524 | SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return _mm256_avg_epu8(a, b); } |
| 525 | |
| 526 | SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { |
| 527 | return _mm256_sub_epi8( |
| 528 | _mm256_avg_epu8(a, b), |
| 529 | _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_8(1))); |
| 530 | } |
| 531 | |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 532 | SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) { |
| 533 | return _mm256_sub_epi16( |
| 534 | _mm256_avg_epu16(a, b), |
| 535 | _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_16(1))); |
| 536 | } |
| 537 | |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 538 | SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return _mm256_avg_epu16(a, b); } |
| 539 | |
| 540 | SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return _mm256_min_epu8(a, b); } |
| 541 | |
| 542 | SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return _mm256_max_epu8(a, b); } |
| 543 | |
| 544 | SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return _mm256_min_epi8(a, b); } |
| 545 | |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 546 | SIMD_INLINE uint32_t v256_movemask_8(v256 a) { return _mm256_movemask_epi8(a); } |
| 547 | |
| 548 | SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) { |
| 549 | return _mm256_blendv_epi8(a, b, c); |
| 550 | } |
| 551 | |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 552 | SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return _mm256_max_epi8(a, b); } |
| 553 | |
| 554 | SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return _mm256_min_epi16(a, b); } |
| 555 | |
| 556 | SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return _mm256_max_epi16(a, b); } |
| 557 | |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 558 | SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { return _mm256_min_epi32(a, b); } |
| 559 | |
| 560 | SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { return _mm256_max_epi32(a, b); } |
| 561 | |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 562 | SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) { |
| 563 | return _mm256_cmpgt_epi8(a, b); |
| 564 | } |
| 565 | |
| 566 | SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) { |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 567 | return _mm256_cmpgt_epi8(b, a); |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 568 | } |
| 569 | |
| 570 | SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) { |
| 571 | return _mm256_cmpeq_epi8(a, b); |
| 572 | } |
| 573 | |
| 574 | SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) { |
| 575 | return _mm256_cmpgt_epi16(a, b); |
| 576 | } |
| 577 | |
| 578 | SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) { |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 579 | return _mm256_cmpgt_epi16(b, a); |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 580 | } |
| 581 | |
| 582 | SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { |
| 583 | return _mm256_cmpeq_epi16(a, b); |
| 584 | } |
| 585 | |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 586 | SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) { |
| 587 | return _mm256_cmpgt_epi32(a, b); |
| 588 | } |
| 589 | |
| 590 | SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) { |
| 591 | return _mm256_cmpgt_epi32(b, a); |
| 592 | } |
| 593 | |
| 594 | SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) { |
| 595 | return _mm256_cmpeq_epi32(a, b); |
| 596 | } |
| 597 | |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 598 | SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) { |
| 599 | return _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << c)), |
| 600 | _mm256_sll_epi16(a, _mm_cvtsi32_si128(c))); |
| 601 | } |
| 602 | |
| 603 | SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) { |
Hien Ho | 5c87662 | 2019-08-27 14:31:44 -0700 | [diff] [blame] | 604 | return _mm256_and_si256(_mm256_set1_epi8((char)(0xff >> c)), |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 605 | _mm256_srl_epi16(a, _mm_cvtsi32_si128(c))); |
| 606 | } |
| 607 | |
| 608 | SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) { |
| 609 | __m128i x = _mm_cvtsi32_si128(c + 8); |
| 610 | return _mm256_packs_epi16(_mm256_sra_epi16(_mm256_unpacklo_epi8(a, a), x), |
| 611 | _mm256_sra_epi16(_mm256_unpackhi_epi8(a, a), x)); |
| 612 | } |
| 613 | |
| 614 | SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) { |
| 615 | return _mm256_sll_epi16(a, _mm_cvtsi32_si128(c)); |
| 616 | } |
| 617 | |
| 618 | SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) { |
| 619 | return _mm256_srl_epi16(a, _mm_cvtsi32_si128(c)); |
| 620 | } |
| 621 | |
| 622 | SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) { |
| 623 | return _mm256_sra_epi16(a, _mm_cvtsi32_si128(c)); |
| 624 | } |
| 625 | |
| 626 | SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) { |
| 627 | return _mm256_sll_epi32(a, _mm_cvtsi32_si128(c)); |
| 628 | } |
| 629 | |
| 630 | SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) { |
| 631 | return _mm256_srl_epi32(a, _mm_cvtsi32_si128(c)); |
| 632 | } |
| 633 | |
| 634 | SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) { |
| 635 | return _mm256_sra_epi32(a, _mm_cvtsi32_si128(c)); |
| 636 | } |
| 637 | |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 638 | SIMD_INLINE v256 v256_shl_64(v256 a, unsigned int c) { |
| 639 | return _mm256_sll_epi64(a, _mm_cvtsi32_si128(c)); |
| 640 | } |
| 641 | |
| 642 | SIMD_INLINE v256 v256_shr_u64(v256 a, unsigned int c) { |
| 643 | return _mm256_srl_epi64(a, _mm_cvtsi32_si128(c)); |
| 644 | } |
| 645 | |
| 646 | SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) { |
Steinar Midtskogen | 50b2fc2 | 2020-03-24 14:23:51 +0100 | [diff] [blame] | 647 | #if defined(__AVX512VL__) |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 648 | return _mm256_sra_epi64(a, _mm_cvtsi32_si128(c)); |
| 649 | #else |
| 650 | return v256_from_v128(v128_shr_s64(v256_high_v128(a), c), |
| 651 | v128_shr_s64(v256_low_v128(a), c)); |
| 652 | #endif |
| 653 | } |
| 654 | |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 655 | /* These intrinsics require immediate values, so we must use #defines |
| 656 | to enforce that. */ |
| 657 | // _mm256_slli_si256 works on 128 bit lanes and can't be used |
Steinar Midtskogen | 8a99b5f | 2018-06-11 14:44:31 +0200 | [diff] [blame] | 658 | #define v256_shl_n_byte(a, n) \ |
Steinar Midtskogen | 8eb2197 | 2018-06-16 00:05:52 +0200 | [diff] [blame] | 659 | ((n) < 16 ? v256_from_v128( \ |
| 660 | v128_align(v256_high_v128(a), v256_low_v128(a), 16 - (n)), \ |
| 661 | v128_shl_n_byte(v256_low_v128(a), n)) \ |
Steinar Midtskogen | 8a99b5f | 2018-06-11 14:44:31 +0200 | [diff] [blame] | 662 | : _mm256_inserti128_si256( \ |
| 663 | _mm256_setzero_si256(), \ |
Steinar Midtskogen | 8eb2197 | 2018-06-16 00:05:52 +0200 | [diff] [blame] | 664 | v128_shl_n_byte(v256_low_v128(a), (n)-16), 1)) |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 665 | |
| 666 | // _mm256_srli_si256 works on 128 bit lanes and can't be used |
Steinar Midtskogen | 8a99b5f | 2018-06-11 14:44:31 +0200 | [diff] [blame] | 667 | #define v256_shr_n_byte(a, n) \ |
| 668 | ((n) < 16 \ |
| 669 | ? _mm256_alignr_epi8( \ |
| 670 | _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), a, n) \ |
Steinar Midtskogen | 50b2fc2 | 2020-03-24 14:23:51 +0100 | [diff] [blame] | 671 | : ((n) == 16 \ |
| 672 | ? _mm256_permute2x128_si256(_mm256_setzero_si256(), a, 3) \ |
| 673 | : _mm256_inserti128_si256( \ |
| 674 | _mm256_setzero_si256(), \ |
| 675 | v128_align(v256_high_v128(a), v256_high_v128(a), n), 0))) |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 676 | |
| 677 | // _mm256_alignr_epi8 works on two 128 bit lanes and can't be used |
| 678 | #define v256_align(a, b, c) \ |
Steinar Midtskogen | 50b2fc2 | 2020-03-24 14:23:51 +0100 | [diff] [blame] | 679 | ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b) |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 680 | |
| 681 | #define v256_shl_n_8(a, c) \ |
| 682 | _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << (c))), \ |
| 683 | _mm256_slli_epi16(a, c)) |
| 684 | #define v256_shr_n_u8(a, c) \ |
| 685 | _mm256_and_si256(_mm256_set1_epi8(0xff >> (c)), _mm256_srli_epi16(a, c)) |
| 686 | #define v256_shr_n_s8(a, c) \ |
| 687 | _mm256_packs_epi16(_mm256_srai_epi16(_mm256_unpacklo_epi8(a, a), (c) + 8), \ |
| 688 | _mm256_srai_epi16(_mm256_unpackhi_epi8(a, a), (c) + 8)) |
| 689 | #define v256_shl_n_16(a, c) _mm256_slli_epi16(a, c) |
| 690 | #define v256_shr_n_u16(a, c) _mm256_srli_epi16(a, c) |
| 691 | #define v256_shr_n_s16(a, c) _mm256_srai_epi16(a, c) |
| 692 | #define v256_shl_n_32(a, c) _mm256_slli_epi32(a, c) |
| 693 | #define v256_shr_n_u32(a, c) _mm256_srli_epi32(a, c) |
| 694 | #define v256_shr_n_s32(a, c) _mm256_srai_epi32(a, c) |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 695 | #define v256_shl_n_64(a, c) _mm256_slli_epi64(a, c) |
| 696 | #define v256_shr_n_u64(a, c) _mm256_srli_epi64(a, c) |
| 697 | #define v256_shr_n_s64(a, c) \ |
| 698 | v256_shr_s64((a), (c)) // _mm256_srai_epi64 broken in gcc? |
| 699 | #define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n)) |
| 700 | #define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n)) |
| 701 | |
| 702 | typedef v256 sad256_internal_u16; |
| 703 | |
Steinar Midtskogen | 50b2fc2 | 2020-03-24 14:23:51 +0100 | [diff] [blame] | 704 | SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) { return v256_zero(); } |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 705 | |
| 706 | /* Implementation dependent return value. Result must be finalised with |
| 707 | * v256_sad_u16_sum(). */ |
| 708 | SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a, |
| 709 | v256 b) { |
| 710 | #if defined(__SSE4_1__) |
| 711 | v256 t = v256_sub_16(_mm256_max_epu16(a, b), _mm256_min_epu16(a, b)); |
| 712 | #else |
| 713 | v256 t = v256_cmplt_s16(v256_xor(a, v256_dup_16(32768)), |
| 714 | v256_xor(b, v256_dup_16(32768))); |
| 715 | t = v256_sub_16(v256_or(v256_and(b, t), v256_andn(a, t)), |
| 716 | v256_or(v256_and(a, t), v256_andn(b, t))); |
| 717 | #endif |
| 718 | return v256_add_32( |
| 719 | s, v256_add_32(v256_unpackhi_u16_s32(t), v256_unpacklo_u16_s32(t))); |
| 720 | } |
| 721 | |
| 722 | SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) { |
| 723 | v128 t = v128_add_32(v256_high_v128(s), v256_low_v128(s)); |
| 724 | return v128_low_u32(t) + v128_low_u32(v128_shr_n_byte(t, 4)) + |
| 725 | v128_low_u32(v128_shr_n_byte(t, 8)) + |
| 726 | v128_low_u32(v128_shr_n_byte(t, 12)); |
| 727 | } |
| 728 | |
| 729 | typedef v256 ssd256_internal_s16; |
| 730 | |
Steinar Midtskogen | 50b2fc2 | 2020-03-24 14:23:51 +0100 | [diff] [blame] | 731 | SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init(void) { return v256_zero(); } |
Steinar Midtskogen | 0578d43 | 2018-05-28 14:47:36 +0200 | [diff] [blame] | 732 | |
| 733 | /* Implementation dependent return value. Result must be finalised with |
| 734 | * v256_ssd_s16_sum(). */ |
| 735 | SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a, |
| 736 | v256 b) { |
| 737 | v256 d = v256_sub_16(a, b); |
| 738 | d = v256_madd_s16(d, d); |
| 739 | return v256_add_64(s, v256_add_64(_mm256_unpackhi_epi32(d, v256_zero()), |
| 740 | _mm256_unpacklo_epi32(d, v256_zero()))); |
| 741 | } |
| 742 | |
| 743 | SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) { |
| 744 | v128 t = v128_add_64(v256_high_v128(s), v256_low_v128(s)); |
| 745 | return v64_u64(v128_low_v64(t)) + v64_u64(v128_high_v64(t)); |
| 746 | } |
| 747 | |
Steinar Midtskogen | 045d413 | 2016-10-18 12:20:05 +0200 | [diff] [blame] | 748 | #endif |
| 749 | |
James Zern | e1cbb13 | 2018-08-22 14:10:36 -0700 | [diff] [blame] | 750 | #endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_ |