| // Copyright 2019 Google LLC |
| // SPDX-License-Identifier: Apache-2.0 |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // 128-bit vectors and SSE4 instructions, plus some AVX2 and AVX512-VL |
| // operations when compiling for those targets. |
| // External include guard in highway.h - see comment there. |
| |
| // Must come before HWY_DIAGNOSTICS and HWY_COMPILER_GCC_ACTUAL |
| #include "third_party/highway/hwy/base.h" |
| |
| // Avoid uninitialized warnings in GCC's emmintrin.h - see |
| // https://github.com/google/highway/issues/710 and pull/902 |
| HWY_DIAGNOSTICS(push) |
| #if HWY_COMPILER_GCC_ACTUAL |
| HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") |
| HWY_DIAGNOSTICS_OFF(disable : 4701 4703 6001 26494, |
| ignored "-Wmaybe-uninitialized") |
| #endif |
| |
| #include <emmintrin.h> |
| #include <stdio.h> |
| #if HWY_TARGET == HWY_SSSE3 |
| #include <tmmintrin.h> // SSSE3 |
| #elif HWY_TARGET <= HWY_SSE4 |
| #include <smmintrin.h> // SSE4 |
| #ifndef HWY_DISABLE_PCLMUL_AES |
| #include <wmmintrin.h> // CLMUL |
| #endif |
| #endif |
| |
| #include "third_party/highway/hwy/ops/shared-inl.h" |
| |
| HWY_BEFORE_NAMESPACE(); |
| namespace hwy { |
| namespace HWY_NAMESPACE { |
| namespace detail { |
| |
| // Enable generic functions for whichever of (f16, bf16) are not supported. |
| #if !HWY_HAVE_FLOAT16 |
| #define HWY_X86_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D) |
| #else |
| #define HWY_X86_IF_EMULATED_D(D) HWY_IF_BF16_D(D) |
| #endif |
| |
| #undef HWY_AVX3_HAVE_F32_TO_BF16C |
| #if HWY_TARGET <= HWY_AVX3_ZEN4 && !HWY_COMPILER_CLANGCL && \ |
| (HWY_COMPILER_GCC_ACTUAL >= 1000 || HWY_COMPILER_CLANG >= 900) && \ |
| !defined(HWY_AVX3_DISABLE_AVX512BF16) |
| #define HWY_AVX3_HAVE_F32_TO_BF16C 1 |
| #else |
| #define HWY_AVX3_HAVE_F32_TO_BF16C 0 |
| #endif |
| |
| #undef HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT |
| #if HWY_TARGET <= HWY_AVX3 && HWY_ARCH_X86_64 |
| #define HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT "v" |
| #else |
| #define HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT "x" |
| #endif |
| |
| template <typename T> |
| struct Raw128 { |
| using type = __m128i; |
| }; |
| #if HWY_HAVE_FLOAT16 |
| template <> |
| struct Raw128<float16_t> { |
| using type = __m128h; |
| }; |
| #endif // HWY_HAVE_FLOAT16 |
| template <> |
| struct Raw128<float> { |
| using type = __m128; |
| }; |
| template <> |
| struct Raw128<double> { |
| using type = __m128d; |
| }; |
| |
| } // namespace detail |
| |
| template <typename T, size_t N = 16 / sizeof(T)> |
| class Vec128 { |
| using Raw = typename detail::Raw128<T>::type; |
| |
| public: |
| using PrivateT = T; // only for DFromV |
| static constexpr size_t kPrivateN = N; // only for DFromV |
| |
| // Compound assignment. Only usable if there is a corresponding non-member |
| // binary operator overload. For example, only f32 and f64 support division. |
| HWY_INLINE Vec128& operator*=(const Vec128 other) { |
| return *this = (*this * other); |
| } |
| HWY_INLINE Vec128& operator/=(const Vec128 other) { |
| return *this = (*this / other); |
| } |
| HWY_INLINE Vec128& operator+=(const Vec128 other) { |
| return *this = (*this + other); |
| } |
| HWY_INLINE Vec128& operator-=(const Vec128 other) { |
| return *this = (*this - other); |
| } |
| HWY_INLINE Vec128& operator%=(const Vec128 other) { |
| return *this = (*this % other); |
| } |
| HWY_INLINE Vec128& operator&=(const Vec128 other) { |
| return *this = (*this & other); |
| } |
| HWY_INLINE Vec128& operator|=(const Vec128 other) { |
| return *this = (*this | other); |
| } |
| HWY_INLINE Vec128& operator^=(const Vec128 other) { |
| return *this = (*this ^ other); |
| } |
| |
| Raw raw; |
| }; |
| |
| template <typename T> |
| using Vec64 = Vec128<T, 8 / sizeof(T)>; |
| |
| template <typename T> |
| using Vec32 = Vec128<T, 4 / sizeof(T)>; |
| |
| template <typename T> |
| using Vec16 = Vec128<T, 2 / sizeof(T)>; |
| |
| namespace detail { |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| |
| // Template arg: sizeof(lane type) |
| template <size_t size> |
| struct RawMask128T {}; |
| template <> |
| struct RawMask128T<1> { |
| using type = __mmask16; |
| }; |
| template <> |
| struct RawMask128T<2> { |
| using type = __mmask8; |
| }; |
| template <> |
| struct RawMask128T<4> { |
| using type = __mmask8; |
| }; |
| template <> |
| struct RawMask128T<8> { |
| using type = __mmask8; |
| }; |
| |
| template <typename T> |
| using RawMask128 = typename RawMask128T<sizeof(T)>::type; |
| |
| #else // AVX2 or earlier |
| |
| template <typename T> |
| using RawMask128 = typename Raw128<T>::type; |
| |
| #endif // HWY_TARGET <= HWY_AVX3 |
| |
| } // namespace detail |
| |
| template <typename T, size_t N = 16 / sizeof(T)> |
| struct Mask128 { |
| using Raw = typename detail::RawMask128<T>; |
| |
| using PrivateT = T; // only for DFromM |
| static constexpr size_t kPrivateN = N; // only for DFromM |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| static Mask128<T, N> FromBits(uint64_t mask_bits) { |
| return Mask128<T, N>{static_cast<Raw>(mask_bits)}; |
| } |
| #else |
| // Lanes are either FF..FF or 0. |
| #endif |
| |
| Raw raw; |
| }; |
| |
| template <class V> |
| using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>; |
| |
| template <class M> |
| using DFromM = Simd<typename M::PrivateT, M::kPrivateN, 0>; |
| |
| template <class V> |
| using TFromV = typename V::PrivateT; |
| |
| // ------------------------------ Zero |
| |
| // Use HWY_MAX_LANES_D here because VFromD is defined in terms of Zero. |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)> |
| HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) { |
| return Vec128<TFromD<D>, HWY_MAX_LANES_D(D)>{_mm_setzero_si128()}; |
| } |
| #if HWY_HAVE_FLOAT16 |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)> |
| HWY_API Vec128<float16_t, HWY_MAX_LANES_D(D)> Zero(D /* tag */) { |
| return Vec128<float16_t, HWY_MAX_LANES_D(D)>{_mm_setzero_ph()}; |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> |
| HWY_API Vec128<float, HWY_MAX_LANES_D(D)> Zero(D /* tag */) { |
| return Vec128<float, HWY_MAX_LANES_D(D)>{_mm_setzero_ps()}; |
| } |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> |
| HWY_API Vec128<double, HWY_MAX_LANES_D(D)> Zero(D /* tag */) { |
| return Vec128<double, HWY_MAX_LANES_D(D)>{_mm_setzero_pd()}; |
| } |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_X86_IF_EMULATED_D(D)> |
| HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) { |
| return Vec128<TFromD<D>, HWY_MAX_LANES_D(D)>{_mm_setzero_si128()}; |
| } |
| |
| // Using the existing Zero function instead of a dedicated function for |
| // deduction avoids having to forward-declare Vec256 here. |
| template <class D> |
| using VFromD = decltype(Zero(D())); |
| |
| // ------------------------------ BitCast |
| |
| namespace detail { |
| |
| HWY_INLINE __m128i BitCastToInteger(__m128i v) { return v; } |
| #if HWY_HAVE_FLOAT16 |
| HWY_INLINE __m128i BitCastToInteger(__m128h v) { return _mm_castph_si128(v); } |
| #endif // HWY_HAVE_FLOAT16 |
| HWY_INLINE __m128i BitCastToInteger(__m128 v) { return _mm_castps_si128(v); } |
| HWY_INLINE __m128i BitCastToInteger(__m128d v) { return _mm_castpd_si128(v); } |
| |
| #if HWY_AVX3_HAVE_F32_TO_BF16C |
| HWY_INLINE __m128i BitCastToInteger(__m128bh v) { |
| // Need to use reinterpret_cast on GCC/Clang or BitCastScalar on MSVC to |
| // bit cast a __m128bh to a __m128i as there is currently no intrinsic |
| // available (as of GCC 13 and Clang 17) that can bit cast a __m128bh vector |
| // to a __m128i vector |
| |
| #if HWY_COMPILER_GCC || HWY_COMPILER_CLANG |
| // On GCC or Clang, use reinterpret_cast to bit cast a __m128bh to a __m128i |
| return reinterpret_cast<__m128i>(v); |
| #else |
| // On MSVC, use BitCastScalar to bit cast a __m128bh to a __m128i as MSVC does |
| // not allow reinterpret_cast, static_cast, or a C-style cast to be used to |
| // bit cast from one SSE/AVX vector type to a different SSE/AVX vector type |
| return BitCastScalar<__m128i>(v); |
| #endif // HWY_COMPILER_GCC || HWY_COMPILER_CLANG |
| } |
| #endif // HWY_AVX3_HAVE_F32_TO_BF16C |
| |
| template <typename T, size_t N> |
| HWY_INLINE Vec128<uint8_t, N * sizeof(T)> BitCastToByte(Vec128<T, N> v) { |
| return Vec128<uint8_t, N * sizeof(T)>{BitCastToInteger(v.raw)}; |
| } |
| |
| // Cannot rely on function overloading because return types differ. |
| template <typename T> |
| struct BitCastFromInteger128 { |
| HWY_INLINE __m128i operator()(__m128i v) { return v; } |
| }; |
| #if HWY_HAVE_FLOAT16 |
| template <> |
| struct BitCastFromInteger128<float16_t> { |
| HWY_INLINE __m128h operator()(__m128i v) { return _mm_castsi128_ph(v); } |
| }; |
| #endif // HWY_HAVE_FLOAT16 |
| template <> |
| struct BitCastFromInteger128<float> { |
| HWY_INLINE __m128 operator()(__m128i v) { return _mm_castsi128_ps(v); } |
| }; |
| template <> |
| struct BitCastFromInteger128<double> { |
| HWY_INLINE __m128d operator()(__m128i v) { return _mm_castsi128_pd(v); } |
| }; |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */, |
| Vec128<uint8_t, D().MaxBytes()> v) { |
| return VFromD<D>{BitCastFromInteger128<TFromD<D>>()(v.raw)}; |
| } |
| |
| } // namespace detail |
| |
| template <class D, typename FromT, HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API VFromD<D> BitCast(D d, |
| Vec128<FromT, Repartition<FromT, D>().MaxLanes()> v) { |
| return detail::BitCastFromByte(d, detail::BitCastToByte(v)); |
| } |
| |
| // ------------------------------ Set |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)> |
| HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { |
| return VFromD<D>{_mm_set1_epi8(static_cast<char>(t))}; // NOLINT |
| } |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI16_D(D)> |
| HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { |
| return VFromD<D>{_mm_set1_epi16(static_cast<short>(t))}; // NOLINT |
| } |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)> |
| HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { |
| return VFromD<D>{_mm_set1_epi32(static_cast<int>(t))}; |
| } |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)> |
| HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { |
| return VFromD<D>{_mm_set1_epi64x(static_cast<long long>(t))}; // NOLINT |
| } |
| #if HWY_HAVE_FLOAT16 |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)> |
| HWY_API VFromD<D> Set(D /* tag */, float16_t t) { |
| return VFromD<D>{_mm_set1_ph(t)}; |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> |
| HWY_API VFromD<D> Set(D /* tag */, float t) { |
| return VFromD<D>{_mm_set1_ps(t)}; |
| } |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> |
| HWY_API VFromD<D> Set(D /* tag */, double t) { |
| return VFromD<D>{_mm_set1_pd(t)}; |
| } |
| |
| // Generic for all vector lengths. |
| template <class D, HWY_X86_IF_EMULATED_D(D)> |
| HWY_API VFromD<D> Set(D df, TFromD<D> t) { |
| const RebindToUnsigned<decltype(df)> du; |
| static_assert(sizeof(TFromD<D>) == 2, "Expecting [b]f16"); |
| uint16_t bits; |
| CopyBytes<2>(&t, &bits); |
| return BitCast(df, Set(du, bits)); |
| } |
| |
| // ------------------------------ Undefined |
| |
| HWY_DIAGNOSTICS(push) |
| HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") |
| |
| // Returns a vector with uninitialized elements. |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)> |
| HWY_API VFromD<D> Undefined(D /* tag */) { |
| // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC |
| // generate an XOR instruction. |
| return VFromD<D>{_mm_undefined_si128()}; |
| } |
| #if HWY_HAVE_FLOAT16 |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)> |
| HWY_API VFromD<D> Undefined(D /* tag */) { |
| return VFromD<D>{_mm_undefined_ph()}; |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> |
| HWY_API VFromD<D> Undefined(D /* tag */) { |
| return VFromD<D>{_mm_undefined_ps()}; |
| } |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> |
| HWY_API VFromD<D> Undefined(D /* tag */) { |
| return VFromD<D>{_mm_undefined_pd()}; |
| } |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_X86_IF_EMULATED_D(D)> |
| HWY_API VFromD<D> Undefined(D /* tag */) { |
| return VFromD<D>{_mm_undefined_si128()}; |
| } |
| |
| HWY_DIAGNOSTICS(pop) |
| |
| // ------------------------------ GetLane |
| |
| template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> |
| HWY_API T GetLane(const Vec128<T, N> v) { |
| return static_cast<T>(_mm_cvtsi128_si32(v.raw) & 0xFF); |
| } |
| template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> |
| HWY_API T GetLane(const Vec128<T, N> v) { |
| const DFromV<decltype(v)> d; |
| const RebindToUnsigned<decltype(d)> du; |
| const uint16_t bits = |
| static_cast<uint16_t>(_mm_cvtsi128_si32(BitCast(du, v).raw) & 0xFFFF); |
| return BitCastScalar<T>(bits); |
| } |
| template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> |
| HWY_API T GetLane(const Vec128<T, N> v) { |
| return static_cast<T>(_mm_cvtsi128_si32(v.raw)); |
| } |
| template <size_t N> |
| HWY_API float GetLane(const Vec128<float, N> v) { |
| return _mm_cvtss_f32(v.raw); |
| } |
| template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> |
| HWY_API T GetLane(const Vec128<T, N> v) { |
| #if HWY_ARCH_X86_32 |
| const DFromV<decltype(v)> d; |
| alignas(16) T lanes[2]; |
| Store(v, d, lanes); |
| return lanes[0]; |
| #else |
| return static_cast<T>(_mm_cvtsi128_si64(v.raw)); |
| #endif |
| } |
| template <size_t N> |
| HWY_API double GetLane(const Vec128<double, N> v) { |
| return _mm_cvtsd_f64(v.raw); |
| } |
| |
| // ------------------------------ ResizeBitCast |
| |
| template <class D, class FromV, HWY_IF_V_SIZE_LE_V(FromV, 16), |
| HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API VFromD<D> ResizeBitCast(D d, FromV v) { |
| const Repartition<uint8_t, decltype(d)> du8; |
| return BitCast(d, VFromD<decltype(du8)>{detail::BitCastToInteger(v.raw)}); |
| } |
| |
| // ------------------------------ Dup128VecFromValues |
| |
| template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1, |
| TFromD<D> t2, TFromD<D> t3, TFromD<D> t4, |
| TFromD<D> t5, TFromD<D> t6, TFromD<D> t7, |
| TFromD<D> t8, TFromD<D> t9, TFromD<D> t10, |
| TFromD<D> t11, TFromD<D> t12, |
| TFromD<D> t13, TFromD<D> t14, |
| TFromD<D> t15) { |
| return VFromD<D>{_mm_setr_epi8( |
| static_cast<char>(t0), static_cast<char>(t1), static_cast<char>(t2), |
| static_cast<char>(t3), static_cast<char>(t4), static_cast<char>(t5), |
| static_cast<char>(t6), static_cast<char>(t7), static_cast<char>(t8), |
| static_cast<char>(t9), static_cast<char>(t10), static_cast<char>(t11), |
| static_cast<char>(t12), static_cast<char>(t13), static_cast<char>(t14), |
| static_cast<char>(t15))}; |
| } |
| |
| template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1, |
| TFromD<D> t2, TFromD<D> t3, TFromD<D> t4, |
| TFromD<D> t5, TFromD<D> t6, |
| TFromD<D> t7) { |
| return VFromD<D>{ |
| _mm_setr_epi16(static_cast<int16_t>(t0), static_cast<int16_t>(t1), |
| static_cast<int16_t>(t2), static_cast<int16_t>(t3), |
| static_cast<int16_t>(t4), static_cast<int16_t>(t5), |
| static_cast<int16_t>(t6), static_cast<int16_t>(t7))}; |
| } |
| |
| // Generic for all vector lengths |
| template <class D, HWY_IF_BF16_D(D)> |
| HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1, |
| TFromD<D> t2, TFromD<D> t3, TFromD<D> t4, |
| TFromD<D> t5, TFromD<D> t6, |
| TFromD<D> t7) { |
| const RebindToSigned<decltype(d)> di; |
| return BitCast(d, |
| Dup128VecFromValues( |
| di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1), |
| BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3), |
| BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5), |
| BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7))); |
| } |
| |
| #if HWY_HAVE_FLOAT16 |
| template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1, |
| TFromD<D> t2, TFromD<D> t3, TFromD<D> t4, |
| TFromD<D> t5, TFromD<D> t6, |
| TFromD<D> t7) { |
| return VFromD<D>{_mm_setr_ph(t0, t1, t2, t3, t4, t5, t6, t7)}; |
| } |
| #else |
| // Generic for all vector lengths if HWY_HAVE_FLOAT16 is not true |
| template <class D, HWY_IF_F16_D(D)> |
| HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1, |
| TFromD<D> t2, TFromD<D> t3, TFromD<D> t4, |
| TFromD<D> t5, TFromD<D> t6, |
| TFromD<D> t7) { |
| const RebindToSigned<decltype(d)> di; |
| return BitCast(d, |
| Dup128VecFromValues( |
| di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1), |
| BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3), |
| BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5), |
| BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7))); |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| |
| template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1, |
| TFromD<D> t2, TFromD<D> t3) { |
| return VFromD<D>{ |
| _mm_setr_epi32(static_cast<int32_t>(t0), static_cast<int32_t>(t1), |
| static_cast<int32_t>(t2), static_cast<int32_t>(t3))}; |
| } |
| |
| template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1, |
| TFromD<D> t2, TFromD<D> t3) { |
| return VFromD<D>{_mm_setr_ps(t0, t1, t2, t3)}; |
| } |
| |
| template <class D, HWY_IF_UI64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) { |
| // Need to use _mm_set_epi64x as there is no _mm_setr_epi64x intrinsic |
| // available |
| return VFromD<D>{ |
| _mm_set_epi64x(static_cast<int64_t>(t1), static_cast<int64_t>(t0))}; |
| } |
| |
| template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) { |
| return VFromD<D>{_mm_setr_pd(t0, t1)}; |
| } |
| |
| #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD |
| namespace detail { |
| |
| template <class RawV> |
| static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantRawX86Vec( |
| hwy::SizeTag<1> /* num_of_lanes_tag*/, RawV v) { |
| return __builtin_constant_p(v[0]); |
| } |
| |
| template <class RawV> |
| static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantRawX86Vec( |
| hwy::SizeTag<2> /* num_of_lanes_tag*/, RawV v) { |
| return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]); |
| } |
| |
| template <class RawV> |
| static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantRawX86Vec( |
| hwy::SizeTag<4> /* num_of_lanes_tag*/, RawV v) { |
| return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) && |
| __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]); |
| } |
| |
| template <class RawV> |
| static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantRawX86Vec( |
| hwy::SizeTag<8> /* num_of_lanes_tag*/, RawV v) { |
| return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) && |
| __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) && |
| __builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) && |
| __builtin_constant_p(v[6]) && __builtin_constant_p(v[7]); |
| } |
| |
| template <class RawV> |
| static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantRawX86Vec( |
| hwy::SizeTag<16> /* num_of_lanes_tag*/, RawV v) { |
| return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) && |
| __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) && |
| __builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) && |
| __builtin_constant_p(v[6]) && __builtin_constant_p(v[7]) && |
| __builtin_constant_p(v[8]) && __builtin_constant_p(v[9]) && |
| __builtin_constant_p(v[10]) && __builtin_constant_p(v[11]) && |
| __builtin_constant_p(v[12]) && __builtin_constant_p(v[13]) && |
| __builtin_constant_p(v[14]) && __builtin_constant_p(v[15]); |
| } |
| |
| #if HWY_TARGET <= HWY_AVX2 |
| template <class RawV> |
| static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantRawX86Vec( |
| hwy::SizeTag<32> /* num_of_lanes_tag*/, RawV v) { |
| return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) && |
| __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) && |
| __builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) && |
| __builtin_constant_p(v[6]) && __builtin_constant_p(v[7]) && |
| __builtin_constant_p(v[8]) && __builtin_constant_p(v[9]) && |
| __builtin_constant_p(v[10]) && __builtin_constant_p(v[11]) && |
| __builtin_constant_p(v[12]) && __builtin_constant_p(v[13]) && |
| __builtin_constant_p(v[14]) && __builtin_constant_p(v[15]) && |
| __builtin_constant_p(v[16]) && __builtin_constant_p(v[17]) && |
| __builtin_constant_p(v[18]) && __builtin_constant_p(v[19]) && |
| __builtin_constant_p(v[20]) && __builtin_constant_p(v[21]) && |
| __builtin_constant_p(v[22]) && __builtin_constant_p(v[23]) && |
| __builtin_constant_p(v[24]) && __builtin_constant_p(v[25]) && |
| __builtin_constant_p(v[26]) && __builtin_constant_p(v[27]) && |
| __builtin_constant_p(v[28]) && __builtin_constant_p(v[29]) && |
| __builtin_constant_p(v[30]) && __builtin_constant_p(v[31]); |
| } |
| #endif |
| |
| template <size_t kNumOfLanes, class V> |
| static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantX86Vec( |
| hwy::SizeTag<kNumOfLanes> num_of_lanes_tag, V v) { |
| using T = TFromV<V>; |
| #if HWY_HAVE_FLOAT16 && HWY_HAVE_SCALAR_F16_TYPE |
| using F16VecLaneT = hwy::float16_t::Native; |
| #else |
| using F16VecLaneT = uint16_t; |
| #endif |
| using RawVecLaneT = If<hwy::IsSame<T, hwy::float16_t>(), F16VecLaneT, |
| If<hwy::IsSame<T, hwy::bfloat16_t>(), uint16_t, T>>; |
| |
| // Suppress the -Wignored-attributes warning that is emitted by |
| // RemoveCvRef<decltype(v.raw)> with GCC |
| HWY_DIAGNOSTICS(push) |
| HWY_DIAGNOSTICS_OFF(disable : 4649, ignored "-Wignored-attributes") |
| typedef RawVecLaneT GccRawVec |
| __attribute__((__vector_size__(sizeof(RemoveCvRef<decltype(v.raw)>)))); |
| HWY_DIAGNOSTICS(pop) |
| |
| return IsConstantRawX86Vec(num_of_lanes_tag, |
| reinterpret_cast<GccRawVec>(v.raw)); |
| } |
| |
| template <class TTo, class V> |
| static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantX86VecForF2IConv(V v) { |
| constexpr size_t kNumOfLanesInRawSrcVec = |
| HWY_MAX(HWY_MAX_LANES_V(V), 16 / sizeof(TFromV<V>)); |
| constexpr size_t kNumOfLanesInRawResultVec = |
| HWY_MAX(HWY_MAX_LANES_V(V), 16 / sizeof(TTo)); |
| constexpr size_t kNumOfLanesToCheck = |
| HWY_MIN(kNumOfLanesInRawSrcVec, kNumOfLanesInRawResultVec); |
| |
| return IsConstantX86Vec(hwy::SizeTag<kNumOfLanesToCheck>(), v); |
| } |
| |
| } // namespace detail |
| #endif // HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD |
| |
| // ================================================== LOGICAL |
| |
| // ------------------------------ And |
| |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) { |
| const DFromV<decltype(a)> d; // for float16_t |
| const RebindToUnsigned<decltype(d)> du; |
| return BitCast(d, VFromD<decltype(du)>{ |
| _mm_and_si128(BitCast(du, a).raw, BitCast(du, b).raw)}); |
| } |
| template <size_t N> |
| HWY_API Vec128<float, N> And(Vec128<float, N> a, Vec128<float, N> b) { |
| return Vec128<float, N>{_mm_and_ps(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<double, N> And(Vec128<double, N> a, Vec128<double, N> b) { |
| return Vec128<double, N>{_mm_and_pd(a.raw, b.raw)}; |
| } |
| |
| // ------------------------------ AndNot |
| |
| // Returns ~not_mask & mask. |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) { |
| const DFromV<decltype(mask)> d; // for float16_t |
| const RebindToUnsigned<decltype(d)> du; |
| return BitCast(d, VFromD<decltype(du)>{_mm_andnot_si128( |
| BitCast(du, not_mask).raw, BitCast(du, mask).raw)}); |
| } |
| template <size_t N> |
| HWY_API Vec128<float, N> AndNot(Vec128<float, N> not_mask, |
| Vec128<float, N> mask) { |
| return Vec128<float, N>{_mm_andnot_ps(not_mask.raw, mask.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<double, N> AndNot(Vec128<double, N> not_mask, |
| Vec128<double, N> mask) { |
| return Vec128<double, N>{_mm_andnot_pd(not_mask.raw, mask.raw)}; |
| } |
| |
| // ------------------------------ Or |
| |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) { |
| const DFromV<decltype(a)> d; // for float16_t |
| const RebindToUnsigned<decltype(d)> du; |
| return BitCast(d, VFromD<decltype(du)>{ |
| _mm_or_si128(BitCast(du, a).raw, BitCast(du, b).raw)}); |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<float, N> Or(Vec128<float, N> a, Vec128<float, N> b) { |
| return Vec128<float, N>{_mm_or_ps(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<double, N> Or(Vec128<double, N> a, Vec128<double, N> b) { |
| return Vec128<double, N>{_mm_or_pd(a.raw, b.raw)}; |
| } |
| |
| // ------------------------------ Xor |
| |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) { |
| const DFromV<decltype(a)> d; // for float16_t |
| const RebindToUnsigned<decltype(d)> du; |
| return BitCast(d, VFromD<decltype(du)>{ |
| _mm_xor_si128(BitCast(du, a).raw, BitCast(du, b).raw)}); |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<float, N> Xor(Vec128<float, N> a, Vec128<float, N> b) { |
| return Vec128<float, N>{_mm_xor_ps(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<double, N> Xor(Vec128<double, N> a, Vec128<double, N> b) { |
| return Vec128<double, N>{_mm_xor_pd(a.raw, b.raw)}; |
| } |
| |
| // ------------------------------ Not |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N> Not(const Vec128<T, N> v) { |
| const DFromV<decltype(v)> d; |
| const RebindToUnsigned<decltype(d)> du; |
| using VU = VFromD<decltype(du)>; |
| #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN |
| const __m128i vu = BitCast(du, v).raw; |
| return BitCast(d, VU{_mm_ternarylogic_epi32(vu, vu, vu, 0x55)}); |
| #else |
| return Xor(v, BitCast(d, VU{_mm_set1_epi32(-1)})); |
| #endif |
| } |
| |
| // ------------------------------ Xor3 |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) { |
| #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN |
| const DFromV<decltype(x1)> d; |
| const RebindToUnsigned<decltype(d)> du; |
| using VU = VFromD<decltype(du)>; |
| const __m128i ret = _mm_ternarylogic_epi64( |
| BitCast(du, x1).raw, BitCast(du, x2).raw, BitCast(du, x3).raw, 0x96); |
| return BitCast(d, VU{ret}); |
| #else |
| return Xor(x1, Xor(x2, x3)); |
| #endif |
| } |
| |
| // ------------------------------ Or3 |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) { |
| #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN |
| const DFromV<decltype(o1)> d; |
| const RebindToUnsigned<decltype(d)> du; |
| using VU = VFromD<decltype(du)>; |
| const __m128i ret = _mm_ternarylogic_epi64( |
| BitCast(du, o1).raw, BitCast(du, o2).raw, BitCast(du, o3).raw, 0xFE); |
| return BitCast(d, VU{ret}); |
| #else |
| return Or(o1, Or(o2, o3)); |
| #endif |
| } |
| |
| // ------------------------------ OrAnd |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) { |
| #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN |
| const DFromV<decltype(o)> d; |
| const RebindToUnsigned<decltype(d)> du; |
| using VU = VFromD<decltype(du)>; |
| const __m128i ret = _mm_ternarylogic_epi64( |
| BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8); |
| return BitCast(d, VU{ret}); |
| #else |
| return Or(o, And(a1, a2)); |
| #endif |
| } |
| |
| // ------------------------------ IfVecThenElse |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes, |
| Vec128<T, N> no) { |
| #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN |
| const DFromV<decltype(no)> d; |
| const RebindToUnsigned<decltype(d)> du; |
| using VU = VFromD<decltype(du)>; |
| return BitCast( |
| d, VU{_mm_ternarylogic_epi64(BitCast(du, mask).raw, BitCast(du, yes).raw, |
| BitCast(du, no).raw, 0xCA)}); |
| #else |
| return IfThenElse(MaskFromVec(mask), yes, no); |
| #endif |
| } |
| |
| // ------------------------------ BitwiseIfThenElse |
| #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN |
| |
| #ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE |
| #undef HWY_NATIVE_BITWISE_IF_THEN_ELSE |
| #else |
| #define HWY_NATIVE_BITWISE_IF_THEN_ELSE |
| #endif |
| |
| template <class V> |
| HWY_API V BitwiseIfThenElse(V mask, V yes, V no) { |
| return IfVecThenElse(mask, yes, no); |
| } |
| |
| #endif |
| |
| // ------------------------------ Operator overloads (internal-only if float) |
| |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) { |
| return And(a, b); |
| } |
| |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) { |
| return Or(a, b); |
| } |
| |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) { |
| return Xor(a, b); |
| } |
| |
| // ------------------------------ PopulationCount |
| |
| // 8/16 require BITALG, 32/64 require VPOPCNTDQ. |
| #if HWY_TARGET <= HWY_AVX3_DL |
| |
| #ifdef HWY_NATIVE_POPCNT |
| #undef HWY_NATIVE_POPCNT |
| #else |
| #define HWY_NATIVE_POPCNT |
| #endif |
| |
| namespace detail { |
| |
| template <typename T, size_t N> |
| HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<1> /* tag */, |
| Vec128<T, N> v) { |
| return Vec128<T, N>{_mm_popcnt_epi8(v.raw)}; |
| } |
| template <typename T, size_t N> |
| HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<2> /* tag */, |
| Vec128<T, N> v) { |
| return Vec128<T, N>{_mm_popcnt_epi16(v.raw)}; |
| } |
| template <typename T, size_t N> |
| HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<4> /* tag */, |
| Vec128<T, N> v) { |
| return Vec128<T, N>{_mm_popcnt_epi32(v.raw)}; |
| } |
| template <typename T, size_t N> |
| HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<8> /* tag */, |
| Vec128<T, N> v) { |
| return Vec128<T, N>{_mm_popcnt_epi64(v.raw)}; |
| } |
| |
| } // namespace detail |
| |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N> PopulationCount(Vec128<T, N> v) { |
| return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v); |
| } |
| |
| #endif // HWY_TARGET <= HWY_AVX3_DL |
| |
| // ================================================== SIGN |
| |
| // ------------------------------ Neg |
| |
| // Tag dispatch instead of SFINAE for MSVC 2017 compatibility |
| namespace detail { |
| |
| template <typename T, size_t N> |
| HWY_INLINE Vec128<T, N> Neg(hwy::FloatTag /*tag*/, const Vec128<T, N> v) { |
| return Xor(v, SignBit(DFromV<decltype(v)>())); |
| } |
| |
| template <typename T, size_t N> |
| HWY_INLINE Vec128<T, N> Neg(hwy::SpecialTag /*tag*/, const Vec128<T, N> v) { |
| return Xor(v, SignBit(DFromV<decltype(v)>())); |
| } |
| |
| template <typename T, size_t N> |
| HWY_INLINE Vec128<T, N> Neg(hwy::SignedTag /*tag*/, const Vec128<T, N> v) { |
| return Zero(DFromV<decltype(v)>()) - v; |
| } |
| |
| } // namespace detail |
| |
| template <typename T, size_t N> |
| HWY_INLINE Vec128<T, N> Neg(const Vec128<T, N> v) { |
| return detail::Neg(hwy::TypeTag<T>(), v); |
| } |
| |
| // ------------------------------ Floating-point Abs |
| // Generic for all vector lengths |
| template <class V, HWY_IF_FLOAT(TFromV<V>)> |
| HWY_API V Abs(V v) { |
| const DFromV<decltype(v)> d; |
| const RebindToSigned<decltype(d)> di; |
| using TI = TFromD<decltype(di)>; |
| return v & BitCast(d, Set(di, static_cast<TI>(~SignMask<TI>()))); |
| } |
| |
| // ------------------------------ CopySign |
| // Generic for all vector lengths. |
| template <class V> |
| HWY_API V CopySign(const V magn, const V sign) { |
| static_assert(IsFloat<TFromV<V>>(), "Only makes sense for floating-point"); |
| |
| const DFromV<decltype(magn)> d; |
| const auto msb = SignBit(d); |
| |
| // Truth table for msb, magn, sign | bitwise msb ? sign : mag |
| // 0 0 0 | 0 |
| // 0 0 1 | 0 |
| // 0 1 0 | 1 |
| // 0 1 1 | 1 |
| // 1 0 0 | 0 |
| // 1 0 1 | 1 |
| // 1 1 0 | 0 |
| // 1 1 1 | 1 |
| return BitwiseIfThenElse(msb, sign, magn); |
| } |
| |
| // ------------------------------ CopySignToAbs |
| // Generic for all vector lengths. |
| template <class V> |
| HWY_API V CopySignToAbs(const V abs, const V sign) { |
| const DFromV<decltype(abs)> d; |
| return OrAnd(abs, SignBit(d), sign); |
| } |
| |
| // ================================================== MASK |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| // ------------------------------ MaskFromVec |
| |
| namespace detail { |
| |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<1> /*tag*/, |
| const Vec128<T, N> v) { |
| return Mask128<T, N>{_mm_movepi8_mask(v.raw)}; |
| } |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<2> /*tag*/, |
| const Vec128<T, N> v) { |
| return Mask128<T, N>{_mm_movepi16_mask(v.raw)}; |
| } |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<4> /*tag*/, |
| const Vec128<T, N> v) { |
| return Mask128<T, N>{_mm_movepi32_mask(v.raw)}; |
| } |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<8> /*tag*/, |
| const Vec128<T, N> v) { |
| return Mask128<T, N>{_mm_movepi64_mask(v.raw)}; |
| } |
| |
| } // namespace detail |
| |
| template <typename T, size_t N> |
| HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) { |
| return detail::MaskFromVec(hwy::SizeTag<sizeof(T)>(), v); |
| } |
| // There do not seem to be native floating-point versions of these instructions. |
| #if HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Mask128<float16_t, N> MaskFromVec(const Vec128<float16_t, N> v) { |
| const RebindToSigned<DFromV<decltype(v)>> di; |
| return Mask128<float16_t, N>{MaskFromVec(BitCast(di, v)).raw}; |
| } |
| #endif |
| template <size_t N> |
| HWY_API Mask128<float, N> MaskFromVec(const Vec128<float, N> v) { |
| const RebindToSigned<DFromV<decltype(v)>> di; |
| return Mask128<float, N>{MaskFromVec(BitCast(di, v)).raw}; |
| } |
| template <size_t N> |
| HWY_API Mask128<double, N> MaskFromVec(const Vec128<double, N> v) { |
| const RebindToSigned<DFromV<decltype(v)>> di; |
| return Mask128<double, N>{MaskFromVec(BitCast(di, v)).raw}; |
| } |
| |
| template <class D> |
| using MFromD = decltype(MaskFromVec(VFromD<D>())); |
| |
| // ------------------------------ MaskFalse (MFromD) |
| |
| #ifdef HWY_NATIVE_MASK_FALSE |
| #undef HWY_NATIVE_MASK_FALSE |
| #else |
| #define HWY_NATIVE_MASK_FALSE |
| #endif |
| |
| // Generic for all vector lengths |
| template <class D> |
| HWY_API MFromD<D> MaskFalse(D /*d*/) { |
| return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(0)}; |
| } |
| |
| // ------------------------------ IsNegative (MFromD) |
| #ifdef HWY_NATIVE_IS_NEGATIVE |
| #undef HWY_NATIVE_IS_NEGATIVE |
| #else |
| #define HWY_NATIVE_IS_NEGATIVE |
| #endif |
| |
| // Generic for all vector lengths |
| template <class V, HWY_IF_NOT_UNSIGNED_V(V)> |
| HWY_API MFromD<DFromV<V>> IsNegative(V v) { |
| return MaskFromVec(v); |
| } |
| |
| // ------------------------------ PromoteMaskTo (MFromD) |
| |
| #ifdef HWY_NATIVE_PROMOTE_MASK_TO |
| #undef HWY_NATIVE_PROMOTE_MASK_TO |
| #else |
| #define HWY_NATIVE_PROMOTE_MASK_TO |
| #endif |
| |
| // AVX3 PromoteMaskTo is generic for all vector lengths |
| template <class DTo, class DFrom, |
| HWY_IF_T_SIZE_GT_D(DTo, sizeof(TFromD<DFrom>)), |
| class DFrom_2 = Rebind<TFromD<DFrom>, DTo>, |
| hwy::EnableIf<IsSame<MFromD<DFrom>, MFromD<DFrom_2>>()>* = nullptr> |
| HWY_API MFromD<DTo> PromoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/, |
| MFromD<DFrom> m) { |
| return MFromD<DTo>{static_cast<decltype(MFromD<DTo>().raw)>(m.raw)}; |
| } |
| |
| // ------------------------------ DemoteMaskTo (MFromD) |
| |
| #ifdef HWY_NATIVE_DEMOTE_MASK_TO |
| #undef HWY_NATIVE_DEMOTE_MASK_TO |
| #else |
| #define HWY_NATIVE_DEMOTE_MASK_TO |
| #endif |
| |
| // AVX3 DemoteMaskTo is generic for all vector lengths |
| template <class DTo, class DFrom, |
| HWY_IF_T_SIZE_LE_D(DTo, sizeof(TFromD<DFrom>) - 1), |
| class DFrom_2 = Rebind<TFromD<DFrom>, DTo>, |
| hwy::EnableIf<IsSame<MFromD<DFrom>, MFromD<DFrom_2>>()>* = nullptr> |
| HWY_API MFromD<DTo> DemoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/, |
| MFromD<DFrom> m) { |
| return MFromD<DTo>{static_cast<decltype(MFromD<DTo>().raw)>(m.raw)}; |
| } |
| |
| // ------------------------------ CombineMasks (MFromD) |
| |
| #ifdef HWY_NATIVE_COMBINE_MASKS |
| #undef HWY_NATIVE_COMBINE_MASKS |
| #else |
| #define HWY_NATIVE_COMBINE_MASKS |
| #endif |
| |
| // For Clang and GCC, mask intrinsics (KORTEST) weren't added until recently. |
| #if !defined(HWY_COMPILER_HAS_MASK_INTRINSICS) |
| #if HWY_COMPILER_MSVC != 0 || HWY_COMPILER_GCC_ACTUAL >= 700 || \ |
| HWY_COMPILER_CLANG >= 800 |
| #define HWY_COMPILER_HAS_MASK_INTRINSICS 1 |
| #else |
| #define HWY_COMPILER_HAS_MASK_INTRINSICS 0 |
| #endif |
| #endif // HWY_COMPILER_HAS_MASK_INTRINSICS |
| |
| template <class D, HWY_IF_LANES_D(D, 2)> |
| HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi, |
| MFromD<Half<D>> lo) { |
| #if HWY_COMPILER_HAS_MASK_INTRINSICS |
| const __mmask8 combined_mask = _kor_mask8( |
| _kshiftli_mask8(static_cast<__mmask8>(hi.raw), 1), |
| _kand_mask8(static_cast<__mmask8>(lo.raw), static_cast<__mmask8>(1))); |
| #else |
| const auto combined_mask = |
| (static_cast<unsigned>(hi.raw) << 1) | (lo.raw & 1); |
| #endif |
| |
| return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(combined_mask)}; |
| } |
| |
| template <class D, HWY_IF_LANES_D(D, 4)> |
| HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi, |
| MFromD<Half<D>> lo) { |
| #if HWY_COMPILER_HAS_MASK_INTRINSICS |
| const __mmask8 combined_mask = _kor_mask8( |
| _kshiftli_mask8(static_cast<__mmask8>(hi.raw), 2), |
| _kand_mask8(static_cast<__mmask8>(lo.raw), static_cast<__mmask8>(3))); |
| #else |
| const auto combined_mask = |
| (static_cast<unsigned>(hi.raw) << 2) | (lo.raw & 3); |
| #endif |
| |
| return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(combined_mask)}; |
| } |
| |
| template <class D, HWY_IF_LANES_D(D, 8)> |
| HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi, |
| MFromD<Half<D>> lo) { |
| #if HWY_COMPILER_HAS_MASK_INTRINSICS |
| const __mmask8 combined_mask = _kor_mask8( |
| _kshiftli_mask8(static_cast<__mmask8>(hi.raw), 4), |
| _kand_mask8(static_cast<__mmask8>(lo.raw), static_cast<__mmask8>(15))); |
| #else |
| const auto combined_mask = |
| (static_cast<unsigned>(hi.raw) << 4) | (lo.raw & 15u); |
| #endif |
| |
| return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(combined_mask)}; |
| } |
| |
| template <class D, HWY_IF_LANES_D(D, 16)> |
| HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi, |
| MFromD<Half<D>> lo) { |
| #if HWY_COMPILER_HAS_MASK_INTRINSICS |
| const __mmask16 combined_mask = _mm512_kunpackb( |
| static_cast<__mmask16>(hi.raw), static_cast<__mmask16>(lo.raw)); |
| #else |
| const auto combined_mask = |
| ((static_cast<unsigned>(hi.raw) << 8) | (lo.raw & 0xFFu)); |
| #endif |
| |
| return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(combined_mask)}; |
| } |
| |
| // ------------------------------ LowerHalfOfMask (MFromD) |
| |
| #ifdef HWY_NATIVE_LOWER_HALF_OF_MASK |
| #undef HWY_NATIVE_LOWER_HALF_OF_MASK |
| #else |
| #define HWY_NATIVE_LOWER_HALF_OF_MASK |
| #endif |
| |
| // Generic for all vector lengths |
| template <class D> |
| HWY_API MFromD<D> LowerHalfOfMask(D d, MFromD<Twice<D>> m) { |
| using RawM = decltype(MFromD<D>().raw); |
| constexpr size_t kN = MaxLanes(d); |
| constexpr size_t kNumOfBitsInRawMask = sizeof(RawM) * 8; |
| |
| MFromD<D> result_mask{static_cast<RawM>(m.raw)}; |
| |
| if (kN < kNumOfBitsInRawMask) { |
| result_mask = |
| And(result_mask, MFromD<D>{static_cast<RawM>((1ULL << kN) - 1)}); |
| } |
| |
| return result_mask; |
| } |
| |
| // ------------------------------ UpperHalfOfMask (MFromD) |
| |
| #ifdef HWY_NATIVE_UPPER_HALF_OF_MASK |
| #undef HWY_NATIVE_UPPER_HALF_OF_MASK |
| #else |
| #define HWY_NATIVE_UPPER_HALF_OF_MASK |
| #endif |
| |
| template <class D, HWY_IF_LANES_D(D, 1)> |
| HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) { |
| #if HWY_COMPILER_HAS_MASK_INTRINSICS |
| const auto shifted_mask = _kshiftri_mask8(static_cast<__mmask8>(m.raw), 1); |
| #else |
| const auto shifted_mask = static_cast<unsigned>(m.raw) >> 1; |
| #endif |
| |
| return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)}; |
| } |
| |
| template <class D, HWY_IF_LANES_D(D, 2)> |
| HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) { |
| #if HWY_COMPILER_HAS_MASK_INTRINSICS |
| const auto shifted_mask = _kshiftri_mask8(static_cast<__mmask8>(m.raw), 2); |
| #else |
| const auto shifted_mask = static_cast<unsigned>(m.raw) >> 2; |
| #endif |
| |
| return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)}; |
| } |
| |
| template <class D, HWY_IF_LANES_D(D, 4)> |
| HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) { |
| #if HWY_COMPILER_HAS_MASK_INTRINSICS |
| const auto shifted_mask = _kshiftri_mask8(static_cast<__mmask8>(m.raw), 4); |
| #else |
| const auto shifted_mask = static_cast<unsigned>(m.raw) >> 4; |
| #endif |
| |
| return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)}; |
| } |
| |
| template <class D, HWY_IF_LANES_D(D, 8)> |
| HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) { |
| #if HWY_COMPILER_HAS_MASK_INTRINSICS |
| const auto shifted_mask = _kshiftri_mask16(static_cast<__mmask16>(m.raw), 8); |
| #else |
| const auto shifted_mask = static_cast<unsigned>(m.raw) >> 8; |
| #endif |
| |
| return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)}; |
| } |
| |
| // ------------------------------ OrderedDemote2MasksTo (MFromD, CombineMasks) |
| |
| #ifdef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO |
| #undef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO |
| #else |
| #define HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO |
| #endif |
| |
| // Generic for all vector lengths |
| template <class DTo, class DFrom, |
| HWY_IF_T_SIZE_D(DTo, sizeof(TFromD<DFrom>) / 2), |
| class DTo_2 = Repartition<TFromD<DTo>, DFrom>, |
| hwy::EnableIf<IsSame<MFromD<DTo>, MFromD<DTo_2>>()>* = nullptr> |
| HWY_API MFromD<DTo> OrderedDemote2MasksTo(DTo d_to, DFrom /*d_from*/, |
| MFromD<DFrom> a, MFromD<DFrom> b) { |
| using MH = MFromD<Half<DTo>>; |
| using RawMH = decltype(MH().raw); |
| |
| return CombineMasks(d_to, MH{static_cast<RawMH>(b.raw)}, |
| MH{static_cast<RawMH>(a.raw)}); |
| } |
| |
| // ------------------------------ Slide mask up/down |
| #ifdef HWY_NATIVE_SLIDE_MASK |
| #undef HWY_NATIVE_SLIDE_MASK |
| #else |
| #define HWY_NATIVE_SLIDE_MASK |
| #endif |
| |
| template <class D, HWY_IF_LANES_LE_D(D, 8)> |
| HWY_API MFromD<D> SlideMask1Up(D d, MFromD<D> m) { |
| using RawM = decltype(MFromD<D>().raw); |
| constexpr size_t kN = MaxLanes(d); |
| constexpr unsigned kValidLanesMask = (1u << kN) - 1u; |
| |
| #if HWY_COMPILER_HAS_MASK_INTRINSICS |
| MFromD<D> result_mask{ |
| static_cast<RawM>(_kshiftli_mask8(static_cast<__mmask8>(m.raw), 1))}; |
| |
| if (kN < 8) { |
| result_mask = |
| And(result_mask, MFromD<D>{static_cast<RawM>(kValidLanesMask)}); |
| } |
| #else |
| MFromD<D> result_mask{ |
| static_cast<RawM>((static_cast<unsigned>(m.raw) << 1) & kValidLanesMask)}; |
| #endif |
| |
| return result_mask; |
| } |
| |
| template <class D, HWY_IF_LANES_D(D, 16)> |
| HWY_API MFromD<D> SlideMask1Up(D /*d*/, MFromD<D> m) { |
| using RawM = decltype(MFromD<D>().raw); |
| #if HWY_COMPILER_HAS_MASK_INTRINSICS |
| return MFromD<D>{ |
| static_cast<RawM>(_kshiftli_mask16(static_cast<__mmask16>(m.raw), 1))}; |
| #else |
| return MFromD<D>{static_cast<RawM>(static_cast<unsigned>(m.raw) << 1)}; |
| #endif |
| } |
| |
| template <class D, HWY_IF_LANES_LE_D(D, 8)> |
| HWY_API MFromD<D> SlideMask1Down(D d, MFromD<D> m) { |
| using RawM = decltype(MFromD<D>().raw); |
| constexpr size_t kN = MaxLanes(d); |
| constexpr unsigned kValidLanesMask = (1u << kN) - 1u; |
| |
| #if HWY_COMPILER_HAS_MASK_INTRINSICS |
| if (kN < 8) { |
| m = And(m, MFromD<D>{static_cast<RawM>(kValidLanesMask)}); |
| } |
| |
| return MFromD<D>{ |
| static_cast<RawM>(_kshiftri_mask8(static_cast<__mmask8>(m.raw), 1))}; |
| #else |
| return MFromD<D>{ |
| static_cast<RawM>((static_cast<unsigned>(m.raw) & kValidLanesMask) >> 1)}; |
| #endif |
| } |
| |
| template <class D, HWY_IF_LANES_D(D, 16)> |
| HWY_API MFromD<D> SlideMask1Down(D /*d*/, MFromD<D> m) { |
| using RawM = decltype(MFromD<D>().raw); |
| #if HWY_COMPILER_HAS_MASK_INTRINSICS |
| return MFromD<D>{ |
| static_cast<RawM>(_kshiftri_mask16(static_cast<__mmask16>(m.raw), 1))}; |
| #else |
| return MFromD<D>{ |
| static_cast<RawM>((static_cast<unsigned>(m.raw) & 0xFFFFu) >> 1)}; |
| #endif |
| } |
| |
| // Generic for all vector lengths |
| template <class D> |
| HWY_API MFromD<D> SlideMaskUpLanes(D d, MFromD<D> m, size_t amt) { |
| using RawM = decltype(MFromD<D>().raw); |
| constexpr size_t kN = MaxLanes(d); |
| constexpr uint64_t kValidLanesMask = |
| static_cast<uint64_t>(((kN < 64) ? (1ULL << kN) : 0ULL) - 1ULL); |
| |
| return MFromD<D>{static_cast<RawM>( |
| (static_cast<uint64_t>(m.raw) << (amt & 63)) & kValidLanesMask)}; |
| } |
| |
| // Generic for all vector lengths |
| template <class D> |
| HWY_API MFromD<D> SlideMaskDownLanes(D d, MFromD<D> m, size_t amt) { |
| using RawM = decltype(MFromD<D>().raw); |
| constexpr size_t kN = MaxLanes(d); |
| constexpr uint64_t kValidLanesMask = |
| static_cast<uint64_t>(((kN < 64) ? (1ULL << kN) : 0ULL) - 1ULL); |
| |
| return MFromD<D>{static_cast<RawM>( |
| (static_cast<uint64_t>(m.raw) & kValidLanesMask) >> (amt & 63))}; |
| } |
| |
| // ------------------------------ VecFromMask |
| |
| template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> |
| HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) { |
| return Vec128<T, N>{_mm_movm_epi8(v.raw)}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_UI16(T)> |
| HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) { |
| return Vec128<T, N>{_mm_movm_epi16(v.raw)}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_UI32(T)> |
| HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) { |
| return Vec128<T, N>{_mm_movm_epi32(v.raw)}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_UI64(T)> |
| HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) { |
| return Vec128<T, N>{_mm_movm_epi64(v.raw)}; |
| } |
| |
| #if HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Vec128<float16_t, N> VecFromMask(const Mask128<float16_t, N> v) { |
| return Vec128<float16_t, N>{_mm_castsi128_ph(_mm_movm_epi16(v.raw))}; |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| |
| template <size_t N> |
| HWY_API Vec128<float, N> VecFromMask(const Mask128<float, N> v) { |
| return Vec128<float, N>{_mm_castsi128_ps(_mm_movm_epi32(v.raw))}; |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<double, N> VecFromMask(const Mask128<double, N> v) { |
| return Vec128<double, N>{_mm_castsi128_pd(_mm_movm_epi64(v.raw))}; |
| } |
| |
| // Generic for all vector lengths. |
| template <class D> |
| HWY_API VFromD<D> VecFromMask(D /* tag */, MFromD<D> v) { |
| return VecFromMask(v); |
| } |
| |
| // ------------------------------ RebindMask (MaskFromVec) |
| |
| template <typename TFrom, size_t NFrom, class DTo, HWY_IF_V_SIZE_LE_D(DTo, 16)> |
| HWY_API MFromD<DTo> RebindMask(DTo /* tag */, Mask128<TFrom, NFrom> m) { |
| static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size"); |
| return MFromD<DTo>{m.raw}; |
| } |
| |
| // ------------------------------ IfThenElse |
| |
| namespace detail { |
| |
| template <typename T, size_t N> |
| HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<1> /* tag */, |
| Mask128<T, N> mask, Vec128<T, N> yes, |
| Vec128<T, N> no) { |
| return Vec128<T, N>{_mm_mask_blend_epi8(mask.raw, no.raw, yes.raw)}; |
| } |
| template <typename T, size_t N> |
| HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<2> /* tag */, |
| Mask128<T, N> mask, Vec128<T, N> yes, |
| Vec128<T, N> no) { |
| return Vec128<T, N>{_mm_mask_blend_epi16(mask.raw, no.raw, yes.raw)}; |
| } |
| template <typename T, size_t N> |
| HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<4> /* tag */, |
| Mask128<T, N> mask, Vec128<T, N> yes, |
| Vec128<T, N> no) { |
| return Vec128<T, N>{_mm_mask_blend_epi32(mask.raw, no.raw, yes.raw)}; |
| } |
| template <typename T, size_t N> |
| HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<8> /* tag */, |
| Mask128<T, N> mask, Vec128<T, N> yes, |
| Vec128<T, N> no) { |
| return Vec128<T, N>{_mm_mask_blend_epi64(mask.raw, no.raw, yes.raw)}; |
| } |
| |
| } // namespace detail |
| |
| template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> |
| HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes, |
| Vec128<T, N> no) { |
| return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no); |
| } |
| |
| #if HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Vec128<float16_t, N> IfThenElse(Mask128<float16_t, N> mask, |
| Vec128<float16_t, N> yes, |
| Vec128<float16_t, N> no) { |
| return Vec128<float16_t, N>{_mm_mask_blend_ph(mask.raw, no.raw, yes.raw)}; |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| |
| // Generic for all vector lengths. |
| template <class V, class D = DFromV<V>, HWY_X86_IF_EMULATED_D(D)> |
| HWY_API V IfThenElse(MFromD<D> mask, V yes, V no) { |
| const RebindToUnsigned<D> du; |
| return BitCast( |
| D(), IfThenElse(RebindMask(du, mask), BitCast(du, yes), BitCast(du, no))); |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<float, N> IfThenElse(Mask128<float, N> mask, |
| Vec128<float, N> yes, Vec128<float, N> no) { |
| return Vec128<float, N>{_mm_mask_blend_ps(mask.raw, no.raw, yes.raw)}; |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<double, N> IfThenElse(Mask128<double, N> mask, |
| Vec128<double, N> yes, |
| Vec128<double, N> no) { |
| return Vec128<double, N>{_mm_mask_blend_pd(mask.raw, no.raw, yes.raw)}; |
| } |
| |
| namespace detail { |
| |
| template <typename T, size_t N> |
| HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<1> /* tag */, |
| Mask128<T, N> mask, Vec128<T, N> yes) { |
| return Vec128<T, N>{_mm_maskz_mov_epi8(mask.raw, yes.raw)}; |
| } |
| template <typename T, size_t N> |
| HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<2> /* tag */, |
| Mask128<T, N> mask, Vec128<T, N> yes) { |
| return Vec128<T, N>{_mm_maskz_mov_epi16(mask.raw, yes.raw)}; |
| } |
| template <typename T, size_t N> |
| HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<4> /* tag */, |
| Mask128<T, N> mask, Vec128<T, N> yes) { |
| return Vec128<T, N>{_mm_maskz_mov_epi32(mask.raw, yes.raw)}; |
| } |
| template <typename T, size_t N> |
| HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<8> /* tag */, |
| Mask128<T, N> mask, Vec128<T, N> yes) { |
| return Vec128<T, N>{_mm_maskz_mov_epi64(mask.raw, yes.raw)}; |
| } |
| |
| } // namespace detail |
| |
| template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> |
| HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) { |
| return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes); |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<float, N> IfThenElseZero(Mask128<float, N> mask, |
| Vec128<float, N> yes) { |
| return Vec128<float, N>{_mm_maskz_mov_ps(mask.raw, yes.raw)}; |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<double, N> IfThenElseZero(Mask128<double, N> mask, |
| Vec128<double, N> yes) { |
| return Vec128<double, N>{_mm_maskz_mov_pd(mask.raw, yes.raw)}; |
| } |
| |
| // Generic for all vector lengths. |
| template <class V, class D = DFromV<V>, HWY_IF_SPECIAL_FLOAT_D(D)> |
| HWY_API V IfThenElseZero(MFromD<D> mask, V yes) { |
| const RebindToUnsigned<D> du; |
| return BitCast(D(), IfThenElseZero(RebindMask(du, mask), BitCast(du, yes))); |
| } |
| |
| namespace detail { |
| |
| template <typename T, size_t N> |
| HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<1> /* tag */, |
| Mask128<T, N> mask, Vec128<T, N> no) { |
| // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16. |
| return Vec128<T, N>{_mm_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)}; |
| } |
| template <typename T, size_t N> |
| HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<2> /* tag */, |
| Mask128<T, N> mask, Vec128<T, N> no) { |
| return Vec128<T, N>{_mm_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)}; |
| } |
| template <typename T, size_t N> |
| HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<4> /* tag */, |
| Mask128<T, N> mask, Vec128<T, N> no) { |
| return Vec128<T, N>{_mm_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)}; |
| } |
| template <typename T, size_t N> |
| HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<8> /* tag */, |
| Mask128<T, N> mask, Vec128<T, N> no) { |
| return Vec128<T, N>{_mm_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)}; |
| } |
| |
| } // namespace detail |
| |
| template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> |
| HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) { |
| return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no); |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<float, N> IfThenZeroElse(Mask128<float, N> mask, |
| Vec128<float, N> no) { |
| return Vec128<float, N>{_mm_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)}; |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<double, N> IfThenZeroElse(Mask128<double, N> mask, |
| Vec128<double, N> no) { |
| return Vec128<double, N>{_mm_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)}; |
| } |
| |
| // Generic for all vector lengths. |
| template <class V, class D = DFromV<V>, HWY_IF_SPECIAL_FLOAT_D(D)> |
| HWY_API V IfThenZeroElse(MFromD<D> mask, V no) { |
| const RebindToUnsigned<D> du; |
| return BitCast(D(), IfThenZeroElse(RebindMask(du, mask), BitCast(du, no))); |
| } |
| |
| // ------------------------------ Mask logical |
| |
| namespace detail { |
| |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> And(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> a, |
| const Mask128<T, N> b) { |
| #if HWY_COMPILER_HAS_MASK_INTRINSICS |
| return Mask128<T, N>{_kand_mask16(a.raw, b.raw)}; |
| #else |
| return Mask128<T, N>{static_cast<__mmask16>(a.raw & b.raw)}; |
| #endif |
| } |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> And(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> a, |
| const Mask128<T, N> b) { |
| #if HWY_COMPILER_HAS_MASK_INTRINSICS |
| return Mask128<T, N>{_kand_mask8(a.raw, b.raw)}; |
| #else |
| return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)}; |
| #endif |
| } |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> And(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> a, |
| const Mask128<T, N> b) { |
| #if HWY_COMPILER_HAS_MASK_INTRINSICS |
| return Mask128<T, N>{_kand_mask8(a.raw, b.raw)}; |
| #else |
| return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)}; |
| #endif |
| } |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> And(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> a, |
| const Mask128<T, N> b) { |
| #if HWY_COMPILER_HAS_MASK_INTRINSICS |
| return Mask128<T, N>{_kand_mask8(a.raw, b.raw)}; |
| #else |
| return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)}; |
| #endif |
| } |
| |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> AndNot(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> a, |
| const Mask128<T, N> b) { |
| #if HWY_COMPILER_HAS_MASK_INTRINSICS |
| return Mask128<T, N>{_kandn_mask16(a.raw, b.raw)}; |
| #else |
| return Mask128<T, N>{static_cast<__mmask16>(~a.raw & b.raw)}; |
| #endif |
| } |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> AndNot(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> a, |
| const Mask128<T, N> b) { |
| #if HWY_COMPILER_HAS_MASK_INTRINSICS |
| return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)}; |
| #else |
| return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)}; |
| #endif |
| } |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> AndNot(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> a, |
| const Mask128<T, N> b) { |
| #if HWY_COMPILER_HAS_MASK_INTRINSICS |
| return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)}; |
| #else |
| return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)}; |
| #endif |
| } |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> AndNot(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> a, |
| const Mask128<T, N> b) { |
| #if HWY_COMPILER_HAS_MASK_INTRINSICS |
| return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)}; |
| #else |
| return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)}; |
| #endif |
| } |
| |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> Or(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> a, |
| const Mask128<T, N> b) { |
| #if HWY_COMPILER_HAS_MASK_INTRINSICS |
| return Mask128<T, N>{_kor_mask16(a.raw, b.raw)}; |
| #else |
| return Mask128<T, N>{static_cast<__mmask16>(a.raw | b.raw)}; |
| #endif |
| } |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> Or(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> a, |
| const Mask128<T, N> b) { |
| #if HWY_COMPILER_HAS_MASK_INTRINSICS |
| return Mask128<T, N>{_kor_mask8(a.raw, b.raw)}; |
| #else |
| return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)}; |
| #endif |
| } |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> Or(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> a, |
| const Mask128<T, N> b) { |
| #if HWY_COMPILER_HAS_MASK_INTRINSICS |
| return Mask128<T, N>{_kor_mask8(a.raw, b.raw)}; |
| #else |
| return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)}; |
| #endif |
| } |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> Or(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> a, |
| const Mask128<T, N> b) { |
| #if HWY_COMPILER_HAS_MASK_INTRINSICS |
| return Mask128<T, N>{_kor_mask8(a.raw, b.raw)}; |
| #else |
| return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)}; |
| #endif |
| } |
| |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> Xor(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> a, |
| const Mask128<T, N> b) { |
| #if HWY_COMPILER_HAS_MASK_INTRINSICS |
| return Mask128<T, N>{_kxor_mask16(a.raw, b.raw)}; |
| #else |
| return Mask128<T, N>{static_cast<__mmask16>(a.raw ^ b.raw)}; |
| #endif |
| } |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> Xor(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> a, |
| const Mask128<T, N> b) { |
| #if HWY_COMPILER_HAS_MASK_INTRINSICS |
| return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)}; |
| #else |
| return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)}; |
| #endif |
| } |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> Xor(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> a, |
| const Mask128<T, N> b) { |
| #if HWY_COMPILER_HAS_MASK_INTRINSICS |
| return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)}; |
| #else |
| return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)}; |
| #endif |
| } |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> Xor(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> a, |
| const Mask128<T, N> b) { |
| #if HWY_COMPILER_HAS_MASK_INTRINSICS |
| return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)}; |
| #else |
| return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)}; |
| #endif |
| } |
| |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<1> /*tag*/, |
| const Mask128<T, N> a, |
| const Mask128<T, N> b) { |
| #if HWY_COMPILER_HAS_MASK_INTRINSICS |
| return Mask128<T, N>{_kxnor_mask16(a.raw, b.raw)}; |
| #else |
| return Mask128<T, N>{static_cast<__mmask16>(~(a.raw ^ b.raw) & 0xFFFF)}; |
| #endif |
| } |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<2> /*tag*/, |
| const Mask128<T, N> a, |
| const Mask128<T, N> b) { |
| #if HWY_COMPILER_HAS_MASK_INTRINSICS |
| return Mask128<T, N>{_kxnor_mask8(a.raw, b.raw)}; |
| #else |
| return Mask128<T, N>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xFF)}; |
| #endif |
| } |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<4> /*tag*/, |
| const Mask128<T, N> a, |
| const Mask128<T, N> b) { |
| #if HWY_COMPILER_HAS_MASK_INTRINSICS |
| return Mask128<T, N>{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0xF)}; |
| #else |
| return Mask128<T, N>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xF)}; |
| #endif |
| } |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<8> /*tag*/, |
| const Mask128<T, N> a, |
| const Mask128<T, N> b) { |
| #if HWY_COMPILER_HAS_MASK_INTRINSICS |
| return Mask128<T, N>{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0x3)}; |
| #else |
| return Mask128<T, N>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0x3)}; |
| #endif |
| } |
| |
| // UnmaskedNot returns ~m.raw without zeroing out any invalid bits |
| template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> |
| HWY_INLINE Mask128<T, N> UnmaskedNot(const Mask128<T, N> m) { |
| #if HWY_COMPILER_HAS_MASK_INTRINSICS |
| return Mask128<T, N>{static_cast<__mmask16>(_knot_mask16(m.raw))}; |
| #else |
| return Mask128<T, N>{static_cast<__mmask16>(~m.raw)}; |
| #endif |
| } |
| |
| template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)> |
| HWY_INLINE Mask128<T, N> UnmaskedNot(const Mask128<T, N> m) { |
| #if HWY_COMPILER_HAS_MASK_INTRINSICS |
| return Mask128<T, N>{static_cast<__mmask8>(_knot_mask8(m.raw))}; |
| #else |
| return Mask128<T, N>{static_cast<__mmask8>(~m.raw)}; |
| #endif |
| } |
| |
| template <typename T> |
| HWY_INLINE Mask128<T> Not(hwy::SizeTag<1> /*tag*/, const Mask128<T> m) { |
| // sizeof(T) == 1 and N == 16: simply return ~m as all 16 bits of m are valid |
| return UnmaskedNot(m); |
| } |
| template <typename T, size_t N, HWY_IF_LANES_LE(N, 8)> |
| HWY_INLINE Mask128<T, N> Not(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> m) { |
| // sizeof(T) == 1 and N <= 8: need to zero out the upper bits of ~m as there |
| // are fewer than 16 valid bits in m |
| |
| // Return (~m) & ((1ull << N) - 1) |
| return AndNot(hwy::SizeTag<1>(), m, Mask128<T, N>::FromBits((1ull << N) - 1)); |
| } |
| template <typename T> |
| HWY_INLINE Mask128<T> Not(hwy::SizeTag<2> /*tag*/, const Mask128<T> m) { |
| // sizeof(T) == 2 and N == 8: simply return ~m as all 8 bits of m are valid |
| return UnmaskedNot(m); |
| } |
| template <typename T, size_t N, HWY_IF_LANES_LE(N, 4)> |
| HWY_INLINE Mask128<T, N> Not(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> m) { |
| // sizeof(T) == 2 and N <= 4: need to zero out the upper bits of ~m as there |
| // are fewer than 8 valid bits in m |
| |
| // Return (~m) & ((1ull << N) - 1) |
| return AndNot(hwy::SizeTag<2>(), m, Mask128<T, N>::FromBits((1ull << N) - 1)); |
| } |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> Not(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> m) { |
| // sizeof(T) == 4: need to zero out the upper bits of ~m as there are at most |
| // 4 valid bits in m |
| |
| // Return (~m) & ((1ull << N) - 1) |
| return AndNot(hwy::SizeTag<4>(), m, Mask128<T, N>::FromBits((1ull << N) - 1)); |
| } |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> Not(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> m) { |
| // sizeof(T) == 8: need to zero out the upper bits of ~m as there are at most |
| // 2 valid bits in m |
| |
| // Return (~m) & ((1ull << N) - 1) |
| return AndNot(hwy::SizeTag<8>(), m, Mask128<T, N>::FromBits((1ull << N) - 1)); |
| } |
| |
| } // namespace detail |
| |
| template <typename T, size_t N> |
| HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) { |
| return detail::And(hwy::SizeTag<sizeof(T)>(), a, b); |
| } |
| |
| template <typename T, size_t N> |
| HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) { |
| return detail::AndNot(hwy::SizeTag<sizeof(T)>(), a, b); |
| } |
| |
| template <typename T, size_t N> |
| HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) { |
| return detail::Or(hwy::SizeTag<sizeof(T)>(), a, b); |
| } |
| |
| template <typename T, size_t N> |
| HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) { |
| return detail::Xor(hwy::SizeTag<sizeof(T)>(), a, b); |
| } |
| |
| template <typename T, size_t N> |
| HWY_API Mask128<T, N> Not(const Mask128<T, N> m) { |
| // Flip only the valid bits |
| return detail::Not(hwy::SizeTag<sizeof(T)>(), m); |
| } |
| |
| template <typename T, size_t N> |
| HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) { |
| return detail::ExclusiveNeither(hwy::SizeTag<sizeof(T)>(), a, b); |
| } |
| |
| #else // AVX2 or below |
| |
| // ------------------------------ Mask |
| |
| // Mask and Vec are the same (true = FF..FF). |
| template <typename T, size_t N> |
| HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) { |
| return Mask128<T, N>{v.raw}; |
| } |
| |
| template <class D> |
| using MFromD = decltype(MaskFromVec(VFromD<D>())); |
| |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) { |
| return Vec128<T, N>{v.raw}; |
| } |
| |
| // Generic for all vector lengths. |
| template <class D> |
| HWY_API VFromD<D> VecFromMask(D /* tag */, MFromD<D> v) { |
| return VecFromMask(v); |
| } |
| |
| #if HWY_TARGET >= HWY_SSSE3 |
| |
| // mask ? yes : no |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes, |
| Vec128<T, N> no) { |
| const auto vmask = VecFromMask(DFromV<decltype(no)>(), mask); |
| return Or(And(vmask, yes), AndNot(vmask, no)); |
| } |
| |
| #else // HWY_TARGET < HWY_SSSE3 |
| |
| // mask ? yes : no |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes, |
| Vec128<T, N> no) { |
| return Vec128<T, N>{_mm_blendv_epi8(no.raw, yes.raw, mask.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<float, N> IfThenElse(Mask128<float, N> mask, |
| Vec128<float, N> yes, Vec128<float, N> no) { |
| return Vec128<float, N>{_mm_blendv_ps(no.raw, yes.raw, mask.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<double, N> IfThenElse(Mask128<double, N> mask, |
| Vec128<double, N> yes, |
| Vec128<double, N> no) { |
| return Vec128<double, N>{_mm_blendv_pd(no.raw, yes.raw, mask.raw)}; |
| } |
| |
| #endif // HWY_TARGET >= HWY_SSSE3 |
| |
| // mask ? yes : 0 |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) { |
| return yes & VecFromMask(DFromV<decltype(yes)>(), mask); |
| } |
| |
| // mask ? 0 : no |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) { |
| return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no); |
| } |
| |
| // ------------------------------ Mask logical |
| |
| template <typename T, size_t N> |
| HWY_API Mask128<T, N> Not(const Mask128<T, N> m) { |
| const Simd<T, N, 0> d; |
| return MaskFromVec(Not(VecFromMask(d, m))); |
| } |
| |
| template <typename T, size_t N> |
| HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) { |
| const Simd<T, N, 0> d; |
| return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); |
| } |
| |
| template <typename T, size_t N> |
| HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) { |
| const Simd<T, N, 0> d; |
| return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); |
| } |
| |
| template <typename T, size_t N> |
| HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) { |
| const Simd<T, N, 0> d; |
| return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); |
| } |
| |
| template <typename T, size_t N> |
| HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) { |
| const Simd<T, N, 0> d; |
| return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); |
| } |
| |
| template <typename T, size_t N> |
| HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) { |
| const Simd<T, N, 0> d; |
| return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); |
| } |
| |
| #endif // HWY_TARGET <= HWY_AVX3 |
| |
| // ------------------------------ ShiftLeft |
| |
| template <int kBits, size_t N> |
| HWY_API Vec128<uint16_t, N> ShiftLeft(const Vec128<uint16_t, N> v) { |
| return Vec128<uint16_t, N>{_mm_slli_epi16(v.raw, kBits)}; |
| } |
| |
| template <int kBits, size_t N> |
| HWY_API Vec128<uint32_t, N> ShiftLeft(const Vec128<uint32_t, N> v) { |
| return Vec128<uint32_t, N>{_mm_slli_epi32(v.raw, kBits)}; |
| } |
| |
| template <int kBits, size_t N> |
| HWY_API Vec128<uint64_t, N> ShiftLeft(const Vec128<uint64_t, N> v) { |
| return Vec128<uint64_t, N>{_mm_slli_epi64(v.raw, kBits)}; |
| } |
| |
| template <int kBits, size_t N> |
| HWY_API Vec128<int16_t, N> ShiftLeft(const Vec128<int16_t, N> v) { |
| return Vec128<int16_t, N>{_mm_slli_epi16(v.raw, kBits)}; |
| } |
| template <int kBits, size_t N> |
| HWY_API Vec128<int32_t, N> ShiftLeft(const Vec128<int32_t, N> v) { |
| return Vec128<int32_t, N>{_mm_slli_epi32(v.raw, kBits)}; |
| } |
| template <int kBits, size_t N> |
| HWY_API Vec128<int64_t, N> ShiftLeft(const Vec128<int64_t, N> v) { |
| return Vec128<int64_t, N>{_mm_slli_epi64(v.raw, kBits)}; |
| } |
| |
| #if HWY_TARGET <= HWY_AVX3_DL |
| |
| namespace detail { |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N> GaloisAffine( |
| Vec128<T, N> v, VFromD<Repartition<uint64_t, Simd<T, N, 0>>> matrix) { |
| return Vec128<T, N>{_mm_gf2p8affine_epi64_epi8(v.raw, matrix.raw, 0)}; |
| } |
| } // namespace detail |
| |
| #else // HWY_TARGET > HWY_AVX3_DL |
| |
| template <int kBits, typename T, size_t N, HWY_IF_T_SIZE(T, 1)> |
| HWY_API Vec128<T, N> ShiftLeft(const Vec128<T, N> v) { |
| const DFromV<decltype(v)> d8; |
| // Use raw instead of BitCast to support N=1. |
| const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw}; |
| return kBits == 1 |
| ? (v + v) |
| : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF))); |
| } |
| |
| #endif // HWY_TARGET > HWY_AVX3_DL |
| |
| // ------------------------------ ShiftRight |
| |
| template <int kBits, size_t N> |
| HWY_API Vec128<uint16_t, N> ShiftRight(const Vec128<uint16_t, N> v) { |
| return Vec128<uint16_t, N>{_mm_srli_epi16(v.raw, kBits)}; |
| } |
| template <int kBits, size_t N> |
| HWY_API Vec128<uint32_t, N> ShiftRight(const Vec128<uint32_t, N> v) { |
| return Vec128<uint32_t, N>{_mm_srli_epi32(v.raw, kBits)}; |
| } |
| template <int kBits, size_t N> |
| HWY_API Vec128<uint64_t, N> ShiftRight(const Vec128<uint64_t, N> v) { |
| return Vec128<uint64_t, N>{_mm_srli_epi64(v.raw, kBits)}; |
| } |
| |
| template <int kBits, size_t N> |
| HWY_API Vec128<int16_t, N> ShiftRight(const Vec128<int16_t, N> v) { |
| return Vec128<int16_t, N>{_mm_srai_epi16(v.raw, kBits)}; |
| } |
| template <int kBits, size_t N> |
| HWY_API Vec128<int32_t, N> ShiftRight(const Vec128<int32_t, N> v) { |
| return Vec128<int32_t, N>{_mm_srai_epi32(v.raw, kBits)}; |
| } |
| |
| #if HWY_TARGET > HWY_AVX3_DL |
| |
| template <int kBits, size_t N> |
| HWY_API Vec128<uint8_t, N> ShiftRight(const Vec128<uint8_t, N> v) { |
| const DFromV<decltype(v)> d8; |
| // Use raw instead of BitCast to support N=1. |
| const Vec128<uint8_t, N> shifted{ |
| ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw}; |
| return shifted & Set(d8, 0xFF >> kBits); |
| } |
| |
| template <int kBits, size_t N> |
| HWY_API Vec128<int8_t, N> ShiftRight(const Vec128<int8_t, N> v) { |
| const DFromV<decltype(v)> di; |
| const RebindToUnsigned<decltype(di)> du; |
| const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v))); |
| const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits)); |
| return (shifted ^ shifted_sign) - shifted_sign; |
| } |
| |
| #endif // HWY_TARGET > HWY_AVX3_DL |
| |
| // i64 is implemented after BroadcastSignBit. |
| |
| // ================================================== MEMORY (1) |
| |
| // Clang static analysis claims the memory immediately after a partial vector |
| // store is uninitialized, and also flags the input to partial loads (at least |
| // for loadl_pd) as "garbage". This is a false alarm because msan does not |
| // raise errors. We work around this by using CopyBytes instead of intrinsics, |
| // but only for the analyzer to avoid potentially bad code generation. |
| // Unfortunately __clang_analyzer__ was not defined for clang-tidy prior to v7. |
| #ifndef HWY_SAFE_PARTIAL_LOAD_STORE |
| #if defined(__clang_analyzer__) || \ |
| (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700) |
| #define HWY_SAFE_PARTIAL_LOAD_STORE 1 |
| #else |
| #define HWY_SAFE_PARTIAL_LOAD_STORE 0 |
| #endif |
| #endif // HWY_SAFE_PARTIAL_LOAD_STORE |
| |
| // ------------------------------ Load |
| |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)> |
| HWY_API VFromD<D> Load(D /* tag */, const TFromD<D>* HWY_RESTRICT aligned) { |
| return VFromD<D>{_mm_load_si128(reinterpret_cast<const __m128i*>(aligned))}; |
| } |
| #if HWY_HAVE_FLOAT16 |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)> |
| HWY_API Vec128<float16_t> Load(D, const float16_t* HWY_RESTRICT aligned) { |
| return Vec128<float16_t>{_mm_load_ph(aligned)}; |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| // Generic for all vector lengths greater than or equal to 16 bytes. |
| template <class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_X86_IF_EMULATED_D(D)> |
| HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT aligned) { |
| const RebindToUnsigned<decltype(d)> du; |
| return BitCast(d, Load(du, detail::U16LanePointer(aligned))); |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)> |
| HWY_API Vec128<float> Load(D /* tag */, const float* HWY_RESTRICT aligned) { |
| return Vec128<float>{_mm_load_ps(aligned)}; |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)> |
| HWY_API Vec128<double> Load(D /* tag */, const double* HWY_RESTRICT aligned) { |
| return Vec128<double>{_mm_load_pd(aligned)}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)> |
| HWY_API VFromD<D> LoadU(D /* tag */, const TFromD<D>* HWY_RESTRICT p) { |
| return VFromD<D>{_mm_loadu_si128(reinterpret_cast<const __m128i*>(p))}; |
| } |
| #if HWY_HAVE_FLOAT16 |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)> |
| HWY_API Vec128<float16_t> LoadU(D, const float16_t* HWY_RESTRICT p) { |
| return Vec128<float16_t>{_mm_loadu_ph(p)}; |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| // Generic for all vector lengths greater than or equal to 16 bytes. |
| template <class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_X86_IF_EMULATED_D(D)> |
| HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) { |
| const RebindToUnsigned<decltype(d)> du; |
| return BitCast(d, LoadU(du, detail::U16LanePointer(p))); |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)> |
| HWY_API Vec128<float> LoadU(D /* tag */, const float* HWY_RESTRICT p) { |
| return Vec128<float>{_mm_loadu_ps(p)}; |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)> |
| HWY_API Vec128<double> LoadU(D /* tag */, const double* HWY_RESTRICT p) { |
| return Vec128<double>{_mm_loadu_pd(p)}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_NOT_FLOAT3264_D(D)> |
| HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) { |
| const RebindToUnsigned<decltype(d)> du; // for float16_t |
| #if HWY_SAFE_PARTIAL_LOAD_STORE |
| __m128i v = _mm_setzero_si128(); |
| CopyBytes<8>(p, &v); // not same size |
| #else |
| const __m128i v = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p)); |
| #endif |
| return BitCast(d, VFromD<decltype(du)>{v}); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)> |
| HWY_API Vec64<float> Load(D /* tag */, const float* HWY_RESTRICT p) { |
| #if HWY_SAFE_PARTIAL_LOAD_STORE |
| __m128 v = _mm_setzero_ps(); |
| CopyBytes<8>(p, &v); // not same size |
| return Vec64<float>{v}; |
| #else |
| const __m128 hi = _mm_setzero_ps(); |
| return Vec64<float>{_mm_loadl_pi(hi, reinterpret_cast<const __m64*>(p))}; |
| #endif |
| } |
| |
| template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F64_D(D)> |
| HWY_API Vec64<double> Load(D /* tag */, const double* HWY_RESTRICT p) { |
| #if HWY_SAFE_PARTIAL_LOAD_STORE |
| __m128d v = _mm_setzero_pd(); |
| CopyBytes<8>(p, &v); // not same size |
| return Vec64<double>{v}; |
| #else |
| return Vec64<double>{_mm_load_sd(p)}; |
| #endif |
| } |
| |
| template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)> |
| HWY_API Vec32<float> Load(D /* tag */, const float* HWY_RESTRICT p) { |
| #if HWY_SAFE_PARTIAL_LOAD_STORE |
| __m128 v = _mm_setzero_ps(); |
| CopyBytes<4>(p, &v); // not same size |
| return Vec32<float>{v}; |
| #else |
| return Vec32<float>{_mm_load_ss(p)}; |
| #endif |
| } |
| |
| // Any <= 32 bit except <float, 1> |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_NOT_FLOAT3264_D(D)> |
| HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) { |
| const RebindToUnsigned<decltype(d)> du; // for float16_t |
| // Clang ArgumentPromotionPass seems to break this code. We can unpoison |
| // before SetTableIndices -> LoadU -> Load and the memory is poisoned again. |
| detail::MaybeUnpoison(p, Lanes(d)); |
| |
| #if HWY_SAFE_PARTIAL_LOAD_STORE |
| __m128i v = Zero(Full128<TFromD<decltype(du)>>()).raw; |
| CopyBytes<d.MaxBytes()>(p, &v); // not same size as VFromD |
| #else |
| int32_t bits = 0; |
| CopyBytes<d.MaxBytes()>(p, &bits); // not same size as VFromD |
| const __m128i v = _mm_cvtsi32_si128(bits); |
| #endif |
| return BitCast(d, VFromD<decltype(du)>{v}); |
| } |
| |
| // For < 128 bit, LoadU == Load. |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> |
| HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) { |
| return Load(d, p); |
| } |
| |
| // 128-bit SIMD => nothing to duplicate, same as an unaligned load. |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) { |
| return LoadU(d, p); |
| } |
| |
| // ------------------------------ Store |
| |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)> |
| HWY_API void Store(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT aligned) { |
| _mm_store_si128(reinterpret_cast<__m128i*>(aligned), v.raw); |
| } |
| #if HWY_HAVE_FLOAT16 |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)> |
| HWY_API void Store(Vec128<float16_t> v, D, float16_t* HWY_RESTRICT aligned) { |
| _mm_store_ph(aligned, v.raw); |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| // Generic for all vector lengths greater than or equal to 16 bytes. |
| template <class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_X86_IF_EMULATED_D(D)> |
| HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) { |
| const RebindToUnsigned<decltype(d)> du; |
| Store(BitCast(du, v), du, reinterpret_cast<uint16_t*>(aligned)); |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)> |
| HWY_API void Store(Vec128<float> v, D /* tag */, float* HWY_RESTRICT aligned) { |
| _mm_store_ps(aligned, v.raw); |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)> |
| HWY_API void Store(Vec128<double> v, D /* tag */, |
| double* HWY_RESTRICT aligned) { |
| _mm_store_pd(aligned, v.raw); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)> |
| HWY_API void StoreU(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT p) { |
| _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v.raw); |
| } |
| #if HWY_HAVE_FLOAT16 |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)> |
| HWY_API void StoreU(Vec128<float16_t> v, D, float16_t* HWY_RESTRICT p) { |
| _mm_storeu_ph(p, v.raw); |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| // Generic for all vector lengths greater than or equal to 16 bytes. |
| template <class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_X86_IF_EMULATED_D(D)> |
| HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) { |
| const RebindToUnsigned<decltype(d)> du; |
| StoreU(BitCast(du, v), du, reinterpret_cast<uint16_t*>(p)); |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)> |
| HWY_API void StoreU(Vec128<float> v, D /* tag */, float* HWY_RESTRICT p) { |
| _mm_storeu_ps(p, v.raw); |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)> |
| HWY_API void StoreU(Vec128<double> v, D /* tag */, double* HWY_RESTRICT p) { |
| _mm_storeu_pd(p, v.raw); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_NOT_FLOAT3264_D(D)> |
| HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) { |
| #if HWY_SAFE_PARTIAL_LOAD_STORE |
| (void)d; |
| CopyBytes<8>(&v, p); // not same size |
| #else |
| const RebindToUnsigned<decltype(d)> du; // for float16_t |
| _mm_storel_epi64(reinterpret_cast<__m128i*>(p), BitCast(du, v).raw); |
| #endif |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)> |
| HWY_API void Store(Vec64<float> v, D /* tag */, float* HWY_RESTRICT p) { |
| #if HWY_SAFE_PARTIAL_LOAD_STORE |
| CopyBytes<8>(&v, p); // not same size |
| #else |
| _mm_storel_pi(reinterpret_cast<__m64*>(p), v.raw); |
| #endif |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F64_D(D)> |
| HWY_API void Store(Vec64<double> v, D /* tag */, double* HWY_RESTRICT p) { |
| #if HWY_SAFE_PARTIAL_LOAD_STORE |
| CopyBytes<8>(&v, p); // not same size |
| #else |
| _mm_storel_pd(p, v.raw); |
| #endif |
| } |
| |
| // Any <= 32 bit except <float, 1> |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_NOT_FLOAT3264_D(D)> |
| HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) { |
| CopyBytes<d.MaxBytes()>(&v, p); // not same size |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)> |
| HWY_API void Store(Vec32<float> v, D /* tag */, float* HWY_RESTRICT p) { |
| #if HWY_SAFE_PARTIAL_LOAD_STORE |
| CopyBytes<4>(&v, p); // not same size |
| #else |
| _mm_store_ss(p, v.raw); |
| #endif |
| } |
| |
| // For < 128 bit, StoreU == Store. |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> |
| HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) { |
| Store(v, d, p); |
| } |
| |
| // ================================================== SWIZZLE (1) |
| |
| // ------------------------------ TableLookupBytes |
| template <typename T, size_t N, typename TI, size_t NI> |
| HWY_API Vec128<TI, NI> TableLookupBytes(const Vec128<T, N> bytes, |
| const Vec128<TI, NI> from) { |
| const DFromV<decltype(from)> d; |
| const Repartition<uint8_t, decltype(d)> du8; |
| |
| const DFromV<decltype(bytes)> d_bytes; |
| const Repartition<uint8_t, decltype(d_bytes)> du8_bytes; |
| #if HWY_TARGET == HWY_SSE2 |
| #if HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle) |
| typedef uint8_t GccU8RawVectType __attribute__((__vector_size__(16))); |
| (void)d; |
| (void)du8; |
| (void)d_bytes; |
| (void)du8_bytes; |
| return Vec128<TI, NI>{reinterpret_cast<typename detail::Raw128<TI>::type>( |
| __builtin_shuffle(reinterpret_cast<GccU8RawVectType>(bytes.raw), |
| reinterpret_cast<GccU8RawVectType>(from.raw)))}; |
| #else |
| const Full128<uint8_t> du8_full; |
| |
| alignas(16) uint8_t result_bytes[16]; |
| alignas(16) uint8_t u8_bytes[16]; |
| alignas(16) uint8_t from_bytes[16]; |
| |
| Store(Vec128<uint8_t>{BitCast(du8_bytes, bytes).raw}, du8_full, u8_bytes); |
| Store(Vec128<uint8_t>{BitCast(du8, from).raw}, du8_full, from_bytes); |
| |
| for (int i = 0; i < 16; i++) { |
| result_bytes[i] = u8_bytes[from_bytes[i] & 15]; |
| } |
| |
| return BitCast(d, VFromD<decltype(du8)>{Load(du8_full, result_bytes).raw}); |
| #endif |
| #else // SSSE3 or newer |
| return BitCast( |
| d, VFromD<decltype(du8)>{_mm_shuffle_epi8(BitCast(du8_bytes, bytes).raw, |
| BitCast(du8, from).raw)}); |
| #endif |
| } |
| |
| // ------------------------------ TableLookupBytesOr0 |
| // For all vector widths; x86 anyway zeroes if >= 0x80 on SSSE3/SSE4/AVX2/AVX3 |
| template <class V, class VI> |
| HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) { |
| #if HWY_TARGET == HWY_SSE2 |
| const DFromV<decltype(from)> d; |
| const Repartition<int8_t, decltype(d)> di8; |
| |
| const auto di8_from = BitCast(di8, from); |
| return BitCast(d, IfThenZeroElse(di8_from < Zero(di8), |
| TableLookupBytes(bytes, di8_from))); |
| #else |
| return TableLookupBytes(bytes, from); |
| #endif |
| } |
| |
| // ------------------------------ Shuffles (ShiftRight, TableLookupBytes) |
| |
| // Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant). |
| // Shuffle0321 rotates one lane to the right (the previous least-significant |
| // lane is now most-significant). These could also be implemented via |
| // CombineShiftRightBytes but the shuffle_abcd notation is more convenient. |
| |
| // Swap 32-bit halves in 64-bit halves. |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N> Shuffle2301(const Vec128<T, N> v) { |
| static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); |
| static_assert(N == 2 || N == 4, "Does not make sense for N=1"); |
| return Vec128<T, N>{_mm_shuffle_epi32(v.raw, 0xB1)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<float, N> Shuffle2301(const Vec128<float, N> v) { |
| static_assert(N == 2 || N == 4, "Does not make sense for N=1"); |
| return Vec128<float, N>{_mm_shuffle_ps(v.raw, v.raw, 0xB1)}; |
| } |
| |
| // These are used by generic_ops-inl to implement LoadInterleaved3. As with |
| // Intel's shuffle* intrinsics and InterleaveLower, the lower half of the output |
| // comes from the first argument. |
| namespace detail { |
| |
| template <typename T, HWY_IF_T_SIZE(T, 1)> |
| HWY_API Vec32<T> ShuffleTwo2301(const Vec32<T> a, const Vec32<T> b) { |
| const DFromV<decltype(a)> d; |
| const Twice<decltype(d)> d2; |
| const auto ba = Combine(d2, b, a); |
| #if HWY_TARGET == HWY_SSE2 |
| Vec32<uint16_t> ba_shuffled{ |
| _mm_shufflelo_epi16(ba.raw, _MM_SHUFFLE(3, 0, 3, 0))}; |
| return BitCast(d, Or(ShiftLeft<8>(ba_shuffled), ShiftRight<8>(ba_shuffled))); |
| #else |
| const RebindToUnsigned<decltype(d2)> d2_u; |
| const auto shuffle_idx = |
| BitCast(d2, Dup128VecFromValues(d2_u, 1, 0, 7, 6, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0)); |
| return Vec32<T>{TableLookupBytes(ba, shuffle_idx).raw}; |
| #endif |
| } |
| template <typename T, HWY_IF_T_SIZE(T, 2)> |
| HWY_API Vec64<T> ShuffleTwo2301(const Vec64<T> a, const Vec64<T> b) { |
| const DFromV<decltype(a)> d; |
| const Twice<decltype(d)> d2; |
| const auto ba = Combine(d2, b, a); |
| #if HWY_TARGET == HWY_SSE2 |
| Vec64<uint32_t> ba_shuffled{ |
| _mm_shuffle_epi32(ba.raw, _MM_SHUFFLE(3, 0, 3, 0))}; |
| return Vec64<T>{ |
| _mm_shufflelo_epi16(ba_shuffled.raw, _MM_SHUFFLE(2, 3, 0, 1))}; |
| #else |
| const RebindToUnsigned<decltype(d2)> d2_u; |
| const auto shuffle_idx = BitCast( |
| d2, |
| Dup128VecFromValues(d2_u, 0x0302, 0x0100, 0x0f0e, 0x0d0c, 0, 0, 0, 0)); |
| return Vec64<T>{TableLookupBytes(ba, shuffle_idx).raw}; |
| #endif |
| } |
| template <typename T, HWY_IF_T_SIZE(T, 4)> |
| HWY_API Vec128<T> ShuffleTwo2301(const Vec128<T> a, const Vec128<T> b) { |
| const DFromV<decltype(a)> d; |
| const RebindToFloat<decltype(d)> df; |
| constexpr int m = _MM_SHUFFLE(2, 3, 0, 1); |
| return BitCast(d, Vec128<float>{_mm_shuffle_ps(BitCast(df, a).raw, |
| BitCast(df, b).raw, m)}); |
| } |
| |
| template <typename T, HWY_IF_T_SIZE(T, 1)> |
| HWY_API Vec32<T> ShuffleTwo1230(const Vec32<T> a, const Vec32<T> b) { |
| const DFromV<decltype(a)> d; |
| #if HWY_TARGET == HWY_SSE2 |
| const auto zero = Zero(d); |
| const Rebind<int16_t, decltype(d)> di16; |
| const Vec32<int16_t> a_shuffled{_mm_shufflelo_epi16( |
| _mm_unpacklo_epi8(a.raw, zero.raw), _MM_SHUFFLE(3, 0, 3, 0))}; |
| const Vec32<int16_t> b_shuffled{_mm_shufflelo_epi16( |
| _mm_unpacklo_epi8(b.raw, zero.raw), _MM_SHUFFLE(1, 2, 1, 2))}; |
| const auto ba_shuffled = Combine(di16, b_shuffled, a_shuffled); |
| return Vec32<T>{_mm_packus_epi16(ba_shuffled.raw, ba_shuffled.raw)}; |
| #else |
| const Twice<decltype(d)> d2; |
| const auto ba = Combine(d2, b, a); |
| const RebindToUnsigned<decltype(d2)> d2_u; |
| const auto shuffle_idx = |
| BitCast(d2, Dup128VecFromValues(d2_u, 0, 3, 6, 5, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0)); |
| return Vec32<T>{TableLookupBytes(ba, shuffle_idx).raw}; |
| #endif |
| } |
| template <typename T, HWY_IF_T_SIZE(T, 2)> |
| HWY_API Vec64<T> ShuffleTwo1230(const Vec64<T> a, const Vec64<T> b) { |
| const DFromV<decltype(a)> d; |
| #if HWY_TARGET == HWY_SSE2 |
| const Vec32<T> a_shuffled{ |
| _mm_shufflelo_epi16(a.raw, _MM_SHUFFLE(3, 0, 3, 0))}; |
| const Vec32<T> b_shuffled{ |
| _mm_shufflelo_epi16(b.raw, _MM_SHUFFLE(1, 2, 1, 2))}; |
| return Combine(d, b_shuffled, a_shuffled); |
| #else |
| const Twice<decltype(d)> d2; |
| const auto ba = Combine(d2, b, a); |
| const RebindToUnsigned<decltype(d2)> d2_u; |
| const auto shuffle_idx = BitCast( |
| d2, |
| Dup128VecFromValues(d2_u, 0x0100, 0x0706, 0x0d0c, 0x0b0a, 0, 0, 0, 0)); |
| return Vec64<T>{TableLookupBytes(ba, shuffle_idx).raw}; |
| #endif |
| } |
| template <typename T, HWY_IF_T_SIZE(T, 4)> |
| HWY_API Vec128<T> ShuffleTwo1230(const Vec128<T> a, const Vec128<T> b) { |
| const DFromV<decltype(a)> d; |
| const RebindToFloat<decltype(d)> df; |
| constexpr int m = _MM_SHUFFLE(1, 2, 3, 0); |
| return BitCast(d, Vec128<float>{_mm_shuffle_ps(BitCast(df, a).raw, |
| BitCast(df, b).raw, m)}); |
| } |
| |
| template <typename T, HWY_IF_T_SIZE(T, 1)> |
| HWY_API Vec32<T> ShuffleTwo3012(const Vec32<T> a, const Vec32<T> b) { |
| const DFromV<decltype(a)> d; |
| #if HWY_TARGET == HWY_SSE2 |
| const auto zero = Zero(d); |
| const Rebind<int16_t, decltype(d)> di16; |
| const Vec32<int16_t> a_shuffled{_mm_shufflelo_epi16( |
| _mm_unpacklo_epi8(a.raw, zero.raw), _MM_SHUFFLE(1, 2, 1, 2))}; |
| const Vec32<int16_t> b_shuffled{_mm_shufflelo_epi16( |
| _mm_unpacklo_epi8(b.raw, zero.raw), _MM_SHUFFLE(3, 0, 3, 0))}; |
| const auto ba_shuffled = Combine(di16, b_shuffled, a_shuffled); |
| return Vec32<T>{_mm_packus_epi16(ba_shuffled.raw, ba_shuffled.raw)}; |
| #else |
| const Twice<decltype(d)> d2; |
| const auto ba = Combine(d2, b, a); |
| const RebindToUnsigned<decltype(d2)> d2_u; |
| const auto shuffle_idx = |
| BitCast(d2, Dup128VecFromValues(d2_u, 2, 1, 4, 7, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0)); |
| return Vec32<T>{TableLookupBytes(ba, shuffle_idx).raw}; |
| #endif |
| } |
| template <typename T, HWY_IF_T_SIZE(T, 2)> |
| HWY_API Vec64<T> ShuffleTwo3012(const Vec64<T> a, const Vec64<T> b) { |
| const DFromV<decltype(a)> d; |
| #if HWY_TARGET == HWY_SSE2 |
| const Vec32<T> a_shuffled{ |
| _mm_shufflelo_epi16(a.raw, _MM_SHUFFLE(1, 2, 1, 2))}; |
| const Vec32<T> b_shuffled{ |
| _mm_shufflelo_epi16(b.raw, _MM_SHUFFLE(3, 0, 3, 0))}; |
| return Combine(d, b_shuffled, a_shuffled); |
| #else |
| const Twice<decltype(d)> d2; |
| const auto ba = Combine(d2, b, a); |
| const RebindToUnsigned<decltype(d2)> d2_u; |
| const auto shuffle_idx = BitCast( |
| d2, |
| Dup128VecFromValues(d2_u, 0x0504, 0x0302, 0x0908, 0x0f0e, 0, 0, 0, 0)); |
| return Vec64<T>{TableLookupBytes(ba, shuffle_idx).raw}; |
| #endif |
| } |
| template <typename T, HWY_IF_T_SIZE(T, 4)> |
| HWY_API Vec128<T> ShuffleTwo3012(const Vec128<T> a, const Vec128<T> b) { |
| const DFromV<decltype(a)> d; |
| const RebindToFloat<decltype(d)> df; |
| constexpr int m = _MM_SHUFFLE(3, 0, 1, 2); |
| return BitCast(d, Vec128<float>{_mm_shuffle_ps(BitCast(df, a).raw, |
| BitCast(df, b).raw, m)}); |
| } |
| |
| } // namespace detail |
| |
| // Swap 64-bit halves |
| HWY_API Vec128<uint32_t> Shuffle1032(const Vec128<uint32_t> v) { |
| return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x4E)}; |
| } |
| HWY_API Vec128<int32_t> Shuffle1032(const Vec128<int32_t> v) { |
| return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x4E)}; |
| } |
| HWY_API Vec128<float> Shuffle1032(const Vec128<float> v) { |
| return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x4E)}; |
| } |
| HWY_API Vec128<uint64_t> Shuffle01(const Vec128<uint64_t> v) { |
| return Vec128<uint64_t>{_mm_shuffle_epi32(v.raw, 0x4E)}; |
| } |
| HWY_API Vec128<int64_t> Shuffle01(const Vec128<int64_t> v) { |
| return Vec128<int64_t>{_mm_shuffle_epi32(v.raw, 0x4E)}; |
| } |
| HWY_API Vec128<double> Shuffle01(const Vec128<double> v) { |
| return Vec128<double>{_mm_shuffle_pd(v.raw, v.raw, 1)}; |
| } |
| |
| // Rotate right 32 bits |
| HWY_API Vec128<uint32_t> Shuffle0321(const Vec128<uint32_t> v) { |
| return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x39)}; |
| } |
| HWY_API Vec128<int32_t> Shuffle0321(const Vec128<int32_t> v) { |
| return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x39)}; |
| } |
| HWY_API Vec128<float> Shuffle0321(const Vec128<float> v) { |
| return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x39)}; |
| } |
| // Rotate left 32 bits |
| HWY_API Vec128<uint32_t> Shuffle2103(const Vec128<uint32_t> v) { |
| return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x93)}; |
| } |
| HWY_API Vec128<int32_t> Shuffle2103(const Vec128<int32_t> v) { |
| return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x93)}; |
| } |
| HWY_API Vec128<float> Shuffle2103(const Vec128<float> v) { |
| return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x93)}; |
| } |
| |
| // Reverse |
| HWY_API Vec128<uint32_t> Shuffle0123(const Vec128<uint32_t> v) { |
| return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x1B)}; |
| } |
| HWY_API Vec128<int32_t> Shuffle0123(const Vec128<int32_t> v) { |
| return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x1B)}; |
| } |
| HWY_API Vec128<float> Shuffle0123(const Vec128<float> v) { |
| return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x1B)}; |
| } |
| |
| // ================================================== COMPARE |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| |
| // Comparisons set a mask bit to 1 if the condition is true, else 0. |
| |
| // ------------------------------ TestBit |
| |
| namespace detail { |
| |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> TestBit(hwy::SizeTag<1> /*tag*/, const Vec128<T, N> v, |
| const Vec128<T, N> bit) { |
| return Mask128<T, N>{_mm_test_epi8_mask(v.raw, bit.raw)}; |
| } |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> TestBit(hwy::SizeTag<2> /*tag*/, const Vec128<T, N> v, |
| const Vec128<T, N> bit) { |
| return Mask128<T, N>{_mm_test_epi16_mask(v.raw, bit.raw)}; |
| } |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> TestBit(hwy::SizeTag<4> /*tag*/, const Vec128<T, N> v, |
| const Vec128<T, N> bit) { |
| return Mask128<T, N>{_mm_test_epi32_mask(v.raw, bit.raw)}; |
| } |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> TestBit(hwy::SizeTag<8> /*tag*/, const Vec128<T, N> v, |
| const Vec128<T, N> bit) { |
| return Mask128<T, N>{_mm_test_epi64_mask(v.raw, bit.raw)}; |
| } |
| |
| } // namespace detail |
| |
| template <typename T, size_t N> |
| HWY_API Mask128<T, N> TestBit(const Vec128<T, N> v, const Vec128<T, N> bit) { |
| static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported"); |
| return detail::TestBit(hwy::SizeTag<sizeof(T)>(), v, bit); |
| } |
| |
| // ------------------------------ Equality |
| |
| template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> |
| HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) { |
| return Mask128<T, N>{_mm_cmpeq_epi8_mask(a.raw, b.raw)}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_UI16(T)> |
| HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) { |
| return Mask128<T, N>{_mm_cmpeq_epi16_mask(a.raw, b.raw)}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_UI32(T)> |
| HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) { |
| return Mask128<T, N>{_mm_cmpeq_epi32_mask(a.raw, b.raw)}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_UI64(T)> |
| HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) { |
| return Mask128<T, N>{_mm_cmpeq_epi64_mask(a.raw, b.raw)}; |
| } |
| |
| #if HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Mask128<float16_t, N> operator==(Vec128<float16_t, N> a, |
| Vec128<float16_t, N> b) { |
| // Work around warnings in the intrinsic definitions (passing -1 as a mask). |
| HWY_DIAGNOSTICS(push) |
| HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") |
| return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_EQ_OQ)}; |
| HWY_DIAGNOSTICS(pop) |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Mask128<float, N> operator==(Vec128<float, N> a, Vec128<float, N> b) { |
| return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)}; |
| } |
| |
| template <size_t N> |
| HWY_API Mask128<double, N> operator==(Vec128<double, N> a, |
| Vec128<double, N> b) { |
| return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)}; |
| } |
| |
| // ------------------------------ Inequality |
| |
| template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> |
| HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) { |
| return Mask128<T, N>{_mm_cmpneq_epi8_mask(a.raw, b.raw)}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_UI16(T)> |
| HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) { |
| return Mask128<T, N>{_mm_cmpneq_epi16_mask(a.raw, b.raw)}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_UI32(T)> |
| HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) { |
| return Mask128<T, N>{_mm_cmpneq_epi32_mask(a.raw, b.raw)}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_UI64(T)> |
| HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) { |
| return Mask128<T, N>{_mm_cmpneq_epi64_mask(a.raw, b.raw)}; |
| } |
| |
| #if HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Mask128<float16_t, N> operator!=(Vec128<float16_t, N> a, |
| Vec128<float16_t, N> b) { |
| // Work around warnings in the intrinsic definitions (passing -1 as a mask). |
| HWY_DIAGNOSTICS(push) |
| HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") |
| return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_NEQ_OQ)}; |
| HWY_DIAGNOSTICS(pop) |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Mask128<float, N> operator!=(Vec128<float, N> a, Vec128<float, N> b) { |
| return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)}; |
| } |
| |
| template <size_t N> |
| HWY_API Mask128<double, N> operator!=(Vec128<double, N> a, |
| Vec128<double, N> b) { |
| return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)}; |
| } |
| |
| // ------------------------------ Strict inequality |
| |
| // Signed/float < |
| template <size_t N> |
| HWY_API Mask128<int8_t, N> operator>(Vec128<int8_t, N> a, Vec128<int8_t, N> b) { |
| return Mask128<int8_t, N>{_mm_cmpgt_epi8_mask(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Mask128<int16_t, N> operator>(Vec128<int16_t, N> a, |
| Vec128<int16_t, N> b) { |
| return Mask128<int16_t, N>{_mm_cmpgt_epi16_mask(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Mask128<int32_t, N> operator>(Vec128<int32_t, N> a, |
| Vec128<int32_t, N> b) { |
| return Mask128<int32_t, N>{_mm_cmpgt_epi32_mask(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Mask128<int64_t, N> operator>(Vec128<int64_t, N> a, |
| Vec128<int64_t, N> b) { |
| return Mask128<int64_t, N>{_mm_cmpgt_epi64_mask(a.raw, b.raw)}; |
| } |
| |
| template <size_t N> |
| HWY_API Mask128<uint8_t, N> operator>(Vec128<uint8_t, N> a, |
| Vec128<uint8_t, N> b) { |
| return Mask128<uint8_t, N>{_mm_cmpgt_epu8_mask(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Mask128<uint16_t, N> operator>(Vec128<uint16_t, N> a, |
| Vec128<uint16_t, N> b) { |
| return Mask128<uint16_t, N>{_mm_cmpgt_epu16_mask(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Mask128<uint32_t, N> operator>(Vec128<uint32_t, N> a, |
| Vec128<uint32_t, N> b) { |
| return Mask128<uint32_t, N>{_mm_cmpgt_epu32_mask(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Mask128<uint64_t, N> operator>(Vec128<uint64_t, N> a, |
| Vec128<uint64_t, N> b) { |
| return Mask128<uint64_t, N>{_mm_cmpgt_epu64_mask(a.raw, b.raw)}; |
| } |
| |
| #if HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Mask128<float16_t, N> operator>(Vec128<float16_t, N> a, |
| Vec128<float16_t, N> b) { |
| // Work around warnings in the intrinsic definitions (passing -1 as a mask). |
| HWY_DIAGNOSTICS(push) |
| HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") |
| return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_GT_OQ)}; |
| HWY_DIAGNOSTICS(pop) |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Mask128<float, N> operator>(Vec128<float, N> a, Vec128<float, N> b) { |
| return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)}; |
| } |
| template <size_t N> |
| HWY_API Mask128<double, N> operator>(Vec128<double, N> a, Vec128<double, N> b) { |
| return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)}; |
| } |
| |
| // ------------------------------ Weak inequality |
| |
| #if HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Mask128<float16_t, N> operator>=(Vec128<float16_t, N> a, |
| Vec128<float16_t, N> b) { |
| // Work around warnings in the intrinsic definitions (passing -1 as a mask). |
| HWY_DIAGNOSTICS(push) |
| HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") |
| return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_GE_OQ)}; |
| HWY_DIAGNOSTICS(pop) |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Mask128<float, N> operator>=(Vec128<float, N> a, Vec128<float, N> b) { |
| return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)}; |
| } |
| template <size_t N> |
| HWY_API Mask128<double, N> operator>=(Vec128<double, N> a, |
| Vec128<double, N> b) { |
| return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)}; |
| } |
| |
| template <size_t N> |
| HWY_API Mask128<int8_t, N> operator>=(Vec128<int8_t, N> a, |
| Vec128<int8_t, N> b) { |
| return Mask128<int8_t, N>{_mm_cmpge_epi8_mask(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Mask128<int16_t, N> operator>=(Vec128<int16_t, N> a, |
| Vec128<int16_t, N> b) { |
| return Mask128<int16_t, N>{_mm_cmpge_epi16_mask(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Mask128<int32_t, N> operator>=(Vec128<int32_t, N> a, |
| Vec128<int32_t, N> b) { |
| return Mask128<int32_t, N>{_mm_cmpge_epi32_mask(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Mask128<int64_t, N> operator>=(Vec128<int64_t, N> a, |
| Vec128<int64_t, N> b) { |
| return Mask128<int64_t, N>{_mm_cmpge_epi64_mask(a.raw, b.raw)}; |
| } |
| |
| template <size_t N> |
| HWY_API Mask128<uint8_t, N> operator>=(Vec128<uint8_t, N> a, |
| Vec128<uint8_t, N> b) { |
| return Mask128<uint8_t, N>{_mm_cmpge_epu8_mask(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Mask128<uint16_t, N> operator>=(Vec128<uint16_t, N> a, |
| Vec128<uint16_t, N> b) { |
| return Mask128<uint16_t, N>{_mm_cmpge_epu16_mask(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Mask128<uint32_t, N> operator>=(Vec128<uint32_t, N> a, |
| Vec128<uint32_t, N> b) { |
| return Mask128<uint32_t, N>{_mm_cmpge_epu32_mask(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Mask128<uint64_t, N> operator>=(Vec128<uint64_t, N> a, |
| Vec128<uint64_t, N> b) { |
| return Mask128<uint64_t, N>{_mm_cmpge_epu64_mask(a.raw, b.raw)}; |
| } |
| |
| #else // AVX2 or below |
| |
| // Comparisons fill a lane with 1-bits if the condition is true, else 0. |
| |
| template <class DTo, typename TFrom, size_t NFrom, HWY_IF_V_SIZE_LE_D(DTo, 16)> |
| HWY_API MFromD<DTo> RebindMask(DTo dto, Mask128<TFrom, NFrom> m) { |
| static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size"); |
| const Simd<TFrom, NFrom, 0> d; |
| return MaskFromVec(BitCast(dto, VecFromMask(d, m))); |
| } |
| |
| template <typename T, size_t N> |
| HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) { |
| static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported"); |
| return (v & bit) == bit; |
| } |
| |
| // ------------------------------ Equality |
| |
| // Unsigned |
| template <size_t N> |
| HWY_API Mask128<uint8_t, N> operator==(Vec128<uint8_t, N> a, |
| Vec128<uint8_t, N> b) { |
| return Mask128<uint8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Mask128<uint16_t, N> operator==(Vec128<uint16_t, N> a, |
| Vec128<uint16_t, N> b) { |
| return Mask128<uint16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Mask128<uint32_t, N> operator==(Vec128<uint32_t, N> a, |
| Vec128<uint32_t, N> b) { |
| return Mask128<uint32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a, |
| const Vec128<uint64_t, N> b) { |
| #if HWY_TARGET >= HWY_SSSE3 |
| const DFromV<decltype(a)> d64; |
| const RepartitionToNarrow<decltype(d64)> d32; |
| const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b))); |
| const auto cmp64 = cmp32 & Shuffle2301(cmp32); |
| return MaskFromVec(BitCast(d64, cmp64)); |
| #else |
| return Mask128<uint64_t, N>{_mm_cmpeq_epi64(a.raw, b.raw)}; |
| #endif |
| } |
| |
| // Signed |
| template <size_t N> |
| HWY_API Mask128<int8_t, N> operator==(Vec128<int8_t, N> a, |
| Vec128<int8_t, N> b) { |
| return Mask128<int8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Mask128<int16_t, N> operator==(Vec128<int16_t, N> a, |
| Vec128<int16_t, N> b) { |
| return Mask128<int16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Mask128<int32_t, N> operator==(Vec128<int32_t, N> a, |
| Vec128<int32_t, N> b) { |
| return Mask128<int32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a, |
| const Vec128<int64_t, N> b) { |
| // Same as signed ==; avoid duplicating the SSSE3 version. |
| const DFromV<decltype(a)> d; |
| RebindToUnsigned<decltype(d)> du; |
| return RebindMask(d, BitCast(du, a) == BitCast(du, b)); |
| } |
| |
| // Float |
| template <size_t N> |
| HWY_API Mask128<float, N> operator==(Vec128<float, N> a, Vec128<float, N> b) { |
| return Mask128<float, N>{_mm_cmpeq_ps(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Mask128<double, N> operator==(Vec128<double, N> a, |
| Vec128<double, N> b) { |
| return Mask128<double, N>{_mm_cmpeq_pd(a.raw, b.raw)}; |
| } |
| |
| // ------------------------------ Inequality |
| |
| // This cannot have T as a template argument, otherwise it is not more |
| // specialized than rewritten operator== in C++20, leading to compile |
| // errors: https://gcc.godbolt.org/z/xsrPhPvPT. |
| template <size_t N> |
| HWY_API Mask128<uint8_t, N> operator!=(Vec128<uint8_t, N> a, |
| Vec128<uint8_t, N> b) { |
| return Not(a == b); |
| } |
| template <size_t N> |
| HWY_API Mask128<uint16_t, N> operator!=(Vec128<uint16_t, N> a, |
| Vec128<uint16_t, N> b) { |
| return Not(a == b); |
| } |
| template <size_t N> |
| HWY_API Mask128<uint32_t, N> operator!=(Vec128<uint32_t, N> a, |
| Vec128<uint32_t, N> b) { |
| return Not(a == b); |
| } |
| template <size_t N> |
| HWY_API Mask128<uint64_t, N> operator!=(Vec128<uint64_t, N> a, |
| Vec128<uint64_t, N> b) { |
| return Not(a == b); |
| } |
| template <size_t N> |
| HWY_API Mask128<int8_t, N> operator!=(Vec128<int8_t, N> a, |
| Vec128<int8_t, N> b) { |
| return Not(a == b); |
| } |
| template <size_t N> |
| HWY_API Mask128<int16_t, N> operator!=(Vec128<int16_t, N> a, |
| Vec128<int16_t, N> b) { |
| return Not(a == b); |
| } |
| template <size_t N> |
| HWY_API Mask128<int32_t, N> operator!=(Vec128<int32_t, N> a, |
| Vec128<int32_t, N> b) { |
| return Not(a == b); |
| } |
| template <size_t N> |
| HWY_API Mask128<int64_t, N> operator!=(Vec128<int64_t, N> a, |
| Vec128<int64_t, N> b) { |
| return Not(a == b); |
| } |
| |
| template <size_t N> |
| HWY_API Mask128<float, N> operator!=(Vec128<float, N> a, Vec128<float, N> b) { |
| return Mask128<float, N>{_mm_cmpneq_ps(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Mask128<double, N> operator!=(Vec128<double, N> a, |
| Vec128<double, N> b) { |
| return Mask128<double, N>{_mm_cmpneq_pd(a.raw, b.raw)}; |
| } |
| |
| // ------------------------------ Strict inequality |
| |
| namespace detail { |
| |
| template <size_t N> |
| HWY_INLINE Mask128<int8_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<int8_t, N> a, |
| Vec128<int8_t, N> b) { |
| return Mask128<int8_t, N>{_mm_cmpgt_epi8(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_INLINE Mask128<int16_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<int16_t, N> a, |
| Vec128<int16_t, N> b) { |
| return Mask128<int16_t, N>{_mm_cmpgt_epi16(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_INLINE Mask128<int32_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<int32_t, N> a, |
| Vec128<int32_t, N> b) { |
| return Mask128<int32_t, N>{_mm_cmpgt_epi32(a.raw, b.raw)}; |
| } |
| |
| template <size_t N> |
| HWY_INLINE Mask128<int64_t, N> Gt(hwy::SignedTag /*tag*/, |
| const Vec128<int64_t, N> a, |
| const Vec128<int64_t, N> b) { |
| #if HWY_TARGET >= HWY_SSSE3 |
| // See https://stackoverflow.com/questions/65166174/: |
| const DFromV<decltype(a)> d; |
| const RepartitionToNarrow<decltype(d)> d32; |
| const Vec128<int64_t, N> m_eq32{Eq(BitCast(d32, a), BitCast(d32, b)).raw}; |
| const Vec128<int64_t, N> m_gt32{Gt(BitCast(d32, a), BitCast(d32, b)).raw}; |
| // If a.upper is greater, upper := true. Otherwise, if a.upper == b.upper: |
| // upper := b-a (unsigned comparison result of lower). Otherwise: upper := 0. |
| const __m128i upper = OrAnd(m_gt32, m_eq32, Sub(b, a)).raw; |
| // Duplicate upper to lower half. |
| return Mask128<int64_t, N>{_mm_shuffle_epi32(upper, _MM_SHUFFLE(3, 3, 1, 1))}; |
| #else |
| return Mask128<int64_t, N>{_mm_cmpgt_epi64(a.raw, b.raw)}; // SSE4.2 |
| #endif |
| } |
| |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> Gt(hwy::UnsignedTag /*tag*/, Vec128<T, N> a, |
| Vec128<T, N> b) { |
| const DFromV<decltype(a)> du; |
| const RebindToSigned<decltype(du)> di; |
| const Vec128<T, N> msb = Set(du, (LimitsMax<T>() >> 1) + 1); |
| const auto sa = BitCast(di, Xor(a, msb)); |
| const auto sb = BitCast(di, Xor(b, msb)); |
| return RebindMask(du, Gt(hwy::SignedTag(), sa, sb)); |
| } |
| |
| template <size_t N> |
| HWY_INLINE Mask128<float, N> Gt(hwy::FloatTag /*tag*/, Vec128<float, N> a, |
| Vec128<float, N> b) { |
| return Mask128<float, N>{_mm_cmpgt_ps(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_INLINE Mask128<double, N> Gt(hwy::FloatTag /*tag*/, Vec128<double, N> a, |
| Vec128<double, N> b) { |
| return Mask128<double, N>{_mm_cmpgt_pd(a.raw, b.raw)}; |
| } |
| |
| } // namespace detail |
| |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) { |
| return detail::Gt(hwy::TypeTag<T>(), a, b); |
| } |
| |
| // ------------------------------ Weak inequality |
| |
| namespace detail { |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> Ge(hwy::SignedTag tag, Vec128<T, N> a, |
| Vec128<T, N> b) { |
| return Not(Gt(tag, b, a)); |
| } |
| |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> Ge(hwy::UnsignedTag tag, Vec128<T, N> a, |
| Vec128<T, N> b) { |
| return Not(Gt(tag, b, a)); |
| } |
| |
| template <size_t N> |
| HWY_INLINE Mask128<float, N> Ge(hwy::FloatTag /*tag*/, Vec128<float, N> a, |
| Vec128<float, N> b) { |
| return Mask128<float, N>{_mm_cmpge_ps(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_INLINE Mask128<double, N> Ge(hwy::FloatTag /*tag*/, Vec128<double, N> a, |
| Vec128<double, N> b) { |
| return Mask128<double, N>{_mm_cmpge_pd(a.raw, b.raw)}; |
| } |
| |
| } // namespace detail |
| |
| template <typename T, size_t N> |
| HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) { |
| return detail::Ge(hwy::TypeTag<T>(), a, b); |
| } |
| |
| #endif // HWY_TARGET <= HWY_AVX3 |
| |
| // ------------------------------ Reversed comparisons |
| |
| template <typename T, size_t N> |
| HWY_API Mask128<T, N> operator<(Vec128<T, N> a, Vec128<T, N> b) { |
| return b > a; |
| } |
| |
| template <typename T, size_t N> |
| HWY_API Mask128<T, N> operator<=(Vec128<T, N> a, Vec128<T, N> b) { |
| return b >= a; |
| } |
| |
| // ------------------------------ Iota (Load) |
| |
| namespace detail { |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)> |
| HWY_INLINE VFromD<D> Iota0(D /*d*/) { |
| return VFromD<D>{_mm_set_epi8( |
| static_cast<char>(15), static_cast<char>(14), static_cast<char>(13), |
| static_cast<char>(12), static_cast<char>(11), static_cast<char>(10), |
| static_cast<char>(9), static_cast<char>(8), static_cast<char>(7), |
| static_cast<char>(6), static_cast<char>(5), static_cast<char>(4), |
| static_cast<char>(3), static_cast<char>(2), static_cast<char>(1), |
| static_cast<char>(0))}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI16_D(D)> |
| HWY_INLINE VFromD<D> Iota0(D /*d*/) { |
| return VFromD<D>{_mm_set_epi16(int16_t{7}, int16_t{6}, int16_t{5}, int16_t{4}, |
| int16_t{3}, int16_t{2}, int16_t{1}, |
| int16_t{0})}; |
| } |
| |
| #if HWY_HAVE_FLOAT16 |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)> |
| HWY_INLINE VFromD<D> Iota0(D /*d*/) { |
| return VFromD<D>{_mm_set_ph(float16_t{7}, float16_t{6}, float16_t{5}, |
| float16_t{4}, float16_t{3}, float16_t{2}, |
| float16_t{1}, float16_t{0})}; |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)> |
| HWY_INLINE VFromD<D> Iota0(D /*d*/) { |
| return VFromD<D>{ |
| _mm_set_epi32(int32_t{3}, int32_t{2}, int32_t{1}, int32_t{0})}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)> |
| HWY_INLINE VFromD<D> Iota0(D /*d*/) { |
| return VFromD<D>{_mm_set_epi64x(int64_t{1}, int64_t{0})}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> |
| HWY_INLINE VFromD<D> Iota0(D /*d*/) { |
| return VFromD<D>{_mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> |
| HWY_INLINE VFromD<D> Iota0(D /*d*/) { |
| return VFromD<D>{_mm_set_pd(1.0, 0.0)}; |
| } |
| |
| #if HWY_COMPILER_MSVC |
| template <class V, HWY_IF_V_SIZE_V(V, 1)> |
| static HWY_INLINE V MaskOutVec128Iota(V v) { |
| const V mask_out_mask{_mm_set_epi32(0, 0, 0, 0xFF)}; |
| return v & mask_out_mask; |
| } |
| template <class V, HWY_IF_V_SIZE_V(V, 2)> |
| static HWY_INLINE V MaskOutVec128Iota(V v) { |
| #if HWY_TARGET <= HWY_SSE4 |
| return V{_mm_blend_epi16(v.raw, _mm_setzero_si128(), 0xFE)}; |
| #else |
| const V mask_out_mask{_mm_set_epi32(0, 0, 0, 0xFFFF)}; |
| return v & mask_out_mask; |
| #endif |
| } |
| template <class V, HWY_IF_V_SIZE_V(V, 4)> |
| static HWY_INLINE V MaskOutVec128Iota(V v) { |
| const DFromV<decltype(v)> d; |
| const Repartition<float, decltype(d)> df; |
| using VF = VFromD<decltype(df)>; |
| return BitCast(d, VF{_mm_move_ss(_mm_setzero_ps(), BitCast(df, v).raw)}); |
| } |
| template <class V, HWY_IF_V_SIZE_V(V, 8)> |
| static HWY_INLINE V MaskOutVec128Iota(V v) { |
| const DFromV<decltype(v)> d; |
| const RebindToUnsigned<decltype(d)> du; |
| using VU = VFromD<decltype(du)>; |
| return BitCast(d, VU{_mm_move_epi64(BitCast(du, v).raw)}); |
| } |
| template <class V, HWY_IF_V_SIZE_GT_V(V, 8)> |
| static HWY_INLINE V MaskOutVec128Iota(V v) { |
| return v; |
| } |
| #endif |
| |
| } // namespace detail |
| |
| template <class D, typename T2, HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API VFromD<D> Iota(D d, const T2 first) { |
| const auto result_iota = |
| detail::Iota0(d) + Set(d, ConvertScalarTo<TFromD<D>>(first)); |
| #if HWY_COMPILER_MSVC |
| return detail::MaskOutVec128Iota(result_iota); |
| #else |
| return result_iota; |
| #endif |
| } |
| |
| // ------------------------------ FirstN (Iota, Lt) |
| |
| template <class D, class M = MFromD<D>, HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API M FirstN(D d, size_t num) { |
| constexpr size_t kN = MaxLanes(d); |
| // For AVX3, this ensures `num` <= 255 as required by bzhi, which only looks |
| // at the lower 8 bits; for AVX2 and below, this ensures `num` fits in TI. |
| num = HWY_MIN(num, kN); |
| #if HWY_TARGET <= HWY_AVX3 |
| #if HWY_ARCH_X86_64 |
| const uint64_t all = (1ull << kN) - 1; |
| return M::FromBits(_bzhi_u64(all, num)); |
| #else |
| const uint32_t all = static_cast<uint32_t>((1ull << kN) - 1); |
| return M::FromBits(_bzhi_u32(all, static_cast<uint32_t>(num))); |
| #endif // HWY_ARCH_X86_64 |
| #else // HWY_TARGET > HWY_AVX3 |
| const RebindToSigned<decltype(d)> di; // Signed comparisons are cheaper. |
| using TI = TFromD<decltype(di)>; |
| return RebindMask(d, detail::Iota0(di) < Set(di, static_cast<TI>(num))); |
| #endif // HWY_TARGET <= HWY_AVX3 |
| } |
| |
| // ------------------------------ InterleaveLower |
| |
| // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides |
| // the least-significant lane) and "b". To concatenate two half-width integers |
| // into one, use ZipLower/Upper instead (also works with scalar). |
| |
| template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> |
| HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_unpacklo_epi8(a.raw, b.raw)}; |
| } |
| template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> |
| HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) { |
| const DFromV<decltype(a)> d; |
| const RebindToUnsigned<decltype(d)> du; |
| using VU = VFromD<decltype(du)>; // for float16_t |
| return BitCast( |
| d, VU{_mm_unpacklo_epi16(BitCast(du, a).raw, BitCast(du, b).raw)}); |
| } |
| template <typename T, size_t N, HWY_IF_UI32(T)> |
| HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_unpacklo_epi32(a.raw, b.raw)}; |
| } |
| template <typename T, size_t N, HWY_IF_UI64(T)> |
| HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_unpacklo_epi64(a.raw, b.raw)}; |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<float, N> InterleaveLower(Vec128<float, N> a, |
| Vec128<float, N> b) { |
| return Vec128<float, N>{_mm_unpacklo_ps(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<double, N> InterleaveLower(Vec128<double, N> a, |
| Vec128<double, N> b) { |
| return Vec128<double, N>{_mm_unpacklo_pd(a.raw, b.raw)}; |
| } |
| |
| // Generic for all vector lengths. |
| template <class D> |
| HWY_API VFromD<D> InterleaveLower(D /* tag */, VFromD<D> a, VFromD<D> b) { |
| return InterleaveLower(a, b); |
| } |
| |
| // ================================================== MEMORY (2) |
| |
| // ------------------------------ MaskedLoad |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)> |
| HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */, |
| const TFromD<D>* HWY_RESTRICT p) { |
| return VFromD<D>{_mm_maskz_loadu_epi8(m.raw, p)}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)> |
| HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, |
| const TFromD<D>* HWY_RESTRICT p) { |
| const RebindToUnsigned<decltype(d)> du; // for float16_t |
| return BitCast(d, VFromD<decltype(du)>{_mm_maskz_loadu_epi16(m.raw, p)}); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)> |
| HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */, |
| const TFromD<D>* HWY_RESTRICT p) { |
| return VFromD<D>{_mm_maskz_loadu_epi32(m.raw, p)}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)> |
| HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */, |
| const TFromD<D>* HWY_RESTRICT p) { |
| return VFromD<D>{_mm_maskz_loadu_epi64(m.raw, p)}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> |
| HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */, |
| const float* HWY_RESTRICT p) { |
| return VFromD<D>{_mm_maskz_loadu_ps(m.raw, p)}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> |
| HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */, |
| const double* HWY_RESTRICT p) { |
| return VFromD<D>{_mm_maskz_loadu_pd(m.raw, p)}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)> |
| HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */, |
| const TFromD<D>* HWY_RESTRICT p) { |
| return VFromD<D>{_mm_mask_loadu_epi8(v.raw, m.raw, p)}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)> |
| HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d, |
| const TFromD<D>* HWY_RESTRICT p) { |
| const RebindToUnsigned<decltype(d)> du; // for float16_t |
| return BitCast(d, VFromD<decltype(du)>{ |
| _mm_mask_loadu_epi16(BitCast(du, v).raw, m.raw, p)}); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)> |
| HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */, |
| const TFromD<D>* HWY_RESTRICT p) { |
| return VFromD<D>{_mm_mask_loadu_epi32(v.raw, m.raw, p)}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)> |
| HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */, |
| const TFromD<D>* HWY_RESTRICT p) { |
| return VFromD<D>{_mm_mask_loadu_epi64(v.raw, m.raw, p)}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> |
| HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */, |
| const float* HWY_RESTRICT p) { |
| return VFromD<D>{_mm_mask_loadu_ps(v.raw, m.raw, p)}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> |
| HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */, |
| const double* HWY_RESTRICT p) { |
| return VFromD<D>{_mm_mask_loadu_pd(v.raw, m.raw, p)}; |
| } |
| |
| #elif HWY_TARGET == HWY_AVX2 |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)> |
| HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */, |
| const TFromD<D>* HWY_RESTRICT p) { |
| auto p_p = reinterpret_cast<const int*>(p); // NOLINT |
| return VFromD<D>{_mm_maskload_epi32(p_p, m.raw)}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)> |
| HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */, |
| const TFromD<D>* HWY_RESTRICT p) { |
| auto p_p = reinterpret_cast<const long long*>(p); // NOLINT |
| return VFromD<D>{_mm_maskload_epi64(p_p, m.raw)}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> |
| HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, const float* HWY_RESTRICT p) { |
| const RebindToSigned<decltype(d)> di; |
| return VFromD<D>{_mm_maskload_ps(p, BitCast(di, VecFromMask(d, m)).raw)}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> |
| HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, const double* HWY_RESTRICT p) { |
| const RebindToSigned<decltype(d)> di; |
| return VFromD<D>{_mm_maskload_pd(p, BitCast(di, VecFromMask(d, m)).raw)}; |
| } |
| |
| // There is no maskload_epi8/16, so blend instead. |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), |
| HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))> |
| HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, |
| const TFromD<D>* HWY_RESTRICT p) { |
| return IfThenElseZero(m, LoadU(d, p)); |
| } |
| |
| #else // <= SSE4 |
| |
| // Avoid maskmov* - its nontemporal 'hint' causes it to bypass caches (slow). |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, |
| const TFromD<D>* HWY_RESTRICT p) { |
| return IfThenElseZero(m, LoadU(d, p)); |
| } |
| |
| #endif |
| |
| // ------------------------------ MaskedLoadOr |
| |
| #if HWY_TARGET > HWY_AVX3 // else: native |
| |
| // Generic for all vector lengths. |
| template <class D> |
| HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d, |
| const TFromD<D>* HWY_RESTRICT p) { |
| return IfThenElse(m, LoadU(d, p), v); |
| } |
| |
| #endif // HWY_TARGET > HWY_AVX3 |
| |
| // ------------------------------ LoadN (InterleaveLower) |
| |
| #if HWY_TARGET <= HWY_AVX2 && !HWY_MEM_OPS_MIGHT_FAULT |
| |
| #ifdef HWY_NATIVE_LOAD_N |
| #undef HWY_NATIVE_LOAD_N |
| #else |
| #define HWY_NATIVE_LOAD_N |
| #endif |
| |
| // Generic for all vector lengths. |
| template <class D, HWY_IF_T_SIZE_ONE_OF_D( |
| D, (HWY_TARGET <= HWY_AVX3 ? ((1 << 1) | (1 << 2)) : 0) | |
| (1 << 4) | (1 << 8))> |
| HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p, |
| size_t num_lanes) { |
| const FixedTag<TFromD<D>, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD<D>))> |
| d_full; |
| return ResizeBitCast(d, MaskedLoad(FirstN(d_full, num_lanes), d_full, p)); |
| } |
| |
| // Generic for all vector lengths. |
| template <class D, HWY_IF_T_SIZE_ONE_OF_D( |
| D, (HWY_TARGET <= HWY_AVX3 ? ((1 << 1) | (1 << 2)) : 0) | |
| (1 << 4) | (1 << 8))> |
| HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p, |
| size_t num_lanes) { |
| const FixedTag<TFromD<D>, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD<D>))> |
| d_full; |
| return ResizeBitCast(d, MaskedLoadOr(ResizeBitCast(d_full, no), |
| FirstN(d_full, num_lanes), d_full, p)); |
| } |
| |
| #if HWY_TARGET > HWY_AVX3 |
| namespace detail { |
| |
| // 'Leading' means the part that fits in 32-bit lanes. With 2-byte vectors, |
| // there are none, so return the remainder (v_trailing). |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 2)> |
| HWY_INLINE VFromD<D> AVX2UIF8Or16LoadLeadingN( |
| VFromD<D> /*load_mask*/, D /*d*/, const TFromD<D>* HWY_RESTRICT /*p*/, |
| VFromD<D> v_trailing) { |
| return v_trailing; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 2)> |
| HWY_INLINE VFromD<D> AVX2UIF8Or16LoadLeadingNOr( |
| VFromD<D> /*no*/, VFromD<D> /*load_mask*/, D /*d*/, |
| const TFromD<D>* HWY_RESTRICT /*p*/, VFromD<D> v_trailing) { |
| return v_trailing; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_GT_D(D, 2)> |
| HWY_INLINE VFromD<D> AVX2UIF8Or16LoadLeadingN(VFromD<D> load_mask, D d, |
| const TFromD<D>* HWY_RESTRICT p, |
| VFromD<D> v_trailing) { |
| using DI32 = Repartition<int32_t, D>; |
| const FixedTag<int32_t, HWY_MAX(HWY_MAX_LANES_D(DI32), 4)> di32_full; |
| |
| // ResizeBitCast of load_mask to di32 is okay below if |
| // d.MaxBytes() < di32.MaxBytes() is true as any lanes of load_mask.raw past |
| // the first (lowest-index) lanes of load_mask.raw will have already been |
| // zeroed out by FirstN. |
| return ResizeBitCast( |
| d, IfNegativeThenElse( |
| ResizeBitCast(di32_full, load_mask), |
| MaskedLoad(MaskFromVec(ResizeBitCast(di32_full, load_mask)), |
| di32_full, reinterpret_cast<const int32_t*>(p)), |
| ResizeBitCast(di32_full, v_trailing))); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_GT_D(D, 2)> |
| HWY_INLINE VFromD<D> AVX2UIF8Or16LoadLeadingNOr(VFromD<D> no, |
| VFromD<D> load_mask, D d, |
| const TFromD<D>* HWY_RESTRICT p, |
| VFromD<D> v_trailing) { |
| using DI32 = Repartition<int32_t, D>; |
| const FixedTag<int32_t, HWY_MAX(HWY_MAX_LANES_D(DI32), 4)> di32_full; |
| |
| // ResizeBitCast of load_mask to di32 is okay below if |
| // d.MaxBytes() < di32.MaxBytes() is true as any lanes of load_mask.raw past |
| // the first (lowest-index) lanes of load_mask.raw will have already been |
| // zeroed out by FirstN. |
| return ResizeBitCast( |
| d, IfNegativeThenElse( |
| ResizeBitCast(di32_full, load_mask), |
| MaskedLoadOr(ResizeBitCast(di32_full, no), |
| MaskFromVec(ResizeBitCast(di32_full, load_mask)), |
| di32_full, reinterpret_cast<const int32_t*>(p)), |
| ResizeBitCast(di32_full, v_trailing))); |
| } |
| |
| // Single lane: load or default value. |
| template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)), |
| HWY_IF_LANES_D(D, 1)> |
| HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingN(VFromD<D> /*load_mask*/, D d, |
| const TFromD<D>* HWY_RESTRICT p, |
| size_t num_lanes) { |
| return (num_lanes > 0) ? LoadU(d, p) : Zero(d); |
| } |
| |
| template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)), |
| HWY_IF_LANES_D(D, 1)> |
| HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingNOr( |
| VFromD<D> no, VFromD<D> /*load_mask*/, D d, const TFromD<D>* HWY_RESTRICT p, |
| size_t num_lanes) { |
| return (num_lanes > 0) ? LoadU(d, p) : no; |
| } |
| |
| // Two lanes: load 1, 2, or default. |
| template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_D(D, 2)> |
| HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingN(VFromD<D> /*load_mask*/, D d, |
| const TFromD<D>* HWY_RESTRICT p, |
| size_t num_lanes) { |
| if (num_lanes > 1) { |
| return LoadU(d, p); |
| } else { |
| const FixedTag<TFromD<D>, 1> d1; |
| return (num_lanes == 1) ? ResizeBitCast(d, LoadU(d1, p)) : Zero(d); |
| } |
| } |
| |
| template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_D(D, 2)> |
| HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingNOr( |
| VFromD<D> no, VFromD<D> /*load_mask*/, D d, const TFromD<D>* HWY_RESTRICT p, |
| size_t num_lanes) { |
| if (num_lanes > 1) { |
| return LoadU(d, p); |
| } else { |
| if (num_lanes == 0) return no; |
| // Load one, upper lane is default. |
| const FixedTag<TFromD<D>, 1> d1; |
| return InterleaveLower(ResizeBitCast(d, LoadU(d1, p)), no); |
| } |
| } |
| |
| template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_GT_D(D, 2)> |
| HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingN(VFromD<D> load_mask, D d, |
| const TFromD<D>* HWY_RESTRICT p, |
| size_t num_lanes) { |
| const size_t trailing_n = num_lanes & 3; |
| if (trailing_n == 0) return Zero(d); |
| |
| VFromD<D> v_trailing = And(load_mask, Set(d, p[num_lanes - 1])); |
| |
| if ((trailing_n & 2) != 0) { |
| const Repartition<int16_t, decltype(d)> di16; |
| int16_t i16_bits; |
| CopyBytes<sizeof(int16_t)>(p + num_lanes - trailing_n, &i16_bits); |
| v_trailing = BitCast( |
| d, IfNegativeThenElse(BitCast(di16, load_mask), Set(di16, i16_bits), |
| BitCast(di16, v_trailing))); |
| } |
| |
| return v_trailing; |
| } |
| |
| template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_GT_D(D, 2)> |
| HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingNOr( |
| VFromD<D> no, VFromD<D> load_mask, D d, const TFromD<D>* HWY_RESTRICT p, |
| size_t num_lanes) { |
| const size_t trailing_n = num_lanes & 3; |
| if (trailing_n == 0) return no; |
| |
| VFromD<D> v_trailing = IfVecThenElse(load_mask, Set(d, p[num_lanes - 1]), no); |
| |
| if ((trailing_n & 2) != 0) { |
| const Repartition<int16_t, decltype(d)> di16; |
| int16_t i16_bits; |
| CopyBytes<sizeof(int16_t)>(p + num_lanes - trailing_n, &i16_bits); |
| v_trailing = BitCast( |
| d, IfNegativeThenElse(BitCast(di16, load_mask), Set(di16, i16_bits), |
| BitCast(di16, v_trailing))); |
| } |
| |
| return v_trailing; |
| } |
| |
| template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_GT_D(D, 1)> |
| HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingN(VFromD<D> load_mask, D d, |
| const TFromD<D>* HWY_RESTRICT p, |
| size_t num_lanes) { |
| if ((num_lanes & 1) != 0) { |
| return And(load_mask, Set(d, p[num_lanes - 1])); |
| } else { |
| return Zero(d); |
| } |
| } |
| |
| template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_GT_D(D, 1)> |
| HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingNOr( |
| VFromD<D> no, VFromD<D> load_mask, D d, const TFromD<D>* HWY_RESTRICT p, |
| size_t num_lanes) { |
| if ((num_lanes & 1) != 0) { |
| return IfVecThenElse(load_mask, Set(d, p[num_lanes - 1]), no); |
| } else { |
| return no; |
| } |
| } |
| |
| } // namespace detail |
| |
| // Generic for all vector lengths. |
| template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))> |
| HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p, size_t N) { |
| const FixedTag<TFromD<D>, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD<D>))> |
| d_full; |
| |
| const VFromD<D> load_mask = |
| ResizeBitCast(d, VecFromMask(d_full, FirstN(d_full, N))); |
| const size_t num_lanes = HWY_MIN(N, HWY_MAX_LANES_D(D)); |
| const VFromD<D> v_trailing = |
| detail::AVX2UIF8Or16LoadTrailingN(load_mask, d, p, num_lanes); |
| |
| #if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD |
| if (__builtin_constant_p(num_lanes < (4 / sizeof(TFromD<D>))) && |
| num_lanes < (4 / sizeof(TFromD<D>))) { |
| return v_trailing; |
| } |
| #endif |
| |
| return detail::AVX2UIF8Or16LoadLeadingN(load_mask, d, p, v_trailing); |
| } |
| |
| // Generic for all vector lengths. |
| template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))> |
| HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p, |
| size_t N) { |
| const FixedTag<TFromD<D>, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD<D>))> |
| d_full; |
| |
| const VFromD<D> load_mask = |
| ResizeBitCast(d, VecFromMask(d_full, FirstN(d_full, N))); |
| const size_t num_lanes = HWY_MIN(N, HWY_MAX_LANES_D(D)); |
| const VFromD<D> v_trailing = |
| detail::AVX2UIF8Or16LoadTrailingNOr(no, load_mask, d, p, num_lanes); |
| |
| #if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD |
| if (__builtin_constant_p(num_lanes < (4 / sizeof(TFromD<D>))) && |
| num_lanes < (4 / sizeof(TFromD<D>))) { |
| return v_trailing; |
| } |
| #endif |
| |
| return detail::AVX2UIF8Or16LoadLeadingNOr(no, load_mask, d, p, v_trailing); |
| } |
| |
| #endif // HWY_TARGET > HWY_AVX3 |
| #endif // HWY_TARGET <= HWY_AVX2 && !HWY_MEM_OPS_MIGHT_FAULT |
| |
| // ------------------------------ BlendedStore |
| |
| namespace detail { |
| |
| // There is no maskload_epi8/16 with which we could safely implement |
| // BlendedStore. Manual blending is also unsafe because loading a full vector |
| // that crosses the array end causes asan faults. Resort to scalar code; the |
| // caller should instead use memcpy, assuming m is FirstN(d, n). |
| template <class D> |
| HWY_API void ScalarMaskedStore(VFromD<D> v, MFromD<D> m, D d, |
| TFromD<D>* HWY_RESTRICT p) { |
| const RebindToSigned<decltype(d)> di; // for testing mask if T=bfloat16_t. |
| using TI = TFromD<decltype(di)>; |
| alignas(16) TI buf[MaxLanes(d)]; |
| alignas(16) TI mask[MaxLanes(d)]; |
| Store(BitCast(di, v), di, buf); |
| Store(BitCast(di, VecFromMask(d, m)), di, mask); |
| for (size_t i = 0; i < MaxLanes(d); ++i) { |
| if (mask[i]) { |
| CopySameSize(buf + i, p + i); |
| } |
| } |
| } |
| } // namespace detail |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)> |
| HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D /* tag */, |
| TFromD<D>* HWY_RESTRICT p) { |
| _mm_mask_storeu_epi8(p, m.raw, v.raw); |
| } |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)> |
| HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d, |
| TFromD<D>* HWY_RESTRICT p) { |
| const RebindToUnsigned<decltype(d)> du; // for float16_t |
| _mm_mask_storeu_epi16(reinterpret_cast<uint16_t*>(p), RebindMask(du, m).raw, |
| BitCast(du, v).raw); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)> |
| HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D /* tag */, |
| TFromD<D>* HWY_RESTRICT p) { |
| auto pi = reinterpret_cast<int*>(p); // NOLINT |
| _mm_mask_storeu_epi32(pi, m.raw, v.raw); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)> |
| HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D /* tag */, |
| TFromD<D>* HWY_RESTRICT p) { |
| auto pi = reinterpret_cast<long long*>(p); // NOLINT |
| _mm_mask_storeu_epi64(pi, m.raw, v.raw); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> |
| HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D, float* HWY_RESTRICT p) { |
| _mm_mask_storeu_ps(p, m.raw, v.raw); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> |
| HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D, double* HWY_RESTRICT p) { |
| _mm_mask_storeu_pd(p, m.raw, v.raw); |
| } |
| |
| #elif HWY_TARGET == HWY_AVX2 |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), |
| HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))> |
| HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d, |
| TFromD<D>* HWY_RESTRICT p) { |
| detail::ScalarMaskedStore(v, m, d, p); |
| } |
| |
| namespace detail { |
| |
| template <class D, class V, class M, HWY_IF_UI32_D(D)> |
| HWY_INLINE void NativeBlendedStore(V v, M m, TFromD<D>* HWY_RESTRICT p) { |
| auto pi = reinterpret_cast<int*>(p); // NOLINT |
| _mm_maskstore_epi32(pi, m.raw, v.raw); |
| } |
| |
| template <class D, class V, class M, HWY_IF_UI64_D(D)> |
| HWY_INLINE void NativeBlendedStore(V v, M m, TFromD<D>* HWY_RESTRICT p) { |
| auto pi = reinterpret_cast<long long*>(p); // NOLINT |
| _mm_maskstore_epi64(pi, m.raw, v.raw); |
| } |
| |
| template <class D, class V, class M, HWY_IF_F32_D(D)> |
| HWY_INLINE void NativeBlendedStore(V v, M m, float* HWY_RESTRICT p) { |
| _mm_maskstore_ps(p, m.raw, v.raw); |
| } |
| |
| template <class D, class V, class M, HWY_IF_F64_D(D)> |
| HWY_INLINE void NativeBlendedStore(V v, M m, double* HWY_RESTRICT p) { |
| _mm_maskstore_pd(p, m.raw, v.raw); |
| } |
| |
| } // namespace detail |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), |
| HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))> |
| HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d, |
| TFromD<D>* HWY_RESTRICT p) { |
| const RebindToSigned<decltype(d)> di; |
| // For partial vectors, avoid writing other lanes by zeroing their mask. |
| if (d.MaxBytes() < 16) { |
| const Full128<TFromD<D>> dfull; |
| const Mask128<TFromD<D>> mfull{m.raw}; |
| m = MFromD<D>{And(mfull, FirstN(dfull, MaxLanes(d))).raw}; |
| } |
| |
| // Float/double require, and unsigned ints tolerate, signed int masks. |
| detail::NativeBlendedStore<D>(v, RebindMask(di, m), p); |
| } |
| |
| #else // <= SSE4 |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d, |
| TFromD<D>* HWY_RESTRICT p) { |
| // Avoid maskmov* - its nontemporal 'hint' causes it to bypass caches (slow). |
| detail::ScalarMaskedStore(v, m, d, p); |
| } |
| |
| #endif // SSE4 |
| |
| // ================================================== ARITHMETIC |
| |
| // ------------------------------ Addition |
| |
| // Unsigned |
| template <size_t N> |
| HWY_API Vec128<uint8_t, N> operator+(const Vec128<uint8_t, N> a, |
| const Vec128<uint8_t, N> b) { |
| return Vec128<uint8_t, N>{_mm_add_epi8(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<uint16_t, N> operator+(const Vec128<uint16_t, N> a, |
| const Vec128<uint16_t, N> b) { |
| return Vec128<uint16_t, N>{_mm_add_epi16(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<uint32_t, N> operator+(const Vec128<uint32_t, N> a, |
| const Vec128<uint32_t, N> b) { |
| return Vec128<uint32_t, N>{_mm_add_epi32(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<uint64_t, N> operator+(const Vec128<uint64_t, N> a, |
| const Vec128<uint64_t, N> b) { |
| return Vec128<uint64_t, N>{_mm_add_epi64(a.raw, b.raw)}; |
| } |
| |
| // Signed |
| template <size_t N> |
| HWY_API Vec128<int8_t, N> operator+(const Vec128<int8_t, N> a, |
| const Vec128<int8_t, N> b) { |
| return Vec128<int8_t, N>{_mm_add_epi8(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<int16_t, N> operator+(const Vec128<int16_t, N> a, |
| const Vec128<int16_t, N> b) { |
| return Vec128<int16_t, N>{_mm_add_epi16(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<int32_t, N> operator+(const Vec128<int32_t, N> a, |
| const Vec128<int32_t, N> b) { |
| return Vec128<int32_t, N>{_mm_add_epi32(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<int64_t, N> operator+(const Vec128<int64_t, N> a, |
| const Vec128<int64_t, N> b) { |
| return Vec128<int64_t, N>{_mm_add_epi64(a.raw, b.raw)}; |
| } |
| |
| // Float |
| #if HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Vec128<float16_t, N> operator+(const Vec128<float16_t, N> a, |
| const Vec128<float16_t, N> b) { |
| return Vec128<float16_t, N>{_mm_add_ph(a.raw, b.raw)}; |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Vec128<float, N> operator+(const Vec128<float, N> a, |
| const Vec128<float, N> b) { |
| return Vec128<float, N>{_mm_add_ps(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<double, N> operator+(const Vec128<double, N> a, |
| const Vec128<double, N> b) { |
| return Vec128<double, N>{_mm_add_pd(a.raw, b.raw)}; |
| } |
| |
| // ------------------------------ Subtraction |
| |
| // Unsigned |
| template <size_t N> |
| HWY_API Vec128<uint8_t, N> operator-(const Vec128<uint8_t, N> a, |
| const Vec128<uint8_t, N> b) { |
| return Vec128<uint8_t, N>{_mm_sub_epi8(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<uint16_t, N> operator-(Vec128<uint16_t, N> a, |
| Vec128<uint16_t, N> b) { |
| return Vec128<uint16_t, N>{_mm_sub_epi16(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<uint32_t, N> operator-(const Vec128<uint32_t, N> a, |
| const Vec128<uint32_t, N> b) { |
| return Vec128<uint32_t, N>{_mm_sub_epi32(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<uint64_t, N> operator-(const Vec128<uint64_t, N> a, |
| const Vec128<uint64_t, N> b) { |
| return Vec128<uint64_t, N>{_mm_sub_epi64(a.raw, b.raw)}; |
| } |
| |
| // Signed |
| template <size_t N> |
| HWY_API Vec128<int8_t, N> operator-(const Vec128<int8_t, N> a, |
| const Vec128<int8_t, N> b) { |
| return Vec128<int8_t, N>{_mm_sub_epi8(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<int16_t, N> operator-(const Vec128<int16_t, N> a, |
| const Vec128<int16_t, N> b) { |
| return Vec128<int16_t, N>{_mm_sub_epi16(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<int32_t, N> operator-(const Vec128<int32_t, N> a, |
| const Vec128<int32_t, N> b) { |
| return Vec128<int32_t, N>{_mm_sub_epi32(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<int64_t, N> operator-(const Vec128<int64_t, N> a, |
| const Vec128<int64_t, N> b) { |
| return Vec128<int64_t, N>{_mm_sub_epi64(a.raw, b.raw)}; |
| } |
| |
| // Float |
| #if HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Vec128<float16_t, N> operator-(const Vec128<float16_t, N> a, |
| const Vec128<float16_t, N> b) { |
| return Vec128<float16_t, N>{_mm_sub_ph(a.raw, b.raw)}; |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Vec128<float, N> operator-(const Vec128<float, N> a, |
| const Vec128<float, N> b) { |
| return Vec128<float, N>{_mm_sub_ps(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<double, N> operator-(const Vec128<double, N> a, |
| const Vec128<double, N> b) { |
| return Vec128<double, N>{_mm_sub_pd(a.raw, b.raw)}; |
| } |
| |
| // ------------------------------ AddSub |
| |
| #if HWY_TARGET <= HWY_SSSE3 |
| |
| #undef HWY_IF_ADDSUB_V |
| #define HWY_IF_ADDSUB_V(V) \ |
| HWY_IF_V_SIZE_GT_V( \ |
| V, ((hwy::IsFloat3264<TFromV<V>>()) ? 32 : sizeof(TFromV<V>))) |
| |
| template <size_t N, HWY_IF_LANES_GT(N, 1)> |
| HWY_API Vec128<float, N> AddSub(Vec128<float, N> a, Vec128<float, N> b) { |
| return Vec128<float, N>{_mm_addsub_ps(a.raw, b.raw)}; |
| } |
| HWY_API Vec128<double> AddSub(Vec128<double> a, Vec128<double> b) { |
| return Vec128<double>{_mm_addsub_pd(a.raw, b.raw)}; |
| } |
| #endif // HWY_TARGET <= HWY_SSSE3 |
| |
| // ------------------------------ PairwiseAdd128/PairwiseSub128 |
| |
| // Need to use the default implementation of PairwiseAdd128/PairwiseSub128 in |
| // generic_ops-inl.h for U8/I8/F16/I64/U64 vectors and 64-byte vectors |
| |
| #if HWY_TARGET <= HWY_SSSE3 |
| |
| #undef HWY_IF_PAIRWISE_ADD_128_D |
| #undef HWY_IF_PAIRWISE_SUB_128_D |
| #define HWY_IF_PAIRWISE_ADD_128_D(D) \ |
| hwy::EnableIf<( \ |
| HWY_MAX_LANES_D(D) > (32 / sizeof(hwy::HWY_NAMESPACE::TFromD<D>)) || \ |
| (HWY_MAX_LANES_D(D) > (8 / sizeof(hwy::HWY_NAMESPACE::TFromD<D>)) && \ |
| !(hwy::IsSameEither<hwy::HWY_NAMESPACE::TFromD<D>, int16_t, \ |
| uint16_t>() || \ |
| sizeof(hwy::HWY_NAMESPACE::TFromD<D>) == 4 || \ |
| hwy::IsSame<hwy::HWY_NAMESPACE::TFromD<D>, double>())))>* = nullptr |
| #define HWY_IF_PAIRWISE_SUB_128_D(D) HWY_IF_PAIRWISE_ADD_128_D(D) |
| |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI16_D(D)> |
| HWY_API VFromD<D> PairwiseAdd128(D /*d*/, VFromD<D> a, VFromD<D> b) { |
| return VFromD<D>{_mm_hadd_epi16(a.raw, b.raw)}; |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI16_D(D)> |
| HWY_API VFromD<D> PairwiseSub128(D /*d*/, VFromD<D> a, VFromD<D> b) { |
| const DFromV<decltype(a)> d; |
| const RebindToSigned<decltype(d)> di; |
| return BitCast(d, Neg(BitCast(di, VFromD<D>{_mm_hsub_epi16(a.raw, b.raw)}))); |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)> |
| HWY_API VFromD<D> PairwiseAdd128(D /*d*/, VFromD<D> a, VFromD<D> b) { |
| return VFromD<D>{_mm_hadd_epi32(a.raw, b.raw)}; |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)> |
| HWY_API VFromD<D> PairwiseSub128(D /*d*/, VFromD<D> a, VFromD<D> b) { |
| const DFromV<decltype(a)> d; |
| const RebindToSigned<decltype(d)> di; |
| return BitCast(d, Neg(BitCast(di, VFromD<D>{_mm_hsub_epi32(a.raw, b.raw)}))); |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)> |
| HWY_API VFromD<D> PairwiseAdd128(D /*d*/, VFromD<D> a, VFromD<D> b) { |
| return VFromD<D>{_mm_hadd_ps(a.raw, b.raw)}; |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)> |
| HWY_API VFromD<D> PairwiseSub128(D /*d*/, VFromD<D> a, VFromD<D> b) { |
| return Neg(VFromD<D>{_mm_hsub_ps(a.raw, b.raw)}); |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)> |
| HWY_API VFromD<D> PairwiseAdd128(D /*d*/, VFromD<D> a, VFromD<D> b) { |
| return VFromD<D>{_mm_hadd_pd(a.raw, b.raw)}; |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)> |
| HWY_API VFromD<D> PairwiseSub128(D /*d*/, VFromD<D> a, VFromD<D> b) { |
| return Neg(VFromD<D>{_mm_hsub_pd(a.raw, b.raw)}); |
| } |
| |
| #endif // HWY_TARGET <= HWY_SSSE3 |
| |
| // ------------------------------ SumsOf8 |
| template <size_t N> |
| HWY_API Vec128<uint64_t, N / 8> SumsOf8(const Vec128<uint8_t, N> v) { |
| return Vec128<uint64_t, N / 8>{_mm_sad_epu8(v.raw, _mm_setzero_si128())}; |
| } |
| |
| // Generic for all vector lengths |
| template <class V, HWY_IF_I8_D(DFromV<V>)> |
| HWY_API VFromD<RepartitionToWideX3<DFromV<V>>> SumsOf8(V v) { |
| const DFromV<decltype(v)> d; |
| const RebindToUnsigned<decltype(d)> du; |
| const Repartition<int64_t, decltype(d)> di64; |
| |
| // Adjust the values of v to be in the 0..255 range by adding 128 to each lane |
| // of v (which is the same as an bitwise XOR of each i8 lane by 128) and then |
| // bitcasting the Xor result to an u8 vector. |
| const auto v_adj = BitCast(du, Xor(v, SignBit(d))); |
| |
| // Need to add -1024 to each i64 lane of the result of the SumsOf8(v_adj) |
| // operation to account for the adjustment made above. |
| return BitCast(di64, SumsOf8(v_adj)) + Set(di64, int64_t{-1024}); |
| } |
| |
| #ifdef HWY_NATIVE_SUMS_OF_8_ABS_DIFF |
| #undef HWY_NATIVE_SUMS_OF_8_ABS_DIFF |
| #else |
| #define HWY_NATIVE_SUMS_OF_8_ABS_DIFF |
| #endif |
| |
| template <size_t N> |
| HWY_API Vec128<uint64_t, N / 8> SumsOf8AbsDiff(const Vec128<uint8_t, N> a, |
| const Vec128<uint8_t, N> b) { |
| return Vec128<uint64_t, N / 8>{_mm_sad_epu8(a.raw, b.raw)}; |
| } |
| |
| // Generic for all vector lengths |
| template <class V, HWY_IF_I8_D(DFromV<V>)> |
| HWY_API VFromD<RepartitionToWideX3<DFromV<V>>> SumsOf8AbsDiff(V a, V b) { |
| const DFromV<V> d; |
| const RebindToUnsigned<decltype(d)> du; |
| const RepartitionToWideX3<decltype(d)> di64; |
| |
| // Adjust the values of a and b to be in the 0..255 range by adding 128 to |
| // each lane of a and b (which is the same as an bitwise XOR of each i8 lane |
| // by 128) and then bitcasting the results of the Xor operations to u8 |
| // vectors. |
| const auto i8_msb = SignBit(d); |
| const auto a_adj = BitCast(du, Xor(a, i8_msb)); |
| const auto b_adj = BitCast(du, Xor(b, i8_msb)); |
| |
| // The result of SumsOf8AbsDiff(a_adj, b_adj) can simply be bitcasted to an |
| // i64 vector as |(a[i] + 128) - (b[i] + 128)| == |a[i] - b[i]| is true |
| return BitCast(di64, SumsOf8AbsDiff(a_adj, b_adj)); |
| } |
| |
| // ------------------------------ SumsOf4 |
| #if HWY_TARGET <= HWY_AVX3 |
| namespace detail { |
| |
| template <size_t N> |
| HWY_INLINE Vec128<uint32_t, (N + 3) / 4> SumsOf4( |
| hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<1> /*lane_size_tag*/, |
| Vec128<uint8_t, N> v) { |
| const DFromV<decltype(v)> d; |
| |
| // _mm_maskz_dbsad_epu8 is used below as the odd uint16_t lanes need to be |
| // zeroed out and the sums of the 4 consecutive lanes are already in the |
| // even uint16_t lanes of the _mm_maskz_dbsad_epu8 result. |
| return Vec128<uint32_t, (N + 3) / 4>{ |
| _mm_maskz_dbsad_epu8(static_cast<__mmask8>(0x55), v.raw, Zero(d).raw, 0)}; |
| } |
| |
| // detail::SumsOf4 for Vec128<int8_t, N> on AVX3 is implemented in x86_512-inl.h |
| |
| } // namespace detail |
| #endif // HWY_TARGET <= HWY_AVX3 |
| |
| // ------------------------------ SumsOfAdjQuadAbsDiff |
| |
| #if HWY_TARGET <= HWY_SSE4 |
| #ifdef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF |
| #undef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF |
| #else |
| #define HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF |
| #endif |
| |
| template <int kAOffset, int kBOffset, size_t N> |
| HWY_API Vec128<uint16_t, (N + 1) / 2> SumsOfAdjQuadAbsDiff( |
| Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) { |
| static_assert(0 <= kAOffset && kAOffset <= 1, |
| "kAOffset must be between 0 and 1"); |
| static_assert(0 <= kBOffset && kBOffset <= 3, |
| "kBOffset must be between 0 and 3"); |
| return Vec128<uint16_t, (N + 1) / 2>{ |
| _mm_mpsadbw_epu8(a.raw, b.raw, (kAOffset << 2) | kBOffset)}; |
| } |
| |
| // Generic for all vector lengths |
| template <int kAOffset, int kBOffset, class V, HWY_IF_I8_D(DFromV<V>)> |
| HWY_API VFromD<RepartitionToWide<DFromV<V>>> SumsOfAdjQuadAbsDiff(V a, V b) { |
| const DFromV<decltype(a)> d; |
| const RebindToUnsigned<decltype(d)> du; |
| const RepartitionToWide<decltype(d)> dw; |
| |
| // Adjust the values of a and b to be in the 0..255 range by adding 128 to |
| // each lane of a and b (which is the same as an bitwise XOR of each i8 lane |
| // by 128) and then bitcasting the results of the Xor operations to u8 |
| // vectors. |
| const auto i8_msb = SignBit(d); |
| const auto a_adj = BitCast(du, Xor(a, i8_msb)); |
| const auto b_adj = BitCast(du, Xor(b, i8_msb)); |
| |
| // The result of SumsOfAdjQuadAbsDiff<kAOffset, kBOffset>(a_adj, b_adj) can |
| // simply be bitcasted to an i16 vector as |
| // |(a[i] + 128) - (b[i] + 128)| == |a[i] - b[i]| is true. |
| return BitCast(dw, SumsOfAdjQuadAbsDiff<kAOffset, kBOffset>(a_adj, b_adj)); |
| } |
| #endif |
| |
| // ------------------------------ SumsOfShuffledQuadAbsDiff |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| #ifdef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF |
| #undef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF |
| #else |
| #define HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF |
| #endif |
| |
| template <int kIdx3, int kIdx2, int kIdx1, int kIdx0, size_t N> |
| HWY_API Vec128<uint16_t, (N + 1) / 2> SumsOfShuffledQuadAbsDiff( |
| Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) { |
| static_assert(0 <= kIdx0 && kIdx0 <= 3, "kIdx0 must be between 0 and 3"); |
| static_assert(0 <= kIdx1 && kIdx1 <= 3, "kIdx1 must be between 0 and 3"); |
| static_assert(0 <= kIdx2 && kIdx2 <= 3, "kIdx2 must be between 0 and 3"); |
| static_assert(0 <= kIdx3 && kIdx3 <= 3, "kIdx3 must be between 0 and 3"); |
| return Vec128<uint16_t, (N + 1) / 2>{ |
| _mm_dbsad_epu8(b.raw, a.raw, _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kIdx0))}; |
| } |
| |
| // Generic for all vector lengths |
| template <int kIdx3, int kIdx2, int kIdx1, int kIdx0, class V, |
| HWY_IF_I8_D(DFromV<V>)> |
| HWY_API VFromD<RepartitionToWide<DFromV<V>>> SumsOfShuffledQuadAbsDiff(V a, |
| V b) { |
| const DFromV<decltype(a)> d; |
| const RebindToUnsigned<decltype(d)> du; |
| const RepartitionToWide<decltype(d)> dw; |
| |
| // Adjust the values of a and b to be in the 0..255 range by adding 128 to |
| // each lane of a and b (which is the same as an bitwise XOR of each i8 lane |
| // by 128) and then bitcasting the results of the Xor operations to u8 |
| // vectors. |
| const auto i8_msb = SignBit(d); |
| const auto a_adj = BitCast(du, Xor(a, i8_msb)); |
| const auto b_adj = BitCast(du, Xor(b, i8_msb)); |
| |
| // The result of |
| // SumsOfShuffledQuadAbsDiff<kIdx3, kIdx2, kIdx1, kIdx0>(a_adj, b_adj) can |
| // simply be bitcasted to an i16 vector as |
| // |(a[i] + 128) - (b[i] + 128)| == |a[i] - b[i]| is true. |
| return BitCast( |
| dw, SumsOfShuffledQuadAbsDiff<kIdx3, kIdx2, kIdx1, kIdx0>(a_adj, b_adj)); |
| } |
| #endif |
| |
| // ------------------------------ SaturatedAdd |
| |
| // Returns a + b clamped to the destination range. |
| |
| // Unsigned |
| template <size_t N> |
| HWY_API Vec128<uint8_t, N> SaturatedAdd(const Vec128<uint8_t, N> a, |
| const Vec128<uint8_t, N> b) { |
| return Vec128<uint8_t, N>{_mm_adds_epu8(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<uint16_t, N> SaturatedAdd(const Vec128<uint16_t, N> a, |
| const Vec128<uint16_t, N> b) { |
| return Vec128<uint16_t, N>{_mm_adds_epu16(a.raw, b.raw)}; |
| } |
| |
| // Signed |
| template <size_t N> |
| HWY_API Vec128<int8_t, N> SaturatedAdd(const Vec128<int8_t, N> a, |
| const Vec128<int8_t, N> b) { |
| return Vec128<int8_t, N>{_mm_adds_epi8(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<int16_t, N> SaturatedAdd(const Vec128<int16_t, N> a, |
| const Vec128<int16_t, N> b) { |
| return Vec128<int16_t, N>{_mm_adds_epi16(a.raw, b.raw)}; |
| } |
| |
| #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN |
| #ifdef HWY_NATIVE_I32_SATURATED_ADDSUB |
| #undef HWY_NATIVE_I32_SATURATED_ADDSUB |
| #else |
| #define HWY_NATIVE_I32_SATURATED_ADDSUB |
| #endif |
| |
| #ifdef HWY_NATIVE_I64_SATURATED_ADDSUB |
| #undef HWY_NATIVE_I64_SATURATED_ADDSUB |
| #else |
| #define HWY_NATIVE_I64_SATURATED_ADDSUB |
| #endif |
| |
| template <size_t N> |
| HWY_API Vec128<int32_t, N> SaturatedAdd(Vec128<int32_t, N> a, |
| Vec128<int32_t, N> b) { |
| const DFromV<decltype(a)> d; |
| const auto sum = a + b; |
| const auto overflow_mask = MaskFromVec( |
| Vec128<int32_t, N>{_mm_ternarylogic_epi32(a.raw, b.raw, sum.raw, 0x42)}); |
| const auto i32_max = Set(d, LimitsMax<int32_t>()); |
| const Vec128<int32_t, N> overflow_result{_mm_mask_ternarylogic_epi32( |
| i32_max.raw, MaskFromVec(a).raw, i32_max.raw, i32_max.raw, 0x55)}; |
| return IfThenElse(overflow_mask, overflow_result, sum); |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<int64_t, N> SaturatedAdd(Vec128<int64_t, N> a, |
| Vec128<int64_t, N> b) { |
| const DFromV<decltype(a)> d; |
| const auto sum = a + b; |
| const auto overflow_mask = MaskFromVec( |
| Vec128<int64_t, N>{_mm_ternarylogic_epi64(a.raw, b.raw, sum.raw, 0x42)}); |
| const auto i64_max = Set(d, LimitsMax<int64_t>()); |
| const Vec128<int64_t, N> overflow_result{_mm_mask_ternarylogic_epi64( |
| i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)}; |
| return IfThenElse(overflow_mask, overflow_result, sum); |
| } |
| #endif // HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN |
| |
| // ------------------------------ SaturatedSub |
| |
| // Returns a - b clamped to the destination range. |
| |
| // Unsigned |
| template <size_t N> |
| HWY_API Vec128<uint8_t, N> SaturatedSub(const Vec128<uint8_t, N> a, |
| const Vec128<uint8_t, N> b) { |
| return Vec128<uint8_t, N>{_mm_subs_epu8(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<uint16_t, N> SaturatedSub(const Vec128<uint16_t, N> a, |
| const Vec128<uint16_t, N> b) { |
| return Vec128<uint16_t, N>{_mm_subs_epu16(a.raw, b.raw)}; |
| } |
| |
| // Signed |
| template <size_t N> |
| HWY_API Vec128<int8_t, N> SaturatedSub(const Vec128<int8_t, N> a, |
| const Vec128<int8_t, N> b) { |
| return Vec128<int8_t, N>{_mm_subs_epi8(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<int16_t, N> SaturatedSub(const Vec128<int16_t, N> a, |
| const Vec128<int16_t, N> b) { |
| return Vec128<int16_t, N>{_mm_subs_epi16(a.raw, b.raw)}; |
| } |
| |
| #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN |
| template <size_t N> |
| HWY_API Vec128<int32_t, N> SaturatedSub(Vec128<int32_t, N> a, |
| Vec128<int32_t, N> b) { |
| const DFromV<decltype(a)> d; |
| const auto diff = a - b; |
| const auto overflow_mask = MaskFromVec( |
| Vec128<int32_t, N>{_mm_ternarylogic_epi32(a.raw, b.raw, diff.raw, 0x18)}); |
| const auto i32_max = Set(d, LimitsMax<int32_t>()); |
| const Vec128<int32_t, N> overflow_result{_mm_mask_ternarylogic_epi32( |
| i32_max.raw, MaskFromVec(a).raw, i32_max.raw, i32_max.raw, 0x55)}; |
| return IfThenElse(overflow_mask, overflow_result, diff); |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<int64_t, N> SaturatedSub(Vec128<int64_t, N> a, |
| Vec128<int64_t, N> b) { |
| const DFromV<decltype(a)> d; |
| const auto diff = a - b; |
| const auto overflow_mask = MaskFromVec( |
| Vec128<int64_t, N>{_mm_ternarylogic_epi64(a.raw, b.raw, diff.raw, 0x18)}); |
| const auto i64_max = Set(d, LimitsMax<int64_t>()); |
| const Vec128<int64_t, N> overflow_result{_mm_mask_ternarylogic_epi64( |
| i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)}; |
| return IfThenElse(overflow_mask, overflow_result, diff); |
| } |
| #endif // HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN |
| |
| // ------------------------------ AverageRound |
| |
| // Returns (a + b + 1) / 2 |
| |
| // Unsigned |
| template <size_t N> |
| HWY_API Vec128<uint8_t, N> AverageRound(const Vec128<uint8_t, N> a, |
| const Vec128<uint8_t, N> b) { |
| return Vec128<uint8_t, N>{_mm_avg_epu8(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<uint16_t, N> AverageRound(const Vec128<uint16_t, N> a, |
| const Vec128<uint16_t, N> b) { |
| return Vec128<uint16_t, N>{_mm_avg_epu16(a.raw, b.raw)}; |
| } |
| |
| // I8/I16 AverageRound is generic for all vector lengths |
| template <class V, HWY_IF_SIGNED_V(V), |
| HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))> |
| HWY_API V AverageRound(V a, V b) { |
| const DFromV<decltype(a)> d; |
| const RebindToUnsigned<decltype(d)> du; |
| const V sign_bit = SignBit(d); |
| return Xor(BitCast(d, AverageRound(BitCast(du, Xor(a, sign_bit)), |
| BitCast(du, Xor(b, sign_bit)))), |
| sign_bit); |
| } |
| |
| // ------------------------------ Integer multiplication |
| |
| template <size_t N> |
| HWY_API Vec128<uint16_t, N> operator*(const Vec128<uint16_t, N> a, |
| const Vec128<uint16_t, N> b) { |
| return Vec128<uint16_t, N>{_mm_mullo_epi16(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<int16_t, N> operator*(const Vec128<int16_t, N> a, |
| const Vec128<int16_t, N> b) { |
| return Vec128<int16_t, N>{_mm_mullo_epi16(a.raw, b.raw)}; |
| } |
| |
| // Returns the upper sizeof(T)*8 bits of a * b in each lane. |
| template <size_t N> |
| HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a, |
| const Vec128<uint16_t, N> b) { |
| return Vec128<uint16_t, N>{_mm_mulhi_epu16(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a, |
| const Vec128<int16_t, N> b) { |
| return Vec128<int16_t, N>{_mm_mulhi_epi16(a.raw, b.raw)}; |
| } |
| |
| template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 4)), |
| HWY_IF_LANES_D(DFromV<V>, 1)> |
| HWY_API V MulHigh(V a, V b) { |
| const DFromV<decltype(a)> d; |
| const Full128<TFromD<decltype(d)>> d_full; |
| return ResizeBitCast( |
| d, Slide1Down(d_full, ResizeBitCast(d_full, MulEven(a, b)))); |
| } |
| |
| // I8/U8/I32/U32 MulHigh is generic for all vector lengths >= 2 lanes |
| template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 4)), |
| HWY_IF_LANES_GT_D(DFromV<V>, 1)> |
| HWY_API V MulHigh(V a, V b) { |
| const DFromV<decltype(a)> d; |
| |
| const auto p_even = BitCast(d, MulEven(a, b)); |
| const auto p_odd = BitCast(d, MulOdd(a, b)); |
| return InterleaveOdd(d, p_even, p_odd); |
| } |
| |
| // Multiplies even lanes (0, 2 ..) and places the double-wide result into |
| // even and the upper half into its odd neighbor lane. |
| template <class V, HWY_IF_U8_D(DFromV<V>)> |
| HWY_API VFromD<RepartitionToWide<DFromV<V>>> MulEven(V a, V b) { |
| const DFromV<decltype(a)> d; |
| const RepartitionToWide<decltype(d)> dw; |
| const auto lo8_mask = Set(dw, uint16_t{0x00FF}); |
| return And(ResizeBitCast(dw, a), lo8_mask) * |
| And(ResizeBitCast(dw, b), lo8_mask); |
| } |
| |
| template <class V, HWY_IF_I8_D(DFromV<V>)> |
| HWY_API VFromD<RepartitionToWide<DFromV<V>>> MulEven(V a, V b) { |
| const DFromV<decltype(a)> d; |
| const RepartitionToWide<decltype(d)> dw; |
| return ShiftRight<8>(ShiftLeft<8>(ResizeBitCast(dw, a))) * |
| ShiftRight<8>(ShiftLeft<8>(ResizeBitCast(dw, b))); |
| } |
| |
| template <class V, HWY_IF_UI16_D(DFromV<V>)> |
| HWY_API VFromD<RepartitionToWide<DFromV<V>>> MulEven(V a, V b) { |
| const DFromV<decltype(a)> d; |
| const RepartitionToWide<decltype(d)> dw; |
| const RepartitionToNarrow<decltype(dw)> dw_as_d16; |
| |
| const auto lo = ResizeBitCast(dw, a * b); |
| const auto hi = ShiftLeft<16>(ResizeBitCast(dw, MulHigh(a, b))); |
| return BitCast(dw, OddEven(BitCast(dw_as_d16, hi), BitCast(dw_as_d16, lo))); |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(const Vec128<uint32_t, N> a, |
| const Vec128<uint32_t, N> b) { |
| return Vec128<uint64_t, (N + 1) / 2>{_mm_mul_epu32(a.raw, b.raw)}; |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a, |
| const Vec128<int32_t, N> b) { |
| #if HWY_TARGET >= HWY_SSSE3 |
| const DFromV<decltype(a)> d; |
| const RepartitionToWide<decltype(d)> dw; |
| const RebindToUnsigned<decltype(d)> du; |
| |
| // p[i] = (((a[i] >> 31) * (a[i] >> 31)) << 64) + |
| // (((a[i] >> 31) * b[i]) << 32) + |
| // (((b[i] >> 31) * a[i]) << 32) + |
| // ((a[i] & int64_t{0xFFFFFFFF}) * (b[i] & int64_t{0xFFFFFFFF})) |
| |
| // ((a[i] >> 31) * (a[i] >> 31)) << 64 does not need to be computed as the |
| // lower 64 bits of ((a[i] >> 31) * (a[i] >> 31)) << 64 is zero. |
| |
| // (((a[i] >> 31) * b[i]) << 32) + (((b[i] >> 31) * a[i]) << 32) == |
| // -((((a[i] >> 31) & b[i]) + ((b[i] >> 31) & a[i])) << 32) |
| |
| // ((a[i] & int64_t{0xFFFFFFFF}) * (b[i] & int64_t{0xFFFFFFFF})) can be |
| // computed using MulEven(BitCast(du, a), BitCast(du, b)) |
| |
| const auto neg_p_hi = ShiftLeft<32>( |
| ResizeBitCast(dw, And(ShiftRight<31>(a), b) + And(ShiftRight<31>(b), a))); |
| const auto p_lo = BitCast(dw, MulEven(BitCast(du, a), BitCast(du, b))); |
| return p_lo - neg_p_hi; |
| #else |
| return Vec128<int64_t, (N + 1) / 2>{_mm_mul_epi32(a.raw, b.raw)}; |
| #endif |
| } |
| |
| template <class V, HWY_IF_T_SIZE_V(V, 1)> |
| HWY_API VFromD<RepartitionToWide<DFromV<V>>> MulOdd(V a, V b) { |
| const DFromV<decltype(a)> d; |
| const RepartitionToWide<decltype(d)> dw; |
| return ShiftRight<8>(ResizeBitCast(dw, a)) * |
| ShiftRight<8>(ResizeBitCast(dw, b)); |
| } |
| |
| template <class V, HWY_IF_UI16_D(DFromV<V>)> |
| HWY_API VFromD<RepartitionToWide<DFromV<V>>> MulOdd(V a, V b) { |
| const DFromV<decltype(a)> d; |
| const RepartitionToWide<decltype(d)> dw; |
| const RebindToUnsigned<decltype(dw)> dw_u; |
| const RepartitionToNarrow<decltype(dw)> dw_as_d16; |
| |
| const auto lo = ShiftRight<16>(BitCast(dw_u, ResizeBitCast(dw, a * b))); |
| const auto hi = ResizeBitCast(dw, MulHigh(a, b)); |
| return BitCast(dw, OddEven(BitCast(dw_as_d16, hi), BitCast(dw_as_d16, lo))); |
| } |
| |
| template <class V, HWY_IF_UI32_D(DFromV<V>)> |
| HWY_API VFromD<RepartitionToWide<DFromV<V>>> MulOdd(V a, V b) { |
| return MulEven(DupOdd(a), DupOdd(b)); |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<uint32_t, N> operator*(const Vec128<uint32_t, N> a, |
| const Vec128<uint32_t, N> b) { |
| #if HWY_TARGET >= HWY_SSSE3 |
| // Not as inefficient as it looks: _mm_mullo_epi32 has 10 cycle latency. |
| // 64-bit right shift would also work but also needs port 5, so no benefit. |
| // Notation: x=don't care, z=0. |
| const __m128i a_x3x1 = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 3, 1, 1)); |
| const auto mullo_x2x0 = MulEven(a, b); |
| const __m128i b_x3x1 = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(3, 3, 1, 1)); |
| const auto mullo_x3x1 = |
| MulEven(Vec128<uint32_t, N>{a_x3x1}, Vec128<uint32_t, N>{b_x3x1}); |
| // We could _mm_slli_epi64 by 32 to get 3z1z and OR with z2z0, but generating |
| // the latter requires one more instruction or a constant. |
| const __m128i mul_20 = |
| _mm_shuffle_epi32(mullo_x2x0.raw, _MM_SHUFFLE(2, 0, 2, 0)); |
| const __m128i mul_31 = |
| _mm_shuffle_epi32(mullo_x3x1.raw, _MM_SHUFFLE(2, 0, 2, 0)); |
| return Vec128<uint32_t, N>{_mm_unpacklo_epi32(mul_20, mul_31)}; |
| #else |
| return Vec128<uint32_t, N>{_mm_mullo_epi32(a.raw, b.raw)}; |
| #endif |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<int32_t, N> operator*(const Vec128<int32_t, N> a, |
| const Vec128<int32_t, N> b) { |
| // Same as unsigned; avoid duplicating the SSSE3 code. |
| const DFromV<decltype(a)> d; |
| const RebindToUnsigned<decltype(d)> du; |
| return BitCast(d, BitCast(du, a) * BitCast(du, b)); |
| } |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| // Per-target flag to prevent generic_ops-inl.h from defining 64-bit operator*. |
| #ifdef HWY_NATIVE_MUL_64 |
| #undef HWY_NATIVE_MUL_64 |
| #else |
| #define HWY_NATIVE_MUL_64 |
| #endif |
| |
| template <size_t N> |
| HWY_API Vec128<uint64_t, N> operator*(Vec128<uint64_t, N> a, |
| Vec128<uint64_t, N> b) { |
| return Vec128<uint64_t, N>{_mm_mullo_epi64(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<int64_t, N> operator*(Vec128<int64_t, N> a, |
| Vec128<int64_t, N> b) { |
| return Vec128<int64_t, N>{_mm_mullo_epi64(a.raw, b.raw)}; |
| } |
| #endif |
| |
| // ------------------------------ RotateRight (ShiftRight, Or) |
| |
| // U8 RotateRight implementation on AVX3_DL is now in x86_512-inl.h as U8 |
| // RotateRight uses detail::GaloisAffine on AVX3_DL |
| |
| #if HWY_TARGET > HWY_AVX3_DL |
| template <int kBits, size_t N> |
| HWY_API Vec128<uint8_t, N> RotateRight(const Vec128<uint8_t, N> v) { |
| static_assert(0 <= kBits && kBits < 8, "Invalid shift count"); |
| if (kBits == 0) return v; |
| // AVX3 does not support 8-bit. |
| return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(7, 8 - kBits)>(v)); |
| } |
| #endif |
| |
| template <int kBits, size_t N> |
| HWY_API Vec128<uint16_t, N> RotateRight(const Vec128<uint16_t, N> v) { |
| static_assert(0 <= kBits && kBits < 16, "Invalid shift count"); |
| if (kBits == 0) return v; |
| #if HWY_TARGET <= HWY_AVX3_DL |
| return Vec128<uint16_t, N>{_mm_shrdi_epi16(v.raw, v.raw, kBits)}; |
| #else |
| // AVX3 does not support 16-bit. |
| return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(15, 16 - kBits)>(v)); |
| #endif |
| } |
| |
| template <int kBits, size_t N> |
| HWY_API Vec128<uint32_t, N> RotateRight(const Vec128<uint32_t, N> v) { |
| static_assert(0 <= kBits && kBits < 32, "Invalid shift count"); |
| #if HWY_TARGET <= HWY_AVX3 |
| return Vec128<uint32_t, N>{_mm_ror_epi32(v.raw, kBits)}; |
| #else |
| if (kBits == 0) return v; |
| return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(31, 32 - kBits)>(v)); |
| #endif |
| } |
| |
| template <int kBits, size_t N> |
| HWY_API Vec128<uint64_t, N> RotateRight(const Vec128<uint64_t, N> v) { |
| static_assert(0 <= kBits && kBits < 64, "Invalid shift count"); |
| #if HWY_TARGET <= HWY_AVX3 |
| return Vec128<uint64_t, N>{_mm_ror_epi64(v.raw, kBits)}; |
| #else |
| if (kBits == 0) return v; |
| return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(63, 64 - kBits)>(v)); |
| #endif |
| } |
| |
| // I8/I16/I32/I64 RotateRight is generic for all vector lengths |
| template <int kBits, class V, HWY_IF_SIGNED_V(V)> |
| HWY_API V RotateRight(V v) { |
| const DFromV<decltype(v)> d; |
| const RebindToUnsigned<decltype(d)> du; |
| return BitCast(d, RotateRight<kBits>(BitCast(du, v))); |
| } |
| |
| // ------------------------------ Rol/Ror |
| #if HWY_TARGET <= HWY_AVX3_DL |
| #ifdef HWY_NATIVE_ROL_ROR_16 |
| #undef HWY_NATIVE_ROL_ROR_16 |
| #else |
| #define HWY_NATIVE_ROL_ROR_16 |
| #endif |
| |
| template <class T, size_t N, HWY_IF_UI16(T)> |
| HWY_API Vec128<T, N> Ror(Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_shrdv_epi16(a.raw, a.raw, b.raw)}; |
| } |
| |
| // U16/I16 Rol is generic for all vector lengths on AVX3_DL |
| template <class V, HWY_IF_UI16(TFromV<V>)> |
| HWY_API V Rol(V a, V b) { |
| const DFromV<decltype(a)> d; |
| const RebindToSigned<decltype(d)> di; |
| return Ror(a, BitCast(d, Neg(BitCast(di, b)))); |
| } |
| |
| #endif // HWY_TARGET <= HWY_AVX3_DL |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| |
| #ifdef HWY_NATIVE_ROL_ROR_32_64 |
| #undef HWY_NATIVE_ROL_ROR_32_64 |
| #else |
| #define HWY_NATIVE_ROL_ROR_32_64 |
| #endif |
| |
| template <class T, size_t N, HWY_IF_UI32(T)> |
| HWY_API Vec128<T, N> Rol(Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_rolv_epi32(a.raw, b.raw)}; |
| } |
| |
| template <class T, size_t N, HWY_IF_UI32(T)> |
| HWY_API Vec128<T, N> Ror(Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_rorv_epi32(a.raw, b.raw)}; |
| } |
| |
| template <class T, size_t N, HWY_IF_UI64(T)> |
| HWY_API Vec128<T, N> Rol(Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_rolv_epi64(a.raw, b.raw)}; |
| } |
| |
| template <class T, size_t N, HWY_IF_UI64(T)> |
| HWY_API Vec128<T, N> Ror(Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_rorv_epi64(a.raw, b.raw)}; |
| } |
| |
| #endif |
| |
| // ------------------------------ RotateLeftSame/RotateRightSame |
| |
| #if HWY_TARGET <= HWY_AVX3_DL |
| |
| #ifdef HWY_NATIVE_ROL_ROR_SAME_16 |
| #undef HWY_NATIVE_ROL_ROR_SAME_16 |
| #else |
| #define HWY_NATIVE_ROL_ROR_SAME_16 |
| #endif |
| |
| // Generic for all vector lengths |
| template <class V, HWY_IF_UI16(TFromV<V>)> |
| HWY_API V RotateLeftSame(V v, int bits) { |
| const DFromV<decltype(v)> d; |
| return Ror(v, |
| Set(d, static_cast<TFromV<V>>(0u - static_cast<unsigned>(bits)))); |
| } |
| |
| template <class V, HWY_IF_UI16(TFromV<V>)> |
| HWY_API V RotateRightSame(V v, int bits) { |
| const DFromV<decltype(v)> d; |
| return Ror(v, Set(d, static_cast<TFromV<V>>(bits))); |
| } |
| #endif // HWY_TARGET <= HWY_AVX3_DL |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| |
| #ifdef HWY_NATIVE_ROL_ROR_SAME_32_64 |
| #undef HWY_NATIVE_ROL_ROR_SAME_32_64 |
| #else |
| #define HWY_NATIVE_ROL_ROR_SAME_32_64 |
| #endif |
| |
| // Generic for all vector lengths |
| template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), |
| HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))> |
| HWY_API V RotateLeftSame(V v, int bits) { |
| const DFromV<decltype(v)> d; |
| return Rol(v, Set(d, static_cast<TFromV<V>>(static_cast<unsigned>(bits)))); |
| } |
| |
| template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), |
| HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))> |
| HWY_API V RotateRightSame(V v, int bits) { |
| const DFromV<decltype(v)> d; |
| return Ror(v, Set(d, static_cast<TFromV<V>>(static_cast<unsigned>(bits)))); |
| } |
| #endif // HWY_TARGET <= HWY_AVX3 |
| |
| // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask) |
| |
| template <size_t N> |
| HWY_API Vec128<int8_t, N> BroadcastSignBit(const Vec128<int8_t, N> v) { |
| const DFromV<decltype(v)> d; |
| return VecFromMask(v < Zero(d)); |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<int16_t, N> BroadcastSignBit(const Vec128<int16_t, N> v) { |
| return ShiftRight<15>(v); |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<int32_t, N> BroadcastSignBit(const Vec128<int32_t, N> v) { |
| return ShiftRight<31>(v); |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<int64_t, N> BroadcastSignBit(const Vec128<int64_t, N> v) { |
| const DFromV<decltype(v)> d; |
| #if HWY_TARGET <= HWY_AVX3 |
| (void)d; |
| return Vec128<int64_t, N>{_mm_srai_epi64(v.raw, 63)}; |
| #elif HWY_TARGET == HWY_AVX2 || HWY_TARGET == HWY_SSE4 |
| return VecFromMask(v < Zero(d)); |
| #else |
| // Efficient Lt() requires SSE4.2 and BLENDVPD requires SSE4.1. 32-bit shift |
| // avoids generating a zero. |
| const RepartitionToNarrow<decltype(d)> d32; |
| const auto sign = ShiftRight<31>(BitCast(d32, v)); |
| return Vec128<int64_t, N>{ |
| _mm_shuffle_epi32(sign.raw, _MM_SHUFFLE(3, 3, 1, 1))}; |
| #endif |
| } |
| |
| // ------------------------------ Integer Abs |
| |
| // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. |
| template <size_t N> |
| HWY_API Vec128<int8_t, N> Abs(const Vec128<int8_t, N> v) { |
| #if HWY_COMPILER_MSVC || HWY_TARGET == HWY_SSE2 |
| const DFromV<decltype(v)> d; |
| const RebindToUnsigned<decltype(d)> du; |
| const auto zero = Zero(du); |
| const auto v_as_u8 = BitCast(du, v); |
| return BitCast(d, Min(v_as_u8, zero - v_as_u8)); |
| #else |
| return Vec128<int8_t, N>{_mm_abs_epi8(v.raw)}; |
| #endif |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<int16_t, N> Abs(const Vec128<int16_t, N> v) { |
| #if HWY_TARGET == HWY_SSE2 |
| const auto zero = Zero(DFromV<decltype(v)>()); |
| return Max(v, zero - v); |
| #else |
| return Vec128<int16_t, N>{_mm_abs_epi16(v.raw)}; |
| #endif |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) { |
| #if HWY_TARGET <= HWY_SSSE3 |
| return Vec128<int32_t, N>{_mm_abs_epi32(v.raw)}; |
| #else |
| const auto zero = Zero(DFromV<decltype(v)>()); |
| return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v); |
| #endif |
| } |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| template <size_t N> |
| HWY_API Vec128<int64_t, N> Abs(const Vec128<int64_t, N> v) { |
| return Vec128<int64_t, N>{_mm_abs_epi64(v.raw)}; |
| } |
| #else |
| // I64 Abs is generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2 |
| template <class V, HWY_IF_I64(TFromV<V>)> |
| HWY_API V Abs(V v) { |
| const auto zero = Zero(DFromV<decltype(v)>()); |
| return IfNegativeThenElse(v, zero - v, v); |
| } |
| #endif |
| |
| #ifdef HWY_NATIVE_SATURATED_ABS |
| #undef HWY_NATIVE_SATURATED_ABS |
| #else |
| #define HWY_NATIVE_SATURATED_ABS |
| #endif |
| |
| // Generic for all vector lengths |
| template <class V, HWY_IF_I8(TFromV<V>)> |
| HWY_API V SaturatedAbs(V v) { |
| const DFromV<decltype(v)> d; |
| const RebindToUnsigned<decltype(d)> du; |
| return BitCast(d, Min(BitCast(du, v), BitCast(du, SaturatedSub(Zero(d), v)))); |
| } |
| |
| // Generic for all vector lengths |
| template <class V, HWY_IF_I16(TFromV<V>)> |
| HWY_API V SaturatedAbs(V v) { |
| return Max(v, SaturatedSub(Zero(DFromV<V>()), v)); |
| } |
| |
| // Generic for all vector lengths |
| template <class V, HWY_IF_I32(TFromV<V>)> |
| HWY_API V SaturatedAbs(V v) { |
| const auto abs_v = Abs(v); |
| |
| #if HWY_TARGET <= HWY_SSE4 |
| const DFromV<decltype(v)> d; |
| const RebindToUnsigned<decltype(d)> du; |
| return BitCast(d, Min(BitCast(du, abs_v), |
| Set(du, static_cast<uint32_t>(LimitsMax<int32_t>())))); |
| #else |
| return Add(abs_v, BroadcastSignBit(abs_v)); |
| #endif |
| } |
| |
| // Generic for all vector lengths |
| template <class V, HWY_IF_I64(TFromV<V>)> |
| HWY_API V SaturatedAbs(V v) { |
| const auto abs_v = Abs(v); |
| return Add(abs_v, BroadcastSignBit(abs_v)); |
| } |
| |
| // GCC <14 and Clang <11 do not follow the Intel documentation for AVX-512VL |
| // srli_epi64: the count should be unsigned int. Note that this is not the same |
| // as the Shift3264Count in x86_512-inl.h (GCC also requires int). |
| #if (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100) || \ |
| (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400) |
| using Shift64Count = int; |
| #else |
| // Assume documented behavior. Clang 12, GCC 14 and MSVC 14.28.29910 match this. |
| using Shift64Count = unsigned int; |
| #endif |
| |
| template <int kBits, size_t N> |
| HWY_API Vec128<int64_t, N> ShiftRight(const Vec128<int64_t, N> v) { |
| #if HWY_TARGET <= HWY_AVX3 |
| return Vec128<int64_t, N>{ |
| _mm_srai_epi64(v.raw, static_cast<Shift64Count>(kBits))}; |
| #else |
| const DFromV<decltype(v)> di; |
| const RebindToUnsigned<decltype(di)> du; |
| const auto right = BitCast(di, ShiftRight<kBits>(BitCast(du, v))); |
| const auto sign = ShiftLeft<64 - kBits>(BroadcastSignBit(v)); |
| return right | sign; |
| #endif |
| } |
| |
| // ------------------------------ IfNegativeThenElse |
| template <size_t N> |
| HWY_API Vec128<int8_t, N> IfNegativeThenElse(const Vec128<int8_t, N> v, |
| const Vec128<int8_t, N> yes, |
| const Vec128<int8_t, N> no) { |
| // int8: IfThenElse only looks at the MSB on SSE4 or newer |
| #if HWY_TARGET <= HWY_SSE4 |
| const auto mask = MaskFromVec(v); |
| #else |
| const DFromV<decltype(v)> d; |
| const RebindToSigned<decltype(d)> di; |
| const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))); |
| #endif |
| |
| return IfThenElse(mask, yes, no); |
| } |
| |
| template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> |
| HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes, |
| Vec128<T, N> no) { |
| static_assert(IsSigned<T>(), "Only works for signed/float"); |
| |
| // 16-bit: no native blendv on AVX2 or earlier, so copy sign to lower byte's |
| // MSB. |
| #if HWY_TARGET <= HWY_AVX3 |
| const auto mask = MaskFromVec(v); |
| #else |
| const DFromV<decltype(v)> d; |
| const RebindToSigned<decltype(d)> di; |
| const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))); |
| #endif |
| |
| return IfThenElse(mask, yes, no); |
| } |
| |
| template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))> |
| HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes, |
| Vec128<T, N> no) { |
| static_assert(IsSigned<T>(), "Only works for signed/float"); |
| const DFromV<decltype(v)> d; |
| |
| #if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE4 |
| // 32/64-bit: use float IfThenElse on SSE4/AVX2, which only looks at the MSB |
| // on SSE4 or later. |
| const RebindToFloat<decltype(d)> df; |
| const auto mask = MaskFromVec(BitCast(df, v)); |
| return BitCast(d, IfThenElse(mask, BitCast(df, yes), BitCast(df, no))); |
| #else // SSE2, SSSE3, or AVX3 |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| // No need to cast to float or broadcast sign bit on AVX3 as IfThenElse only |
| // looks at the MSB on AVX3 |
| (void)d; |
| const auto mask = MaskFromVec(v); |
| #else |
| const RebindToSigned<decltype(d)> di; |
| const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))); |
| #endif |
| |
| return IfThenElse(mask, yes, no); |
| #endif |
| } |
| |
| #if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE4 |
| |
| #ifdef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO |
| #undef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO |
| #else |
| #define HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO |
| #endif |
| |
| #ifdef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE |
| #undef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE |
| #else |
| #define HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE |
| #endif |
| |
| // SSE4/AVX2 IfNegativeThenElseZero/IfNegativeThenZeroElse is generic for all |
| // vector lengths |
| template <class V, HWY_IF_NOT_UNSIGNED_V(V), |
| HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 4) | (1 << 8))> |
| HWY_API V IfNegativeThenElseZero(V v, V yes) { |
| const DFromV<decltype(v)> d; |
| return IfNegativeThenElse(v, yes, Zero(d)); |
| } |
| |
| template <class V, HWY_IF_NOT_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, 2)> |
| HWY_API V IfNegativeThenElseZero(V v, V yes) { |
| return IfThenElseZero(IsNegative(v), yes); |
| } |
| |
| template <class V, HWY_IF_NOT_UNSIGNED_V(V), |
| HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 4) | (1 << 8))> |
| HWY_API V IfNegativeThenZeroElse(V v, V no) { |
| const DFromV<decltype(v)> d; |
| return IfNegativeThenElse(v, Zero(d), no); |
| } |
| |
| template <class V, HWY_IF_NOT_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, 2)> |
| HWY_API V IfNegativeThenZeroElse(V v, V no) { |
| return IfThenZeroElse(IsNegative(v), no); |
| } |
| |
| #endif // HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE4 |
| |
| // ------------------------------ IfNegativeThenNegOrUndefIfZero |
| |
| #if HWY_TARGET <= HWY_SSSE3 |
| |
| #ifdef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG |
| #undef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG |
| #else |
| #define HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG |
| #endif |
| |
| template <size_t N> |
| HWY_API Vec128<int8_t, N> IfNegativeThenNegOrUndefIfZero(Vec128<int8_t, N> mask, |
| Vec128<int8_t, N> v) { |
| return Vec128<int8_t, N>{_mm_sign_epi8(v.raw, mask.raw)}; |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<int16_t, N> IfNegativeThenNegOrUndefIfZero( |
| Vec128<int16_t, N> mask, Vec128<int16_t, N> v) { |
| return Vec128<int16_t, N>{_mm_sign_epi16(v.raw, mask.raw)}; |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<int32_t, N> IfNegativeThenNegOrUndefIfZero( |
| Vec128<int32_t, N> mask, Vec128<int32_t, N> v) { |
| return Vec128<int32_t, N>{_mm_sign_epi32(v.raw, mask.raw)}; |
| } |
| |
| // Generic for all vector lengths |
| template <class V, HWY_IF_I64_D(DFromV<V>)> |
| HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v) { |
| #if HWY_TARGET <= HWY_AVX3 |
| // MaskedSubOr is more efficient than IfNegativeThenElse on AVX3 |
| const DFromV<decltype(v)> d; |
| return MaskedSubOr(v, MaskFromVec(mask), Zero(d), v); |
| #else |
| // IfNegativeThenElse is more efficient than MaskedSubOr on SSE4/AVX2 |
| return IfNegativeThenElse(mask, Neg(v), v); |
| #endif |
| } |
| |
| #endif // HWY_TARGET <= HWY_SSSE3 |
| |
| // ------------------------------ ShiftLeftSame |
| |
| template <size_t N> |
| HWY_API Vec128<uint16_t, N> ShiftLeftSame(const Vec128<uint16_t, N> v, |
| const int bits) { |
| #if HWY_COMPILER_GCC |
| if (__builtin_constant_p(bits)) { |
| return Vec128<uint16_t, N>{_mm_slli_epi16(v.raw, bits)}; |
| } |
| #endif |
| return Vec128<uint16_t, N>{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))}; |
| } |
| template <size_t N> |
| HWY_API Vec128<uint32_t, N> ShiftLeftSame(const Vec128<uint32_t, N> v, |
| const int bits) { |
| #if HWY_COMPILER_GCC |
| if (__builtin_constant_p(bits)) { |
| return Vec128<uint32_t, N>{_mm_slli_epi32(v.raw, bits)}; |
| } |
| #endif |
| return Vec128<uint32_t, N>{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))}; |
| } |
| template <size_t N> |
| HWY_API Vec128<uint64_t, N> ShiftLeftSame(const Vec128<uint64_t, N> v, |
| const int bits) { |
| #if HWY_COMPILER_GCC |
| if (__builtin_constant_p(bits)) { |
| return Vec128<uint64_t, N>{_mm_slli_epi64(v.raw, bits)}; |
| } |
| #endif |
| return Vec128<uint64_t, N>{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))}; |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<int16_t, N> ShiftLeftSame(const Vec128<int16_t, N> v, |
| const int bits) { |
| #if HWY_COMPILER_GCC |
| if (__builtin_constant_p(bits)) { |
| return Vec128<int16_t, N>{_mm_slli_epi16(v.raw, bits)}; |
| } |
| #endif |
| return Vec128<int16_t, N>{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))}; |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<int32_t, N> ShiftLeftSame(const Vec128<int32_t, N> v, |
| const int bits) { |
| #if HWY_COMPILER_GCC |
| if (__builtin_constant_p(bits)) { |
| return Vec128<int32_t, N>{_mm_slli_epi32(v.raw, bits)}; |
| } |
| #endif |
| return Vec128<int32_t, N>{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))}; |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<int64_t, N> ShiftLeftSame(const Vec128<int64_t, N> v, |
| const int bits) { |
| #if HWY_COMPILER_GCC |
| if (__builtin_constant_p(bits)) { |
| return Vec128<int64_t, N>{_mm_slli_epi64(v.raw, bits)}; |
| } |
| #endif |
| return Vec128<int64_t, N>{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> |
| HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, const int bits) { |
| const DFromV<decltype(v)> d8; |
| // Use raw instead of BitCast to support N=1. |
| const Vec128<T, N> shifted{ |
| ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw}; |
| return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF)); |
| } |
| |
| // ------------------------------ ShiftRightSame (BroadcastSignBit) |
| |
| template <size_t N> |
| HWY_API Vec128<uint16_t, N> ShiftRightSame(const Vec128<uint16_t, N> v, |
| const int bits) { |
| #if HWY_COMPILER_GCC |
| if (__builtin_constant_p(bits)) { |
| return Vec128<uint16_t, N>{_mm_srli_epi16(v.raw, bits)}; |
| } |
| #endif |
| return Vec128<uint16_t, N>{_mm_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))}; |
| } |
| template <size_t N> |
| HWY_API Vec128<uint32_t, N> ShiftRightSame(const Vec128<uint32_t, N> v, |
| const int bits) { |
| #if HWY_COMPILER_GCC |
| if (__builtin_constant_p(bits)) { |
| return Vec128<uint32_t, N>{_mm_srli_epi32(v.raw, bits)}; |
| } |
| #endif |
| return Vec128<uint32_t, N>{_mm_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))}; |
| } |
| template <size_t N> |
| HWY_API Vec128<uint64_t, N> ShiftRightSame(const Vec128<uint64_t, N> v, |
| const int bits) { |
| #if HWY_COMPILER_GCC |
| if (__builtin_constant_p(bits)) { |
| return Vec128<uint64_t, N>{_mm_srli_epi64(v.raw, bits)}; |
| } |
| #endif |
| return Vec128<uint64_t, N>{_mm_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))}; |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<uint8_t, N> ShiftRightSame(Vec128<uint8_t, N> v, |
| const int bits) { |
| const DFromV<decltype(v)> d8; |
| // Use raw instead of BitCast to support N=1. |
| const Vec128<uint8_t, N> shifted{ |
| ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw}; |
| return shifted & Set(d8, static_cast<uint8_t>(0xFF >> bits)); |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<int16_t, N> ShiftRightSame(const Vec128<int16_t, N> v, |
| const int bits) { |
| #if HWY_COMPILER_GCC |
| if (__builtin_constant_p(bits)) { |
| return Vec128<int16_t, N>{_mm_srai_epi16(v.raw, bits)}; |
| } |
| #endif |
| return Vec128<int16_t, N>{_mm_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))}; |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<int32_t, N> ShiftRightSame(const Vec128<int32_t, N> v, |
| const int bits) { |
| #if HWY_COMPILER_GCC |
| if (__builtin_constant_p(bits)) { |
| return Vec128<int32_t, N>{_mm_srai_epi32(v.raw, bits)}; |
| } |
| #endif |
| return Vec128<int32_t, N>{_mm_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))}; |
| } |
| template <size_t N> |
| HWY_API Vec128<int64_t, N> ShiftRightSame(const Vec128<int64_t, N> v, |
| const int bits) { |
| #if HWY_TARGET <= HWY_AVX3 |
| #if HWY_COMPILER_GCC |
| if (__builtin_constant_p(bits)) { |
| return Vec128<int64_t, N>{ |
| _mm_srai_epi64(v.raw, static_cast<Shift64Count>(bits))}; |
| } |
| #endif |
| return Vec128<int64_t, N>{_mm_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))}; |
| #else |
| const DFromV<decltype(v)> di; |
| const RebindToUnsigned<decltype(di)> du; |
| const auto right = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); |
| const auto sign = ShiftLeftSame(BroadcastSignBit(v), 64 - bits); |
| return right | sign; |
| #endif |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<int8_t, N> ShiftRightSame(Vec128<int8_t, N> v, const int bits) { |
| const DFromV<decltype(v)> di; |
| const RebindToUnsigned<decltype(di)> du; |
| const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); |
| const auto shifted_sign = |
| BitCast(di, Set(du, static_cast<uint8_t>(0x80 >> bits))); |
| return (shifted ^ shifted_sign) - shifted_sign; |
| } |
| |
| // ------------------------------ Floating-point mul / div |
| |
| #if HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Vec128<float16_t, N> operator*(Vec128<float16_t, N> a, |
| Vec128<float16_t, N> b) { |
| return Vec128<float16_t, N>{_mm_mul_ph(a.raw, b.raw)}; |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Vec128<float, N> operator*(Vec128<float, N> a, Vec128<float, N> b) { |
| return Vec128<float, N>{_mm_mul_ps(a.raw, b.raw)}; |
| } |
| HWY_API Vec128<float, 1> operator*(const Vec128<float, 1> a, |
| const Vec128<float, 1> b) { |
| return Vec128<float, 1>{_mm_mul_ss(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<double, N> operator*(const Vec128<double, N> a, |
| const Vec128<double, N> b) { |
| return Vec128<double, N>{_mm_mul_pd(a.raw, b.raw)}; |
| } |
| HWY_API Vec64<double> operator*(const Vec64<double> a, const Vec64<double> b) { |
| return Vec64<double>{_mm_mul_sd(a.raw, b.raw)}; |
| } |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| |
| #ifdef HWY_NATIVE_MUL_BY_POW2 |
| #undef HWY_NATIVE_MUL_BY_POW2 |
| #else |
| #define HWY_NATIVE_MUL_BY_POW2 |
| #endif |
| |
| #if HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Vec128<float16_t, N> MulByFloorPow2(Vec128<float16_t, N> a, |
| Vec128<float16_t, N> b) { |
| return Vec128<float16_t, N>{_mm_scalef_ph(a.raw, b.raw)}; |
| } |
| #endif |
| |
| template <size_t N> |
| HWY_API Vec128<float, N> MulByFloorPow2(Vec128<float, N> a, |
| Vec128<float, N> b) { |
| return Vec128<float, N>{_mm_scalef_ps(a.raw, b.raw)}; |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<double, N> MulByFloorPow2(Vec128<double, N> a, |
| Vec128<double, N> b) { |
| return Vec128<double, N>{_mm_scalef_pd(a.raw, b.raw)}; |
| } |
| |
| // MulByPow2 is generic for all vector lengths on AVX3 |
| template <class V, HWY_IF_FLOAT_V(V)> |
| HWY_API V MulByPow2(V v, VFromD<RebindToSigned<DFromV<V>>> exp) { |
| const DFromV<decltype(v)> d; |
| return MulByFloorPow2(v, ConvertTo(d, exp)); |
| } |
| |
| #endif // HWY_TARGET <= HWY_AVX3 |
| |
| #if HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Vec128<float16_t, N> operator/(const Vec128<float16_t, N> a, |
| const Vec128<float16_t, N> b) { |
| return Vec128<float16_t, N>{_mm_div_ph(a.raw, b.raw)}; |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Vec128<float, N> operator/(const Vec128<float, N> a, |
| const Vec128<float, N> b) { |
| return Vec128<float, N>{_mm_div_ps(a.raw, b.raw)}; |
| } |
| HWY_API Vec128<float, 1> operator/(const Vec128<float, 1> a, |
| const Vec128<float, 1> b) { |
| return Vec128<float, 1>{_mm_div_ss(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<double, N> operator/(const Vec128<double, N> a, |
| const Vec128<double, N> b) { |
| return Vec128<double, N>{_mm_div_pd(a.raw, b.raw)}; |
| } |
| HWY_API Vec64<double> operator/(const Vec64<double> a, const Vec64<double> b) { |
| return Vec64<double>{_mm_div_sd(a.raw, b.raw)}; |
| } |
| |
| // Approximate reciprocal |
| #if HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Vec128<float16_t, N> ApproximateReciprocal( |
| const Vec128<float16_t, N> v) { |
| return Vec128<float16_t, N>{_mm_rcp_ph(v.raw)}; |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Vec128<float, N> ApproximateReciprocal(const Vec128<float, N> v) { |
| return Vec128<float, N>{_mm_rcp_ps(v.raw)}; |
| } |
| HWY_API Vec128<float, 1> ApproximateReciprocal(const Vec128<float, 1> v) { |
| return Vec128<float, 1>{_mm_rcp_ss(v.raw)}; |
| } |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| #ifdef HWY_NATIVE_F64_APPROX_RECIP |
| #undef HWY_NATIVE_F64_APPROX_RECIP |
| #else |
| #define HWY_NATIVE_F64_APPROX_RECIP |
| #endif |
| |
| HWY_API Vec128<double> ApproximateReciprocal(Vec128<double> v) { |
| return Vec128<double>{_mm_rcp14_pd(v.raw)}; |
| } |
| HWY_API Vec64<double> ApproximateReciprocal(Vec64<double> v) { |
| return Vec64<double>{_mm_rcp14_sd(v.raw, v.raw)}; |
| } |
| #endif |
| |
| // Generic for all vector lengths. |
| template <class V, HWY_IF_FLOAT_V(V)> |
| HWY_API V AbsDiff(V a, V b) { |
| return Abs(a - b); |
| } |
| |
| // ------------------------------ GetExponent |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| |
| #ifdef HWY_NATIVE_GET_EXPONENT |
| #undef HWY_NATIVE_GET_EXPONENT |
| #else |
| #define HWY_NATIVE_GET_EXPONENT |
| #endif |
| |
| #if HWY_HAVE_FLOAT16 |
| template <class V, HWY_IF_F16(TFromV<V>), HWY_IF_V_SIZE_LE_V(V, 16)> |
| HWY_API V GetExponent(V v) { |
| return V{_mm_getexp_ph(v.raw)}; |
| } |
| #endif |
| template <class V, HWY_IF_F32(TFromV<V>), HWY_IF_V_SIZE_LE_V(V, 16)> |
| HWY_API V GetExponent(V v) { |
| return V{_mm_getexp_ps(v.raw)}; |
| } |
| template <class V, HWY_IF_F64(TFromV<V>), HWY_IF_V_SIZE_LE_V(V, 16)> |
| HWY_API V GetExponent(V v) { |
| return V{_mm_getexp_pd(v.raw)}; |
| } |
| |
| #endif |
| |
| // ------------------------------ MaskedMinOr |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| |
| #ifdef HWY_NATIVE_MASKED_ARITH |
| #undef HWY_NATIVE_MASKED_ARITH |
| #else |
| #define HWY_NATIVE_MASKED_ARITH |
| #endif |
| |
| template <typename T, size_t N, HWY_IF_U8(T)> |
| HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_min_epu8(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| template <typename T, size_t N, HWY_IF_I8(T)> |
| HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_min_epi8(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_U16(T)> |
| HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_min_epu16(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| template <typename T, size_t N, HWY_IF_I16(T)> |
| HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_min_epi16(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_U32(T)> |
| HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_min_epu32(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| template <typename T, size_t N, HWY_IF_I32(T)> |
| HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_min_epi32(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_U64(T)> |
| HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_min_epu64(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| template <typename T, size_t N, HWY_IF_I64(T)> |
| HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_min_epi64(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_F32(T)> |
| HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_min_ps(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_F64(T)> |
| HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_min_pd(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| |
| #if HWY_HAVE_FLOAT16 |
| template <typename T, size_t N, HWY_IF_F16(T)> |
| HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_min_ph(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| |
| // ------------------------------ MaskedMaxOr |
| |
| template <typename T, size_t N, HWY_IF_U8(T)> |
| HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_max_epu8(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| template <typename T, size_t N, HWY_IF_I8(T)> |
| HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_max_epi8(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_U16(T)> |
| HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_max_epu16(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| template <typename T, size_t N, HWY_IF_I16(T)> |
| HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_max_epi16(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_U32(T)> |
| HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_max_epu32(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| template <typename T, size_t N, HWY_IF_I32(T)> |
| HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_max_epi32(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_U64(T)> |
| HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_max_epu64(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| template <typename T, size_t N, HWY_IF_I64(T)> |
| HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_max_epi64(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_F32(T)> |
| HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_max_ps(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_F64(T)> |
| HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_max_pd(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| |
| #if HWY_HAVE_FLOAT16 |
| template <typename T, size_t N, HWY_IF_F16(T)> |
| HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_max_ph(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| |
| // ------------------------------ MaskedAddOr |
| |
| template <typename T, size_t N, HWY_IF_UI8(T)> |
| HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_add_epi8(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_UI16(T)> |
| HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_add_epi16(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_UI32(T)> |
| HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_add_epi32(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_UI64(T)> |
| HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_add_epi64(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_F32(T)> |
| HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_add_ps(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_F64(T)> |
| HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_add_pd(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| |
| #if HWY_HAVE_FLOAT16 |
| template <typename T, size_t N, HWY_IF_F16(T)> |
| HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_add_ph(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| |
| // ------------------------------ MaskedSubOr |
| |
| template <typename T, size_t N, HWY_IF_UI8(T)> |
| HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_sub_epi8(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_UI16(T)> |
| HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_sub_epi16(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_UI32(T)> |
| HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_sub_epi32(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_UI64(T)> |
| HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_sub_epi64(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_F32(T)> |
| HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_sub_ps(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_F64(T)> |
| HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_sub_pd(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| |
| #if HWY_HAVE_FLOAT16 |
| template <typename T, size_t N, HWY_IF_F16(T)> |
| HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_sub_ph(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| |
| // ------------------------------ MaskedMulOr |
| |
| // There are no elementwise integer mask_mul. Generic for all vector lengths. |
| template <class V, class M> |
| HWY_API V MaskedMulOr(V no, M m, V a, V b) { |
| return IfThenElse(m, a * b, no); |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<float, N> MaskedMulOr(Vec128<float, N> no, Mask128<float, N> m, |
| Vec128<float, N> a, Vec128<float, N> b) { |
| return Vec128<float, N>{_mm_mask_mul_ps(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<double, N> MaskedMulOr(Vec128<double, N> no, |
| Mask128<double, N> m, Vec128<double, N> a, |
| Vec128<double, N> b) { |
| return Vec128<double, N>{_mm_mask_mul_pd(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| |
| #if HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Vec128<float16_t, N> MaskedMulOr(Vec128<float16_t, N> no, |
| Mask128<float16_t, N> m, |
| Vec128<float16_t, N> a, |
| Vec128<float16_t, N> b) { |
| return Vec128<float16_t, N>{_mm_mask_mul_ph(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| |
| // ------------------------------ MaskedDivOr |
| |
| template <size_t N> |
| HWY_API Vec128<float, N> MaskedDivOr(Vec128<float, N> no, Mask128<float, N> m, |
| Vec128<float, N> a, Vec128<float, N> b) { |
| return Vec128<float, N>{_mm_mask_div_ps(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<double, N> MaskedDivOr(Vec128<double, N> no, |
| Mask128<double, N> m, Vec128<double, N> a, |
| Vec128<double, N> b) { |
| return Vec128<double, N>{_mm_mask_div_pd(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| |
| #if HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Vec128<float16_t, N> MaskedDivOr(Vec128<float16_t, N> no, |
| Mask128<float16_t, N> m, |
| Vec128<float16_t, N> a, |
| Vec128<float16_t, N> b) { |
| return Vec128<float16_t, N>{_mm_mask_div_ph(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| |
| // Generic for all vector lengths |
| template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> |
| HWY_API V MaskedDivOr(V no, MFromD<DFromV<V>> m, V a, V b) { |
| return IfThenElse(m, Div(a, b), no); |
| } |
| |
| // ------------------------------ MaskedModOr |
| // Generic for all vector lengths |
| template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> |
| HWY_API V MaskedModOr(V no, MFromD<DFromV<V>> m, V a, V b) { |
| return IfThenElse(m, Mod(a, b), no); |
| } |
| |
| // ------------------------------ MaskedSatAddOr |
| |
| template <typename T, size_t N, HWY_IF_I8(T)> |
| HWY_API Vec128<T, N> MaskedSatAddOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_adds_epi8(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_U8(T)> |
| HWY_API Vec128<T, N> MaskedSatAddOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_adds_epu8(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_I16(T)> |
| HWY_API Vec128<T, N> MaskedSatAddOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_adds_epi16(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_U16(T)> |
| HWY_API Vec128<T, N> MaskedSatAddOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_adds_epu16(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| |
| // ------------------------------ MaskedSatSubOr |
| |
| template <typename T, size_t N, HWY_IF_I8(T)> |
| HWY_API Vec128<T, N> MaskedSatSubOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_subs_epi8(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_U8(T)> |
| HWY_API Vec128<T, N> MaskedSatSubOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_subs_epu8(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_I16(T)> |
| HWY_API Vec128<T, N> MaskedSatSubOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_subs_epi16(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_U16(T)> |
| HWY_API Vec128<T, N> MaskedSatSubOr(Vec128<T, N> no, Mask128<T, N> m, |
| Vec128<T, N> a, Vec128<T, N> b) { |
| return Vec128<T, N>{_mm_mask_subs_epu16(no.raw, m.raw, a.raw, b.raw)}; |
| } |
| |
| #endif // HWY_TARGET <= HWY_AVX3 |
| |
| // ------------------------------ Floating-point multiply-add variants |
| |
| #if HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Vec128<float16_t, N> MulAdd(Vec128<float16_t, N> mul, |
| Vec128<float16_t, N> x, |
| Vec128<float16_t, N> add) { |
| return Vec128<float16_t, N>{_mm_fmadd_ph(mul.raw, x.raw, add.raw)}; |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<float16_t, N> NegMulAdd(Vec128<float16_t, N> mul, |
| Vec128<float16_t, N> x, |
| Vec128<float16_t, N> add) { |
| return Vec128<float16_t, N>{_mm_fnmadd_ph(mul.raw, x.raw, add.raw)}; |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<float16_t, N> MulSub(Vec128<float16_t, N> mul, |
| Vec128<float16_t, N> x, |
| Vec128<float16_t, N> sub) { |
| return Vec128<float16_t, N>{_mm_fmsub_ph(mul.raw, x.raw, sub.raw)}; |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<float16_t, N> NegMulSub(Vec128<float16_t, N> mul, |
| Vec128<float16_t, N> x, |
| Vec128<float16_t, N> sub) { |
| return Vec128<float16_t, N>{_mm_fnmsub_ph(mul.raw, x.raw, sub.raw)}; |
| } |
| |
| #endif // HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Vec128<float, N> MulAdd(Vec128<float, N> mul, Vec128<float, N> x, |
| Vec128<float, N> add) { |
| #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA) |
| return mul * x + add; |
| #else |
| return Vec128<float, N>{_mm_fmadd_ps(mul.raw, x.raw, add.raw)}; |
| #endif |
| } |
| template <size_t N> |
| HWY_API Vec128<double, N> MulAdd(Vec128<double, N> mul, Vec128<double, N> x, |
| Vec128<double, N> add) { |
| #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA) |
| return mul * x + add; |
| #else |
| return Vec128<double, N>{_mm_fmadd_pd(mul.raw, x.raw, add.raw)}; |
| #endif |
| } |
| |
| // Returns add - mul * x |
| template <size_t N> |
| HWY_API Vec128<float, N> NegMulAdd(Vec128<float, N> mul, Vec128<float, N> x, |
| Vec128<float, N> add) { |
| #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA) |
| return add - mul * x; |
| #else |
| return Vec128<float, N>{_mm_fnmadd_ps(mul.raw, x.raw, add.raw)}; |
| #endif |
| } |
| template <size_t N> |
| HWY_API Vec128<double, N> NegMulAdd(Vec128<double, N> mul, Vec128<double, N> x, |
| Vec128<double, N> add) { |
| #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA) |
| return add - mul * x; |
| #else |
| return Vec128<double, N>{_mm_fnmadd_pd(mul.raw, x.raw, add.raw)}; |
| #endif |
| } |
| |
| // Returns mul * x - sub |
| template <size_t N> |
| HWY_API Vec128<float, N> MulSub(Vec128<float, N> mul, Vec128<float, N> x, |
| Vec128<float, N> sub) { |
| #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA) |
| return mul * x - sub; |
| #else |
| return Vec128<float, N>{_mm_fmsub_ps(mul.raw, x.raw, sub.raw)}; |
| #endif |
| } |
| template <size_t N> |
| HWY_API Vec128<double, N> MulSub(Vec128<double, N> mul, Vec128<double, N> x, |
| Vec128<double, N> sub) { |
| #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA) |
| return mul * x - sub; |
| #else |
| return Vec128<double, N>{_mm_fmsub_pd(mul.raw, x.raw, sub.raw)}; |
| #endif |
| } |
| |
| // Returns -mul * x - sub |
| template <size_t N> |
| HWY_API Vec128<float, N> NegMulSub(Vec128<float, N> mul, Vec128<float, N> x, |
| Vec128<float, N> sub) { |
| #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA) |
| return Neg(mul) * x - sub; |
| #else |
| return Vec128<float, N>{_mm_fnmsub_ps(mul.raw, x.raw, sub.raw)}; |
| #endif |
| } |
| template <size_t N> |
| HWY_API Vec128<double, N> NegMulSub(Vec128<double, N> mul, Vec128<double, N> x, |
| Vec128<double, N> sub) { |
| #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA) |
| return Neg(mul) * x - sub; |
| #else |
| return Vec128<double, N>{_mm_fnmsub_pd(mul.raw, x.raw, sub.raw)}; |
| #endif |
| } |
| |
| #if HWY_TARGET <= HWY_SSSE3 |
| |
| #undef HWY_IF_MULADDSUB_V |
| #define HWY_IF_MULADDSUB_V(V) \ |
| HWY_IF_LANES_GT_D(DFromV<V>, 1), \ |
| HWY_IF_T_SIZE_ONE_OF_V( \ |
| V, (1 << 1) | ((hwy::IsFloat<TFromV<V>>()) \ |
| ? 0 \ |
| : ((1 << 2) | (1 << 4) | (1 << 8)))) |
| |
| #if HWY_HAVE_FLOAT16 |
| template <size_t N, HWY_IF_LANES_GT(N, 1)> |
| HWY_API Vec128<float16_t, N> MulAddSub(Vec128<float16_t, N> mul, |
| Vec128<float16_t, N> x, |
| Vec128<float16_t, N> sub_or_add) { |
| return Vec128<float16_t, N>{_mm_fmaddsub_ph(mul.raw, x.raw, sub_or_add.raw)}; |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| |
| template <size_t N, HWY_IF_LANES_GT(N, 1)> |
| HWY_API Vec128<float, N> MulAddSub(Vec128<float, N> mul, Vec128<float, N> x, |
| Vec128<float, N> sub_or_add) { |
| #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA) |
| return AddSub(mul * x, sub_or_add); |
| #else |
| return Vec128<float, N>{_mm_fmaddsub_ps(mul.raw, x.raw, sub_or_add.raw)}; |
| #endif |
| } |
| |
| HWY_API Vec128<double> MulAddSub(Vec128<double> mul, Vec128<double> x, |
| Vec128<double> sub_or_add) { |
| #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA) |
| return AddSub(mul * x, sub_or_add); |
| #else |
| return Vec128<double>{_mm_fmaddsub_pd(mul.raw, x.raw, sub_or_add.raw)}; |
| #endif |
| } |
| |
| #endif // HWY_TARGET <= HWY_SSSE3 |
| |
| // ------------------------------ Floating-point square root |
| |
| // Full precision square root |
| #if HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Vec128<float16_t, N> Sqrt(Vec128<float16_t, N> v) { |
| return Vec128<float16_t, N>{_mm_sqrt_ph(v.raw)}; |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Vec128<float, N> Sqrt(Vec128<float, N> v) { |
| return Vec128<float, N>{_mm_sqrt_ps(v.raw)}; |
| } |
| HWY_API Vec128<float, 1> Sqrt(Vec128<float, 1> v) { |
| return Vec128<float, 1>{_mm_sqrt_ss(v.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<double, N> Sqrt(Vec128<double, N> v) { |
| return Vec128<double, N>{_mm_sqrt_pd(v.raw)}; |
| } |
| HWY_API Vec64<double> Sqrt(Vec64<double> v) { |
| return Vec64<double>{_mm_sqrt_sd(_mm_setzero_pd(), v.raw)}; |
| } |
| |
| // Approximate reciprocal square root |
| #if HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Vec128<float16_t, N> ApproximateReciprocalSqrt(Vec128<float16_t, N> v) { |
| return Vec128<float16_t, N>{_mm_rsqrt_ph(v.raw)}; |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Vec128<float, N> ApproximateReciprocalSqrt(Vec128<float, N> v) { |
| return Vec128<float, N>{_mm_rsqrt_ps(v.raw)}; |
| } |
| HWY_API Vec128<float, 1> ApproximateReciprocalSqrt(Vec128<float, 1> v) { |
| return Vec128<float, 1>{_mm_rsqrt_ss(v.raw)}; |
| } |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| #ifdef HWY_NATIVE_F64_APPROX_RSQRT |
| #undef HWY_NATIVE_F64_APPROX_RSQRT |
| #else |
| #define HWY_NATIVE_F64_APPROX_RSQRT |
| #endif |
| |
| HWY_API Vec64<double> ApproximateReciprocalSqrt(Vec64<double> v) { |
| return Vec64<double>{_mm_rsqrt14_sd(v.raw, v.raw)}; |
| } |
| HWY_API Vec128<double> ApproximateReciprocalSqrt(Vec128<double> v) { |
| #if HWY_COMPILER_MSVC |
| const DFromV<decltype(v)> d; |
| return Vec128<double>{_mm_mask_rsqrt14_pd( |
| Undefined(d).raw, static_cast<__mmask8>(0xFF), v.raw)}; |
| #else |
| return Vec128<double>{_mm_rsqrt14_pd(v.raw)}; |
| #endif |
| } |
| #endif |
| |
| // ------------------------------ Min (Gt, IfThenElse) |
| |
| namespace detail { |
| |
| template <typename T, size_t N> |
| HWY_INLINE HWY_MAYBE_UNUSED Vec128<T, N> MinU(const Vec128<T, N> a, |
| const Vec128<T, N> b) { |
| const DFromV<decltype(a)> d; |
| const RebindToUnsigned<decltype(d)> du; |
| const RebindToSigned<decltype(d)> di; |
| const auto msb = Set(du, static_cast<T>(T(1) << (sizeof(T) * 8 - 1))); |
| const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb)); |
| return IfThenElse(gt, b, a); |
| } |
| |
| } // namespace detail |
| |
| // Unsigned |
| template <size_t N> |
| HWY_API Vec128<uint8_t, N> Min(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) { |
| return Vec128<uint8_t, N>{_mm_min_epu8(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<uint16_t, N> Min(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) { |
| #if HWY_TARGET >= HWY_SSSE3 |
| return Vec128<uint16_t, N>{ |
| _mm_sub_epi16(a.raw, _mm_subs_epu16(a.raw, b.raw))}; |
| #else |
| return Vec128<uint16_t, N>{_mm_min_epu16(a.raw, b.raw)}; |
| #endif |
| } |
| template <size_t N> |
| HWY_API Vec128<uint32_t, N> Min(Vec128<uint32_t, N> a, Vec128<uint32_t, N> b) { |
| #if HWY_TARGET >= HWY_SSSE3 |
| return detail::MinU(a, b); |
| #else |
| return Vec128<uint32_t, N>{_mm_min_epu32(a.raw, b.raw)}; |
| #endif |
| } |
| template <size_t N> |
| HWY_API Vec128<uint64_t, N> Min(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) { |
| #if HWY_TARGET <= HWY_AVX3 |
| return Vec128<uint64_t, N>{_mm_min_epu64(a.raw, b.raw)}; |
| #else |
| return detail::MinU(a, b); |
| #endif |
| } |
| |
| // Signed |
| template <size_t N> |
| HWY_API Vec128<int8_t, N> Min(Vec128<int8_t, N> a, Vec128<int8_t, N> b) { |
| #if HWY_TARGET >= HWY_SSSE3 |
| return IfThenElse(a < b, a, b); |
| #else |
| return Vec128<int8_t, N>{_mm_min_epi8(a.raw, b.raw)}; |
| #endif |
| } |
| template <size_t N> |
| HWY_API Vec128<int16_t, N> Min(Vec128<int16_t, N> a, Vec128<int16_t, N> b) { |
| return Vec128<int16_t, N>{_mm_min_epi16(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<int32_t, N> Min(Vec128<int32_t, N> a, Vec128<int32_t, N> b) { |
| #if HWY_TARGET >= HWY_SSSE3 |
| return IfThenElse(a < b, a, b); |
| #else |
| return Vec128<int32_t, N>{_mm_min_epi32(a.raw, b.raw)}; |
| #endif |
| } |
| template <size_t N> |
| HWY_API Vec128<int64_t, N> Min(Vec128<int64_t, N> a, Vec128<int64_t, N> b) { |
| #if HWY_TARGET <= HWY_AVX3 |
| return Vec128<int64_t, N>{_mm_min_epi64(a.raw, b.raw)}; |
| #else |
| return IfThenElse(a < b, a, b); |
| #endif |
| } |
| |
| // Float |
| #if HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Vec128<float16_t, N> Min(Vec128<float16_t, N> a, |
| Vec128<float16_t, N> b) { |
| return Vec128<float16_t, N>{_mm_min_ph(a.raw, b.raw)}; |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Vec128<float, N> Min(Vec128<float, N> a, Vec128<float, N> b) { |
| return Vec128<float, N>{_mm_min_ps(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<double, N> Min(Vec128<double, N> a, Vec128<double, N> b) { |
| return Vec128<double, N>{_mm_min_pd(a.raw, b.raw)}; |
| } |
| |
| // ------------------------------ Max (Gt, IfThenElse) |
| |
| namespace detail { |
| template <typename T, size_t N> |
| HWY_INLINE HWY_MAYBE_UNUSED Vec128<T, N> MaxU(const Vec128<T, N> a, |
| const Vec128<T, N> b) { |
| const DFromV<decltype(a)> d; |
| const RebindToUnsigned<decltype(d)> du; |
| const RebindToSigned<decltype(d)> di; |
| const auto msb = Set(du, static_cast<T>(T(1) << (sizeof(T) * 8 - 1))); |
| const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb)); |
| return IfThenElse(gt, a, b); |
| } |
| |
| } // namespace detail |
| |
| // Unsigned |
| template <size_t N> |
| HWY_API Vec128<uint8_t, N> Max(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) { |
| return Vec128<uint8_t, N>{_mm_max_epu8(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<uint16_t, N> Max(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) { |
| #if HWY_TARGET >= HWY_SSSE3 |
| return Vec128<uint16_t, N>{ |
| _mm_add_epi16(a.raw, _mm_subs_epu16(b.raw, a.raw))}; |
| #else |
| return Vec128<uint16_t, N>{_mm_max_epu16(a.raw, b.raw)}; |
| #endif |
| } |
| template <size_t N> |
| HWY_API Vec128<uint32_t, N> Max(Vec128<uint32_t, N> a, Vec128<uint32_t, N> b) { |
| #if HWY_TARGET >= HWY_SSSE3 |
| return detail::MaxU(a, b); |
| #else |
| return Vec128<uint32_t, N>{_mm_max_epu32(a.raw, b.raw)}; |
| #endif |
| } |
| template <size_t N> |
| HWY_API Vec128<uint64_t, N> Max(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) { |
| #if HWY_TARGET <= HWY_AVX3 |
| return Vec128<uint64_t, N>{_mm_max_epu64(a.raw, b.raw)}; |
| #else |
| return detail::MaxU(a, b); |
| #endif |
| } |
| |
| // Signed |
| template <size_t N> |
| HWY_API Vec128<int8_t, N> Max(Vec128<int8_t, N> a, Vec128<int8_t, N> b) { |
| #if HWY_TARGET >= HWY_SSSE3 |
| return IfThenElse(a < b, b, a); |
| #else |
| return Vec128<int8_t, N>{_mm_max_epi8(a.raw, b.raw)}; |
| #endif |
| } |
| template <size_t N> |
| HWY_API Vec128<int16_t, N> Max(Vec128<int16_t, N> a, Vec128<int16_t, N> b) { |
| return Vec128<int16_t, N>{_mm_max_epi16(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<int32_t, N> Max(Vec128<int32_t, N> a, Vec128<int32_t, N> b) { |
| #if HWY_TARGET >= HWY_SSSE3 |
| return IfThenElse(a < b, b, a); |
| #else |
| return Vec128<int32_t, N>{_mm_max_epi32(a.raw, b.raw)}; |
| #endif |
| } |
| template <size_t N> |
| HWY_API Vec128<int64_t, N> Max(Vec128<int64_t, N> a, Vec128<int64_t, N> b) { |
| #if HWY_TARGET <= HWY_AVX3 |
| return Vec128<int64_t, N>{_mm_max_epi64(a.raw, b.raw)}; |
| #else |
| return IfThenElse(a < b, b, a); |
| #endif |
| } |
| |
| // Float |
| #if HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Vec128<float16_t, N> Max(Vec128<float16_t, N> a, |
| Vec128<float16_t, N> b) { |
| return Vec128<float16_t, N>{_mm_max_ph(a.raw, b.raw)}; |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Vec128<float, N> Max(Vec128<float, N> a, Vec128<float, N> b) { |
| return Vec128<float, N>{_mm_max_ps(a.raw, b.raw)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<double, N> Max(Vec128<double, N> a, Vec128<double, N> b) { |
| return Vec128<double, N>{_mm_max_pd(a.raw, b.raw)}; |
| } |
| |
| // ================================================== MEMORY (3) |
| |
| // ------------------------------ Non-temporal stores |
| |
| // On clang6, we see incorrect code generated for _mm_stream_pi, so |
| // round even partial vectors up to 16 bytes. |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_FLOAT3264_D(D)> |
| HWY_API void Stream(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) { |
| const RebindToUnsigned<decltype(d)> du; // for float16_t |
| _mm_stream_si128(reinterpret_cast<__m128i*>(aligned), BitCast(du, v).raw); |
| } |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> |
| HWY_API void Stream(VFromD<D> v, D /* tag */, float* HWY_RESTRICT aligned) { |
| _mm_stream_ps(aligned, v.raw); |
| } |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> |
| HWY_API void Stream(VFromD<D> v, D /* tag */, double* HWY_RESTRICT aligned) { |
| _mm_stream_pd(aligned, v.raw); |
| } |
| |
| // ------------------------------ Scatter |
| |
| // Work around warnings in the intrinsic definitions (passing -1 as a mask). |
| HWY_DIAGNOSTICS(push) |
| HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") |
| |
| // Unfortunately the GCC/Clang intrinsics do not accept int64_t*. |
| using GatherIndex64 = long long int; // NOLINT(runtime/int) |
| static_assert(sizeof(GatherIndex64) == 8, "Must be 64-bit type"); |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| |
| #ifdef HWY_NATIVE_SCATTER |
| #undef HWY_NATIVE_SCATTER |
| #else |
| #define HWY_NATIVE_SCATTER |
| #endif |
| |
| namespace detail { |
| |
| template <int kScale, class D, class VI, HWY_IF_UI32_D(D)> |
| HWY_INLINE void NativeScatter128(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base, |
| VI index) { |
| if (d.MaxBytes() == 16) { |
| _mm_i32scatter_epi32(base, index.raw, v.raw, kScale); |
| } else { |
| const __mmask8 mask = (1u << MaxLanes(d)) - 1; |
| _mm_mask_i32scatter_epi32(base, mask, index.raw, v.raw, kScale); |
| } |
| } |
| |
| template <int kScale, class D, class VI, HWY_IF_UI64_D(D)> |
| HWY_INLINE void NativeScatter128(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base, |
| VI index) { |
| if (d.MaxBytes() == 16) { |
| _mm_i64scatter_epi64(base, index.raw, v.raw, kScale); |
| } else { |
| const __mmask8 mask = (1u << MaxLanes(d)) - 1; |
| _mm_mask_i64scatter_epi64(base, mask, index.raw, v.raw, kScale); |
| } |
| } |
| |
| template <int kScale, class D, class VI, HWY_IF_F32_D(D)> |
| HWY_INLINE void NativeScatter128(VFromD<D> v, D d, float* HWY_RESTRICT base, |
| VI index) { |
| if (d.MaxBytes() == 16) { |
| _mm_i32scatter_ps(base, index.raw, v.raw, kScale); |
| } else { |
| const __mmask8 mask = (1u << MaxLanes(d)) - 1; |
| _mm_mask_i32scatter_ps(base, mask, index.raw, v.raw, kScale); |
| } |
| } |
| |
| template <int kScale, class D, class VI, HWY_IF_F64_D(D)> |
| HWY_INLINE void NativeScatter128(VFromD<D> v, D d, double* HWY_RESTRICT base, |
| VI index) { |
| if (d.MaxBytes() == 16) { |
| _mm_i64scatter_pd(base, index.raw, v.raw, kScale); |
| } else { |
| const __mmask8 mask = (1u << MaxLanes(d)) - 1; |
| _mm_mask_i64scatter_pd(base, mask, index.raw, v.raw, kScale); |
| } |
| } |
| |
| template <int kScale, class D, class VI, HWY_IF_UI32_D(D)> |
| HWY_INLINE void NativeMaskedScatter128(VFromD<D> v, MFromD<D> m, D d, |
| TFromD<D>* HWY_RESTRICT base, VI index) { |
| // For partial vectors, ensure upper mask lanes are zero to prevent faults. |
| if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d))); |
| _mm_mask_i32scatter_epi32(base, m.raw, index.raw, v.raw, kScale); |
| } |
| |
| template <int kScale, class D, class VI, HWY_IF_UI64_D(D)> |
| HWY_INLINE void NativeMaskedScatter128(VFromD<D> v, MFromD<D> m, D d, |
| TFromD<D>* HWY_RESTRICT base, VI index) { |
| // For partial vectors, ensure upper mask lanes are zero to prevent faults. |
| if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d))); |
| _mm_mask_i64scatter_epi64(base, m.raw, index.raw, v.raw, kScale); |
| } |
| |
| template <int kScale, class D, class VI, HWY_IF_F32_D(D)> |
| HWY_INLINE void NativeMaskedScatter128(VFromD<D> v, MFromD<D> m, D d, |
| float* HWY_RESTRICT base, VI index) { |
| // For partial vectors, ensure upper mask lanes are zero to prevent faults. |
| if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d))); |
| _mm_mask_i32scatter_ps(base, m.raw, index.raw, v.raw, kScale); |
| } |
| |
| template <int kScale, class D, class VI, HWY_IF_F64_D(D)> |
| HWY_INLINE void NativeMaskedScatter128(VFromD<D> v, MFromD<D> m, D d, |
| double* HWY_RESTRICT base, VI index) { |
| // For partial vectors, ensure upper mask lanes are zero to prevent faults. |
| if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d))); |
| _mm_mask_i64scatter_pd(base, m.raw, index.raw, v.raw, kScale); |
| } |
| |
| } // namespace detail |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API void ScatterOffset(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base, |
| VFromD<RebindToSigned<D>> offset) { |
| return detail::NativeScatter128<1>(v, d, base, offset); |
| } |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API void ScatterIndex(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base, |
| VFromD<RebindToSigned<D>> index) { |
| return detail::NativeScatter128<sizeof(TFromD<D>)>(v, d, base, index); |
| } |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API void MaskedScatterIndex(VFromD<D> v, MFromD<D> m, D d, |
| TFromD<D>* HWY_RESTRICT base, |
| VFromD<RebindToSigned<D>> index) { |
| return detail::NativeMaskedScatter128<sizeof(TFromD<D>)>(v, m, d, base, |
| index); |
| } |
| |
| #endif // HWY_TARGET <= HWY_AVX3 |
| |
| // ------------------------------ Gather (Load/Store) |
| |
| #if HWY_TARGET <= HWY_AVX2 |
| |
| #ifdef HWY_NATIVE_GATHER |
| #undef HWY_NATIVE_GATHER |
| #else |
| #define HWY_NATIVE_GATHER |
| #endif |
| |
| namespace detail { |
| |
| template <int kScale, typename T, size_t N, HWY_IF_UI32(T)> |
| HWY_INLINE Vec128<T, N> NativeGather128(const T* HWY_RESTRICT base, |
| Vec128<int32_t, N> indices) { |
| return Vec128<T, N>{_mm_i32gather_epi32( |
| reinterpret_cast<const int32_t*>(base), indices.raw, kScale)}; |
| } |
| |
| template <int kScale, typename T, size_t N, HWY_IF_UI64(T)> |
| HWY_INLINE Vec128<T, N> NativeGather128(const T* HWY_RESTRICT base, |
| Vec128<int64_t, N> indices) { |
| return Vec128<T, N>{_mm_i64gather_epi64( |
| reinterpret_cast<const GatherIndex64*>(base), indices.raw, kScale)}; |
| } |
| |
| template <int kScale, size_t N> |
| HWY_INLINE Vec128<float, N> NativeGather128(const float* HWY_RESTRICT base, |
| Vec128<int32_t, N> indices) { |
| return Vec128<float, N>{_mm_i32gather_ps(base, indices.raw, kScale)}; |
| } |
| |
| template <int kScale, size_t N> |
| HWY_INLINE Vec128<double, N> NativeGather128(const double* HWY_RESTRICT base, |
| Vec128<int64_t, N> indices) { |
| return Vec128<double, N>{_mm_i64gather_pd(base, indices.raw, kScale)}; |
| } |
| |
| template <int kScale, typename T, size_t N, HWY_IF_UI32(T)> |
| HWY_INLINE Vec128<T, N> NativeMaskedGatherOr128(Vec128<T, N> no, |
| Mask128<T, N> m, |
| const T* HWY_RESTRICT base, |
| Vec128<int32_t, N> indices) { |
| #if HWY_TARGET <= HWY_AVX3 |
| return Vec128<T, N>{_mm_mmask_i32gather_epi32( |
| no.raw, m.raw, indices.raw, reinterpret_cast<const int32_t*>(base), |
| kScale)}; |
| #else |
| return Vec128<T, N>{ |
| _mm_mask_i32gather_epi32(no.raw, reinterpret_cast<const int32_t*>(base), |
| indices.raw, m.raw, kScale)}; |
| #endif |
| } |
| |
| template <int kScale, typename T, size_t N, HWY_IF_UI64(T)> |
| HWY_INLINE Vec128<T, N> NativeMaskedGatherOr128(Vec128<T, N> no, |
| Mask128<T, N> m, |
| const T* HWY_RESTRICT base, |
| Vec128<int64_t, N> indices) { |
| #if HWY_TARGET <= HWY_AVX3 |
| return Vec128<T, N>{_mm_mmask_i64gather_epi64( |
| no.raw, m.raw, indices.raw, reinterpret_cast<const GatherIndex64*>(base), |
| kScale)}; |
| #else |
| return Vec128<T, N>{_mm_mask_i64gather_epi64( |
| no.raw, reinterpret_cast<const GatherIndex64*>(base), indices.raw, m.raw, |
| kScale)}; |
| #endif |
| } |
| |
| template <int kScale, size_t N> |
| HWY_INLINE Vec128<float, N> NativeMaskedGatherOr128( |
| Vec128<float, N> no, Mask128<float, N> m, const float* HWY_RESTRICT base, |
| Vec128<int32_t, N> indices) { |
| #if HWY_TARGET <= HWY_AVX3 |
| return Vec128<float, N>{ |
| _mm_mmask_i32gather_ps(no.raw, m.raw, indices.raw, base, kScale)}; |
| #else |
| return Vec128<float, N>{ |
| _mm_mask_i32gather_ps(no.raw, base, indices.raw, m.raw, kScale)}; |
| #endif |
| } |
| |
| template <int kScale, size_t N> |
| HWY_INLINE Vec128<double, N> NativeMaskedGatherOr128( |
| Vec128<double, N> no, Mask128<double, N> m, const double* HWY_RESTRICT base, |
| Vec128<int64_t, N> indices) { |
| #if HWY_TARGET <= HWY_AVX3 |
| return Vec128<double, N>{ |
| _mm_mmask_i64gather_pd(no.raw, m.raw, indices.raw, base, kScale)}; |
| #else |
| return Vec128<double, N>{ |
| _mm_mask_i64gather_pd(no.raw, base, indices.raw, m.raw, kScale)}; |
| #endif |
| } |
| |
| } // namespace detail |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API VFromD<D> GatherOffset(D /*d*/, const TFromD<D>* HWY_RESTRICT base, |
| VFromD<RebindToSigned<D>> offsets) { |
| return detail::NativeGather128<1>(base, offsets); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename T = TFromD<D>> |
| HWY_API VFromD<D> GatherIndex(D /*d*/, const T* HWY_RESTRICT base, |
| VFromD<RebindToSigned<D>> indices) { |
| return detail::NativeGather128<sizeof(T)>(base, indices); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename T = TFromD<D>> |
| HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D d, |
| const T* HWY_RESTRICT base, |
| VFromD<RebindToSigned<D>> indices) { |
| // For partial vectors, ensure upper mask lanes are zero to prevent faults. |
| if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d))); |
| |
| return detail::NativeMaskedGatherOr128<sizeof(T)>(no, m, base, indices); |
| } |
| |
| // Generic for all vector lengths. |
| template <class D> |
| HWY_API VFromD<D> MaskedGatherIndex(MFromD<D> m, D d, |
| const TFromD<D>* HWY_RESTRICT base, |
| VFromD<RebindToSigned<D>> indices) { |
| return MaskedGatherIndexOr(Zero(d), m, d, base, indices); |
| } |
| |
| #endif // HWY_TARGET <= HWY_AVX2 |
| |
| HWY_DIAGNOSTICS(pop) |
| |
| // ================================================== SWIZZLE (2) |
| |
| // ------------------------------ LowerHalf |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> |
| HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) { |
| return VFromD<D>{v.raw}; |
| } |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) { |
| return Vec128<T, N / 2>{v.raw}; |
| } |
| |
| // ------------------------------ ShiftLeftBytes |
| |
| template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) { |
| static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); |
| const RebindToUnsigned<decltype(d)> du; |
| return BitCast( |
| d, VFromD<decltype(du)>{_mm_slli_si128(BitCast(du, v).raw, kBytes)}); |
| } |
| |
| // Generic for all vector lengths. |
| template <int kBytes, class V> |
| HWY_API V ShiftLeftBytes(const V v) { |
| return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v); |
| } |
| |
| // ------------------------------ ShiftLeftLanes |
| |
| // Generic for all vector lengths. |
| template <int kLanes, class D> |
| HWY_API VFromD<D> ShiftLeftLanes(D d, const VFromD<D> v) { |
| const Repartition<uint8_t, decltype(d)> d8; |
| return BitCast(d, ShiftLeftBytes<kLanes * sizeof(TFromD<D>)>(BitCast(d8, v))); |
| } |
| |
| // Generic for all vector lengths. |
| template <int kLanes, class V> |
| HWY_API V ShiftLeftLanes(const V v) { |
| return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v); |
| } |
| |
| // ------------------------------ ShiftRightBytes |
| template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) { |
| static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); |
| const RebindToUnsigned<decltype(d)> du; |
| // For partial vectors, clear upper lanes so we shift in zeros. |
| if (d.MaxBytes() != 16) { |
| const Full128<TFromD<D>> dfull; |
| const VFromD<decltype(dfull)> vfull{v.raw}; |
| v = VFromD<D>{IfThenElseZero(FirstN(dfull, MaxLanes(d)), vfull).raw}; |
| } |
| return BitCast( |
| d, VFromD<decltype(du)>{_mm_srli_si128(BitCast(du, v).raw, kBytes)}); |
| } |
| |
| // ------------------------------ ShiftRightLanes |
| // Generic for all vector lengths. |
| template <int kLanes, class D> |
| HWY_API VFromD<D> ShiftRightLanes(D d, const VFromD<D> v) { |
| const Repartition<uint8_t, decltype(d)> d8; |
| constexpr size_t kBytes = kLanes * sizeof(TFromD<D>); |
| return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v))); |
| } |
| |
| // ------------------------------ UpperHalf (ShiftRightBytes) |
| |
| // Full input: copy hi into lo (smaller instruction encoding than shifts). |
| template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_NOT_FLOAT3264_D(D)> |
| HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) { |
| const Twice<RebindToUnsigned<decltype(d)>> dut; |
| using VUT = VFromD<decltype(dut)>; // for float16_t |
| const VUT vut = BitCast(dut, v); |
| return BitCast(d, LowerHalf(VUT{_mm_unpackhi_epi64(vut.raw, vut.raw)})); |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)> |
| HWY_API Vec64<float> UpperHalf(D /* tag */, Vec128<float> v) { |
| return Vec64<float>{_mm_movehl_ps(v.raw, v.raw)}; |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F64_D(D)> |
| HWY_API Vec64<double> UpperHalf(D /* tag */, Vec128<double> v) { |
| return Vec64<double>{_mm_unpackhi_pd(v.raw, v.raw)}; |
| } |
| |
| // Partial |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 4)> |
| HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) { |
| return LowerHalf(d, ShiftRightBytes<d.MaxBytes()>(Twice<D>(), v)); |
| } |
| |
| // ------------------------------ ExtractLane (UpperHalf) |
| |
| namespace detail { |
| |
| template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 1)> |
| HWY_INLINE T ExtractLane(const Vec128<T, N> v) { |
| static_assert(kLane < N, "Lane index out of bounds"); |
| #if HWY_TARGET >= HWY_SSSE3 |
| const int pair = _mm_extract_epi16(v.raw, kLane / 2); |
| constexpr int kShift = kLane & 1 ? 8 : 0; |
| return static_cast<T>((pair >> kShift) & 0xFF); |
| #else |
| return static_cast<T>(_mm_extract_epi8(v.raw, kLane) & 0xFF); |
| #endif |
| } |
| |
| template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2)> |
| HWY_INLINE T ExtractLane(const Vec128<T, N> v) { |
| static_assert(kLane < N, "Lane index out of bounds"); |
| const DFromV<decltype(v)> d; |
| const RebindToUnsigned<decltype(d)> du; |
| const uint16_t lane = static_cast<uint16_t>( |
| _mm_extract_epi16(BitCast(du, v).raw, kLane) & 0xFFFF); |
| return BitCastScalar<T>(lane); |
| } |
| |
| template <size_t kLane, typename T, size_t N, HWY_IF_UI32(T)> |
| HWY_INLINE T ExtractLane(const Vec128<T, N> v) { |
| static_assert(kLane < N, "Lane index out of bounds"); |
| #if HWY_TARGET >= HWY_SSSE3 |
| return static_cast<T>(_mm_cvtsi128_si32( |
| (kLane == 0) ? v.raw : _mm_shuffle_epi32(v.raw, kLane))); |
| #else |
| return static_cast<T>(_mm_extract_epi32(v.raw, kLane)); |
| #endif |
| } |
| |
| template <size_t kLane, typename T, size_t N, HWY_IF_UI64(T)> |
| HWY_INLINE T ExtractLane(const Vec128<T, N> v) { |
| static_assert(kLane < N, "Lane index out of bounds"); |
| #if HWY_ARCH_X86_32 |
| alignas(16) T lanes[2]; |
| Store(v, DFromV<decltype(v)>(), lanes); |
| return lanes[kLane]; |
| #elif HWY_TARGET >= HWY_SSSE3 |
| return static_cast<T>( |
| _mm_cvtsi128_si64((kLane == 0) ? v.raw : _mm_shuffle_epi32(v.raw, 0xEE))); |
| #else |
| return static_cast<T>(_mm_extract_epi64(v.raw, kLane)); |
| #endif |
| } |
| |
| template <size_t kLane, size_t N> |
| HWY_INLINE float ExtractLane(const Vec128<float, N> v) { |
| static_assert(kLane < N, "Lane index out of bounds"); |
| #if HWY_TARGET >= HWY_SSSE3 |
| return _mm_cvtss_f32((kLane == 0) ? v.raw |
| : _mm_shuffle_ps(v.raw, v.raw, kLane)); |
| #else |
| // Bug in the intrinsic, returns int but should be float. |
| const int32_t bits = _mm_extract_ps(v.raw, kLane); |
| return BitCastScalar<float>(bits); |
| #endif |
| } |
| |
| // There is no extract_pd; two overloads because there is no UpperHalf for N=1. |
| template <size_t kLane> |
| HWY_INLINE double ExtractLane(const Vec64<double> v) { |
| static_assert(kLane == 0, "Lane index out of bounds"); |
| return GetLane(v); |
| } |
| |
| template <size_t kLane> |
| HWY_INLINE double ExtractLane(const Vec128<double> v) { |
| static_assert(kLane < 2, "Lane index out of bounds"); |
| const Half<DFromV<decltype(v)>> dh; |
| return kLane == 0 ? GetLane(v) : GetLane(UpperHalf(dh, v)); |
| } |
| |
| } // namespace detail |
| |
| // Requires one overload per vector length because ExtractLane<3> may be a |
| // compile error if it calls _mm_extract_epi64. |
| template <typename T> |
| HWY_API T ExtractLane(const Vec128<T, 1> v, size_t i) { |
| HWY_DASSERT(i == 0); |
| (void)i; |
| return GetLane(v); |
| } |
| |
| template <typename T> |
| HWY_API T ExtractLane(const Vec128<T, 2> v, size_t i) { |
| #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang |
| if (__builtin_constant_p(i)) { |
| switch (i) { |
| case 0: |
| return detail::ExtractLane<0>(v); |
| case 1: |
| return detail::ExtractLane<1>(v); |
| } |
| } |
| #endif |
| alignas(16) T lanes[2]; |
| Store(v, DFromV<decltype(v)>(), lanes); |
| return lanes[i]; |
| } |
| |
| template <typename T> |
| HWY_API T ExtractLane(const Vec128<T, 4> v, size_t i) { |
| #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang |
| if (__builtin_constant_p(i)) { |
| switch (i) { |
| case 0: |
| return detail::ExtractLane<0>(v); |
| case 1: |
| return detail::ExtractLane<1>(v); |
| case 2: |
| return detail::ExtractLane<2>(v); |
| case 3: |
| return detail::ExtractLane<3>(v); |
| } |
| } |
| #endif |
| alignas(16) T lanes[4]; |
| Store(v, DFromV<decltype(v)>(), lanes); |
| return lanes[i]; |
| } |
| |
| template <typename T> |
| HWY_API T ExtractLane(const Vec128<T, 8> v, size_t i) { |
| #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang |
| if (__builtin_constant_p(i)) { |
| switch (i) { |
| case 0: |
| return detail::ExtractLane<0>(v); |
| case 1: |
| return detail::ExtractLane<1>(v); |
| case 2: |
| return detail::ExtractLane<2>(v); |
| case 3: |
| return detail::ExtractLane<3>(v); |
| case 4: |
| return detail::ExtractLane<4>(v); |
| case 5: |
| return detail::ExtractLane<5>(v); |
| case 6: |
| return detail::ExtractLane<6>(v); |
| case 7: |
| return detail::ExtractLane<7>(v); |
| } |
| } |
| #endif |
| alignas(16) T lanes[8]; |
| Store(v, DFromV<decltype(v)>(), lanes); |
| return lanes[i]; |
| } |
| |
| template <typename T> |
| HWY_API T ExtractLane(const Vec128<T, 16> v, size_t i) { |
| #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang |
| if (__builtin_constant_p(i)) { |
| switch (i) { |
| case 0: |
| return detail::ExtractLane<0>(v); |
| case 1: |
| return detail::ExtractLane<1>(v); |
| case 2: |
| return detail::ExtractLane<2>(v); |
| case 3: |
| return detail::ExtractLane<3>(v); |
| case 4: |
| return detail::ExtractLane<4>(v); |
| case 5: |
| return detail::ExtractLane<5>(v); |
| case 6: |
| return detail::ExtractLane<6>(v); |
| case 7: |
| return detail::ExtractLane<7>(v); |
| case 8: |
| return detail::ExtractLane<8>(v); |
| case 9: |
| return detail::ExtractLane<9>(v); |
| case 10: |
| return detail::ExtractLane<10>(v); |
| case 11: |
| return detail::ExtractLane<11>(v); |
| case 12: |
| return detail::ExtractLane<12>(v); |
| case 13: |
| return detail::ExtractLane<13>(v); |
| case 14: |
| return detail::ExtractLane<14>(v); |
| case 15: |
| return detail::ExtractLane<15>(v); |
| } |
| } |
| #endif |
| alignas(16) T lanes[16]; |
| Store(v, DFromV<decltype(v)>(), lanes); |
| return lanes[i]; |
| } |
| |
| // ------------------------------ InsertLane (UpperHalf) |
| |
| namespace detail { |
| |
| template <class V> |
| HWY_INLINE V InsertLaneUsingBroadcastAndBlend(V v, size_t i, TFromV<V> t) { |
| const DFromV<decltype(v)> d; |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| using RawMask = decltype(MaskFromVec(VFromD<decltype(d)>()).raw); |
| const auto mask = MFromD<decltype(d)>{static_cast<RawMask>(uint64_t{1} << i)}; |
| #else |
| const RebindToUnsigned<decltype(d)> du; |
| using TU = TFromD<decltype(du)>; |
| const auto mask = RebindMask(d, Iota(du, 0) == Set(du, static_cast<TU>(i))); |
| #endif |
| |
| return IfThenElse(mask, Set(d, t), v); |
| } |
| |
| template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 1)> |
| HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) { |
| static_assert(kLane < N, "Lane index out of bounds"); |
| #if HWY_TARGET >= HWY_SSSE3 |
| return InsertLaneUsingBroadcastAndBlend(v, kLane, t); |
| #else |
| return Vec128<T, N>{_mm_insert_epi8(v.raw, t, kLane)}; |
| #endif |
| } |
| |
| template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2)> |
| HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) { |
| static_assert(kLane < N, "Lane index out of bounds"); |
| const DFromV<decltype(v)> d; |
| const RebindToUnsigned<decltype(d)> du; |
| const uint16_t bits = BitCastScalar<uint16_t>(t); |
| return BitCast(d, VFromD<decltype(du)>{ |
| _mm_insert_epi16(BitCast(du, v).raw, bits, kLane)}); |
| } |
| |
| template <size_t kLane, typename T, size_t N, HWY_IF_UI32(T)> |
| HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) { |
| static_assert(kLane < N, "Lane index out of bounds"); |
| #if HWY_TARGET >= HWY_SSSE3 |
| return InsertLaneUsingBroadcastAndBlend(v, kLane, t); |
| #else |
| const MakeSigned<T> ti = BitCastScalar<MakeSigned<T>>(t); |
| return Vec128<T, N>{_mm_insert_epi32(v.raw, ti, kLane)}; |
| #endif |
| } |
| |
| template <size_t kLane, typename T, size_t N, HWY_IF_UI64(T)> |
| HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) { |
| static_assert(kLane < N, "Lane index out of bounds"); |
| #if HWY_TARGET >= HWY_SSSE3 || HWY_ARCH_X86_32 |
| const DFromV<decltype(v)> d; |
| const RebindToFloat<decltype(d)> df; |
| const auto vt = BitCast(df, Set(d, t)); |
| if (kLane == 0) { |
| return BitCast( |
| d, Vec128<double, N>{_mm_shuffle_pd(vt.raw, BitCast(df, v).raw, 2)}); |
| } |
| return BitCast( |
| d, Vec128<double, N>{_mm_shuffle_pd(BitCast(df, v).raw, vt.raw, 0)}); |
| #else |
| const MakeSigned<T> ti = BitCastScalar<MakeSigned<T>>(t); |
| return Vec128<T, N>{_mm_insert_epi64(v.raw, ti, kLane)}; |
| #endif |
| } |
| |
| template <size_t kLane, size_t N> |
| HWY_INLINE Vec128<float, N> InsertLane(const Vec128<float, N> v, float t) { |
| static_assert(kLane < N, "Lane index out of bounds"); |
| #if HWY_TARGET >= HWY_SSSE3 |
| return InsertLaneUsingBroadcastAndBlend(v, kLane, t); |
| #else |
| return Vec128<float, N>{_mm_insert_ps(v.raw, _mm_set_ss(t), kLane << 4)}; |
| #endif |
| } |
| |
| // There is no insert_pd; two overloads because there is no UpperHalf for N=1. |
| template <size_t kLane> |
| HWY_INLINE Vec128<double, 1> InsertLane(const Vec128<double, 1> v, double t) { |
| static_assert(kLane == 0, "Lane index out of bounds"); |
| return Set(DFromV<decltype(v)>(), t); |
| } |
| |
| template <size_t kLane> |
| HWY_INLINE Vec128<double> InsertLane(const Vec128<double> v, double t) { |
| static_assert(kLane < 2, "Lane index out of bounds"); |
| const DFromV<decltype(v)> d; |
| const Vec128<double> vt = Set(d, t); |
| if (kLane == 0) { |
| return Vec128<double>{_mm_shuffle_pd(vt.raw, v.raw, 2)}; |
| } |
| return Vec128<double>{_mm_shuffle_pd(v.raw, vt.raw, 0)}; |
| } |
| |
| } // namespace detail |
| |
| // Requires one overload per vector length because InsertLane<3> may be a |
| // compile error if it calls _mm_insert_epi64. |
| |
| template <typename T> |
| HWY_API Vec128<T, 1> InsertLane(const Vec128<T, 1> v, size_t i, T t) { |
| HWY_DASSERT(i == 0); |
| (void)i; |
| return Set(DFromV<decltype(v)>(), t); |
| } |
| |
| template <typename T> |
| HWY_API Vec128<T, 2> InsertLane(const Vec128<T, 2> v, size_t i, T t) { |
| #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang |
| if (__builtin_constant_p(i)) { |
| switch (i) { |
| case 0: |
| return detail::InsertLane<0>(v, t); |
| case 1: |
| return detail::InsertLane<1>(v, t); |
| } |
| } |
| #endif |
| return detail::InsertLaneUsingBroadcastAndBlend(v, i, t); |
| } |
| |
| template <typename T> |
| HWY_API Vec128<T, 4> InsertLane(const Vec128<T, 4> v, size_t i, T t) { |
| #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang |
| if (__builtin_constant_p(i)) { |
| switch (i) { |
| case 0: |
| return detail::InsertLane<0>(v, t); |
| case 1: |
| return detail::InsertLane<1>(v, t); |
| case 2: |
| return detail::InsertLane<2>(v, t); |
| case 3: |
| return detail::InsertLane<3>(v, t); |
| } |
| } |
| #endif |
| return detail::InsertLaneUsingBroadcastAndBlend(v, i, t); |
| } |
| |
| template <typename T> |
| HWY_API Vec128<T, 8> InsertLane(const Vec128<T, 8> v, size_t i, T t) { |
| #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang |
| if (__builtin_constant_p(i)) { |
| switch (i) { |
| case 0: |
| return detail::InsertLane<0>(v, t); |
| case 1: |
| return detail::InsertLane<1>(v, t); |
| case 2: |
| return detail::InsertLane<2>(v, t); |
| case 3: |
| return detail::InsertLane<3>(v, t); |
| case 4: |
| return detail::InsertLane<4>(v, t); |
| case 5: |
| return detail::InsertLane<5>(v, t); |
| case 6: |
| return detail::InsertLane<6>(v, t); |
| case 7: |
| return detail::InsertLane<7>(v, t); |
| } |
| } |
| #endif |
| return detail::InsertLaneUsingBroadcastAndBlend(v, i, t); |
| } |
| |
| template <typename T> |
| HWY_API Vec128<T, 16> InsertLane(const Vec128<T, 16> v, size_t i, T t) { |
| #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang |
| if (__builtin_constant_p(i)) { |
| switch (i) { |
| case 0: |
| return detail::InsertLane<0>(v, t); |
| case 1: |
| return detail::InsertLane<1>(v, t); |
| case 2: |
| return detail::InsertLane<2>(v, t); |
| case 3: |
| return detail::InsertLane<3>(v, t); |
| case 4: |
| return detail::InsertLane<4>(v, t); |
| case 5: |
| return detail::InsertLane<5>(v, t); |
| case 6: |
| return detail::InsertLane<6>(v, t); |
| case 7: |
| return detail::InsertLane<7>(v, t); |
| case 8: |
| return detail::InsertLane<8>(v, t); |
| case 9: |
| return detail::InsertLane<9>(v, t); |
| case 10: |
| return detail::InsertLane<10>(v, t); |
| case 11: |
| return detail::InsertLane<11>(v, t); |
| case 12: |
| return detail::InsertLane<12>(v, t); |
| case 13: |
| return detail::InsertLane<13>(v, t); |
| case 14: |
| return detail::InsertLane<14>(v, t); |
| case 15: |
| return detail::InsertLane<15>(v, t); |
| } |
| } |
| #endif |
| return detail::InsertLaneUsingBroadcastAndBlend(v, i, t); |
| } |
| |
| // ------------------------------ CombineShiftRightBytes |
| |
| #if HWY_TARGET == HWY_SSE2 |
| template <int kBytes, class D, HWY_IF_V_SIZE_D(D, 16)> |
| HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) { |
| static_assert(0 < kBytes && kBytes < 16, "kBytes invalid"); |
| return Or(ShiftRightBytes<kBytes>(d, lo), ShiftLeftBytes<16 - kBytes>(d, hi)); |
| } |
| template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 8)> |
| HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) { |
| constexpr size_t kSize = d.MaxBytes(); |
| static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid"); |
| |
| const Twice<decltype(d)> dt; |
| return VFromD<D>{ShiftRightBytes<kBytes>(dt, Combine(dt, hi, lo)).raw}; |
| } |
| #else |
| template <int kBytes, class D, HWY_IF_V_SIZE_D(D, 16)> |
| HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) { |
| const Repartition<uint8_t, decltype(d)> d8; |
| return BitCast(d, Vec128<uint8_t>{_mm_alignr_epi8( |
| BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)}); |
| } |
| |
| template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 8)> |
| HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) { |
| constexpr size_t kSize = d.MaxBytes(); |
| static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid"); |
| const Repartition<uint8_t, decltype(d)> d8; |
| using V8 = Vec128<uint8_t>; |
| const DFromV<V8> dfull8; |
| const Repartition<TFromD<D>, decltype(dfull8)> dfull; |
| const V8 hi8{BitCast(d8, hi).raw}; |
| // Move into most-significant bytes |
| const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw}); |
| const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(dfull8, hi8, lo8); |
| return VFromD<D>{BitCast(dfull, r).raw}; |
| } |
| #endif |
| |
| // ------------------------------ Broadcast/splat any lane |
| |
| template <int kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2)> |
| HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) { |
| const DFromV<decltype(v)> d; |
| const RebindToUnsigned<decltype(d)> du; |
| using VU = VFromD<decltype(du)>; |
| const VU vu = BitCast(du, v); // for float16_t |
| static_assert(0 <= kLane && kLane < N, "Invalid lane"); |
| if (kLane < 4) { |
| const __m128i lo = _mm_shufflelo_epi16(vu.raw, (0x55 * kLane) & 0xFF); |
| return BitCast(d, VU{_mm_unpacklo_epi64(lo, lo)}); |
| } else { |
| const __m128i hi = _mm_shufflehi_epi16(vu.raw, (0x55 * (kLane - 4)) & 0xFF); |
| return BitCast(d, VU{_mm_unpackhi_epi64(hi, hi)}); |
| } |
| } |
| |
| template <int kLane, typename T, size_t N, HWY_IF_UI32(T)> |
| HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) { |
| static_assert(0 <= kLane && kLane < N, "Invalid lane"); |
| return Vec128<T, N>{_mm_shuffle_epi32(v.raw, 0x55 * kLane)}; |
| } |
| |
| template <int kLane, typename T, size_t N, HWY_IF_UI64(T)> |
| HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) { |
| static_assert(0 <= kLane && kLane < N, "Invalid lane"); |
| return Vec128<T, N>{_mm_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)}; |
| } |
| |
| template <int kLane, size_t N> |
| HWY_API Vec128<float, N> Broadcast(const Vec128<float, N> v) { |
| static_assert(0 <= kLane && kLane < N, "Invalid lane"); |
| return Vec128<float, N>{_mm_shuffle_ps(v.raw, v.raw, 0x55 * kLane)}; |
| } |
| |
| template <int kLane, size_t N> |
| HWY_API Vec128<double, N> Broadcast(const Vec128<double, N> v) { |
| static_assert(0 <= kLane && kLane < N, "Invalid lane"); |
| return Vec128<double, N>{_mm_shuffle_pd(v.raw, v.raw, 3 * kLane)}; |
| } |
| |
| // ------------------------------ TableLookupLanes (Shuffle01) |
| |
| // Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes. |
| template <typename T, size_t N = 16 / sizeof(T)> |
| struct Indices128 { |
| __m128i raw; |
| }; |
| |
| template <class D, typename T = TFromD<D>, typename TI, size_t kN, |
| HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE(T, 1)> |
| HWY_API Indices128<T, kN> IndicesFromVec(D d, Vec128<TI, kN> vec) { |
| static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); |
| #if HWY_IS_DEBUG_BUILD |
| const Rebind<TI, decltype(d)> di; |
| HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && |
| AllTrue(di, Lt(vec, Set(di, kN * 2)))); |
| #endif |
| |
| // No change as byte indices are always used for 8-bit lane types |
| (void)d; |
| return Indices128<T, kN>{vec.raw}; |
| } |
| |
| template <class D, typename T = TFromD<D>, typename TI, size_t kN, |
| HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE(T, 2)> |
| HWY_API Indices128<T, kN> IndicesFromVec(D d, Vec128<TI, kN> vec) { |
| static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); |
| #if HWY_IS_DEBUG_BUILD |
| const Rebind<TI, decltype(d)> di; |
| HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && |
| AllTrue(di, Lt(vec, Set(di, kN * 2)))); |
| #endif |
| |
| #if HWY_TARGET <= HWY_AVX3 || HWY_TARGET == HWY_SSE2 |
| (void)d; |
| return Indices128<T, kN>{vec.raw}; |
| #else // SSSE3, SSE4, or AVX2 |
| const Repartition<uint8_t, decltype(d)> d8; |
| using V8 = VFromD<decltype(d8)>; |
| alignas(16) static constexpr uint8_t kByteOffsets[16] = { |
| 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}; |
| |
| // Broadcast each lane index to all 4 bytes of T |
| alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { |
| 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; |
| const V8 lane_indices = TableLookupBytes(vec, Load(d8, kBroadcastLaneBytes)); |
| |
| // Shift to bytes |
| const Repartition<uint16_t, decltype(d)> d16; |
| const V8 byte_indices = BitCast(d8, ShiftLeft<1>(BitCast(d16, lane_indices))); |
| |
| return Indices128<T, kN>{Add(byte_indices, Load(d8, kByteOffsets)).raw}; |
| #endif // HWY_TARGET <= HWY_AVX3 || HWY_TARGET == HWY_SSE2 |
| } |
| |
| template <class D, typename T = TFromD<D>, typename TI, size_t kN, |
| HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE(T, 4)> |
| HWY_API Indices128<T, kN> IndicesFromVec(D d, Vec128<TI, kN> vec) { |
| static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); |
| #if HWY_IS_DEBUG_BUILD |
| const Rebind<TI, decltype(d)> di; |
| HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && |
| AllTrue(di, Lt(vec, Set(di, kN * 2)))); |
| #endif |
| |
| #if HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SSE2 |
| (void)d; |
| return Indices128<T, kN>{vec.raw}; |
| #else |
| const Repartition<uint8_t, decltype(d)> d8; |
| using V8 = VFromD<decltype(d8)>; |
| alignas(16) static constexpr uint8_t kByteOffsets[16] = { |
| 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; |
| |
| // Broadcast each lane index to all 4 bytes of T |
| alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { |
| 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; |
| const V8 lane_indices = TableLookupBytes(vec, Load(d8, kBroadcastLaneBytes)); |
| |
| // Shift to bytes |
| const Repartition<uint16_t, decltype(d)> d16; |
| const V8 byte_indices = BitCast(d8, ShiftLeft<2>(BitCast(d16, lane_indices))); |
| |
| return Indices128<T, kN>{Add(byte_indices, Load(d8, kByteOffsets)).raw}; |
| #endif |
| } |
| |
| template <class D, typename T = TFromD<D>, typename TI, size_t kN, |
| HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE(T, 8)> |
| HWY_API Indices128<T, kN> IndicesFromVec(D d, Vec128<TI, kN> vec) { |
| static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); |
| #if HWY_IS_DEBUG_BUILD |
| const Rebind<TI, decltype(d)> di; |
| HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && |
| AllTrue(di, Lt(vec, Set(di, static_cast<TI>(kN * 2))))); |
| #else |
| (void)d; |
| #endif |
| |
| // No change - even without AVX3, we can shuffle+blend. |
| return Indices128<T, kN>{vec.raw}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename TI> |
| HWY_API Indices128<TFromD<D>, HWY_MAX_LANES_D(D)> SetTableIndices( |
| D d, const TI* idx) { |
| static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index size must match lane"); |
| const Rebind<TI, decltype(d)> di; |
| return IndicesFromVec(d, LoadU(di, idx)); |
| } |
| |
| template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> |
| HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) { |
| return TableLookupBytes(v, Vec128<T, N>{idx.raw}); |
| } |
| |
| template <typename T, size_t N, HWY_IF_UI16(T)> |
| HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) { |
| #if HWY_TARGET <= HWY_AVX3 |
| return {_mm_permutexvar_epi16(idx.raw, v.raw)}; |
| #elif HWY_TARGET == HWY_SSE2 |
| #if HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle) |
| typedef uint16_t GccU16RawVectType __attribute__((__vector_size__(16))); |
| return Vec128<T, N>{reinterpret_cast<typename detail::Raw128<T>::type>( |
| __builtin_shuffle(reinterpret_cast<GccU16RawVectType>(v.raw), |
| reinterpret_cast<GccU16RawVectType>(idx.raw)))}; |
| #else |
| const Full128<T> d_full; |
| alignas(16) T src_lanes[8]; |
| alignas(16) uint16_t indices[8]; |
| alignas(16) T result_lanes[8]; |
| |
| Store(Vec128<T>{v.raw}, d_full, src_lanes); |
| _mm_store_si128(reinterpret_cast<__m128i*>(indices), idx.raw); |
| |
| for (int i = 0; i < 8; i++) { |
| result_lanes[i] = src_lanes[indices[i] & 7u]; |
| } |
| |
| return Vec128<T, N>{Load(d_full, result_lanes).raw}; |
| #endif // HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle) |
| #else |
| return TableLookupBytes(v, Vec128<T, N>{idx.raw}); |
| #endif |
| } |
| |
| #if HWY_HAVE_FLOAT16 |
| template <size_t N, HWY_IF_V_SIZE_GT(float16_t, N, 2)> |
| HWY_API Vec128<float16_t, N> TableLookupLanes(Vec128<float16_t, N> v, |
| Indices128<float16_t, N> idx) { |
| return {_mm_permutexvar_ph(idx.raw, v.raw)}; |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| |
| template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> |
| HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) { |
| #if HWY_TARGET <= HWY_AVX2 |
| const DFromV<decltype(v)> d; |
| const RebindToFloat<decltype(d)> df; |
| const Vec128<float, N> perm{_mm_permutevar_ps(BitCast(df, v).raw, idx.raw)}; |
| return BitCast(d, perm); |
| #elif HWY_TARGET == HWY_SSE2 |
| #if HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle) |
| typedef uint32_t GccU32RawVectType __attribute__((__vector_size__(16))); |
| return Vec128<T, N>{reinterpret_cast<typename detail::Raw128<T>::type>( |
| __builtin_shuffle(reinterpret_cast<GccU32RawVectType>(v.raw), |
| reinterpret_cast<GccU32RawVectType>(idx.raw)))}; |
| #else |
| const Full128<T> d_full; |
| alignas(16) T src_lanes[4]; |
| alignas(16) uint32_t indices[4]; |
| alignas(16) T result_lanes[4]; |
| |
| Store(Vec128<T>{v.raw}, d_full, src_lanes); |
| _mm_store_si128(reinterpret_cast<__m128i*>(indices), idx.raw); |
| |
| for (int i = 0; i < 4; i++) { |
| result_lanes[i] = src_lanes[indices[i] & 3u]; |
| } |
| |
| return Vec128<T, N>{Load(d_full, result_lanes).raw}; |
| #endif // HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle) |
| #else // SSSE3 or SSE4 |
| return TableLookupBytes(v, Vec128<T, N>{idx.raw}); |
| #endif |
| } |
| |
| #if HWY_TARGET <= HWY_SSSE3 |
| template <size_t N, HWY_IF_V_SIZE_GT(float, N, 4)> |
| HWY_API Vec128<float, N> TableLookupLanes(Vec128<float, N> v, |
| Indices128<float, N> idx) { |
| #if HWY_TARGET <= HWY_AVX2 |
| return Vec128<float, N>{_mm_permutevar_ps(v.raw, idx.raw)}; |
| #else // SSSE3 or SSE4 |
| const DFromV<decltype(v)> df; |
| const RebindToSigned<decltype(df)> di; |
| return BitCast(df, |
| TableLookupBytes(BitCast(di, v), Vec128<int32_t, N>{idx.raw})); |
| #endif // HWY_TARGET <= HWY_AVX2 |
| } |
| #endif // HWY_TARGET <= HWY_SSSE3 |
| |
| // Single lane: no change |
| template <typename T> |
| HWY_API Vec128<T, 1> TableLookupLanes(Vec128<T, 1> v, |
| Indices128<T, 1> /* idx */) { |
| return v; |
| } |
| |
| template <typename T, HWY_IF_UI64(T)> |
| HWY_API Vec128<T> TableLookupLanes(Vec128<T> v, Indices128<T> idx) { |
| const DFromV<decltype(v)> d; |
| Vec128<int64_t> vidx{idx.raw}; |
| #if HWY_TARGET <= HWY_AVX2 |
| // There is no _mm_permute[x]var_epi64. |
| vidx += vidx; // bit1 is the decider (unusual) |
| const RebindToFloat<decltype(d)> df; |
| return BitCast( |
| d, Vec128<double>{_mm_permutevar_pd(BitCast(df, v).raw, vidx.raw)}); |
| #else |
| // Only 2 lanes: can swap+blend. Choose v if vidx == iota. To avoid a 64-bit |
| // comparison (expensive on SSSE3), just invert the upper lane and subtract 1 |
| // to obtain an all-zero or all-one mask. |
| const RebindToSigned<decltype(d)> di; |
| const Vec128<int64_t> same = (vidx ^ Iota(di, 0)) - Set(di, 1); |
| const Mask128<T> mask_same = RebindMask(d, MaskFromVec(same)); |
| return IfThenElse(mask_same, v, Shuffle01(v)); |
| #endif |
| } |
| |
| HWY_API Vec128<double> TableLookupLanes(Vec128<double> v, |
| Indices128<double> idx) { |
| Vec128<int64_t> vidx{idx.raw}; |
| #if HWY_TARGET <= HWY_AVX2 |
| vidx += vidx; // bit1 is the decider (unusual) |
| return Vec128<double>{_mm_permutevar_pd(v.raw, vidx.raw)}; |
| #else |
| // Only 2 lanes: can swap+blend. Choose v if vidx == iota. To avoid a 64-bit |
| // comparison (expensive on SSSE3), just invert the upper lane and subtract 1 |
| // to obtain an all-zero or all-one mask. |
| const DFromV<decltype(v)> d; |
| const RebindToSigned<decltype(d)> di; |
| const Vec128<int64_t> same = (vidx ^ Iota(di, 0)) - Set(di, 1); |
| const Mask128<double> mask_same = RebindMask(d, MaskFromVec(same)); |
| return IfThenElse(mask_same, v, Shuffle01(v)); |
| #endif |
| } |
| |
| // ------------------------------ ReverseBlocks |
| |
| // Single block: no change |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) { |
| return v; |
| } |
| |
| // ------------------------------ Reverse (Shuffle0123, Shuffle2301) |
| |
| // Single lane: no change |
| template <class D, HWY_IF_LANES_D(D, 1)> |
| HWY_API VFromD<D> Reverse(D /* tag */, VFromD<D> v) { |
| return v; |
| } |
| |
| // 32-bit x2: shuffle |
| template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 4)> |
| HWY_API VFromD<D> Reverse(D /* tag */, const VFromD<D> v) { |
| return VFromD<D>{Shuffle2301(Vec128<TFromD<D>>{v.raw}).raw}; |
| } |
| |
| // 64-bit x2: shuffle |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)> |
| HWY_API VFromD<D> Reverse(D /* tag */, const VFromD<D> v) { |
| return Shuffle01(v); |
| } |
| |
| // 32-bit x4: shuffle |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)> |
| HWY_API VFromD<D> Reverse(D /* tag */, const VFromD<D> v) { |
| return Shuffle0123(v); |
| } |
| |
| // 16-bit |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2), |
| HWY_IF_LANES_GT_D(D, 1)> |
| HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) { |
| const RebindToUnsigned<decltype(d)> du; |
| using VU = VFromD<decltype(du)>; |
| const VU vu = BitCast(du, v); // for float16_t |
| constexpr size_t kN = MaxLanes(d); |
| if (kN == 1) return v; |
| if (kN == 2) { |
| return BitCast(d, VU{_mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 0, 1))}); |
| } |
| if (kN == 4) { |
| return BitCast(d, VU{_mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 2, 3))}); |
| } |
| |
| #if HWY_TARGET == HWY_SSE2 |
| const VU rev4{ |
| _mm_shufflehi_epi16(_mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 2, 3)), |
| _MM_SHUFFLE(0, 1, 2, 3))}; |
| return BitCast(d, VU{_mm_shuffle_epi32(rev4.raw, _MM_SHUFFLE(1, 0, 3, 2))}); |
| #else |
| const RebindToSigned<decltype(d)> di; |
| const VFromD<decltype(di)> shuffle = Dup128VecFromValues( |
| di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100); |
| return BitCast(d, TableLookupBytes(v, shuffle)); |
| #endif |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1), |
| HWY_IF_LANES_GT_D(D, 1)> |
| HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) { |
| constexpr int kN = static_cast<int>(MaxLanes(d)); |
| if (kN == 1) return v; |
| #if HWY_TARGET <= HWY_SSSE3 |
| // NOTE: Lanes with negative shuffle control mask values are set to zero. |
| alignas(16) static constexpr int8_t kReverse[16] = { |
| kN - 1, kN - 2, kN - 3, kN - 4, kN - 5, kN - 6, kN - 7, kN - 8, |
| kN - 9, kN - 10, kN - 11, kN - 12, kN - 13, kN - 14, kN - 15, kN - 16}; |
| const RebindToSigned<decltype(d)> di; |
| const VFromD<decltype(di)> idx = Load(di, kReverse); |
| return VFromD<D>{_mm_shuffle_epi8(BitCast(di, v).raw, idx.raw)}; |
| #else |
| const RepartitionToWide<decltype(d)> d16; |
| return BitCast(d, Reverse(d16, RotateRight<8>(BitCast(d16, v)))); |
| #endif |
| } |
| |
| // ------------------------------ Reverse2 |
| |
| // Single lane: no change |
| template <class D, HWY_IF_LANES_D(D, 1)> |
| HWY_API VFromD<D> Reverse2(D /* tag */, VFromD<D> v) { |
| return v; |
| } |
| |
| // Generic for all vector lengths (128-bit sufficient if SSE2). |
| template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_GT_D(D, 1)> |
| HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) { |
| #if HWY_TARGET <= HWY_AVX3 |
| const Repartition<uint32_t, decltype(d)> du32; |
| return BitCast(d, RotateRight<16>(BitCast(du32, v))); |
| #elif HWY_TARGET == HWY_SSE2 |
| const RebindToUnsigned<decltype(d)> du; |
| using VU = VFromD<decltype(du)>; |
| const VU vu = BitCast(du, v); // for float16_t |
| constexpr size_t kN = MaxLanes(d); |
| __m128i shuf_result = _mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(2, 3, 0, 1)); |
| if (kN > 4) { |
| shuf_result = _mm_shufflehi_epi16(shuf_result, _MM_SHUFFLE(2, 3, 0, 1)); |
| } |
| return BitCast(d, VU{shuf_result}); |
| #else |
| const RebindToSigned<decltype(d)> di; |
| const VFromD<decltype(di)> shuffle = Dup128VecFromValues( |
| di, 0x0302, 0x0100, 0x0706, 0x0504, 0x0B0A, 0x0908, 0x0F0E, 0x0D0C); |
| return BitCast(d, TableLookupBytes(v, shuffle)); |
| #endif |
| } |
| |
| // Generic for all vector lengths. |
| template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_LANES_GT_D(D, 1)> |
| HWY_API VFromD<D> Reverse2(D /* tag */, VFromD<D> v) { |
| return Shuffle2301(v); |
| } |
| |
| // Generic for all vector lengths. |
| template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_GT_D(D, 1)> |
| HWY_API VFromD<D> Reverse2(D /* tag */, VFromD<D> v) { |
| return Shuffle01(v); |
| } |
| |
| // ------------------------------ Reverse4 |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)> |
| HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) { |
| const RebindToUnsigned<decltype(d)> du; |
| using VU = VFromD<decltype(du)>; |
| const VU vu = BitCast(du, v); // for float16_t |
| // 4x 16-bit: a single shufflelo suffices. |
| constexpr size_t kN = MaxLanes(d); |
| if (kN <= 4) { |
| return BitCast(d, VU{_mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 2, 3))}); |
| } |
| |
| #if HWY_TARGET == HWY_SSE2 |
| return BitCast(d, VU{_mm_shufflehi_epi16( |
| _mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 2, 3)), |
| _MM_SHUFFLE(0, 1, 2, 3))}); |
| #else |
| const RebindToSigned<decltype(d)> di; |
| const VFromD<decltype(di)> shuffle = Dup128VecFromValues( |
| di, 0x0706, 0x0504, 0x0302, 0x0100, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908); |
| return BitCast(d, TableLookupBytes(v, shuffle)); |
| #endif |
| } |
| |
| // Generic for all vector lengths. |
| template <class D, HWY_IF_T_SIZE_D(D, 4)> |
| HWY_API VFromD<D> Reverse4(D /* tag */, const VFromD<D> v) { |
| return Shuffle0123(v); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)> |
| HWY_API VFromD<D> Reverse4(D /* tag */, VFromD<D> /* v */) { |
| HWY_ASSERT(0); // don't have 4 u64 lanes |
| } |
| |
| // ------------------------------ Reverse8 |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)> |
| HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) { |
| #if HWY_TARGET == HWY_SSE2 |
| const RepartitionToWide<decltype(d)> dw; |
| return Reverse2(d, BitCast(d, Shuffle0123(BitCast(dw, v)))); |
| #else |
| const RebindToSigned<decltype(d)> di; |
| const VFromD<decltype(di)> shuffle = Dup128VecFromValues( |
| di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100); |
| return BitCast(d, TableLookupBytes(v, shuffle)); |
| #endif |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), |
| HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))> |
| HWY_API VFromD<D> Reverse8(D /* tag */, VFromD<D> /* v */) { |
| HWY_ASSERT(0); // don't have 8 lanes if larger than 16-bit |
| } |
| |
| // ------------------------------ ReverseBits in x86_512 |
| |
| // ------------------------------ InterleaveUpper (UpperHalf) |
| |
| // Full |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)> |
| HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) { |
| return VFromD<D>{_mm_unpackhi_epi8(a.raw, b.raw)}; |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)> |
| HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) { |
| const DFromV<decltype(a)> d; |
| const RebindToUnsigned<decltype(d)> du; |
| using VU = VFromD<decltype(du)>; // for float16_t |
| return BitCast( |
| d, VU{_mm_unpackhi_epi16(BitCast(du, a).raw, BitCast(du, b).raw)}); |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)> |
| HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) { |
| return VFromD<D>{_mm_unpackhi_epi32(a.raw, b.raw)}; |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI64_D(D)> |
| HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) { |
| return VFromD<D>{_mm_unpackhi_epi64(a.raw, b.raw)}; |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)> |
| HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) { |
| return VFromD<D>{_mm_unpackhi_ps(a.raw, b.raw)}; |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)> |
| HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) { |
| return VFromD<D>{_mm_unpackhi_pd(a.raw, b.raw)}; |
| } |
| |
| // Partial |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> |
| HWY_API VFromD<D> InterleaveUpper(D d, VFromD<D> a, VFromD<D> b) { |
| const Half<decltype(d)> d2; |
| return InterleaveLower(d, VFromD<D>{UpperHalf(d2, a).raw}, |
| VFromD<D>{UpperHalf(d2, b).raw}); |
| } |
| |
| // -------------------------- I8/U8 Broadcast (InterleaveLower, InterleaveUpper) |
| |
| template <int kLane, class T, size_t N, HWY_IF_T_SIZE(T, 1)> |
| HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) { |
| static_assert(0 <= kLane && kLane < N, "Invalid lane"); |
| const DFromV<decltype(v)> d; |
| |
| #if HWY_TARGET == HWY_SSE2 |
| const Full128<T> d_full; |
| const Vec128<T> v_full{v.raw}; |
| const auto v_interleaved = (kLane < 8) |
| ? InterleaveLower(d_full, v_full, v_full) |
| : InterleaveUpper(d_full, v_full, v_full); |
| return ResizeBitCast( |
| d, Broadcast<kLane & 7>(BitCast(Full128<uint16_t>(), v_interleaved))); |
| #else |
| return TableLookupBytes(v, Set(d, static_cast<T>(kLane))); |
| #endif |
| } |
| |
| // ------------------------------ ZipLower/ZipUpper (InterleaveLower) |
| |
| // Same as Interleave*, except that the return lanes are double-width integers; |
| // this is necessary because the single-lane scalar cannot return two values. |
| // Generic for all vector lengths. |
| template <class V, class DW = RepartitionToWide<DFromV<V>>> |
| HWY_API VFromD<DW> ZipLower(V a, V b) { |
| return BitCast(DW(), InterleaveLower(a, b)); |
| } |
| template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>> |
| HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) { |
| return BitCast(dw, InterleaveLower(D(), a, b)); |
| } |
| |
| template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>> |
| HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) { |
| return BitCast(dw, InterleaveUpper(D(), a, b)); |
| } |
| |
| // ================================================== CONVERT (1) |
| |
| // ------------------------------ PromoteTo unsigned (TableLookupBytesOr0) |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)> |
| HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) { |
| #if HWY_TARGET >= HWY_SSSE3 |
| const __m128i zero = _mm_setzero_si128(); |
| return VFromD<D>{_mm_unpacklo_epi8(v.raw, zero)}; |
| #else |
| return VFromD<D>{_mm_cvtepu8_epi16(v.raw)}; |
| #endif |
| } |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)> |
| HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) { |
| #if HWY_TARGET >= HWY_SSSE3 |
| return VFromD<D>{_mm_unpacklo_epi16(v.raw, _mm_setzero_si128())}; |
| #else |
| return VFromD<D>{_mm_cvtepu16_epi32(v.raw)}; |
| #endif |
| } |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)> |
| HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) { |
| #if HWY_TARGET >= HWY_SSSE3 |
| return VFromD<D>{_mm_unpacklo_epi32(v.raw, _mm_setzero_si128())}; |
| #else |
| return VFromD<D>{_mm_cvtepu32_epi64(v.raw)}; |
| #endif |
| } |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)> |
| HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) { |
| #if HWY_TARGET >= HWY_SSSE3 |
| const __m128i zero = _mm_setzero_si128(); |
| const __m128i u16 = _mm_unpacklo_epi8(v.raw, zero); |
| return VFromD<D>{_mm_unpacklo_epi16(u16, zero)}; |
| #else |
| return VFromD<D>{_mm_cvtepu8_epi32(v.raw)}; |
| #endif |
| } |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)> |
| HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint8_t, D>> v) { |
| #if HWY_TARGET > HWY_SSSE3 |
| const Rebind<uint32_t, decltype(d)> du32; |
| return PromoteTo(d, PromoteTo(du32, v)); |
| #elif HWY_TARGET == HWY_SSSE3 |
| alignas(16) static constexpr int8_t kShuffle[16] = { |
| 0, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1}; |
| const Repartition<int8_t, decltype(d)> di8; |
| return TableLookupBytesOr0(v, BitCast(d, Load(di8, kShuffle))); |
| #else |
| (void)d; |
| return VFromD<D>{_mm_cvtepu8_epi64(v.raw)}; |
| #endif |
| } |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)> |
| HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint16_t, D>> v) { |
| #if HWY_TARGET > HWY_SSSE3 |
| const Rebind<uint32_t, decltype(d)> du32; |
| return PromoteTo(d, PromoteTo(du32, v)); |
| #elif HWY_TARGET == HWY_SSSE3 |
| alignas(16) static constexpr int8_t kShuffle[16] = { |
| 0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1}; |
| const Repartition<int8_t, decltype(d)> di8; |
| return TableLookupBytesOr0(v, BitCast(d, Load(di8, kShuffle))); |
| #else |
| (void)d; |
| return VFromD<D>{_mm_cvtepu16_epi64(v.raw)}; |
| #endif |
| } |
| |
| // Unsigned to signed: same plus cast. |
| template <class D, class V, HWY_IF_SIGNED_D(D), HWY_IF_UNSIGNED_V(V), |
| HWY_IF_LANES_GT(sizeof(TFromD<D>), sizeof(TFromV<V>)), |
| HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V))> |
| HWY_API VFromD<D> PromoteTo(D di, V v) { |
| const RebindToUnsigned<decltype(di)> du; |
| return BitCast(di, PromoteTo(du, v)); |
| } |
| |
| // ------------------------------ PromoteTo signed (ShiftRight, ZipLower) |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I16_D(D)> |
| HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) { |
| #if HWY_TARGET >= HWY_SSSE3 |
| return ShiftRight<8>(VFromD<D>{_mm_unpacklo_epi8(v.raw, v.raw)}); |
| #else |
| return VFromD<D>{_mm_cvtepi8_epi16(v.raw)}; |
| #endif |
| } |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)> |
| HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) { |
| #if HWY_TARGET >= HWY_SSSE3 |
| return ShiftRight<16>(VFromD<D>{_mm_unpacklo_epi16(v.raw, v.raw)}); |
| #else |
| return VFromD<D>{_mm_cvtepi16_epi32(v.raw)}; |
| #endif |
| } |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)> |
| HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { |
| #if HWY_TARGET >= HWY_SSSE3 |
| return ShiftRight<32>(VFromD<D>{_mm_unpacklo_epi32(v.raw, v.raw)}); |
| #else |
| return VFromD<D>{_mm_cvtepi32_epi64(v.raw)}; |
| #endif |
| } |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)> |
| HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) { |
| #if HWY_TARGET >= HWY_SSSE3 |
| const __m128i x2 = _mm_unpacklo_epi8(v.raw, v.raw); |
| const __m128i x4 = _mm_unpacklo_epi16(x2, x2); |
| return ShiftRight<24>(VFromD<D>{x4}); |
| #else |
| return VFromD<D>{_mm_cvtepi8_epi32(v.raw)}; |
| #endif |
| } |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)> |
| HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<int8_t, D>> v) { |
| #if HWY_TARGET >= HWY_SSSE3 |
| const Repartition<int32_t, decltype(d)> di32; |
| const Half<decltype(di32)> dh_i32; |
| const VFromD<decltype(di32)> x4{PromoteTo(dh_i32, v).raw}; |
| const VFromD<decltype(di32)> s4{ |
| _mm_shufflelo_epi16(x4.raw, _MM_SHUFFLE(3, 3, 1, 1))}; |
| return ZipLower(d, x4, s4); |
| #else |
| (void)d; |
| return VFromD<D>{_mm_cvtepi8_epi64(v.raw)}; |
| #endif |
| } |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)> |
| HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<int16_t, D>> v) { |
| #if HWY_TARGET >= HWY_SSSE3 |
| const Repartition<int32_t, decltype(d)> di32; |
| const Half<decltype(di32)> dh_i32; |
| const VFromD<decltype(di32)> x2{PromoteTo(dh_i32, v).raw}; |
| const VFromD<decltype(di32)> s2{ |
| _mm_shufflelo_epi16(x2.raw, _MM_SHUFFLE(3, 3, 1, 1))}; |
| return ZipLower(d, x2, s2); |
| #else |
| (void)d; |
| return VFromD<D>{_mm_cvtepi16_epi64(v.raw)}; |
| #endif |
| } |
| |
| // -------------------- PromoteTo float (ShiftLeft, IfNegativeThenElse) |
| #if HWY_TARGET < HWY_SSE4 && !defined(HWY_DISABLE_F16C) |
| |
| // Per-target flag to prevent generic_ops-inl.h from defining f16 conversions. |
| #ifdef HWY_NATIVE_F16C |
| #undef HWY_NATIVE_F16C |
| #else |
| #define HWY_NATIVE_F16C |
| #endif |
| |
| // Workaround for origin tracking bug in Clang msan prior to 11.0 |
| // (spurious "uninitialized memory" for TestF16 with "ORIGIN: invalid") |
| #if HWY_IS_MSAN && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1100) |
| #define HWY_INLINE_F16 HWY_NOINLINE |
| #else |
| #define HWY_INLINE_F16 HWY_INLINE |
| #endif |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> |
| HWY_INLINE_F16 VFromD<D> PromoteTo(D /*tag*/, VFromD<Rebind<float16_t, D>> v) { |
| #if HWY_HAVE_FLOAT16 |
| const RebindToUnsigned<DFromV<decltype(v)>> du16; |
| return VFromD<D>{_mm_cvtph_ps(BitCast(du16, v).raw)}; |
| #else |
| return VFromD<D>{_mm_cvtph_ps(v.raw)}; |
| #endif |
| } |
| |
| #endif // HWY_NATIVE_F16C |
| |
| #if HWY_HAVE_FLOAT16 |
| |
| #ifdef HWY_NATIVE_PROMOTE_F16_TO_F64 |
| #undef HWY_NATIVE_PROMOTE_F16_TO_F64 |
| #else |
| #define HWY_NATIVE_PROMOTE_F16_TO_F64 |
| #endif |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> |
| HWY_INLINE VFromD<D> PromoteTo(D /*tag*/, VFromD<Rebind<float16_t, D>> v) { |
| return VFromD<D>{_mm_cvtph_pd(v.raw)}; |
| } |
| |
| #endif // HWY_HAVE_FLOAT16 |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> |
| HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<bfloat16_t, D>> v) { |
| const Rebind<uint16_t, decltype(df32)> du16; |
| const RebindToSigned<decltype(df32)> di32; |
| return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> |
| HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) { |
| return VFromD<D>{_mm_cvtps_pd(v.raw)}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> |
| HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { |
| return VFromD<D>{_mm_cvtepi32_pd(v.raw)}; |
| } |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> |
| HWY_API VFromD<D> PromoteTo(D /*df64*/, VFromD<Rebind<uint32_t, D>> v) { |
| return VFromD<D>{_mm_cvtepu32_pd(v.raw)}; |
| } |
| #else |
| // Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2 |
| template <class D, HWY_IF_F64_D(D)> |
| HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<uint32_t, D>> v) { |
| const Rebind<int32_t, decltype(df64)> di32; |
| const auto i32_to_f64_result = PromoteTo(df64, BitCast(di32, v)); |
| return i32_to_f64_result + IfNegativeThenElse(i32_to_f64_result, |
| Set(df64, 4294967296.0), |
| Zero(df64)); |
| } |
| #endif // HWY_TARGET <= HWY_AVX3 |
| |
| // ------------------------------ Per4LaneBlockShuffle |
| namespace detail { |
| |
| #ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32 |
| #undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32 |
| #else |
| #define HWY_NATIVE_PER4LANEBLKSHUF_DUP32 |
| #endif |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_INLINE VFromD<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3, |
| const uint32_t x2, |
| const uint32_t x1, |
| const uint32_t x0) { |
| return ResizeBitCast( |
| d, Vec128<uint32_t>{_mm_set_epi32( |
| static_cast<int32_t>(x3), static_cast<int32_t>(x2), |
| static_cast<int32_t>(x1), static_cast<int32_t>(x0))}); |
| } |
| |
| template <size_t kIdx3210, class V> |
| HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/, |
| hwy::SizeTag<2> /*lane_size_tag*/, |
| hwy::SizeTag<8> /*vect_size_tag*/, V v) { |
| const DFromV<decltype(v)> d; |
| const RebindToUnsigned<decltype(d)> du; // for float16_t |
| return BitCast(d, |
| VFromD<decltype(du)>{_mm_shufflelo_epi16( |
| BitCast(du, v).raw, static_cast<int>(kIdx3210 & 0xFF))}); |
| } |
| |
| #if HWY_TARGET == HWY_SSE2 |
| template <size_t kIdx3210, class V> |
| HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/, |
| hwy::SizeTag<2> /*lane_size_tag*/, |
| hwy::SizeTag<16> /*vect_size_tag*/, V v) { |
| const DFromV<decltype(v)> d; |
| const RebindToUnsigned<decltype(d)> du; // for float16_t |
| constexpr int kShuffle = static_cast<int>(kIdx3210 & 0xFF); |
| return BitCast( |
| d, VFromD<decltype(du)>{_mm_shufflehi_epi16( |
| _mm_shufflelo_epi16(BitCast(du, v).raw, kShuffle), kShuffle)}); |
| } |
| |
| template <size_t kIdx3210, size_t kVectSize, class V, |
| hwy::EnableIf<(kVectSize == 4 || kVectSize == 8)>* = nullptr> |
| HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> idx_3210_tag, |
| hwy::SizeTag<1> /*lane_size_tag*/, |
| hwy::SizeTag<kVectSize> /*vect_size_tag*/, |
| V v) { |
| const DFromV<decltype(v)> d; |
| const RebindToUnsigned<decltype(d)> du; |
| const Rebind<uint16_t, decltype(d)> du16; |
| const RebindToSigned<decltype(du16)> di16; |
| |
| const auto vu16 = PromoteTo(du16, BitCast(du, v)); |
| const auto shuf16_result = Per4LaneBlockShuffle( |
| idx_3210_tag, hwy::SizeTag<2>(), hwy::SizeTag<kVectSize * 2>(), vu16); |
| return BitCast(d, DemoteTo(du, BitCast(di16, shuf16_result))); |
| } |
| |
| template <size_t kIdx3210, size_t kVectSize, class V> |
| HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> idx_3210_tag, |
| hwy::SizeTag<1> /*lane_size_tag*/, |
| hwy::SizeTag<16> /*vect_size_tag*/, V v) { |
| const DFromV<decltype(v)> d; |
| const RebindToUnsigned<decltype(d)> du; |
| const Repartition<uint16_t, decltype(d)> du16; |
| const RebindToSigned<decltype(du16)> di16; |
| |
| const auto zero = Zero(d); |
| const auto v_lo16 = BitCast(du16, InterleaveLower(d, v, zero)); |
| const auto v_hi16 = BitCast(du16, InterleaveUpper(d, v, zero)); |
| |
| const auto lo_shuf_result = Per4LaneBlockShuffle( |
| idx_3210_tag, hwy::SizeTag<2>(), hwy::SizeTag<16>(), v_lo16); |
| const auto hi_shuf_result = Per4LaneBlockShuffle( |
| idx_3210_tag, hwy::SizeTag<2>(), hwy::SizeTag<16>(), v_hi16); |
| |
| return BitCast(d, OrderedDemote2To(du, BitCast(di16, lo_shuf_result), |
| BitCast(di16, hi_shuf_result))); |
| } |
| #endif |
| |
| template <size_t kIdx3210, class V, HWY_IF_NOT_FLOAT(TFromV<V>)> |
| HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/, |
| hwy::SizeTag<4> /*lane_size_tag*/, |
| hwy::SizeTag<16> /*vect_size_tag*/, V v) { |
| return V{_mm_shuffle_epi32(v.raw, static_cast<int>(kIdx3210 & 0xFF))}; |
| } |
| |
| template <size_t kIdx3210, class V, HWY_IF_FLOAT(TFromV<V>)> |
| HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/, |
| hwy::SizeTag<4> /*lane_size_tag*/, |
| hwy::SizeTag<16> /*vect_size_tag*/, V v) { |
| return V{_mm_shuffle_ps(v.raw, v.raw, static_cast<int>(kIdx3210 & 0xFF))}; |
| } |
| |
| } // namespace detail |
| |
| // ------------------------------ SlideUpLanes |
| |
| namespace detail { |
| |
| template <class V, HWY_IF_V_SIZE_LE_V(V, 8)> |
| HWY_INLINE V SlideUpLanes(V v, size_t amt) { |
| const DFromV<decltype(v)> d; |
| const Full64<uint64_t> du64; |
| const auto vu64 = ResizeBitCast(du64, v); |
| return ResizeBitCast( |
| d, ShiftLeftSame(vu64, static_cast<int>(amt * sizeof(TFromV<V>) * 8))); |
| } |
| |
| #if HWY_TARGET <= HWY_SSSE3 |
| template <class V, HWY_IF_V_SIZE_V(V, 16)> |
| HWY_INLINE V SlideUpLanes(V v, size_t amt) { |
| const DFromV<decltype(v)> d; |
| const Repartition<uint8_t, decltype(d)> du8; |
| const auto idx = |
| Iota(du8, static_cast<uint8_t>(size_t{0} - amt * sizeof(TFromV<V>))); |
| return BitCast(d, TableLookupBytesOr0(BitCast(du8, v), idx)); |
| } |
| #else |
| template <class V, HWY_IF_V_SIZE_V(V, 16)> |
| HWY_INLINE V SlideUpLanes(V v, size_t amt) { |
| const DFromV<decltype(v)> d; |
| const Repartition<int32_t, decltype(d)> di32; |
| const Repartition<uint64_t, decltype(d)> du64; |
| constexpr size_t kNumOfLanesPerU64 = 8 / sizeof(TFromV<V>); |
| |
| const auto vu64 = BitCast(du64, v); |
| const auto v_hi = IfVecThenElse( |
| BitCast(du64, Set(di32, -static_cast<int32_t>(amt >= kNumOfLanesPerU64))), |
| BitCast(du64, ShiftLeftBytes<8>(du64, vu64)), vu64); |
| const auto v_lo = ShiftLeftBytes<8>(du64, v_hi); |
| |
| const int shl_amt = static_cast<int>((amt * sizeof(TFromV<V>) * 8) & 63); |
| return BitCast( |
| d, Or(ShiftLeftSame(v_hi, shl_amt), ShiftRightSame(v_lo, 64 - shl_amt))); |
| } |
| #endif |
| |
| } // namespace detail |
| |
| template <class D, HWY_IF_LANES_D(D, 1)> |
| HWY_API VFromD<D> SlideUpLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) { |
| return v; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)> |
| HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) { |
| #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang |
| if (__builtin_constant_p(amt)) { |
| switch (amt) { |
| case 0: |
| return v; |
| case 1: |
| return ShiftLeftLanes<1>(d, v); |
| } |
| } |
| #else |
| (void)d; |
| #endif |
| |
| return detail::SlideUpLanes(v, amt); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)> |
| HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) { |
| #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang |
| if (__builtin_constant_p(amt)) { |
| switch (amt) { |
| case 0: |
| return v; |
| case 1: |
| return ShiftLeftLanes<1>(d, v); |
| case 2: |
| return ShiftLeftLanes<2>(d, v); |
| case 3: |
| return ShiftLeftLanes<3>(d, v); |
| } |
| } |
| #else |
| (void)d; |
| #endif |
| |
| return detail::SlideUpLanes(v, amt); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)> |
| HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) { |
| #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang |
| if (__builtin_constant_p(amt)) { |
| switch (amt) { |
| case 0: |
| return v; |
| case 1: |
| return ShiftLeftLanes<1>(d, v); |
| case 2: |
| return ShiftLeftLanes<2>(d, v); |
| case 3: |
| return ShiftLeftLanes<3>(d, v); |
| case 4: |
| return ShiftLeftLanes<4>(d, v); |
| case 5: |
| return ShiftLeftLanes<5>(d, v); |
| case 6: |
| return ShiftLeftLanes<6>(d, v); |
| case 7: |
| return ShiftLeftLanes<7>(d, v); |
| } |
| } |
| #else |
| (void)d; |
| #endif |
| |
| return detail::SlideUpLanes(v, amt); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_LANES_D(D, 16)> |
| HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) { |
| #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang |
| if (__builtin_constant_p(amt)) { |
| switch (amt) { |
| case 0: |
| return v; |
| case 1: |
| return ShiftLeftLanes<1>(d, v); |
| case 2: |
| return ShiftLeftLanes<2>(d, v); |
| case 3: |
| return ShiftLeftLanes<3>(d, v); |
| case 4: |
| return ShiftLeftLanes<4>(d, v); |
| case 5: |
| return ShiftLeftLanes<5>(d, v); |
| case 6: |
| return ShiftLeftLanes<6>(d, v); |
| case 7: |
| return ShiftLeftLanes<7>(d, v); |
| case 8: |
| return ShiftLeftLanes<8>(d, v); |
| case 9: |
| return ShiftLeftLanes<9>(d, v); |
| case 10: |
| return ShiftLeftLanes<10>(d, v); |
| case 11: |
| return ShiftLeftLanes<11>(d, v); |
| case 12: |
| return ShiftLeftLanes<12>(d, v); |
| case 13: |
| return ShiftLeftLanes<13>(d, v); |
| case 14: |
| return ShiftLeftLanes<14>(d, v); |
| case 15: |
| return ShiftLeftLanes<15>(d, v); |
| } |
| } |
| #else |
| (void)d; |
| #endif |
| |
| return detail::SlideUpLanes(v, amt); |
| } |
| |
| // ------------------------------ SlideDownLanes |
| |
| namespace detail { |
| |
| template <class V, HWY_IF_V_SIZE_LE_V(V, 8)> |
| HWY_INLINE V SlideDownLanes(V v, size_t amt) { |
| const DFromV<decltype(v)> d; |
| const Repartition<UnsignedFromSize<d.MaxBytes()>, decltype(d)> dv; |
| return BitCast(d, |
| ShiftRightSame(BitCast(dv, v), |
| static_cast<int>(amt * sizeof(TFromV<V>) * 8))); |
| } |
| |
| #if HWY_TARGET <= HWY_SSSE3 |
| template <class V, HWY_IF_V_SIZE_V(V, 16)> |
| HWY_INLINE V SlideDownLanes(V v, size_t amt) { |
| const DFromV<decltype(v)> d; |
| const Repartition<int8_t, decltype(d)> di8; |
| auto idx = Iota(di8, static_cast<int8_t>(amt * sizeof(TFromV<V>))); |
| idx = Or(idx, VecFromMask(di8, idx > Set(di8, int8_t{15}))); |
| return BitCast(d, TableLookupBytesOr0(BitCast(di8, v), idx)); |
| } |
| #else |
| template <class V, HWY_IF_V_SIZE_V(V, 16)> |
| HWY_INLINE V SlideDownLanes(V v, size_t amt) { |
| const DFromV<decltype(v)> d; |
| const Repartition<int32_t, decltype(d)> di32; |
| const Repartition<uint64_t, decltype(d)> du64; |
| constexpr size_t kNumOfLanesPerU64 = 8 / sizeof(TFromV<V>); |
| |
| const auto vu64 = BitCast(du64, v); |
| const auto v_lo = IfVecThenElse( |
| BitCast(du64, Set(di32, -static_cast<int32_t>(amt >= kNumOfLanesPerU64))), |
| BitCast(du64, ShiftRightBytes<8>(du64, vu64)), vu64); |
| const auto v_hi = ShiftRightBytes<8>(du64, v_lo); |
| |
| const int shr_amt = static_cast<int>((amt * sizeof(TFromV<V>) * 8) & 63); |
| return BitCast( |
| d, Or(ShiftRightSame(v_lo, shr_amt), ShiftLeftSame(v_hi, 64 - shr_amt))); |
| } |
| #endif |
| |
| } // namespace detail |
| |
| template <class D, HWY_IF_LANES_D(D, 1)> |
| HWY_API VFromD<D> SlideDownLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) { |
| return v; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)> |
| HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) { |
| #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang |
| if (__builtin_constant_p(amt)) { |
| switch (amt) { |
| case 0: |
| return v; |
| case 1: |
| return ShiftRightLanes<1>(d, v); |
| } |
| } |
| #else |
| (void)d; |
| #endif |
| |
| return detail::SlideDownLanes(v, amt); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)> |
| HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) { |
| #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang |
| if (__builtin_constant_p(amt)) { |
| switch (amt) { |
| case 0: |
| return v; |
| case 1: |
| return ShiftRightLanes<1>(d, v); |
| case 2: |
| return ShiftRightLanes<2>(d, v); |
| case 3: |
| return ShiftRightLanes<3>(d, v); |
| } |
| } |
| #else |
| (void)d; |
| #endif |
| |
| return detail::SlideDownLanes(v, amt); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)> |
| HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) { |
| #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang |
| if (__builtin_constant_p(amt)) { |
| switch (amt) { |
| case 0: |
| return v; |
| case 1: |
| return ShiftRightLanes<1>(d, v); |
| case 2: |
| return ShiftRightLanes<2>(d, v); |
| case 3: |
| return ShiftRightLanes<3>(d, v); |
| case 4: |
| return ShiftRightLanes<4>(d, v); |
| case 5: |
| return ShiftRightLanes<5>(d, v); |
| case 6: |
| return ShiftRightLanes<6>(d, v); |
| case 7: |
| return ShiftRightLanes<7>(d, v); |
| } |
| } |
| #else |
| (void)d; |
| #endif |
| |
| return detail::SlideDownLanes(v, amt); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_LANES_D(D, 16)> |
| HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) { |
| #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang |
| if (__builtin_constant_p(amt)) { |
| switch (amt) { |
| case 0: |
| return v; |
| case 1: |
| return ShiftRightLanes<1>(d, v); |
| case 2: |
| return ShiftRightLanes<2>(d, v); |
| case 3: |
| return ShiftRightLanes<3>(d, v); |
| case 4: |
| return ShiftRightLanes<4>(d, v); |
| case 5: |
| return ShiftRightLanes<5>(d, v); |
| case 6: |
| return ShiftRightLanes<6>(d, v); |
| case 7: |
| return ShiftRightLanes<7>(d, v); |
| case 8: |
| return ShiftRightLanes<8>(d, v); |
| case 9: |
| return ShiftRightLanes<9>(d, v); |
| case 10: |
| return ShiftRightLanes<10>(d, v); |
| case 11: |
| return ShiftRightLanes<11>(d, v); |
| case 12: |
| return ShiftRightLanes<12>(d, v); |
| case 13: |
| return ShiftRightLanes<13>(d, v); |
| case 14: |
| return ShiftRightLanes<14>(d, v); |
| case 15: |
| return ShiftRightLanes<15>(d, v); |
| } |
| } |
| #else |
| (void)d; |
| #endif |
| |
| return detail::SlideDownLanes(v, amt); |
| } |
| |
| // ================================================== MEMORY (4) |
| |
| // ------------------------------ StoreN (ExtractLane) |
| |
| #if HWY_TARGET <= HWY_AVX2 |
| |
| #ifdef HWY_NATIVE_STORE_N |
| #undef HWY_NATIVE_STORE_N |
| #else |
| #define HWY_NATIVE_STORE_N |
| #endif |
| |
| template <class D, HWY_IF_T_SIZE_ONE_OF_D( |
| D, (HWY_TARGET <= HWY_AVX3 ? ((1 << 1) | (1 << 2)) : 0) | |
| (1 << 4) | (1 << 8))> |
| HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p, |
| size_t max_lanes_to_store) { |
| const size_t num_lanes_to_store = |
| HWY_MIN(max_lanes_to_store, HWY_MAX_LANES_D(D)); |
| |
| #if HWY_COMPILER_MSVC |
| // Work around MSVC compiler bug by using a HWY_FENCE before the BlendedStore |
| HWY_FENCE; |
| #endif |
| |
| BlendedStore(v, FirstN(d, num_lanes_to_store), d, p); |
| |
| #if HWY_COMPILER_MSVC |
| // Work around MSVC compiler bug by using a HWY_FENCE after the BlendedStore |
| HWY_FENCE; |
| #endif |
| |
| detail::MaybeUnpoison(p, num_lanes_to_store); |
| } |
| |
| #if HWY_TARGET > HWY_AVX3 |
| template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)), |
| HWY_IF_LANES_D(D, 1)> |
| HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p, |
| size_t max_lanes_to_store) { |
| if (max_lanes_to_store > 0) { |
| StoreU(v, d, p); |
| } |
| } |
| |
| template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)), |
| HWY_IF_LANES_D(D, 2)> |
| HWY_API void StoreN(VFromD<D> v, D /*d*/, TFromD<D>* HWY_RESTRICT p, |
| size_t max_lanes_to_store) { |
| if (max_lanes_to_store >= 1) { |
| p[static_cast<size_t>(max_lanes_to_store > 1)] = detail::ExtractLane<1>(v); |
| p[0] = GetLane(v); |
| } |
| } |
| |
| namespace detail { |
| |
| template <class D, HWY_IF_T_SIZE_D(D, 1)> |
| HWY_API void AVX2UIF8Or16StoreTrailingN(VFromD<D> v_trailing, D /*d*/, |
| TFromD<D>* HWY_RESTRICT p, |
| size_t num_lanes_to_store) { |
| // AVX2UIF8Or16StoreTrailingN should only be called for an I8/U8 vector if |
| // (num_lanes_to_store & 3) != 0 is true |
| const auto v_full128 = ResizeBitCast(Full128<TFromD<D>>(), v_trailing); |
| if ((num_lanes_to_store & 2) != 0) { |
| const uint16_t u16_bits = GetLane(BitCast(Full128<uint16_t>(), v_full128)); |
| p[num_lanes_to_store - 1] = detail::ExtractLane<2>(v_full128); |
| CopyBytes<sizeof(uint16_t)>(&u16_bits, |
| p + (num_lanes_to_store & ~size_t{3})); |
| } else { |
| p[num_lanes_to_store - 1] = GetLane(v_full128); |
| } |
| } |
| |
| template <class D, HWY_IF_T_SIZE_D(D, 2)> |
| HWY_API void AVX2UIF8Or16StoreTrailingN(VFromD<D> v_trailing, D /*d*/, |
| TFromD<D>* p, |
| size_t num_lanes_to_store) { |
| // AVX2UIF8Or16StoreTrailingN should only be called for an I16/U16/F16/BF16 |
| // vector if (num_lanes_to_store & 1) == 1 is true |
| p[num_lanes_to_store - 1] = GetLane(v_trailing); |
| } |
| |
| } // namespace detail |
| |
| template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)), |
| HWY_IF_LANES_GT_D(D, 2)> |
| HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* p, size_t max_lanes_to_store) { |
| const size_t num_lanes_to_store = |
| HWY_MIN(max_lanes_to_store, HWY_MAX_LANES_D(D)); |
| |
| const FixedTag<TFromD<D>, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD<D>))> |
| d_full; |
| const RebindToUnsigned<decltype(d_full)> du_full; |
| const Repartition<int32_t, decltype(d_full)> di32_full; |
| |
| const auto i32_store_mask = BitCast( |
| di32_full, VecFromMask(du_full, FirstN(du_full, num_lanes_to_store))); |
| const auto vi32 = ResizeBitCast(di32_full, v); |
| |
| #if HWY_COMPILER_MSVC |
| // Work around MSVC compiler bug by using a HWY_FENCE before the BlendedStore |
| HWY_FENCE; |
| #endif |
| |
| BlendedStore(vi32, MaskFromVec(i32_store_mask), di32_full, |
| reinterpret_cast<int32_t*>(p)); |
| |
| constexpr size_t kNumOfLanesPerI32 = 4 / sizeof(TFromD<D>); |
| constexpr size_t kTrailingLenMask = kNumOfLanesPerI32 - 1; |
| const size_t trailing_n = (num_lanes_to_store & kTrailingLenMask); |
| |
| if (trailing_n != 0) { |
| const VFromD<D> v_trailing = ResizeBitCast( |
| d, SlideDownLanes(di32_full, vi32, |
| num_lanes_to_store / kNumOfLanesPerI32)); |
| detail::AVX2UIF8Or16StoreTrailingN(v_trailing, d, p, num_lanes_to_store); |
| } |
| |
| #if HWY_COMPILER_MSVC |
| // Work around MSVC compiler bug by using a HWY_FENCE after the BlendedStore |
| HWY_FENCE; |
| #endif |
| |
| detail::MaybeUnpoison(p, num_lanes_to_store); |
| } |
| #endif // HWY_TARGET > HWY_AVX3 |
| #endif // HWY_TARGET <= HWY_AVX2 |
| |
| // ================================================== COMBINE |
| |
| // ------------------------------ Combine (InterleaveLower) |
| |
| // N = N/2 + N/2 (upper half undefined) |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), class VH = VFromD<Half<D>>> |
| HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) { |
| const Half<decltype(d)> dh; |
| const RebindToUnsigned<decltype(dh)> duh; |
| // Treat half-width input as one lane, and expand to two lanes. |
| using VU = Vec128<UnsignedFromSize<dh.MaxBytes()>, 2>; |
| const VU lo{BitCast(duh, lo_half).raw}; |
| const VU hi{BitCast(duh, hi_half).raw}; |
| return BitCast(d, InterleaveLower(lo, hi)); |
| } |
| |
| // ------------------------------ ZeroExtendVector (Combine, IfThenElseZero) |
| |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_SPECIAL_FLOAT_D(D)> |
| HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) { |
| const RebindToUnsigned<decltype(d)> du; |
| const Half<decltype(du)> duh; |
| return BitCast(d, VFromD<decltype(du)>{_mm_move_epi64(BitCast(duh, lo).raw)}); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_NOT_SPECIAL_FLOAT_D(D)> |
| HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) { |
| const Half<D> dh; |
| return IfThenElseZero(FirstN(d, MaxLanes(dh)), VFromD<D>{lo.raw}); |
| } |
| |
| #if HWY_HAVE_FLOAT16 |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)> |
| HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) { |
| const RebindToUnsigned<decltype(d)> du; |
| const Half<decltype(du)> duh; |
| return BitCast(d, ZeroExtendVector(du, BitCast(duh, lo))); |
| } |
| #endif |
| |
| // Generic for all vector lengths. |
| template <class D, HWY_X86_IF_EMULATED_D(D)> |
| HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) { |
| const RebindToUnsigned<decltype(d)> du; |
| const Half<decltype(du)> duh; |
| return BitCast(d, ZeroExtendVector(du, BitCast(duh, lo))); |
| } |
| |
| // ------------------------------ Concat full (InterleaveLower) |
| |
| // hiH,hiL loH,loL |-> hiL,loL (= lower halves) |
| template <class D, HWY_IF_V_SIZE_D(D, 16)> |
| HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) { |
| const Repartition<uint64_t, decltype(d)> d64; |
| return BitCast(d, InterleaveLower(BitCast(d64, lo), BitCast(d64, hi))); |
| } |
| |
| // hiH,hiL loH,loL |-> hiH,loH (= upper halves) |
| template <class D, HWY_IF_V_SIZE_D(D, 16)> |
| HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) { |
| const Repartition<uint64_t, decltype(d)> d64; |
| return BitCast(d, InterleaveUpper(d64, BitCast(d64, lo), BitCast(d64, hi))); |
| } |
| |
| // hiH,hiL loH,loL |-> hiL,loH (= inner halves) |
| template <class D, HWY_IF_V_SIZE_D(D, 16)> |
| HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) { |
| return CombineShiftRightBytes<8>(d, hi, lo); |
| } |
| |
| // hiH,hiL loH,loL |-> hiH,loL (= outer halves) |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT3264_D(D)> |
| HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) { |
| const Repartition<double, decltype(d)> dd; |
| #if HWY_TARGET >= HWY_SSSE3 |
| return BitCast( |
| d, Vec128<double>{_mm_shuffle_pd(BitCast(dd, lo).raw, BitCast(dd, hi).raw, |
| _MM_SHUFFLE2(1, 0))}); |
| #else |
| // _mm_blend_epi16 has throughput 1/cycle on SKX, whereas _pd can do 3/cycle. |
| return BitCast(d, Vec128<double>{_mm_blend_pd(BitCast(dd, hi).raw, |
| BitCast(dd, lo).raw, 1)}); |
| #endif |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)> |
| HWY_API Vec128<float> ConcatUpperLower(D d, Vec128<float> hi, |
| Vec128<float> lo) { |
| #if HWY_TARGET >= HWY_SSSE3 |
| (void)d; |
| return Vec128<float>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 2, 1, 0))}; |
| #else |
| // _mm_shuffle_ps has throughput 1/cycle on SKX, whereas blend can do 3/cycle. |
| const RepartitionToWide<decltype(d)> dd; |
| return BitCast(d, Vec128<double>{_mm_blend_pd(BitCast(dd, hi).raw, |
| BitCast(dd, lo).raw, 1)}); |
| #endif |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)> |
| HWY_API Vec128<double> ConcatUpperLower(D /* tag */, Vec128<double> hi, |
| Vec128<double> lo) { |
| #if HWY_TARGET >= HWY_SSSE3 |
| return Vec128<double>{_mm_shuffle_pd(lo.raw, hi.raw, _MM_SHUFFLE2(1, 0))}; |
| #else |
| // _mm_shuffle_pd has throughput 1/cycle on SKX, whereas blend can do 3/cycle. |
| return Vec128<double>{_mm_blend_pd(hi.raw, lo.raw, 1)}; |
| #endif |
| } |
| |
| // ------------------------------ Concat partial (Combine, LowerHalf) |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> |
| HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) { |
| const Half<decltype(d)> d2; |
| return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo)); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> |
| HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) { |
| const Half<decltype(d)> d2; |
| return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo)); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> |
| HWY_API VFromD<D> ConcatLowerUpper(D d, const VFromD<D> hi, |
| const VFromD<D> lo) { |
| const Half<decltype(d)> d2; |
| return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo)); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> |
| HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) { |
| const Half<decltype(d)> d2; |
| return Combine(d, UpperHalf(d2, hi), LowerHalf(d2, lo)); |
| } |
| |
| // ------------------------------ ConcatOdd |
| |
| // 8-bit full |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)> |
| HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) { |
| const Repartition<uint16_t, decltype(d)> dw; |
| // Right-shift 8 bits per u16 so we can pack. |
| const Vec128<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi)); |
| const Vec128<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo)); |
| return VFromD<D>{_mm_packus_epi16(uL.raw, uH.raw)}; |
| } |
| |
| // 8-bit x8 |
| template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)> |
| HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) { |
| #if HWY_TARGET == HWY_SSE2 |
| const Repartition<uint16_t, decltype(d)> dw; |
| // Right-shift 8 bits per u16 so we can pack. |
| const Vec64<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi)); |
| const Vec64<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo)); |
| return VFromD<D>{_mm_shuffle_epi32(_mm_packus_epi16(uL.raw, uH.raw), |
| _MM_SHUFFLE(2, 0, 2, 0))}; |
| #else |
| const Repartition<uint32_t, decltype(d)> du32; |
| // Don't care about upper half, no need to zero. |
| alignas(16) const uint8_t kCompactOddU8[8] = {1, 3, 5, 7}; |
| const VFromD<D> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactOddU8)); |
| const VFromD<D> L = TableLookupBytes(lo, shuf); |
| const VFromD<D> H = TableLookupBytes(hi, shuf); |
| return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H))); |
| #endif |
| } |
| |
| // 8-bit x4 |
| template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_T_SIZE_D(D, 1)> |
| HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) { |
| #if HWY_TARGET == HWY_SSE2 |
| const Repartition<uint16_t, decltype(d)> dw; |
| const Twice<decltype(dw)> dw_2; |
| // Right-shift 8 bits per u16 so we can pack. |
| const Vec32<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi)); |
| const Vec32<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo)); |
| const Vec64<uint16_t> uHL = Combine(dw_2, uH, uL); |
| return VFromD<D>{_mm_packus_epi16(uHL.raw, uHL.raw)}; |
| #else |
| const Repartition<uint16_t, decltype(d)> du16; |
| // Don't care about upper half, no need to zero. |
| alignas(16) const uint8_t kCompactOddU8[4] = {1, 3}; |
| const VFromD<D> shuf = BitCast(d, Load(Full32<uint8_t>(), kCompactOddU8)); |
| const VFromD<D> L = TableLookupBytes(lo, shuf); |
| const VFromD<D> H = TableLookupBytes(hi, shuf); |
| return BitCast(d, InterleaveLower(du16, BitCast(du16, L), BitCast(du16, H))); |
| #endif |
| } |
| |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)> |
| HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) { |
| // Right-shift 16 bits per i32 - a *signed* shift of 0x8000xxxx returns |
| // 0xFFFF8000, which correctly saturates to 0x8000. |
| const RebindToUnsigned<decltype(d)> du; |
| const Repartition<int32_t, decltype(d)> dw; |
| const Vec128<int32_t> uH = ShiftRight<16>(BitCast(dw, hi)); |
| const Vec128<int32_t> uL = ShiftRight<16>(BitCast(dw, lo)); |
| return BitCast(d, VFromD<decltype(du)>{_mm_packs_epi32(uL.raw, uH.raw)}); |
| } |
| |
| // 16-bit x4 |
| template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 2)> |
| HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) { |
| #if HWY_TARGET == HWY_SSE2 |
| // Right-shift 16 bits per i32 - a *signed* shift of 0x8000xxxx returns |
| // 0xFFFF8000, which correctly saturates to 0x8000. |
| const Repartition<int32_t, decltype(d)> dw; |
| const Vec64<int32_t> uH = ShiftRight<16>(BitCast(dw, hi)); |
| const Vec64<int32_t> uL = ShiftRight<16>(BitCast(dw, lo)); |
| return VFromD<D>{_mm_shuffle_epi32(_mm_packs_epi32(uL.raw, uH.raw), |
| _MM_SHUFFLE(2, 0, 2, 0))}; |
| #else |
| const Repartition<uint32_t, decltype(d)> du32; |
| // Don't care about upper half, no need to zero. |
| alignas(16) const uint8_t kCompactOddU16[8] = {2, 3, 6, 7}; |
| const VFromD<D> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactOddU16)); |
| const VFromD<D> L = TableLookupBytes(lo, shuf); |
| const VFromD<D> H = TableLookupBytes(hi, shuf); |
| return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H))); |
| #endif |
| } |
| |
| // 32-bit full |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)> |
| HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) { |
| const RebindToFloat<decltype(d)> df; |
| return BitCast( |
| d, Vec128<float>{_mm_shuffle_ps(BitCast(df, lo).raw, BitCast(df, hi).raw, |
| _MM_SHUFFLE(3, 1, 3, 1))}); |
| } |
| |
| // Any type x2 |
| template <class D, HWY_IF_LANES_D(D, 2)> |
| HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) { |
| return InterleaveUpper(d, lo, hi); |
| } |
| |
| // ------------------------------ ConcatEven (InterleaveLower) |
| |
| // 8-bit full |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)> |
| HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) { |
| const Repartition<uint16_t, decltype(d)> dw; |
| // Isolate lower 8 bits per u16 so we can pack. |
| const Vec128<uint16_t> mask = Set(dw, 0x00FF); |
| const Vec128<uint16_t> uH = And(BitCast(dw, hi), mask); |
| const Vec128<uint16_t> uL = And(BitCast(dw, lo), mask); |
| return VFromD<D>{_mm_packus_epi16(uL.raw, uH.raw)}; |
| } |
| |
| // 8-bit x8 |
| template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)> |
| HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) { |
| #if HWY_TARGET == HWY_SSE2 |
| const Repartition<uint16_t, decltype(d)> dw; |
| // Isolate lower 8 bits per u16 so we can pack. |
| const Vec64<uint16_t> mask = Set(dw, 0x00FF); |
| const Vec64<uint16_t> uH = And(BitCast(dw, hi), mask); |
| const Vec64<uint16_t> uL = And(BitCast(dw, lo), mask); |
| return VFromD<D>{_mm_shuffle_epi32(_mm_packus_epi16(uL.raw, uH.raw), |
| _MM_SHUFFLE(2, 0, 2, 0))}; |
| #else |
| const Repartition<uint32_t, decltype(d)> du32; |
| // Don't care about upper half, no need to zero. |
| alignas(16) const uint8_t kCompactEvenU8[8] = {0, 2, 4, 6}; |
| const VFromD<D> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactEvenU8)); |
| const VFromD<D> L = TableLookupBytes(lo, shuf); |
| const VFromD<D> H = TableLookupBytes(hi, shuf); |
| return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H))); |
| #endif |
| } |
| |
| // 8-bit x4 |
| template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_T_SIZE_D(D, 1)> |
| HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) { |
| #if HWY_TARGET == HWY_SSE2 |
| const Repartition<uint16_t, decltype(d)> dw; |
| const Twice<decltype(dw)> dw_2; |
| // Isolate lower 8 bits per u16 so we can pack. |
| const Vec32<uint16_t> mask = Set(dw, 0x00FF); |
| const Vec32<uint16_t> uH = And(BitCast(dw, hi), mask); |
| const Vec32<uint16_t> uL = And(BitCast(dw, lo), mask); |
| const Vec64<uint16_t> uHL = Combine(dw_2, uH, uL); |
| return VFromD<D>{_mm_packus_epi16(uHL.raw, uHL.raw)}; |
| #else |
| const Repartition<uint16_t, decltype(d)> du16; |
| // Don't care about upper half, no need to zero. |
| alignas(16) const uint8_t kCompactEvenU8[4] = {0, 2}; |
| const VFromD<D> shuf = BitCast(d, Load(Full32<uint8_t>(), kCompactEvenU8)); |
| const VFromD<D> L = TableLookupBytes(lo, shuf); |
| const VFromD<D> H = TableLookupBytes(hi, shuf); |
| return BitCast(d, InterleaveLower(du16, BitCast(du16, L), BitCast(du16, H))); |
| #endif |
| } |
| |
| // 16-bit full |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)> |
| HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) { |
| #if HWY_TARGET <= HWY_SSE4 |
| // Isolate lower 16 bits per u32 so we can pack. |
| const RebindToUnsigned<decltype(d)> du; // for float16_t |
| const Repartition<uint32_t, decltype(d)> dw; |
| const Vec128<uint32_t> mask = Set(dw, 0x0000FFFF); |
| const Vec128<uint32_t> uH = And(BitCast(dw, hi), mask); |
| const Vec128<uint32_t> uL = And(BitCast(dw, lo), mask); |
| return BitCast(d, VFromD<decltype(du)>{_mm_packus_epi32(uL.raw, uH.raw)}); |
| #elif HWY_TARGET == HWY_SSE2 |
| const Repartition<uint32_t, decltype(d)> dw; |
| return ConcatOdd(d, BitCast(d, ShiftLeft<16>(BitCast(dw, hi))), |
| BitCast(d, ShiftLeft<16>(BitCast(dw, lo)))); |
| #else |
| const RebindToUnsigned<decltype(d)> du; |
| // packs_epi32 saturates 0x8000 to 0x7FFF. Instead ConcatEven within the two |
| // inputs, then concatenate them. |
| alignas(16) |
| const uint16_t kCompactEvenU16[8] = {0x0100, 0x0504, 0x0908, 0x0D0C}; |
| const VFromD<D> shuf = BitCast(d, Load(du, kCompactEvenU16)); |
| const VFromD<D> L = TableLookupBytes(lo, shuf); |
| const VFromD<D> H = TableLookupBytes(hi, shuf); |
| return ConcatLowerLower(d, H, L); |
| #endif |
| } |
| |
| // 16-bit x4 |
| template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 2)> |
| HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) { |
| #if HWY_TARGET == HWY_SSE2 |
| const Repartition<uint32_t, decltype(d)> dw; |
| return ConcatOdd(d, BitCast(d, ShiftLeft<16>(BitCast(dw, hi))), |
| BitCast(d, ShiftLeft<16>(BitCast(dw, lo)))); |
| #else |
| const Repartition<uint32_t, decltype(d)> du32; |
| // Don't care about upper half, no need to zero. |
| alignas(16) const uint8_t kCompactEvenU16[8] = {0, 1, 4, 5}; |
| const VFromD<D> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactEvenU16)); |
| const VFromD<D> L = TableLookupBytes(lo, shuf); |
| const VFromD<D> H = TableLookupBytes(hi, shuf); |
| return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H))); |
| #endif |
| } |
| |
| // 32-bit full |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)> |
| HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) { |
| const RebindToFloat<decltype(d)> df; |
| return BitCast( |
| d, Vec128<float>{_mm_shuffle_ps(BitCast(df, lo).raw, BitCast(df, hi).raw, |
| _MM_SHUFFLE(2, 0, 2, 0))}); |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)> |
| HWY_API VFromD<D> ConcatEven(D /* d */, VFromD<D> hi, VFromD<D> lo) { |
| return VFromD<D>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(2, 0, 2, 0))}; |
| } |
| |
| // Any T x2 |
| template <class D, HWY_IF_LANES_D(D, 2)> |
| HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) { |
| return InterleaveLower(d, lo, hi); |
| } |
| |
| // ------------------------------ DupEven (InterleaveLower) |
| |
| template <typename T> |
| HWY_API Vec128<T, 1> DupEven(const Vec128<T, 1> v) { |
| return v; |
| } |
| |
| template <typename T> |
| HWY_API Vec128<T, 2> DupEven(const Vec128<T, 2> v) { |
| return InterleaveLower(DFromV<decltype(v)>(), v, v); |
| } |
| |
| template <typename V, HWY_IF_T_SIZE_V(V, 1), HWY_IF_V_SIZE_GT_V(V, 2)> |
| HWY_API V DupEven(V v) { |
| const DFromV<decltype(v)> d; |
| |
| #if HWY_TARGET <= HWY_SSSE3 |
| const RebindToUnsigned<decltype(d)> du; |
| const VFromD<decltype(du)> shuffle = Dup128VecFromValues( |
| du, 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14); |
| return TableLookupBytes(v, BitCast(d, shuffle)); |
| #else |
| const Repartition<uint16_t, decltype(d)> du16; |
| return IfVecThenElse(BitCast(d, Set(du16, uint16_t{0xFF00})), |
| BitCast(d, ShiftLeft<8>(BitCast(du16, v))), v); |
| #endif |
| } |
| |
| template <typename T, HWY_IF_T_SIZE(T, 2)> |
| HWY_API Vec64<T> DupEven(const Vec64<T> v) { |
| const DFromV<decltype(v)> d; |
| const RebindToUnsigned<decltype(d)> du; // for float16_t |
| return BitCast(d, VFromD<decltype(du)>{_mm_shufflelo_epi16( |
| BitCast(du, v).raw, _MM_SHUFFLE(2, 2, 0, 0))}); |
| } |
| |
| // Generic for all vector lengths. |
| template <class V, HWY_IF_T_SIZE_V(V, 2)> |
| HWY_API V DupEven(const V v) { |
| const DFromV<decltype(v)> d; |
| const RebindToUnsigned<decltype(d)> du; // for float16_t |
| #if HWY_TARGET <= HWY_SSSE3 |
| const VFromD<decltype(du)> shuffle = Dup128VecFromValues( |
| du, 0x0100, 0x0100, 0x0504, 0x0504, 0x0908, 0x0908, 0x0d0c, 0x0d0c); |
| return TableLookupBytes(v, BitCast(d, shuffle)); |
| #else |
| return BitCast( |
| d, VFromD<decltype(du)>{_mm_shufflehi_epi16( |
| _mm_shufflelo_epi16(BitCast(du, v).raw, _MM_SHUFFLE(2, 2, 0, 0)), |
| _MM_SHUFFLE(2, 2, 0, 0))}); |
| #endif |
| } |
| |
| template <typename T, HWY_IF_UI32(T)> |
| HWY_API Vec128<T> DupEven(Vec128<T> v) { |
| return Vec128<T>{_mm_shuffle_epi32(v.raw, _MM_SHUFFLE(2, 2, 0, 0))}; |
| } |
| |
| HWY_API Vec128<float> DupEven(Vec128<float> v) { |
| return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(2, 2, 0, 0))}; |
| } |
| |
| // ------------------------------ DupOdd (InterleaveUpper) |
| |
| template <typename T, HWY_IF_T_SIZE(T, 1)> |
| HWY_API Vec128<T, 1> DupOdd(Vec128<T, 1> v) { |
| return v; |
| } |
| |
| template <typename V, HWY_IF_T_SIZE_V(V, 1), HWY_IF_V_SIZE_GT_V(V, 1)> |
| HWY_API V DupOdd(V v) { |
| const DFromV<decltype(v)> d; |
| |
| #if HWY_TARGET <= HWY_SSSE3 |
| const RebindToUnsigned<decltype(d)> du; |
| const VFromD<decltype(du)> shuffle = Dup128VecFromValues( |
| du, 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15); |
| return TableLookupBytes(v, BitCast(d, shuffle)); |
| #else |
| const Repartition<uint16_t, decltype(d)> du16; |
| return IfVecThenElse(BitCast(d, Set(du16, uint16_t{0x00FF})), |
| BitCast(d, ShiftRight<8>(BitCast(du16, v))), v); |
| #endif |
| } |
| |
| template <typename T, size_t N, HWY_IF_T_SIZE(T, 2), HWY_IF_LANES_LE(N, 4)> |
| HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) { |
| const DFromV<decltype(v)> d; |
| const RebindToUnsigned<decltype(d)> du; // for float16_t |
| return BitCast(d, VFromD<decltype(du)>{_mm_shufflelo_epi16( |
| BitCast(du, v).raw, _MM_SHUFFLE(3, 3, 1, 1))}); |
| } |
| |
| // Generic for all vector lengths. |
| template <typename V, HWY_IF_T_SIZE_V(V, 2), HWY_IF_V_SIZE_GT_V(V, 8)> |
| HWY_API V DupOdd(V v) { |
| const DFromV<decltype(v)> d; |
| const RebindToUnsigned<decltype(d)> du; // for float16_t |
| #if HWY_TARGET <= HWY_SSSE3 |
| const VFromD<decltype(du)> shuffle = Dup128VecFromValues( |
| du, 0x0302, 0x0302, 0x0706, 0x0706, 0x0b0a, 0x0b0a, 0x0f0e, 0x0f0e); |
| return TableLookupBytes(v, BitCast(d, shuffle)); |
| #else |
| return BitCast( |
| d, VFromD<decltype(du)>{_mm_shufflehi_epi16( |
| _mm_shufflelo_epi16(BitCast(du, v).raw, _MM_SHUFFLE(3, 3, 1, 1)), |
| _MM_SHUFFLE(3, 3, 1, 1))}); |
| #endif |
| } |
| |
| template <typename T, size_t N, HWY_IF_UI32(T)> |
| HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) { |
| return Vec128<T, N>{_mm_shuffle_epi32(v.raw, _MM_SHUFFLE(3, 3, 1, 1))}; |
| } |
| template <size_t N> |
| HWY_API Vec128<float, N> DupOdd(Vec128<float, N> v) { |
| return Vec128<float, N>{ |
| _mm_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(3, 3, 1, 1))}; |
| } |
| |
| template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> |
| HWY_API Vec128<T, N> DupOdd(const Vec128<T, N> v) { |
| return InterleaveUpper(DFromV<decltype(v)>(), v, v); |
| } |
| |
| // ------------------------------ TwoTablesLookupLanes (DupEven) |
| |
| template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)> |
| HWY_API Vec128<T, N> TwoTablesLookupLanes(Vec128<T, N> a, Vec128<T, N> b, |
| Indices128<T, N> idx) { |
| const DFromV<decltype(a)> d; |
| const Twice<decltype(d)> dt; |
| // TableLookupLanes currently requires table and index vectors to be the same |
| // size, though a half-length index vector would be sufficient here. |
| #if HWY_IS_MSAN |
| const Vec128<T, N> idx_vec{idx.raw}; |
| const Indices128<T, N * 2> idx2{Combine(dt, idx_vec, idx_vec).raw}; |
| #else |
| // We only keep LowerHalf of the result, which is valid in idx. |
| const Indices128<T, N * 2> idx2{idx.raw}; |
| #endif |
| return LowerHalf(d, TableLookupLanes(Combine(dt, b, a), idx2)); |
| } |
| |
| template <typename T, HWY_IF_T_SIZE(T, 1)> |
| HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b, |
| Indices128<T> idx) { |
| #if HWY_TARGET <= HWY_AVX3_DL |
| return Vec128<T>{_mm_permutex2var_epi8(a.raw, idx.raw, b.raw)}; |
| #else // AVX3 or below |
| const DFromV<decltype(a)> d; |
| const Vec128<T> idx_vec{idx.raw}; |
| |
| #if HWY_TARGET <= HWY_SSE4 |
| const Repartition<uint16_t, decltype(d)> du16; |
| const auto sel_hi_mask = |
| MaskFromVec(BitCast(d, ShiftLeft<3>(BitCast(du16, idx_vec)))); |
| #else |
| const RebindToSigned<decltype(d)> di; |
| const auto sel_hi_mask = |
| RebindMask(d, BitCast(di, idx_vec) > Set(di, int8_t{15})); |
| #endif |
| |
| const auto lo_lookup_result = TableLookupBytes(a, idx_vec); |
| #if HWY_TARGET <= HWY_AVX3 |
| const Vec128<T> lookup_result{_mm_mask_shuffle_epi8( |
| lo_lookup_result.raw, sel_hi_mask.raw, b.raw, idx_vec.raw)}; |
| return lookup_result; |
| #else |
| const auto hi_lookup_result = TableLookupBytes(b, idx_vec); |
| return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result); |
| #endif // HWY_TARGET <= HWY_AVX3 |
| #endif // HWY_TARGET <= HWY_AVX3_DL |
| } |
| |
| template <typename T, HWY_IF_T_SIZE(T, 2)> |
| HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b, |
| Indices128<T> idx) { |
| #if HWY_TARGET <= HWY_AVX3 |
| return Vec128<T>{_mm_permutex2var_epi16(a.raw, idx.raw, b.raw)}; |
| #elif HWY_TARGET == HWY_SSE2 |
| const DFromV<decltype(a)> d; |
| const RebindToSigned<decltype(d)> di; |
| const Vec128<T> idx_vec{idx.raw}; |
| const auto sel_hi_mask = |
| RebindMask(d, BitCast(di, idx_vec) > Set(di, int16_t{7})); |
| const auto lo_lookup_result = TableLookupLanes(a, idx); |
| const auto hi_lookup_result = TableLookupLanes(b, idx); |
| return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result); |
| #else |
| const DFromV<decltype(a)> d; |
| const Repartition<uint8_t, decltype(d)> du8; |
| return BitCast(d, TwoTablesLookupLanes(BitCast(du8, a), BitCast(du8, b), |
| Indices128<uint8_t>{idx.raw})); |
| #endif |
| } |
| |
| template <typename T, HWY_IF_UI32(T)> |
| HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b, |
| Indices128<T> idx) { |
| #if HWY_TARGET <= HWY_AVX3 |
| return Vec128<T>{_mm_permutex2var_epi32(a.raw, idx.raw, b.raw)}; |
| #else // AVX2 or below |
| const DFromV<decltype(a)> d; |
| |
| #if HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SSE2 |
| const Vec128<T> idx_vec{idx.raw}; |
| |
| #if HWY_TARGET <= HWY_AVX2 |
| const RebindToFloat<decltype(d)> d_sel; |
| const auto sel_hi_mask = MaskFromVec(BitCast(d_sel, ShiftLeft<29>(idx_vec))); |
| #else |
| const RebindToSigned<decltype(d)> d_sel; |
| const auto sel_hi_mask = BitCast(d_sel, idx_vec) > Set(d_sel, int32_t{3}); |
| #endif |
| |
| const auto lo_lookup_result = BitCast(d_sel, TableLookupLanes(a, idx)); |
| const auto hi_lookup_result = BitCast(d_sel, TableLookupLanes(b, idx)); |
| return BitCast(d, |
| IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result)); |
| #else // SSSE3 or SSE4 |
| const Repartition<uint8_t, decltype(d)> du8; |
| return BitCast(d, TwoTablesLookupLanes(BitCast(du8, a), BitCast(du8, b), |
| Indices128<uint8_t>{idx.raw})); |
| #endif // HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SSE2 |
| #endif // HWY_TARGET <= HWY_AVX3 |
| } |
| |
| #if HWY_HAVE_FLOAT16 |
| HWY_API Vec128<float16_t> TwoTablesLookupLanes(Vec128<float16_t> a, |
| Vec128<float16_t> b, |
| Indices128<float16_t> idx) { |
| return Vec128<float16_t>{_mm_permutex2var_ph(a.raw, idx.raw, b.raw)}; |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| HWY_API Vec128<float> TwoTablesLookupLanes(Vec128<float> a, Vec128<float> b, |
| Indices128<float> idx) { |
| #if HWY_TARGET <= HWY_AVX3 |
| return Vec128<float>{_mm_permutex2var_ps(a.raw, idx.raw, b.raw)}; |
| #elif HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SSE2 |
| const DFromV<decltype(a)> d; |
| |
| #if HWY_TARGET <= HWY_AVX2 |
| const auto sel_hi_mask = |
| MaskFromVec(BitCast(d, ShiftLeft<29>(Vec128<int32_t>{idx.raw}))); |
| #else |
| const RebindToSigned<decltype(d)> di; |
| const auto sel_hi_mask = |
| RebindMask(d, Vec128<int32_t>{idx.raw} > Set(di, int32_t{3})); |
| #endif |
| |
| const auto lo_lookup_result = TableLookupLanes(a, idx); |
| const auto hi_lookup_result = TableLookupLanes(b, idx); |
| return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result); |
| #else // SSSE3 or SSE4 |
| const DFromV<decltype(a)> d; |
| const Repartition<uint8_t, decltype(d)> du8; |
| return BitCast(d, TwoTablesLookupLanes(BitCast(du8, a), BitCast(du8, b), |
| Indices128<uint8_t>{idx.raw})); |
| #endif |
| } |
| |
| template <typename T, HWY_IF_UI64(T)> |
| HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b, |
| Indices128<T> idx) { |
| #if HWY_TARGET <= HWY_AVX3 |
| return Vec128<T>{_mm_permutex2var_epi64(a.raw, idx.raw, b.raw)}; |
| #else |
| const DFromV<decltype(a)> d; |
| const Vec128<T> idx_vec{idx.raw}; |
| const Indices128<T> idx_mod{And(idx_vec, Set(d, T{1})).raw}; |
| |
| #if HWY_TARGET <= HWY_SSE4 |
| const RebindToFloat<decltype(d)> d_sel; |
| const auto sel_hi_mask = MaskFromVec(BitCast(d_sel, ShiftLeft<62>(idx_vec))); |
| #else // SSE2 or SSSE3 |
| const Repartition<int32_t, decltype(d)> di32; |
| const RebindToSigned<decltype(d)> d_sel; |
| const auto sel_hi_mask = MaskFromVec( |
| BitCast(d_sel, VecFromMask(di32, DupEven(BitCast(di32, idx_vec)) > |
| Set(di32, int32_t{1})))); |
| #endif // HWY_TARGET <= HWY_SSE4 |
| |
| const auto lo_lookup_result = BitCast(d_sel, TableLookupLanes(a, idx_mod)); |
| const auto hi_lookup_result = BitCast(d_sel, TableLookupLanes(b, idx_mod)); |
| return BitCast(d, |
| IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result)); |
| #endif // HWY_TARGET <= HWY_AVX3 |
| } |
| |
| HWY_API Vec128<double> TwoTablesLookupLanes(Vec128<double> a, Vec128<double> b, |
| Indices128<double> idx) { |
| #if HWY_TARGET <= HWY_AVX3 |
| return Vec128<double>{_mm_permutex2var_pd(a.raw, idx.raw, b.raw)}; |
| #else |
| const DFromV<decltype(a)> d; |
| const RebindToSigned<decltype(d)> di; |
| const Vec128<int64_t> idx_vec{idx.raw}; |
| const Indices128<double> idx_mod{And(idx_vec, Set(di, int64_t{1})).raw}; |
| |
| #if HWY_TARGET <= HWY_SSE4 |
| const auto sel_hi_mask = MaskFromVec(BitCast(d, ShiftLeft<62>(idx_vec))); |
| #else // SSE2 or SSSE3 |
| const Repartition<int32_t, decltype(d)> di32; |
| const auto sel_hi_mask = |
| MaskFromVec(BitCast(d, VecFromMask(di32, DupEven(BitCast(di32, idx_vec)) > |
| Set(di32, int32_t{1})))); |
| #endif // HWY_TARGET <= HWY_SSE4 |
| |
| const auto lo_lookup_result = TableLookupLanes(a, idx_mod); |
| const auto hi_lookup_result = TableLookupLanes(b, idx_mod); |
| return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result); |
| #endif // HWY_TARGET <= HWY_AVX3 |
| } |
| |
| // ------------------------------ OddEven (IfThenElse) |
| |
| template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> |
| HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) { |
| const DFromV<decltype(a)> d; |
| const Repartition<uint8_t, decltype(d)> d8; |
| alignas(16) static constexpr uint8_t mask[16] = { |
| 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0}; |
| return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a); |
| } |
| |
| template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> |
| HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) { |
| const DFromV<decltype(a)> d; |
| #if HWY_TARGET >= HWY_SSSE3 |
| const Repartition<uint8_t, decltype(d)> d8; |
| alignas(16) static constexpr uint8_t mask[16] = { |
| 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0}; |
| return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a); |
| #else |
| const RebindToUnsigned<decltype(d)> du; // for float16_t |
| return BitCast(d, VFromD<decltype(du)>{_mm_blend_epi16( |
| BitCast(du, a).raw, BitCast(du, b).raw, 0x55)}); |
| #endif |
| } |
| |
| template <typename T, size_t N, HWY_IF_UI32(T)> |
| HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) { |
| #if HWY_TARGET >= HWY_SSSE3 |
| const __m128i odd = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 1, 3, 1)); |
| const __m128i even = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(2, 0, 2, 0)); |
| return Vec128<T, N>{_mm_unpacklo_epi32(even, odd)}; |
| #else |
| // _mm_blend_epi16 has throughput 1/cycle on SKX, whereas _ps can do 3/cycle. |
| const DFromV<decltype(a)> d; |
| const RebindToFloat<decltype(d)> df; |
| return BitCast(d, Vec128<float, N>{_mm_blend_ps(BitCast(df, a).raw, |
| BitCast(df, b).raw, 5)}); |
| #endif |
| } |
| |
| template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> |
| HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) { |
| // Same as ConcatUpperLower for full vectors; do not call that because this |
| // is more efficient for 64x1 vectors. |
| const DFromV<decltype(a)> d; |
| const RebindToFloat<decltype(d)> dd; |
| #if HWY_TARGET >= HWY_SSSE3 |
| return BitCast( |
| d, Vec128<double, N>{_mm_shuffle_pd( |
| BitCast(dd, b).raw, BitCast(dd, a).raw, _MM_SHUFFLE2(1, 0))}); |
| #else |
| // _mm_shuffle_pd has throughput 1/cycle on SKX, whereas blend can do 3/cycle. |
| return BitCast(d, Vec128<double, N>{_mm_blend_pd(BitCast(dd, a).raw, |
| BitCast(dd, b).raw, 1)}); |
| #endif |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<float, N> OddEven(Vec128<float, N> a, Vec128<float, N> b) { |
| #if HWY_TARGET >= HWY_SSSE3 |
| // SHUFPS must fill the lower half of the output from one input, so we |
| // need another shuffle. Unpack avoids another immediate byte. |
| const __m128 odd = _mm_shuffle_ps(a.raw, a.raw, _MM_SHUFFLE(3, 1, 3, 1)); |
| const __m128 even = _mm_shuffle_ps(b.raw, b.raw, _MM_SHUFFLE(2, 0, 2, 0)); |
| return Vec128<float, N>{_mm_unpacklo_ps(even, odd)}; |
| #else |
| return Vec128<float, N>{_mm_blend_ps(a.raw, b.raw, 5)}; |
| #endif |
| } |
| |
| // -------------------------- InterleaveEven |
| |
| template <class D, HWY_IF_LANES_LE_D(D, 2)> |
| HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) { |
| return ConcatEven(d, b, a); |
| } |
| |
| // I8/U8 InterleaveEven is generic for all vector lengths that are >= 4 bytes |
| template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 1)> |
| HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) { |
| const Repartition<uint16_t, decltype(d)> du16; |
| return OddEven(BitCast(d, ShiftLeft<8>(BitCast(du16, b))), a); |
| } |
| |
| // I16/U16 InterleaveEven is generic for all vector lengths that are >= 8 bytes |
| template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 2)> |
| HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) { |
| const Repartition<uint32_t, decltype(d)> du32; |
| return OddEven(BitCast(d, ShiftLeft<16>(BitCast(du32, b))), a); |
| } |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_UI32_D(D)> |
| HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) { |
| return VFromD<D>{_mm_mask_shuffle_epi32( |
| a.raw, static_cast<__mmask8>(0x0A), b.raw, |
| static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(2, 2, 0, 0)))}; |
| } |
| template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_F32_D(D)> |
| HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) { |
| return VFromD<D>{_mm_mask_shuffle_ps(a.raw, static_cast<__mmask8>(0x0A), |
| b.raw, b.raw, _MM_SHUFFLE(2, 2, 0, 0))}; |
| } |
| #else |
| template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_T_SIZE_D(D, 4)> |
| HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) { |
| const RebindToFloat<decltype(d)> df; |
| const auto b2_b0_a2_a0 = ConcatEven(df, BitCast(df, b), BitCast(df, a)); |
| return BitCast( |
| d, VFromD<decltype(df)>{_mm_shuffle_ps(b2_b0_a2_a0.raw, b2_b0_a2_a0.raw, |
| _MM_SHUFFLE(3, 1, 2, 0))}); |
| } |
| #endif |
| |
| // -------------------------- InterleaveOdd |
| |
| template <class D, HWY_IF_LANES_LE_D(D, 2)> |
| HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) { |
| return ConcatOdd(d, b, a); |
| } |
| |
| // I8/U8 InterleaveOdd is generic for all vector lengths that are >= 4 bytes |
| template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 1)> |
| HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) { |
| const Repartition<uint16_t, decltype(d)> du16; |
| return OddEven(b, BitCast(d, ShiftRight<8>(BitCast(du16, a)))); |
| } |
| |
| // I16/U16 InterleaveOdd is generic for all vector lengths that are >= 8 bytes |
| template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 2)> |
| HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) { |
| const Repartition<uint32_t, decltype(d)> du32; |
| return OddEven(b, BitCast(d, ShiftRight<16>(BitCast(du32, a)))); |
| } |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_UI32_D(D)> |
| HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) { |
| return VFromD<D>{_mm_mask_shuffle_epi32( |
| b.raw, static_cast<__mmask8>(0x05), a.raw, |
| static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(3, 3, 1, 1)))}; |
| } |
| template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_F32_D(D)> |
| HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) { |
| return VFromD<D>{_mm_mask_shuffle_ps(b.raw, static_cast<__mmask8>(0x05), |
| a.raw, a.raw, _MM_SHUFFLE(3, 3, 1, 1))}; |
| } |
| #else |
| template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_T_SIZE_D(D, 4)> |
| HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) { |
| const RebindToFloat<decltype(d)> df; |
| const auto b3_b1_a3_a1 = ConcatOdd(df, BitCast(df, b), BitCast(df, a)); |
| return BitCast( |
| d, VFromD<decltype(df)>{_mm_shuffle_ps(b3_b1_a3_a1.raw, b3_b1_a3_a1.raw, |
| _MM_SHUFFLE(3, 1, 2, 0))}); |
| } |
| #endif |
| |
| // ------------------------------ OddEvenBlocks |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) { |
| return even; |
| } |
| |
| // ------------------------------ SwapAdjacentBlocks |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) { |
| return v; |
| } |
| |
| // ------------------------------ InterleaveEvenBlocks |
| template <class D, class V = VFromD<D>, HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API V InterleaveEvenBlocks(D, V a, V /*b*/) { |
| return a; |
| } |
| // ------------------------------ InterleaveOddBlocks |
| template <class D, class V = VFromD<D>, HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API V InterleaveOddBlocks(D, V a, V /*b*/) { |
| return a; |
| } |
| |
| // ------------------------------ Shl (ZipLower, Mul) |
| |
| // Use AVX2/3 variable shifts where available, otherwise multiply by powers of |
| // two from loading float exponents, which is considerably faster (according |
| // to LLVM-MCA) than scalar or testing bits: https://gcc.godbolt.org/z/9G7Y9v. |
| |
| namespace detail { |
| |
| #if HWY_TARGET == HWY_AVX2 // Unused for AVX3 - we use sllv directly |
| template <class V> |
| HWY_API V AVX2ShlU16Vec128(V v, V bits) { |
| const DFromV<decltype(v)> d; |
| const Rebind<uint32_t, decltype(d)> du32; |
| return TruncateTo(d, PromoteTo(du32, v) << PromoteTo(du32, bits)); |
| } |
| #elif HWY_TARGET > HWY_AVX2 |
| |
| template <class D32> |
| static HWY_INLINE VFromD<D32> Pow2ConvF32ToI32( |
| D32 d32, VFromD<RebindToFloat<D32>> vf32) { |
| const RebindToSigned<decltype(d32)> di32; |
| #if HWY_COMPILER_GCC_ACTUAL |
| // ConvertInRangeTo is safe with GCC due the inline assembly workaround used |
| // for F32->I32 ConvertInRangeTo with GCC |
| return BitCast(d32, ConvertInRangeTo(di32, vf32)); |
| #else |
| // Otherwise, use NearestIntInRange because we rely on the native 0x80..00 |
| // overflow behavior |
| return BitCast(d32, NearestIntInRange(di32, vf32)); |
| #endif |
| } |
| |
| // Returns 2^v for use as per-lane multipliers to emulate 16-bit shifts. |
| template <typename T, HWY_IF_T_SIZE(T, 2)> |
| HWY_INLINE Vec128<MakeUnsigned<T>> Pow2(const Vec128<T> v) { |
| const DFromV<decltype(v)> d; |
| const RebindToUnsigned<decltype(d)> du; |
| const RepartitionToWide<decltype(d)> dw; |
| const Rebind<float, decltype(dw)> df; |
| const auto zero = Zero(d); |
| // Move into exponent (this u16 will become the upper half of an f32) |
| const auto exp = ShiftLeft<23 - 16>(v); |
| const auto upper = exp + Set(d, 0x3F80); // upper half of 1.0f |
| // Insert 0 into lower halves for reinterpreting as binary32. |
| const auto f0 = ZipLower(dw, zero, upper); |
| const auto f1 = ZipUpper(dw, zero, upper); |
| // See cvtps comment below. |
| const VFromD<decltype(dw)> bits0 = Pow2ConvF32ToI32(dw, BitCast(df, f0)); |
| const VFromD<decltype(dw)> bits1 = Pow2ConvF32ToI32(dw, BitCast(df, f1)); |
| #if HWY_TARGET <= HWY_SSE4 |
| return VFromD<decltype(du)>{_mm_packus_epi32(bits0.raw, bits1.raw)}; |
| #else |
| return ConcatEven(du, BitCast(du, bits1), BitCast(du, bits0)); |
| #endif |
| } |
| |
| template <typename T, size_t N, HWY_IF_T_SIZE(T, 2), HWY_IF_LANES_LE(N, 4)> |
| HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(const Vec128<T, N> v) { |
| const DFromV<decltype(v)> d; |
| const RebindToUnsigned<decltype(d)> du; |
| const Twice<decltype(du)> dt_u; |
| const RepartitionToWide<decltype(dt_u)> dt_w; |
| const RebindToFloat<decltype(dt_w)> dt_f; |
| // Move into exponent (this u16 will become the upper half of an f32) |
| const auto exp = ShiftLeft<23 - 16>(v); |
| const auto upper = exp + Set(d, 0x3F80); // upper half of 1.0f |
| // Insert 0 into lower halves for reinterpreting as binary32. |
| const auto f0 = ZipLower(dt_w, Zero(dt_u), ResizeBitCast(dt_u, upper)); |
| // See cvtps comment below. |
| const VFromD<decltype(dt_w)> bits0 = |
| Pow2ConvF32ToI32(dt_w, BitCast(dt_f, f0)); |
| #if HWY_TARGET <= HWY_SSE4 |
| return VFromD<decltype(du)>{_mm_packus_epi32(bits0.raw, bits0.raw)}; |
| #elif HWY_TARGET == HWY_SSSE3 |
| alignas(16) |
| const uint16_t kCompactEvenU16[8] = {0x0100, 0x0504, 0x0908, 0x0D0C}; |
| return TableLookupBytes(bits0, Load(du, kCompactEvenU16)); |
| #else |
| const RebindToSigned<decltype(dt_w)> dt_i32; |
| const auto bits0_i32 = ShiftRight<16>(BitCast(dt_i32, ShiftLeft<16>(bits0))); |
| return VFromD<decltype(du)>{_mm_packs_epi32(bits0_i32.raw, bits0_i32.raw)}; |
| #endif |
| } |
| |
| // Same, for 32-bit shifts. |
| template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> |
| HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(const Vec128<T, N> v) { |
| const DFromV<decltype(v)> d; |
| const RebindToFloat<decltype(d)> df; |
| const auto exp = ShiftLeft<23>(v); |
| const auto f = exp + Set(d, 0x3F800000); // 1.0f |
| // Do not use ConvertTo because we rely on the native 0x80..00 overflow |
| // behavior. |
| return Pow2ConvF32ToI32(d, BitCast(df, f)); |
| } |
| |
| #endif // HWY_TARGET > HWY_AVX2 |
| |
| template <size_t N> |
| HWY_API Vec128<uint16_t, N> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint16_t, N> v, |
| Vec128<uint16_t, N> bits) { |
| #if HWY_TARGET <= HWY_AVX3 |
| return Vec128<uint16_t, N>{_mm_sllv_epi16(v.raw, bits.raw)}; |
| #elif HWY_TARGET == HWY_AVX2 |
| return AVX2ShlU16Vec128(v, bits); |
| #else |
| return v * Pow2(bits); |
| #endif |
| } |
| |
| #if HWY_TARGET > HWY_AVX3 |
| HWY_API Vec16<uint16_t> Shl(hwy::UnsignedTag /*tag*/, Vec16<uint16_t> v, |
| Vec16<uint16_t> bits) { |
| #if HWY_TARGET <= HWY_SSE4 |
| const Vec16<uint16_t> bits16{_mm_cvtepu16_epi64(bits.raw)}; |
| #else |
| const auto bits16 = And(bits, Vec16<uint16_t>{_mm_set_epi64x(0, 0xFFFF)}); |
| #endif |
| return Vec16<uint16_t>{_mm_sll_epi16(v.raw, bits16.raw)}; |
| } |
| #endif |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| template <class V> |
| HWY_INLINE V AVX2ShlU8Vec128(V v, V bits) { |
| const DFromV<decltype(v)> d; |
| const Rebind<uint16_t, decltype(d)> du16; |
| return TruncateTo(d, PromoteTo(du16, v) << PromoteTo(du16, bits)); |
| } |
| #elif HWY_TARGET <= HWY_AVX2 |
| template <class V, HWY_IF_V_SIZE_LE_V(V, 8)> |
| HWY_INLINE V AVX2ShlU8Vec128(V v, V bits) { |
| const DFromV<decltype(v)> d; |
| const Rebind<uint32_t, decltype(d)> du32; |
| return TruncateTo(d, PromoteTo(du32, v) << PromoteTo(du32, bits)); |
| } |
| template <class V, HWY_IF_V_SIZE_V(V, 16)> |
| HWY_INLINE V AVX2ShlU8Vec128(V v, V bits) { |
| const DFromV<decltype(v)> d; |
| const Half<decltype(d)> dh; |
| const Rebind<uint16_t, decltype(d)> du16; |
| const Rebind<uint32_t, decltype(dh)> dh_u32; |
| |
| const VFromD<decltype(dh_u32)> lo_shl_result = |
| PromoteTo(dh_u32, LowerHalf(dh, v)) |
| << PromoteTo(dh_u32, LowerHalf(dh, bits)); |
| const VFromD<decltype(dh_u32)> hi_shl_result = |
| PromoteTo(dh_u32, UpperHalf(dh, v)) |
| << PromoteTo(dh_u32, UpperHalf(dh, bits)); |
| const VFromD<decltype(du16)> u16_shl_result = ConcatEven( |
| du16, BitCast(du16, hi_shl_result), BitCast(du16, lo_shl_result)); |
| return TruncateTo(d, u16_shl_result); |
| } |
| #endif // HWY_TARGET <= HWY_AVX3 |
| |
| // 8-bit: may use the Shl overload for uint16_t. |
| template <size_t N> |
| HWY_API Vec128<uint8_t, N> Shl(hwy::UnsignedTag tag, Vec128<uint8_t, N> v, |
| Vec128<uint8_t, N> bits) { |
| const DFromV<decltype(v)> d; |
| #if HWY_TARGET <= HWY_AVX3_DL |
| (void)tag; |
| // kMask[i] = 0xFF >> i |
| alignas(16) static constexpr uint8_t kMasks[16] = { |
| 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01, 0x00}; |
| // kShl[i] = 1 << i |
| alignas(16) static constexpr uint8_t kShl[16] = {1, 2, 4, 8, 0x10, |
| 0x20, 0x40, 0x80, 0x00}; |
| v = And(v, TableLookupBytes(Load(Full64<uint8_t>(), kMasks), bits)); |
| const VFromD<decltype(d)> mul = |
| TableLookupBytes(Load(Full64<uint8_t>(), kShl), bits); |
| return VFromD<decltype(d)>{_mm_gf2p8mul_epi8(v.raw, mul.raw)}; |
| #elif HWY_TARGET <= HWY_AVX2 |
| (void)tag; |
| (void)d; |
| return AVX2ShlU8Vec128(v, bits); |
| #else |
| const Repartition<uint16_t, decltype(d)> dw; |
| using VW = VFromD<decltype(dw)>; |
| const VW even_mask = Set(dw, 0x00FF); |
| const VW odd_mask = Set(dw, 0xFF00); |
| const VW vw = BitCast(dw, v); |
| const VW bits16 = BitCast(dw, bits); |
| // Shift even lanes in-place |
| const VW evens = Shl(tag, vw, And(bits16, even_mask)); |
| const VW odds = Shl(tag, And(vw, odd_mask), ShiftRight<8>(bits16)); |
| return OddEven(BitCast(d, odds), BitCast(d, evens)); |
| #endif |
| } |
| HWY_API Vec128<uint8_t, 1> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint8_t, 1> v, |
| Vec128<uint8_t, 1> bits) { |
| #if HWY_TARGET <= HWY_SSE4 |
| const Vec16<uint16_t> bits8{_mm_cvtepu8_epi64(bits.raw)}; |
| #else |
| const Vec16<uint16_t> bits8 = |
| And(Vec16<uint16_t>{bits.raw}, Vec16<uint16_t>{_mm_set_epi64x(0, 0xFF)}); |
| #endif |
| return Vec128<uint8_t, 1>{_mm_sll_epi16(v.raw, bits8.raw)}; |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<uint32_t, N> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint32_t, N> v, |
| Vec128<uint32_t, N> bits) { |
| #if HWY_TARGET >= HWY_SSE4 |
| return v * Pow2(bits); |
| #else |
| return Vec128<uint32_t, N>{_mm_sllv_epi32(v.raw, bits.raw)}; |
| #endif |
| } |
| |
| #if HWY_TARGET >= HWY_SSE4 |
| HWY_API Vec32<uint32_t> Shl(hwy::UnsignedTag /*tag*/, Vec32<uint32_t> v, |
| const Vec32<uint32_t> bits) { |
| #if HWY_TARGET == HWY_SSE4 |
| const Vec32<uint32_t> bits32{_mm_cvtepu32_epi64(bits.raw)}; |
| #else |
| const auto bits32 = |
| Combine(Full64<uint32_t>(), Zero(Full32<uint32_t>()), bits); |
| #endif |
| return Vec32<uint32_t>{_mm_sll_epi32(v.raw, bits32.raw)}; |
| } |
| #endif |
| |
| HWY_API Vec128<uint64_t> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint64_t> v, |
| Vec128<uint64_t> bits) { |
| #if HWY_TARGET >= HWY_SSE4 |
| const DFromV<decltype(v)> d; |
| // Individual shifts and combine |
| const Vec128<uint64_t> out0{_mm_sll_epi64(v.raw, bits.raw)}; |
| const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw); |
| const Vec128<uint64_t> out1{_mm_sll_epi64(v.raw, bits1)}; |
| return ConcatUpperLower(d, out1, out0); |
| #else |
| return Vec128<uint64_t>{_mm_sllv_epi64(v.raw, bits.raw)}; |
| #endif |
| } |
| HWY_API Vec64<uint64_t> Shl(hwy::UnsignedTag /*tag*/, Vec64<uint64_t> v, |
| Vec64<uint64_t> bits) { |
| return Vec64<uint64_t>{_mm_sll_epi64(v.raw, bits.raw)}; |
| } |
| |
| // Signed left shift is the same as unsigned. |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N> Shl(hwy::SignedTag /*tag*/, Vec128<T, N> v, |
| Vec128<T, N> bits) { |
| const DFromV<decltype(v)> di; |
| const RebindToUnsigned<decltype(di)> du; |
| return BitCast(di, |
| Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits))); |
| } |
| |
| } // namespace detail |
| |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) { |
| return detail::Shl(hwy::TypeTag<T>(), v, bits); |
| } |
| |
| // ------------------------------ Shr (mul, mask, BroadcastSignBit) |
| |
| // Use AVX2+ variable shifts except for SSSE3/SSE4. There, we use |
| // widening multiplication by powers of two obtained by loading float exponents, |
| // followed by a constant right-shift. This is still faster than a scalar or |
| // bit-test approach: https://gcc.godbolt.org/z/9G7Y9v. |
| |
| #if HWY_TARGET <= HWY_AVX2 |
| namespace detail { |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| template <class V> |
| HWY_INLINE V AVX2ShrU8Vec128(V v, V bits) { |
| const DFromV<decltype(v)> d; |
| const Rebind<uint16_t, decltype(d)> du16; |
| const RebindToSigned<decltype(du16)> di16; |
| return DemoteTo(d, |
| BitCast(di16, PromoteTo(du16, v) >> PromoteTo(du16, bits))); |
| } |
| #else // AVX2 |
| template <class V> |
| HWY_INLINE V AVX2ShrU16Vec128(V v, V bits) { |
| const DFromV<decltype(v)> d; |
| const Rebind<uint32_t, decltype(d)> du32; |
| const RebindToSigned<decltype(du32)> di32; |
| return DemoteTo(d, |
| BitCast(di32, PromoteTo(du32, v) >> PromoteTo(du32, bits))); |
| } |
| template <class V, HWY_IF_V_SIZE_LE_V(V, 8)> |
| HWY_INLINE V AVX2ShrU8Vec128(V v, V bits) { |
| const DFromV<decltype(v)> d; |
| const Rebind<uint32_t, decltype(d)> du32; |
| const RebindToSigned<decltype(du32)> di32; |
| return DemoteTo(d, |
| BitCast(di32, PromoteTo(du32, v) >> PromoteTo(du32, bits))); |
| } |
| template <class V, HWY_IF_V_SIZE_V(V, 16)> |
| HWY_INLINE V AVX2ShrU8Vec128(V v, V bits) { |
| const DFromV<decltype(v)> d; |
| const Half<decltype(d)> dh; |
| const Rebind<int16_t, decltype(d)> di16; |
| const Rebind<uint16_t, decltype(d)> du16; |
| const Rebind<int32_t, decltype(dh)> dh_i32; |
| const Rebind<uint32_t, decltype(dh)> dh_u32; |
| |
| const auto lo_shr_result = |
| BitCast(dh_i32, PromoteTo(dh_u32, LowerHalf(dh, v)) >> |
| PromoteTo(dh_u32, LowerHalf(dh, bits))); |
| const auto hi_shr_result = |
| BitCast(dh_i32, PromoteTo(dh_u32, UpperHalf(dh, v)) >> |
| PromoteTo(dh_u32, UpperHalf(dh, bits))); |
| const auto i16_shr_result = |
| BitCast(di16, OrderedDemote2To(du16, lo_shr_result, hi_shr_result)); |
| return DemoteTo(d, i16_shr_result); |
| } |
| #endif // HWY_TARGET <= HWY_AVX3 |
| |
| } // namespace detail |
| #endif // HWY_TARGET <= HWY_AVX2 |
| |
| template <size_t N> |
| HWY_API Vec128<uint16_t, N> operator>>(Vec128<uint16_t, N> in, |
| const Vec128<uint16_t, N> bits) { |
| #if HWY_TARGET <= HWY_AVX3 |
| return Vec128<uint16_t, N>{_mm_srlv_epi16(in.raw, bits.raw)}; |
| #elif HWY_TARGET <= HWY_AVX2 |
| return detail::AVX2ShrU16Vec128(in, bits); |
| #else |
| const DFromV<decltype(in)> d; |
| // For bits=0, we cannot mul by 2^16, so fix the result later. |
| const auto out = MulHigh(in, detail::Pow2(Set(d, 16) - bits)); |
| // Replace output with input where bits == 0. |
| return IfThenElse(bits == Zero(d), in, out); |
| #endif |
| } |
| |
| #if HWY_TARGET > HWY_AVX3 |
| HWY_API Vec16<uint16_t> operator>>(const Vec16<uint16_t> in, |
| const Vec16<uint16_t> bits) { |
| #if HWY_TARGET <= HWY_SSE4 |
| const Vec16<uint16_t> bits16{_mm_cvtepu16_epi64(bits.raw)}; |
| #else |
| const auto bits16 = And(bits, Vec16<uint16_t>{_mm_set_epi64x(0, 0xFFFF)}); |
| #endif |
| return Vec16<uint16_t>{_mm_srl_epi16(in.raw, bits16.raw)}; |
| } |
| #endif |
| |
| // 8-bit uses 16-bit shifts. |
| template <size_t N> |
| HWY_API Vec128<uint8_t, N> operator>>(Vec128<uint8_t, N> in, |
| const Vec128<uint8_t, N> bits) { |
| #if HWY_TARGET <= HWY_AVX2 |
| return detail::AVX2ShrU8Vec128(in, bits); |
| #else |
| const DFromV<decltype(in)> d; |
| const Repartition<uint16_t, decltype(d)> dw; |
| using VW = VFromD<decltype(dw)>; |
| const VW mask = Set(dw, 0x00FF); |
| const VW vw = BitCast(dw, in); |
| const VW bits16 = BitCast(dw, bits); |
| const VW evens = And(vw, mask) >> And(bits16, mask); |
| // Shift odd lanes in-place |
| const VW odds = vw >> ShiftRight<8>(bits16); |
| return OddEven(BitCast(d, odds), BitCast(d, evens)); |
| #endif |
| } |
| HWY_API Vec128<uint8_t, 1> operator>>(const Vec128<uint8_t, 1> in, |
| const Vec128<uint8_t, 1> bits) { |
| #if HWY_TARGET <= HWY_SSE4 |
| const Vec16<uint16_t> in8{_mm_cvtepu8_epi16(in.raw)}; |
| const Vec16<uint16_t> bits8{_mm_cvtepu8_epi64(bits.raw)}; |
| #else |
| const Vec16<uint16_t> mask{_mm_set_epi64x(0, 0xFF)}; |
| const Vec16<uint16_t> in8 = And(Vec16<uint16_t>{in.raw}, mask); |
| const Vec16<uint16_t> bits8 = And(Vec16<uint16_t>{bits.raw}, mask); |
| #endif |
| return Vec128<uint8_t, 1>{_mm_srl_epi16(in8.raw, bits8.raw)}; |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<uint32_t, N> operator>>(const Vec128<uint32_t, N> in, |
| const Vec128<uint32_t, N> bits) { |
| #if HWY_TARGET >= HWY_SSE4 |
| // 32x32 -> 64 bit mul, then shift right by 32. |
| const DFromV<decltype(in)> d32; |
| // Move odd lanes into position for the second mul. Shuffle more gracefully |
| // handles N=1 than repartitioning to u64 and shifting 32 bits right. |
| const Vec128<uint32_t, N> in31{_mm_shuffle_epi32(in.raw, 0x31)}; |
| // For bits=0, we cannot mul by 2^32, so fix the result later. |
| const auto mul = detail::Pow2(Set(d32, 32) - bits); |
| const auto out20 = ShiftRight<32>(MulEven(in, mul)); // z 2 z 0 |
| const Vec128<uint32_t, N> mul31{_mm_shuffle_epi32(mul.raw, 0x31)}; |
| // No need to shift right, already in the correct position. |
| const auto out31 = BitCast(d32, MulEven(in31, mul31)); // 3 ? 1 ? |
| const Vec128<uint32_t, N> out = OddEven(out31, BitCast(d32, out20)); |
| // Replace output with input where bits == 0. |
| return IfThenElse(bits == Zero(d32), in, out); |
| #else |
| return Vec128<uint32_t, N>{_mm_srlv_epi32(in.raw, bits.raw)}; |
| #endif |
| } |
| |
| #if HWY_TARGET >= HWY_SSE4 |
| HWY_API Vec128<uint32_t, 1> operator>>(const Vec128<uint32_t, 1> in, |
| const Vec128<uint32_t, 1> bits) { |
| #if HWY_TARGET == HWY_SSE4 |
| const Vec32<uint32_t> bits32{_mm_cvtepu32_epi64(bits.raw)}; |
| #else |
| const auto bits32 = |
| Combine(Full64<uint32_t>(), Zero(Full32<uint32_t>()), bits); |
| #endif |
| return Vec128<uint32_t, 1>{_mm_srl_epi32(in.raw, bits32.raw)}; |
| } |
| #endif |
| |
| HWY_API Vec128<uint64_t> operator>>(const Vec128<uint64_t> v, |
| const Vec128<uint64_t> bits) { |
| #if HWY_TARGET >= HWY_SSE4 |
| const DFromV<decltype(v)> d; |
| // Individual shifts and combine |
| const Vec128<uint64_t> out0{_mm_srl_epi64(v.raw, bits.raw)}; |
| const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw); |
| const Vec128<uint64_t> out1{_mm_srl_epi64(v.raw, bits1)}; |
| return ConcatUpperLower(d, out1, out0); |
| #else |
| return Vec128<uint64_t>{_mm_srlv_epi64(v.raw, bits.raw)}; |
| #endif |
| } |
| HWY_API Vec64<uint64_t> operator>>(const Vec64<uint64_t> v, |
| const Vec64<uint64_t> bits) { |
| return Vec64<uint64_t>{_mm_srl_epi64(v.raw, bits.raw)}; |
| } |
| |
| namespace detail { |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| template <class V> |
| HWY_INLINE V AVX2ShrI8Vec128(V v, V bits) { |
| const DFromV<decltype(v)> d; |
| const Rebind<int16_t, decltype(d)> di16; |
| return DemoteTo(d, PromoteTo(di16, v) >> PromoteTo(di16, bits)); |
| } |
| #elif HWY_TARGET <= HWY_AVX2 // AVX2 |
| template <class V> |
| HWY_INLINE V AVX2ShrI16Vec128(V v, V bits) { |
| const DFromV<decltype(v)> d; |
| const Rebind<int32_t, decltype(d)> di32; |
| return DemoteTo(d, PromoteTo(di32, v) >> PromoteTo(di32, bits)); |
| } |
| template <class V, HWY_IF_V_SIZE_LE_V(V, 8)> |
| HWY_INLINE V AVX2ShrI8Vec128(V v, V bits) { |
| const DFromV<decltype(v)> d; |
| const Rebind<int32_t, decltype(d)> di32; |
| return DemoteTo(d, PromoteTo(di32, v) >> PromoteTo(di32, bits)); |
| } |
| template <class V, HWY_IF_V_SIZE_V(V, 16)> |
| HWY_INLINE V AVX2ShrI8Vec128(V v, V bits) { |
| const DFromV<decltype(v)> d; |
| const Half<decltype(d)> dh; |
| const Rebind<int16_t, decltype(d)> di16; |
| const Rebind<int32_t, decltype(dh)> dh_i32; |
| |
| const auto lo_shr_result = PromoteTo(dh_i32, LowerHalf(dh, v)) >> |
| PromoteTo(dh_i32, LowerHalf(dh, bits)); |
| const auto hi_shr_result = PromoteTo(dh_i32, UpperHalf(dh, v)) >> |
| PromoteTo(dh_i32, UpperHalf(dh, bits)); |
| const auto i16_shr_result = |
| OrderedDemote2To(di16, lo_shr_result, hi_shr_result); |
| return DemoteTo(d, i16_shr_result); |
| } |
| #endif |
| |
| #if HWY_TARGET > HWY_AVX3 |
| // Also used in x86_256-inl.h. |
| template <class DI, class V> |
| HWY_INLINE V SignedShr(const DI di, const V v, const V count_i) { |
| const RebindToUnsigned<DI> du; |
| const auto count = BitCast(du, count_i); // same type as value to shift |
| // Clear sign and restore afterwards. This is preferable to shifting the MSB |
| // downwards because Shr is somewhat more expensive than Shl. |
| const auto sign = BroadcastSignBit(v); |
| const auto abs = BitCast(du, v ^ sign); // off by one, but fixed below |
| return BitCast(di, abs >> count) ^ sign; |
| } |
| #endif |
| |
| } // namespace detail |
| |
| template <size_t N> |
| HWY_API Vec128<int16_t, N> operator>>(Vec128<int16_t, N> v, |
| Vec128<int16_t, N> bits) { |
| #if HWY_TARGET <= HWY_AVX3 |
| return Vec128<int16_t, N>{_mm_srav_epi16(v.raw, bits.raw)}; |
| #elif HWY_TARGET <= HWY_AVX2 |
| return detail::AVX2ShrI16Vec128(v, bits); |
| #else |
| const DFromV<decltype(v)> d; |
| return detail::SignedShr(d, v, bits); |
| #endif |
| } |
| |
| #if HWY_TARGET > HWY_AVX3 |
| HWY_API Vec16<int16_t> operator>>(Vec16<int16_t> v, Vec16<int16_t> bits) { |
| #if HWY_TARGET <= HWY_SSE4 |
| const Vec16<int16_t> bits16{_mm_cvtepu16_epi64(bits.raw)}; |
| #else |
| const auto bits16 = And(bits, Vec16<int16_t>{_mm_set_epi64x(0, 0xFFFF)}); |
| #endif |
| return Vec16<int16_t>{_mm_sra_epi16(v.raw, bits16.raw)}; |
| } |
| #endif |
| |
| template <size_t N> |
| HWY_API Vec128<int8_t, N> operator>>(Vec128<int8_t, N> v, |
| Vec128<int8_t, N> bits) { |
| #if HWY_TARGET <= HWY_AVX2 |
| return detail::AVX2ShrI8Vec128(v, bits); |
| #else |
| const DFromV<decltype(v)> d; |
| return detail::SignedShr(d, v, bits); |
| #endif |
| } |
| HWY_API Vec128<int8_t, 1> operator>>(Vec128<int8_t, 1> v, |
| Vec128<int8_t, 1> bits) { |
| #if HWY_TARGET <= HWY_SSE4 |
| const Vec16<int16_t> vi16{_mm_cvtepi8_epi16(v.raw)}; |
| const Vec16<uint16_t> bits8{_mm_cvtepu8_epi64(bits.raw)}; |
| #else |
| const DFromV<decltype(v)> d; |
| const Rebind<int16_t, decltype(d)> di16; |
| const Twice<decltype(d)> dt; |
| |
| const auto vi16 = ShiftRight<8>(BitCast(di16, Combine(dt, v, v))); |
| const Vec16<uint16_t> bits8 = |
| And(Vec16<uint16_t>{bits.raw}, Vec16<uint16_t>{_mm_set_epi64x(0, 0xFF)}); |
| #endif |
| return Vec128<int8_t, 1>{_mm_sra_epi16(vi16.raw, bits8.raw)}; |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<int32_t, N> operator>>(Vec128<int32_t, N> v, |
| Vec128<int32_t, N> bits) { |
| #if HWY_TARGET <= HWY_AVX2 |
| return Vec128<int32_t, N>{_mm_srav_epi32(v.raw, bits.raw)}; |
| #else |
| const DFromV<decltype(v)> d; |
| return detail::SignedShr(d, v, bits); |
| #endif |
| } |
| |
| #if HWY_TARGET > HWY_AVX2 |
| HWY_API Vec32<int32_t> operator>>(Vec32<int32_t> v, Vec32<int32_t> bits) { |
| #if HWY_TARGET == HWY_SSE4 |
| const Vec32<uint32_t> bits32{_mm_cvtepu32_epi64(bits.raw)}; |
| #else |
| const auto bits32 = Combine(Full64<int32_t>(), Zero(Full32<int32_t>()), bits); |
| #endif |
| return Vec32<int32_t>{_mm_sra_epi32(v.raw, bits32.raw)}; |
| } |
| #endif |
| |
| template <size_t N> |
| HWY_API Vec128<int64_t, N> operator>>(Vec128<int64_t, N> v, |
| Vec128<int64_t, N> bits) { |
| #if HWY_TARGET <= HWY_AVX3 |
| return Vec128<int64_t, N>{_mm_srav_epi64(v.raw, bits.raw)}; |
| #else |
| const DFromV<decltype(v)> d; |
| return detail::SignedShr(d, v, bits); |
| #endif |
| } |
| |
| // ------------------------------ MulEven/Odd 64x64 (UpperHalf) |
| |
| namespace detail { |
| |
| template <class V, HWY_IF_U64(TFromV<V>)> |
| static HWY_INLINE V SSE2Mul128(V a, V b, V& mulH) { |
| const DFromV<decltype(a)> du64; |
| const RepartitionToNarrow<decltype(du64)> du32; |
| const auto maskL = Set(du64, 0xFFFFFFFFULL); |
| const auto a32 = BitCast(du32, a); |
| const auto b32 = BitCast(du32, b); |
| // Inputs for MulEven: we only need the lower 32 bits |
| const auto aH = Shuffle2301(a32); |
| const auto bH = Shuffle2301(b32); |
| |
| // Knuth double-word multiplication. We use 32x32 = 64 MulEven and only need |
| // the even (lower 64 bits of every 128-bit block) results. See |
| // https://github.com/hcs0/Hackers-Delight/blob/master/muldwu.c.txt |
| const auto aLbL = MulEven(a32, b32); |
| const auto w3 = aLbL & maskL; |
| |
| const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL); |
| const auto w2 = t2 & maskL; |
| const auto w1 = ShiftRight<32>(t2); |
| |
| const auto t = MulEven(a32, bH) + w2; |
| const auto k = ShiftRight<32>(t); |
| |
| mulH = MulEven(aH, bH) + w1 + k; |
| return ShiftLeft<32>(t) + w3; |
| } |
| |
| template <class V, HWY_IF_I64(TFromV<V>)> |
| static HWY_INLINE V SSE2Mul128(V a, V b, V& mulH) { |
| const DFromV<decltype(a)> di64; |
| const RebindToUnsigned<decltype(di64)> du64; |
| using VU64 = VFromD<decltype(du64)>; |
| |
| VU64 unsigned_mulH; |
| const auto mulL = BitCast( |
| di64, SSE2Mul128(BitCast(du64, a), BitCast(du64, b), unsigned_mulH)); |
| mulH = BitCast(di64, unsigned_mulH) - And(BroadcastSignBit(a), b) - |
| And(a, BroadcastSignBit(b)); |
| return mulL; |
| } |
| |
| } // namespace detail |
| |
| #if !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2 |
| |
| template <class V, HWY_IF_UI64(TFromV<V>), |
| HWY_IF_V_SIZE_GT_V(V, (HWY_ARCH_X86_64 ? 16 : 8))> |
| HWY_API V MulEven(V a, V b) { |
| V mulH; |
| const V mulL = detail::SSE2Mul128(a, b, mulH); |
| return InterleaveLower(mulL, mulH); |
| } |
| |
| template <class V, HWY_IF_UI64(TFromV<V>), |
| HWY_IF_V_SIZE_GT_V(V, (HWY_ARCH_X86_64 ? 16 : 8))> |
| HWY_API V MulOdd(V a, V b) { |
| const DFromV<decltype(a)> du64; |
| V mulH; |
| const V mulL = detail::SSE2Mul128(a, b, mulH); |
| return InterleaveUpper(du64, mulL, mulH); |
| } |
| |
| #endif // !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2 |
| |
| template <class V, HWY_IF_UI64(TFromV<V>), |
| HWY_IF_V_SIZE_GT_V(V, (HWY_ARCH_X86_64 ? 8 : 0))> |
| HWY_API V MulHigh(V a, V b) { |
| V mulH; |
| detail::SSE2Mul128(a, b, mulH); |
| return mulH; |
| } |
| |
| #if HWY_ARCH_X86_64 |
| |
| template <class T, HWY_IF_UI64(T)> |
| HWY_API Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) { |
| const DFromV<decltype(a)> d; |
| alignas(16) T mul[2]; |
| mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]); |
| return Load(d, mul); |
| } |
| |
| template <class T, HWY_IF_UI64(T)> |
| HWY_API Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) { |
| const DFromV<decltype(a)> d; |
| const Half<decltype(d)> d2; |
| alignas(16) T mul[2]; |
| const T a1 = GetLane(UpperHalf(d2, a)); |
| const T b1 = GetLane(UpperHalf(d2, b)); |
| mul[0] = Mul128(a1, b1, &mul[1]); |
| return Load(d, mul); |
| } |
| |
| template <class T, HWY_IF_UI64(T)> |
| HWY_API Vec64<T> MulHigh(Vec64<T> a, Vec64<T> b) { |
| T hi; |
| Mul128(GetLane(a), GetLane(b), &hi); |
| return Vec64<T>{_mm_cvtsi64_si128(static_cast<int64_t>(hi))}; |
| } |
| |
| #endif // HWY_ARCH_X86_64 |
| |
| // ================================================== CONVERT (2) |
| |
| // ------------------------------ PromoteEvenTo/PromoteOddTo |
| |
| #if HWY_TARGET > HWY_AVX3 |
| namespace detail { |
| |
| // I32->I64 PromoteEvenTo/PromoteOddTo |
| |
| template <class D, HWY_IF_LANES_D(D, 1)> |
| HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/, |
| hwy::SizeTag<8> /*to_lane_size_tag*/, |
| hwy::SignedTag /*from_type_tag*/, D d_to, |
| Vec64<int32_t> v) { |
| return PromoteLowerTo(d_to, v); |
| } |
| |
| template <class D, HWY_IF_LANES_D(D, 2)> |
| HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/, |
| hwy::SizeTag<8> /*to_lane_size_tag*/, |
| hwy::SignedTag /*from_type_tag*/, D d_to, |
| Vec128<int32_t> v) { |
| const Repartition<int32_t, D> d_from; |
| return PromoteLowerTo(d_to, ConcatEven(d_from, v, v)); |
| } |
| |
| template <class D, class V, HWY_IF_LANES_LE_D(D, 2)> |
| HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/, |
| hwy::SizeTag<8> /*to_lane_size_tag*/, |
| hwy::SignedTag /*from_type_tag*/, D d_to, |
| V v) { |
| const Repartition<int32_t, D> d_from; |
| return PromoteLowerTo(d_to, ConcatOdd(d_from, v, v)); |
| } |
| |
| } // namespace detail |
| #endif |
| |
| // ------------------------------ PromoteEvenTo/PromoteOddTo |
| #include "third_party/highway/hwy/ops/inside-inl.h" |
| |
| // ------------------------------ WidenMulPairwiseAdd (PromoteEvenTo) |
| |
| #if HWY_NATIVE_DOT_BF16 |
| |
| template <class DF, HWY_IF_F32_D(DF), HWY_IF_V_SIZE_LE_D(DF, 16), |
| class VBF = VFromD<Repartition<bfloat16_t, DF>>> |
| HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) { |
| return VFromD<DF>{_mm_dpbf16_ps(Zero(df).raw, |
| reinterpret_cast<__m128bh>(a.raw), |
| reinterpret_cast<__m128bh>(b.raw))}; |
| } |
| |
| #else |
| |
| // Generic for all vector lengths. |
| template <class DF, HWY_IF_F32_D(DF), |
| class VBF = VFromD<Repartition<bfloat16_t, DF>>> |
| HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) { |
| return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b), |
| Mul(PromoteOddTo(df, a), PromoteOddTo(df, b))); |
| } |
| |
| #endif // HWY_NATIVE_DOT_BF16 |
| |
| // Even if N=1, the input is always at least 2 lanes, hence madd_epi16 is safe. |
| template <class D32, HWY_IF_I32_D(D32), HWY_IF_V_SIZE_LE_D(D32, 16), |
| class V16 = VFromD<RepartitionToNarrow<D32>>> |
| HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 /* tag */, V16 a, V16 b) { |
| return VFromD<D32>{_mm_madd_epi16(a.raw, b.raw)}; |
| } |
| |
| // Generic for all vector lengths. |
| template <class DU32, HWY_IF_U32_D(DU32), |
| class VU16 = VFromD<RepartitionToNarrow<DU32>>> |
| HWY_API VFromD<DU32> WidenMulPairwiseAdd(DU32 du32, VU16 a, VU16 b) { |
| const auto p_lo = a * b; |
| const auto p_hi = MulHigh(a, b); |
| |
| const auto p_hi1_lo0 = BitCast(du32, OddEven(p_hi, p_lo)); |
| const auto p_hi0_lo1 = Or(ShiftLeft<16>(BitCast(du32, p_hi)), |
| ShiftRight<16>(BitCast(du32, p_lo))); |
| return Add(BitCast(du32, p_hi1_lo0), BitCast(du32, p_hi0_lo1)); |
| } |
| |
| // ------------------------------ SatWidenMulPairwiseAdd |
| |
| #if HWY_TARGET <= HWY_SSSE3 |
| |
| #ifdef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD |
| #undef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD |
| #else |
| #define HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD |
| #endif |
| |
| // Even if N=1, the input is always at least 2 lanes, hence _mm_maddubs_epi16 |
| // is safe. |
| template <class DI16, HWY_IF_I16_D(DI16), HWY_IF_V_SIZE_LE_D(DI16, 16)> |
| HWY_API VFromD<DI16> SatWidenMulPairwiseAdd( |
| DI16 /* tag */, VFromD<Repartition<uint8_t, DI16>> a, |
| VFromD<Repartition<int8_t, DI16>> b) { |
| return VFromD<DI16>{_mm_maddubs_epi16(a.raw, b.raw)}; |
| } |
| |
| #endif |
| |
| // ------------------------------ SatWidenMulPairwiseAccumulate |
| |
| #if HWY_TARGET <= HWY_AVX3_DL |
| |
| #ifdef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM |
| #undef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM |
| #else |
| #define HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM |
| #endif |
| |
| // Even if N=1, the I16 vectors have at least 2 lanes, hence _mm_dpwssds_epi32 |
| // is safe. |
| template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 16)> |
| HWY_API VFromD<DI32> SatWidenMulPairwiseAccumulate( |
| DI32 /* tag */, VFromD<Repartition<int16_t, DI32>> a, |
| VFromD<Repartition<int16_t, DI32>> b, VFromD<DI32> sum) { |
| return VFromD<DI32>{_mm_dpwssds_epi32(sum.raw, a.raw, b.raw)}; |
| } |
| |
| #endif // HWY_TARGET <= HWY_AVX3_DL |
| |
| // ------------------------------ ReorderWidenMulAccumulate (PromoteEvenTo) |
| |
| #if HWY_NATIVE_DOT_BF16 |
| |
| #ifdef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16 |
| #undef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16 |
| #else |
| #define HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16 |
| #endif |
| |
| template <class DF, HWY_IF_F32_D(DF), HWY_IF_V_SIZE_LE_D(DF, 16), |
| class VBF = VFromD<Repartition<bfloat16_t, DF>>> |
| HWY_API VFromD<DF> ReorderWidenMulAccumulate(DF /*df*/, VBF a, VBF b, |
| const VFromD<DF> sum0, |
| VFromD<DF>& /*sum1*/) { |
| return VFromD<DF>{_mm_dpbf16_ps(sum0.raw, reinterpret_cast<__m128bh>(a.raw), |
| reinterpret_cast<__m128bh>(b.raw))}; |
| } |
| |
| #endif // HWY_NATIVE_DOT_BF16 |
| |
| // Even if N=1, the input is always at least 2 lanes, hence madd_epi16 is safe. |
| template <class D32, HWY_IF_I32_D(D32), HWY_IF_V_SIZE_LE_D(D32, 16), |
| class V16 = VFromD<RepartitionToNarrow<D32>>> |
| HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 d, V16 a, V16 b, |
| const VFromD<D32> sum0, |
| VFromD<D32>& /*sum1*/) { |
| (void)d; |
| #if HWY_TARGET <= HWY_AVX3_DL |
| return VFromD<D32>{_mm_dpwssd_epi32(sum0.raw, a.raw, b.raw)}; |
| #else |
| return sum0 + WidenMulPairwiseAdd(d, a, b); |
| #endif |
| } |
| |
| template <class DU32, HWY_IF_U32_D(DU32), |
| class VU16 = VFromD<RepartitionToNarrow<DU32>>> |
| HWY_API VFromD<DU32> ReorderWidenMulAccumulate(DU32 d, VU16 a, VU16 b, |
| const VFromD<DU32> sum0, |
| VFromD<DU32>& /*sum1*/) { |
| (void)d; |
| return sum0 + WidenMulPairwiseAdd(d, a, b); |
| } |
| |
| // ------------------------------ RearrangeToOddPlusEven |
| template <size_t N> |
| HWY_API Vec128<int32_t, N> RearrangeToOddPlusEven(const Vec128<int32_t, N> sum0, |
| Vec128<int32_t, N> /*sum1*/) { |
| return sum0; // invariant already holds |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<uint32_t, N> RearrangeToOddPlusEven( |
| const Vec128<uint32_t, N> sum0, Vec128<uint32_t, N> /*sum1*/) { |
| return sum0; // invariant already holds |
| } |
| |
| template <class VW> |
| HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) { |
| return Add(sum0, sum1); |
| } |
| |
| // ------------------------------ SumOfMulQuadAccumulate |
| #if HWY_TARGET <= HWY_AVX3_DL |
| |
| #ifdef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE |
| #undef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE |
| #else |
| #define HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE |
| #endif |
| |
| template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 16)> |
| HWY_API VFromD<DI32> SumOfMulQuadAccumulate( |
| DI32 /*di32*/, VFromD<Repartition<uint8_t, DI32>> a_u, |
| VFromD<Repartition<int8_t, DI32>> b_i, VFromD<DI32> sum) { |
| return VFromD<DI32>{_mm_dpbusd_epi32(sum.raw, a_u.raw, b_i.raw)}; |
| } |
| |
| #ifdef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE |
| #undef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE |
| #else |
| #define HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE |
| #endif |
| template <class DI32, HWY_IF_I32_D(DI32)> |
| HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 di32, |
| VFromD<Repartition<int8_t, DI32>> a, |
| VFromD<Repartition<int8_t, DI32>> b, |
| VFromD<DI32> sum) { |
| // TODO(janwas): AVX-VNNI-INT8 has dpbssd. |
| const Repartition<uint8_t, decltype(di32)> du8; |
| |
| const auto a_u = BitCast(du8, a); |
| const auto result_sum_0 = SumOfMulQuadAccumulate(di32, a_u, b, sum); |
| const auto result_sum_1 = ShiftLeft<8>( |
| SumOfMulQuadAccumulate(di32, ShiftRight<7>(a_u), b, Zero(di32))); |
| return result_sum_0 - result_sum_1; |
| } |
| |
| #ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE |
| #undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE |
| #else |
| #define HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE |
| #endif |
| template <class DU32, HWY_IF_U32_D(DU32)> |
| HWY_API VFromD<DU32> SumOfMulQuadAccumulate( |
| DU32 du32, VFromD<Repartition<uint8_t, DU32>> a, |
| VFromD<Repartition<uint8_t, DU32>> b, VFromD<DU32> sum) { |
| // TODO(janwas): AVX-VNNI-INT8 has dpbuud. |
| const Repartition<uint8_t, decltype(du32)> du8; |
| const RebindToSigned<decltype(du8)> di8; |
| const RebindToSigned<decltype(du32)> di32; |
| |
| const auto b_i = BitCast(di8, b); |
| const auto result_sum_0 = |
| SumOfMulQuadAccumulate(di32, a, b_i, BitCast(di32, sum)); |
| const auto result_sum_1 = ShiftLeft<8>( |
| SumOfMulQuadAccumulate(di32, a, BroadcastSignBit(b_i), Zero(di32))); |
| |
| return BitCast(du32, result_sum_0 - result_sum_1); |
| } |
| |
| #endif // HWY_TARGET <= HWY_AVX3_DL |
| |
| // ------------------------------ Demotions (full -> part w/ narrow lanes) |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)> |
| HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { |
| return VFromD<D>{_mm_packs_epi32(v.raw, v.raw)}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)> |
| HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { |
| #if HWY_TARGET >= HWY_SSSE3 |
| const Rebind<int32_t, D> di32; |
| const auto zero_if_neg = AndNot(ShiftRight<31>(v), v); |
| const auto too_big = VecFromMask(di32, Gt(v, Set(di32, 0xFFFF))); |
| const auto clamped = Or(zero_if_neg, too_big); |
| #if HWY_TARGET == HWY_SSE2 |
| const Rebind<uint16_t, decltype(di32)> du16; |
| const RebindToSigned<decltype(du16)> di16; |
| return BitCast(du16, DemoteTo(di16, ShiftRight<16>(ShiftLeft<16>(clamped)))); |
| #else |
| const Repartition<uint16_t, decltype(di32)> du16; |
| // Lower 2 bytes from each 32-bit lane; same as return type for fewer casts. |
| alignas(16) static constexpr uint16_t kLower2Bytes[16] = { |
| 0x0100, 0x0504, 0x0908, 0x0D0C, 0x8080, 0x8080, 0x8080, 0x8080}; |
| const auto lo2 = Load(du16, kLower2Bytes); |
| return VFromD<D>{TableLookupBytes(BitCast(du16, clamped), lo2).raw}; |
| #endif |
| #else |
| return VFromD<D>{_mm_packus_epi32(v.raw, v.raw)}; |
| #endif |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)> |
| HWY_API VFromD<D> DemoteTo(D du16, VFromD<Rebind<uint32_t, D>> v) { |
| const DFromV<decltype(v)> du32; |
| const RebindToSigned<decltype(du32)> di32; |
| #if HWY_TARGET >= HWY_SSSE3 |
| const auto too_big = |
| VecFromMask(di32, Gt(BitCast(di32, ShiftRight<16>(v)), Zero(di32))); |
| const auto clamped = Or(BitCast(di32, v), too_big); |
| #if HWY_TARGET == HWY_SSE2 |
| const RebindToSigned<decltype(du16)> di16; |
| return BitCast(du16, DemoteTo(di16, ShiftRight<16>(ShiftLeft<16>(clamped)))); |
| #else |
| (void)du16; |
| const Repartition<uint16_t, decltype(di32)> du16_full; |
| // Lower 2 bytes from each 32-bit lane; same as return type for fewer casts. |
| alignas(16) static constexpr uint16_t kLower2Bytes[16] = { |
| 0x0100, 0x0504, 0x0908, 0x0D0C, 0x8080, 0x8080, 0x8080, 0x8080}; |
| const auto lo2 = Load(du16_full, kLower2Bytes); |
| return VFromD<D>{TableLookupBytes(BitCast(du16_full, clamped), lo2).raw}; |
| #endif |
| #else |
| return DemoteTo(du16, BitCast(di32, Min(v, Set(du32, 0x7FFFFFFF)))); |
| #endif |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)> |
| HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { |
| const __m128i i16 = _mm_packs_epi32(v.raw, v.raw); |
| return VFromD<D>{_mm_packus_epi16(i16, i16)}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)> |
| HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) { |
| return VFromD<D>{_mm_packus_epi16(v.raw, v.raw)}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I8_D(D)> |
| HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { |
| const __m128i i16 = _mm_packs_epi32(v.raw, v.raw); |
| return VFromD<D>{_mm_packs_epi16(i16, i16)}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I8_D(D)> |
| HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) { |
| return VFromD<D>{_mm_packs_epi16(v.raw, v.raw)}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)> |
| HWY_API VFromD<D> DemoteTo(D du8, VFromD<Rebind<uint32_t, D>> v) { |
| #if HWY_TARGET <= HWY_AVX3 |
| // NOTE: _mm_cvtusepi32_epi8 is a saturated conversion of 32-bit unsigned |
| // integers to 8-bit unsigned integers |
| (void)du8; |
| return VFromD<D>{_mm_cvtusepi32_epi8(v.raw)}; |
| #else |
| const DFromV<decltype(v)> du32; |
| const RebindToSigned<decltype(du32)> di32; |
| const auto max_i32 = Set(du32, 0x7FFFFFFFu); |
| |
| #if HWY_TARGET >= HWY_SSSE3 |
| // On SSE2/SSSE3, clamp u32 values to an i32 using the u8 Min operation |
| // as SSE2/SSSE3 can do an u8 Min operation in a single instruction. |
| |
| // The u8 Min operation below leaves the lower 24 bits of each 32-bit |
| // lane unchanged. |
| |
| // The u8 Min operation below will leave any values that are less than or |
| // equal to 0x7FFFFFFF unchanged. |
| |
| // For values that are greater than or equal to 0x80000000, the u8 Min |
| // operation below will force the upper 8 bits to 0x7F and leave the lower |
| // 24 bits unchanged. |
| |
| // An u8 Min operation is okay here as any clamped value that is greater than |
| // or equal to 0x80000000 will be clamped to a value between 0x7F000000 and |
| // 0x7FFFFFFF through the u8 Min operation below, which will then be converted |
| // to 0xFF through the i32->u8 demotion. |
| const Repartition<uint8_t, decltype(du32)> du32_as_du8; |
| const auto clamped = BitCast( |
| di32, Min(BitCast(du32_as_du8, v), BitCast(du32_as_du8, max_i32))); |
| #else |
| const auto clamped = BitCast(di32, Min(v, max_i32)); |
| #endif |
| |
| return DemoteTo(du8, clamped); |
| #endif |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)> |
| HWY_API VFromD<D> DemoteTo(D du8, VFromD<Rebind<uint16_t, D>> v) { |
| const DFromV<decltype(v)> du16; |
| const RebindToSigned<decltype(du16)> di16; |
| const auto max_i16 = Set(du16, 0x7FFF); |
| |
| #if HWY_TARGET >= HWY_SSSE3 |
| // On SSE2/SSSE3, clamp u16 values to an i16 using the u8 Min operation |
| // as SSE2/SSSE3 can do an u8 Min operation in a single instruction. |
| |
| // The u8 Min operation below leaves the lower 8 bits of each 16-bit |
| // lane unchanged. |
| |
| // The u8 Min operation below will leave any values that are less than or |
| // equal to 0x7FFF unchanged. |
| |
| // For values that are greater than or equal to 0x8000, the u8 Min |
| // operation below will force the upper 8 bits to 0x7F and leave the lower |
| // 8 bits unchanged. |
| |
| // An u8 Min operation is okay here as any clamped value that is greater than |
| // or equal to 0x8000 will be clamped to a value between 0x7F00 and |
| // 0x7FFF through the u8 Min operation below, which will then be converted |
| // to 0xFF through the i16->u8 demotion. |
| const Repartition<uint8_t, decltype(du16)> du16_as_du8; |
| const auto clamped = BitCast( |
| di16, Min(BitCast(du16_as_du8, v), BitCast(du16_as_du8, max_i16))); |
| #else |
| const auto clamped = BitCast(di16, Min(v, max_i16)); |
| #endif |
| |
| return DemoteTo(du8, clamped); |
| } |
| |
| #if HWY_TARGET < HWY_SSE4 && !defined(HWY_DISABLE_F16C) |
| |
| // HWY_NATIVE_F16C was already toggled above. |
| |
| // Work around MSVC warning for _mm_cvtps_ph (8 is actually a valid immediate). |
| // clang-cl requires a non-empty string, so we 'ignore' the irrelevant -Wmain. |
| HWY_DIAGNOSTICS(push) |
| HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wmain") |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)> |
| HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<float, D>> v) { |
| const RebindToUnsigned<decltype(df16)> du16; |
| return BitCast( |
| df16, VFromD<decltype(du16)>{_mm_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)}); |
| } |
| |
| HWY_DIAGNOSTICS(pop) |
| |
| #endif // F16C |
| |
| #if HWY_HAVE_FLOAT16 |
| |
| #ifdef HWY_NATIVE_DEMOTE_F64_TO_F16 |
| #undef HWY_NATIVE_DEMOTE_F64_TO_F16 |
| #else |
| #define HWY_NATIVE_DEMOTE_F64_TO_F16 |
| #endif |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_F16_D(D)> |
| HWY_API VFromD<D> DemoteTo(D /*df16*/, VFromD<Rebind<double, D>> v) { |
| return VFromD<D>{_mm_cvtpd_ph(v.raw)}; |
| } |
| |
| #endif // HWY_HAVE_FLOAT16 |
| |
| // The _mm*_cvtneps_pbh and _mm*_cvtne2ps_pbh intrinsics require GCC 9 or later |
| // or Clang 10 or later |
| |
| // Also need GCC or Clang to bit cast the __m128bh, __m256bh, or __m512bh vector |
| // returned by the _mm*_cvtneps_pbh and _mm*_cvtne2ps_pbh intrinsics to a |
| // __m128i, __m256i, or __m512i as there are currently no intrinsics available |
| // (as of GCC 13 and Clang 17) to bit cast a __m128bh, __m256bh, or __m512bh |
| // vector to a __m128i, __m256i, or __m512i vector |
| |
| #if HWY_AVX3_HAVE_F32_TO_BF16C |
| #ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16 |
| #undef HWY_NATIVE_DEMOTE_F32_TO_BF16 |
| #else |
| #define HWY_NATIVE_DEMOTE_F32_TO_BF16 |
| #endif |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)> |
| HWY_API VFromD<D> DemoteTo(D /*dbf16*/, VFromD<Rebind<float, D>> v) { |
| #if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000 |
| // Inline assembly workaround for LLVM codegen bug |
| __m128i raw_result; |
| __asm__("vcvtneps2bf16 %1, %0" : "=v"(raw_result) : "v"(v.raw)); |
| return VFromD<D>{raw_result}; |
| #else |
| // The _mm_cvtneps_pbh intrinsic returns a __m128bh vector that needs to be |
| // bit casted to a __m128i vector |
| return VFromD<D>{detail::BitCastToInteger(_mm_cvtneps_pbh(v.raw))}; |
| #endif |
| } |
| |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_BF16_D(D)> |
| HWY_API VFromD<D> ReorderDemote2To(D /*dbf16*/, Vec128<float> a, |
| Vec128<float> b) { |
| #if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000 |
| // Inline assembly workaround for LLVM codegen bug |
| __m128i raw_result; |
| __asm__("vcvtne2ps2bf16 %2, %1, %0" |
| : "=v"(raw_result) |
| : "v"(b.raw), "v"(a.raw)); |
| return VFromD<D>{raw_result}; |
| #else |
| // The _mm_cvtne2ps_pbh intrinsic returns a __m128bh vector that needs to be |
| // bit casted to a __m128i vector |
| return VFromD<D>{detail::BitCastToInteger(_mm_cvtne2ps_pbh(b.raw, a.raw))}; |
| #endif |
| } |
| |
| template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_BF16_D(D)> |
| HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec64<float> a, |
| Vec64<float> b) { |
| return VFromD<D>{_mm_shuffle_epi32( |
| detail::BitCastToInteger(_mm_cvtne2ps_pbh(b.raw, a.raw)), |
| _MM_SHUFFLE(2, 0, 2, 0))}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_BF16_D(D)> |
| HWY_API VFromD<D> ReorderDemote2To(D dbf16, Vec32<float> a, Vec32<float> b) { |
| const DFromV<decltype(a)> d; |
| const Twice<decltype(d)> dt; |
| return DemoteTo(dbf16, Combine(dt, b, a)); |
| } |
| #endif // HWY_AVX3_HAVE_F32_TO_BF16C |
| |
| // Specializations for partial vectors because packs_epi32 sets lanes above 2*N. |
| template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I16_D(D)> |
| HWY_API VFromD<D> ReorderDemote2To(D dn, Vec32<int32_t> a, Vec32<int32_t> b) { |
| const DFromV<decltype(a)> d; |
| const Twice<decltype(d)> dt; |
| return DemoteTo(dn, Combine(dt, b, a)); |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I16_D(D)> |
| HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec64<int32_t> a, |
| Vec64<int32_t> b) { |
| return VFromD<D>{_mm_shuffle_epi32(_mm_packs_epi32(a.raw, b.raw), |
| _MM_SHUFFLE(2, 0, 2, 0))}; |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)> |
| HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<int32_t> a, |
| Vec128<int32_t> b) { |
| return VFromD<D>{_mm_packs_epi32(a.raw, b.raw)}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U16_D(D)> |
| HWY_API VFromD<D> ReorderDemote2To(D dn, Vec32<int32_t> a, Vec32<int32_t> b) { |
| const DFromV<decltype(a)> d; |
| const Twice<decltype(d)> dt; |
| return DemoteTo(dn, Combine(dt, b, a)); |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)> |
| HWY_API VFromD<D> ReorderDemote2To(D dn, Vec64<int32_t> a, Vec64<int32_t> b) { |
| #if HWY_TARGET >= HWY_SSSE3 |
| const DFromV<decltype(a)> d; |
| const Twice<decltype(d)> dt; |
| return DemoteTo(dn, Combine(dt, b, a)); |
| #else |
| (void)dn; |
| return VFromD<D>{_mm_shuffle_epi32(_mm_packus_epi32(a.raw, b.raw), |
| _MM_SHUFFLE(2, 0, 2, 0))}; |
| #endif |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)> |
| HWY_API VFromD<D> ReorderDemote2To(D dn, Vec128<int32_t> a, Vec128<int32_t> b) { |
| #if HWY_TARGET >= HWY_SSSE3 |
| const Half<decltype(dn)> dnh; |
| const auto u16_a = DemoteTo(dnh, a); |
| const auto u16_b = DemoteTo(dnh, b); |
| return Combine(dn, u16_b, u16_a); |
| #else |
| (void)dn; |
| return VFromD<D>{_mm_packus_epi32(a.raw, b.raw)}; |
| #endif |
| } |
| |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)> |
| HWY_API VFromD<D> ReorderDemote2To(D dn, Vec128<uint32_t> a, |
| Vec128<uint32_t> b) { |
| const DFromV<decltype(a)> du32; |
| const RebindToSigned<decltype(du32)> di32; |
| const auto max_i32 = Set(du32, 0x7FFFFFFFu); |
| |
| #if HWY_TARGET >= HWY_SSSE3 |
| const Repartition<uint8_t, decltype(du32)> du32_as_du8; |
| // On SSE2/SSSE3, clamp a and b using u8 Min operation |
| const auto clamped_a = BitCast( |
| di32, Min(BitCast(du32_as_du8, a), BitCast(du32_as_du8, max_i32))); |
| const auto clamped_b = BitCast( |
| di32, Min(BitCast(du32_as_du8, b), BitCast(du32_as_du8, max_i32))); |
| #else |
| const auto clamped_a = BitCast(di32, Min(a, max_i32)); |
| const auto clamped_b = BitCast(di32, Min(b, max_i32)); |
| #endif |
| |
| return ReorderDemote2To(dn, clamped_a, clamped_b); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)> |
| HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint32_t, D>> a, |
| VFromD<Repartition<uint32_t, D>> b) { |
| const DFromV<decltype(a)> d; |
| const Twice<decltype(d)> dt; |
| return DemoteTo(dn, Combine(dt, b, a)); |
| } |
| |
| // Specializations for partial vectors because packs_epi32 sets lanes above 2*N. |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I8_D(D)> |
| HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<int16_t, D>> a, |
| VFromD<Repartition<int16_t, D>> b) { |
| const DFromV<decltype(a)> d; |
| const Twice<decltype(d)> dt; |
| return DemoteTo(dn, Combine(dt, b, a)); |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I8_D(D)> |
| HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec64<int16_t> a, |
| Vec64<int16_t> b) { |
| return VFromD<D>{_mm_shuffle_epi32(_mm_packs_epi16(a.raw, b.raw), |
| _MM_SHUFFLE(2, 0, 2, 0))}; |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I8_D(D)> |
| HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<int16_t> a, |
| Vec128<int16_t> b) { |
| return VFromD<D>{_mm_packs_epi16(a.raw, b.raw)}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)> |
| HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<int16_t, D>> a, |
| VFromD<Repartition<int16_t, D>> b) { |
| const DFromV<decltype(a)> d; |
| const Twice<decltype(d)> dt; |
| return DemoteTo(dn, Combine(dt, b, a)); |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)> |
| HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec64<int16_t> a, |
| Vec64<int16_t> b) { |
| return VFromD<D>{_mm_shuffle_epi32(_mm_packus_epi16(a.raw, b.raw), |
| _MM_SHUFFLE(2, 0, 2, 0))}; |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)> |
| HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<int16_t> a, |
| Vec128<int16_t> b) { |
| return VFromD<D>{_mm_packus_epi16(a.raw, b.raw)}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)> |
| HWY_API VFromD<D> ReorderDemote2To(D dn, Vec128<uint16_t> a, |
| Vec128<uint16_t> b) { |
| const DFromV<decltype(a)> du16; |
| const RebindToSigned<decltype(du16)> di16; |
| const auto max_i16 = Set(du16, 0x7FFFu); |
| |
| #if HWY_TARGET >= HWY_SSSE3 |
| const Repartition<uint8_t, decltype(du16)> du16_as_du8; |
| // On SSE2/SSSE3, clamp a and b using u8 Min operation |
| const auto clamped_a = BitCast( |
| di16, Min(BitCast(du16_as_du8, a), BitCast(du16_as_du8, max_i16))); |
| const auto clamped_b = BitCast( |
| di16, Min(BitCast(du16_as_du8, b), BitCast(du16_as_du8, max_i16))); |
| #else |
| const auto clamped_a = BitCast(di16, Min(a, max_i16)); |
| const auto clamped_b = BitCast(di16, Min(b, max_i16)); |
| #endif |
| |
| return ReorderDemote2To(dn, clamped_a, clamped_b); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)> |
| HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint16_t, D>> a, |
| VFromD<Repartition<uint16_t, D>> b) { |
| const DFromV<decltype(a)> d; |
| const Twice<decltype(d)> dt; |
| return DemoteTo(dn, Combine(dt, b, a)); |
| } |
| |
| template <class D, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>), |
| HWY_IF_V_SIZE_LE_D(D, 16), class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), |
| HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2), |
| HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)> |
| HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) { |
| return ReorderDemote2To(d, a, b); |
| } |
| |
| #if HWY_AVX3_HAVE_F32_TO_BF16C |
| // F32 to BF16 OrderedDemote2To is generic for all vector lengths on targets |
| // that support AVX512BF16 |
| template <class D, HWY_IF_BF16_D(D)> |
| HWY_API VFromD<D> OrderedDemote2To(D dbf16, VFromD<Repartition<float, D>> a, |
| VFromD<Repartition<float, D>> b) { |
| return ReorderDemote2To(dbf16, a, b); |
| } |
| #endif // HWY_AVX3_HAVE_F32_TO_BF16C |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)> |
| HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<double, D>> v) { |
| return VFromD<D>{_mm_cvtpd_ps(v.raw)}; |
| } |
| |
| namespace detail { |
| |
| // Generic for all vector lengths. |
| template <class D> |
| HWY_INLINE VFromD<D> ClampF64ToI32Max(D d, VFromD<D> v) { |
| // The max can be exactly represented in binary64, so clamping beforehand |
| // prevents x86 conversion from raising an exception and returning 80..00. |
| return Min(v, Set(d, 2147483647.0)); |
| } |
| |
| #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD |
| template <class TTo, class TF> |
| static constexpr HWY_INLINE TTo |
| X86ConvertScalarFromFloat(hwy::FloatTag /* to_type_tag */, TF from_val) { |
| return ConvertScalarTo<TTo>(from_val); |
| } |
| |
| template <class TTo, class TF> |
| static HWY_BITCASTSCALAR_CONSTEXPR HWY_INLINE TTo |
| X86ConvertScalarFromFloat(hwy::SpecialTag /* to_type_tag */, TF from_val) { |
| return ConvertScalarTo<TTo>(from_val); |
| } |
| |
| template <class TTo, class TF> |
| static HWY_BITCASTSCALAR_CXX14_CONSTEXPR HWY_INLINE TTo |
| X86ConvertScalarFromFloat(hwy::SignedTag /* to_type_tag */, TF from_val) { |
| #if HWY_HAVE_SCALAR_F16_TYPE && HWY_HAVE_SCALAR_F16_OPERATORS |
| using TFArith = If<hwy::IsSame<RemoveCvRef<TTo>, hwy::bfloat16_t>(), float, |
| RemoveCvRef<TF>>; |
| #else |
| using TFArith = If<sizeof(TF) <= sizeof(float), float, RemoveCvRef<TF>>; |
| #endif |
| |
| const TFArith from_val_in_arith_type = ConvertScalarTo<TFArith>(from_val); |
| constexpr TTo kMinResultVal = LimitsMin<TTo>(); |
| HWY_BITCASTSCALAR_CONSTEXPR const TFArith kMinOutOfRangePosVal = |
| ScalarAbs(ConvertScalarTo<TFArith>(kMinResultVal)); |
| |
| return (ScalarAbs(from_val_in_arith_type) < kMinOutOfRangePosVal) |
| ? ConvertScalarTo<TTo>(from_val_in_arith_type) |
| : kMinResultVal; |
| } |
| |
| template <class TTo, class TF> |
| static HWY_CXX14_CONSTEXPR HWY_INLINE TTo |
| X86ConvertScalarFromFloat(hwy::UnsignedTag /* to_type_tag */, TF from_val) { |
| #if HWY_HAVE_SCALAR_F16_TYPE && HWY_HAVE_SCALAR_F16_OPERATORS |
| using TFArith = If<hwy::IsSame<RemoveCvRef<TTo>, hwy::bfloat16_t>(), float, |
| RemoveCvRef<TF>>; |
| #else |
| using TFArith = If<sizeof(TF) <= sizeof(float), float, RemoveCvRef<TF>>; |
| #endif |
| |
| const TFArith from_val_in_arith_type = ConvertScalarTo<TFArith>(from_val); |
| constexpr TTo kTToMsb = static_cast<TTo>(TTo{1} << (sizeof(TTo) * 8 - 1)); |
| constexpr const TFArith kNegOne = ConvertScalarTo<TFArith>(-1.0); |
| constexpr const TFArith kMinOutOfRangePosVal = |
| ConvertScalarTo<TFArith>(static_cast<double>(kTToMsb) * 2.0); |
| |
| return (from_val_in_arith_type > kNegOne && |
| from_val_in_arith_type < kMinOutOfRangePosVal) |
| ? ConvertScalarTo<TTo>(from_val_in_arith_type) |
| : LimitsMax<TTo>(); |
| } |
| |
| template <class TTo, class TF> |
| static constexpr HWY_INLINE HWY_MAYBE_UNUSED TTo |
| X86ConvertScalarFromFloat(TF from_val) { |
| return X86ConvertScalarFromFloat<TTo>(hwy::TypeTag<RemoveCvRef<TTo>>(), |
| from_val); |
| } |
| #endif // HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD |
| |
| } // namespace detail |
| |
| #ifdef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO |
| #undef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO |
| #else |
| #define HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO |
| #endif |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)> |
| HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, VFromD<Rebind<double, D>> v) { |
| #if HWY_COMPILER_GCC_ACTUAL |
| // Workaround for undefined behavior in _mm_cvttpd_epi32 with GCC if any |
| // values of v[i] are not within the range of an int32_t |
| |
| #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD |
| if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) { |
| typedef double GccF64RawVectType __attribute__((__vector_size__(16))); |
| const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw); |
| return Dup128VecFromValues( |
| D(), detail::X86ConvertScalarFromFloat<int32_t>(raw_v[0]), |
| detail::X86ConvertScalarFromFloat<int32_t>(raw_v[1]), int32_t{0}, |
| int32_t{0}); |
| } |
| #endif |
| |
| __m128i raw_result; |
| __asm__("%vcvttpd2dq {%1, %0|%0, %1}" |
| : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result) |
| : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw) |
| :); |
| return VFromD<D>{raw_result}; |
| #else // !HWY_COMPILER_GCC_ACTUAL |
| return VFromD<D>{_mm_cvttpd_epi32(v.raw)}; |
| #endif |
| } |
| |
| // F64 to I32 DemoteTo is generic for all vector lengths |
| template <class D, HWY_IF_I32_D(D)> |
| HWY_API VFromD<D> DemoteTo(D di32, VFromD<Rebind<double, D>> v) { |
| const Rebind<double, decltype(di32)> df64; |
| const VFromD<decltype(df64)> clamped = detail::ClampF64ToI32Max(df64, v); |
| return DemoteInRangeTo(di32, clamped); |
| } |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)> |
| HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, VFromD<Rebind<double, D>> v) { |
| #if HWY_COMPILER_GCC_ACTUAL |
| // Workaround for undefined behavior in _mm_cvttpd_epu32 with GCC if any |
| // values of v[i] are not within the range of an uint32_t |
| |
| #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD |
| if (detail::IsConstantX86VecForF2IConv<uint32_t>(v)) { |
| typedef double GccF64RawVectType __attribute__((__vector_size__(16))); |
| const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw); |
| return Dup128VecFromValues( |
| D(), detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[0]), |
| detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[1]), uint32_t{0}, |
| uint32_t{0}); |
| } |
| #endif |
| |
| __m128i raw_result; |
| __asm__("vcvttpd2udq {%1, %0|%0, %1}" |
| : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result) |
| : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw) |
| :); |
| return VFromD<D>{raw_result}; |
| #else |
| return VFromD<D>{_mm_cvttpd_epu32(v.raw)}; |
| #endif |
| } |
| |
| // F64->U32 DemoteTo is generic for all vector lengths |
| template <class D, HWY_IF_U32_D(D)> |
| HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<double, D>> v) { |
| return DemoteInRangeTo(D(), ZeroIfNegative(v)); |
| } |
| #else // HWY_TARGET > HWY_AVX3 |
| |
| // F64 to U32 DemoteInRangeTo is generic for all vector lengths on |
| // SSE2/SSSE3/SSE4/AVX2 |
| template <class D, HWY_IF_U32_D(D)> |
| HWY_API VFromD<D> DemoteInRangeTo(D du32, VFromD<Rebind<double, D>> v) { |
| const RebindToSigned<decltype(du32)> di32; |
| const Rebind<double, decltype(du32)> df64; |
| const RebindToUnsigned<decltype(df64)> du64; |
| |
| const auto k2_31 = Set(df64, 2147483648.0); |
| const auto v_is_ge_k2_31 = (v >= k2_31); |
| const auto clamped_lo31_f64 = v - IfThenElseZero(v_is_ge_k2_31, k2_31); |
| const auto clamped_lo31_u32 = |
| BitCast(du32, DemoteInRangeTo(di32, clamped_lo31_f64)); |
| const auto clamped_u32_msb = ShiftLeft<31>( |
| TruncateTo(du32, BitCast(du64, VecFromMask(df64, v_is_ge_k2_31)))); |
| return Or(clamped_lo31_u32, clamped_u32_msb); |
| } |
| |
| // F64 to U32 DemoteTo is generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2 |
| template <class D, HWY_IF_U32_D(D)> |
| HWY_API VFromD<D> DemoteTo(D du32, VFromD<Rebind<double, D>> v) { |
| const Rebind<double, decltype(du32)> df64; |
| const auto clamped = Min(ZeroIfNegative(v), Set(df64, 4294967295.0)); |
| return DemoteInRangeTo(du32, clamped); |
| } |
| #endif // HWY_TARGET <= HWY_AVX3 |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)> |
| HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) { |
| return VFromD<D>{_mm_cvtepi64_ps(v.raw)}; |
| } |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)> |
| HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint64_t, D>> v) { |
| return VFromD<D>{_mm_cvtepu64_ps(v.raw)}; |
| } |
| #else |
| // Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2 |
| template <class D, HWY_IF_F32_D(D)> |
| HWY_API VFromD<D> DemoteTo(D df32, VFromD<Rebind<int64_t, D>> v) { |
| const Rebind<double, decltype(df32)> df64; |
| const RebindToUnsigned<decltype(df64)> du64; |
| const RebindToSigned<decltype(df32)> di32; |
| const RebindToUnsigned<decltype(df32)> du32; |
| |
| const auto k2p64_63 = Set(df64, 27670116110564327424.0); |
| const auto f64_hi52 = |
| Xor(BitCast(df64, ShiftRight<12>(BitCast(du64, v))), k2p64_63) - k2p64_63; |
| const auto f64_lo12 = |
| PromoteTo(df64, BitCast(di32, And(TruncateTo(du32, BitCast(du64, v)), |
| Set(du32, uint32_t{0x00000FFF})))); |
| |
| const auto f64_sum = f64_hi52 + f64_lo12; |
| const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12; |
| |
| const auto f64_sum_is_inexact = |
| ShiftRight<63>(BitCast(du64, VecFromMask(df64, f64_carry != Zero(df64)))); |
| const auto f64_bits_decrement = |
| And(ShiftRight<63>(BitCast(du64, Xor(f64_sum, f64_carry))), |
| f64_sum_is_inexact); |
| |
| const auto adj_f64_val = BitCast( |
| df64, |
| Or(BitCast(du64, f64_sum) - f64_bits_decrement, f64_sum_is_inexact)); |
| |
| return DemoteTo(df32, adj_f64_val); |
| } |
| |
| // Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2 |
| template <class D, HWY_IF_F32_D(D)> |
| HWY_API VFromD<D> DemoteTo(D df32, VFromD<Rebind<uint64_t, D>> v) { |
| const Rebind<double, decltype(df32)> df64; |
| const RebindToUnsigned<decltype(df64)> du64; |
| const RebindToSigned<decltype(df32)> di32; |
| const RebindToUnsigned<decltype(df32)> du32; |
| |
| const auto k2p64 = Set(df64, 18446744073709551616.0); |
| const auto f64_hi52 = Or(BitCast(df64, ShiftRight<12>(v)), k2p64) - k2p64; |
| const auto f64_lo12 = |
| PromoteTo(df64, BitCast(di32, And(TruncateTo(du32, BitCast(du64, v)), |
| Set(du32, uint32_t{0x00000FFF})))); |
| |
| const auto f64_sum = f64_hi52 + f64_lo12; |
| const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12; |
| const auto f64_sum_is_inexact = |
| ShiftRight<63>(BitCast(du64, VecFromMask(df64, f64_carry != Zero(df64)))); |
| |
| const auto adj_f64_val = BitCast( |
| df64, |
| Or(BitCast(du64, f64_sum) - ShiftRight<63>(BitCast(du64, f64_carry)), |
| f64_sum_is_inexact)); |
| |
| return DemoteTo(df32, adj_f64_val); |
| } |
| #endif |
| |
| // For already range-limited input [0, 255]. |
| template <size_t N> |
| HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) { |
| #if HWY_TARGET == HWY_SSE2 |
| const RebindToSigned<DFromV<decltype(v)>> di32; |
| const Rebind<uint8_t, decltype(di32)> du8; |
| return DemoteTo(du8, BitCast(di32, v)); |
| #else |
| const DFromV<decltype(v)> d32; |
| const Repartition<uint8_t, decltype(d32)> d8; |
| alignas(16) static constexpr uint32_t k8From32[4] = { |
| 0x0C080400u, 0x0C080400u, 0x0C080400u, 0x0C080400u}; |
| // Also replicate bytes into all 32 bit lanes for safety. |
| const auto quad = TableLookupBytes(v, Load(d32, k8From32)); |
| return LowerHalf(LowerHalf(BitCast(d8, quad))); |
| #endif |
| } |
| |
| // ------------------------------ F32->UI64 PromoteTo |
| #ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO |
| #undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO |
| #else |
| #define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO |
| #endif |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)> |
| HWY_API VFromD<D> PromoteInRangeTo(D /*di64*/, VFromD<Rebind<float, D>> v) { |
| #if HWY_COMPILER_GCC_ACTUAL |
| // Workaround for undefined behavior with GCC if any values of v[i] are not |
| // within the range of an int64_t |
| |
| #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD |
| if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) { |
| typedef float GccF32RawVectType __attribute__((__vector_size__(16))); |
| const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw); |
| return Dup128VecFromValues( |
| D(), detail::X86ConvertScalarFromFloat<int64_t>(raw_v[0]), |
| detail::X86ConvertScalarFromFloat<int64_t>(raw_v[1])); |
| } |
| #endif |
| |
| __m128i raw_result; |
| __asm__("vcvttps2qq {%1, %0|%0, %1}" |
| : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result) |
| : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw) |
| :); |
| return VFromD<D>{raw_result}; |
| #else |
| return VFromD<D>{_mm_cvttps_epi64(v.raw)}; |
| #endif |
| } |
| |
| // Generic for all vector lengths. |
| template <class D, HWY_IF_I64_D(D)> |
| HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) { |
| const Rebind<float, decltype(di64)> df32; |
| const RebindToFloat<decltype(di64)> df64; |
| // We now avoid GCC UB in PromoteInRangeTo via assembly, see #2189 and |
| // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115115. Previously we fixed up |
| // the result afterwards using three instructions. Now we instead check if |
| // v >= 2^63, and if so replace the output with 2^63-1, which is likely more |
| // efficient. Note that the previous representable f32 is less than 2^63 and |
| // thus fits in i64. |
| const MFromD<D> overflow = RebindMask( |
| di64, PromoteMaskTo(df64, df32, Ge(v, Set(df32, 9.223372e18f)))); |
| return IfThenElse(overflow, Set(di64, LimitsMax<int64_t>()), |
| PromoteInRangeTo(di64, v)); |
| } |
| template <class D, HWY_IF_U64_D(D)> |
| HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) { |
| return PromoteInRangeTo(D(), ZeroIfNegative(v)); |
| } |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)> |
| HWY_API VFromD<D> PromoteInRangeTo(D /* tag */, VFromD<Rebind<float, D>> v) { |
| #if HWY_COMPILER_GCC_ACTUAL |
| // Workaround for undefined behavior with GCC if any values of v[i] are not |
| // within the range of an uint64_t |
| |
| #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD |
| if (detail::IsConstantX86VecForF2IConv<uint64_t>(v)) { |
| typedef float GccF32RawVectType __attribute__((__vector_size__(16))); |
| const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw); |
| return Dup128VecFromValues( |
| D(), detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[0]), |
| detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[1])); |
| } |
| #endif |
| |
| __m128i raw_result; |
| __asm__("vcvttps2uqq {%1, %0|%0, %1}" |
| : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result) |
| : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw) |
| :); |
| return VFromD<D>{raw_result}; |
| #else |
| return VFromD<D>{_mm_cvttps_epu64(v.raw)}; |
| #endif |
| } |
| #else // AVX2 or below |
| |
| // Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2 |
| template <class D, HWY_IF_I64_D(D)> |
| HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) { |
| const Rebind<int32_t, decltype(di64)> di32; |
| const RebindToFloat<decltype(di32)> df32; |
| const RebindToUnsigned<decltype(di32)> du32; |
| const Repartition<uint8_t, decltype(du32)> du32_as_du8; |
| |
| const auto exponent_adj = BitCast( |
| du32, |
| Min(SaturatedSub(BitCast(du32_as_du8, ShiftRight<23>(BitCast(du32, v))), |
| BitCast(du32_as_du8, Set(du32, uint32_t{157}))), |
| BitCast(du32_as_du8, Set(du32, uint32_t{32})))); |
| const auto adj_v = |
| BitCast(df32, BitCast(du32, v) - ShiftLeft<23>(exponent_adj)); |
| |
| const auto f32_to_i32_result = ConvertTo(di32, adj_v); |
| const auto lo64_or_mask = PromoteTo( |
| di64, |
| BitCast(du32, VecFromMask(di32, Eq(f32_to_i32_result, |
| Set(di32, LimitsMax<int32_t>()))))); |
| |
| return Or(PromoteTo(di64, BitCast(di32, f32_to_i32_result)) |
| << PromoteTo(di64, exponent_adj), |
| lo64_or_mask); |
| } |
| |
| // Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2 |
| template <class D, HWY_IF_UI64_D(D)> |
| HWY_API VFromD<D> PromoteInRangeTo(D d64, VFromD<Rebind<float, D>> v) { |
| const Rebind<MakeNarrow<TFromD<D>>, decltype(d64)> d32; |
| const RebindToSigned<decltype(d32)> di32; |
| const RebindToFloat<decltype(d32)> df32; |
| const RebindToUnsigned<decltype(d32)> du32; |
| const Repartition<uint8_t, decltype(d32)> du32_as_du8; |
| |
| const auto exponent_adj = BitCast( |
| du32, |
| SaturatedSub(BitCast(du32_as_du8, ShiftRight<23>(BitCast(du32, v))), |
| BitCast(du32_as_du8, Set(du32, uint32_t{0xFFFFFF9Du})))); |
| const auto adj_v = |
| BitCast(df32, BitCast(du32, v) - ShiftLeft<23>(exponent_adj)); |
| |
| const auto f32_to_i32_result = ConvertInRangeTo(di32, adj_v); |
| return PromoteTo(d64, BitCast(d32, f32_to_i32_result)) |
| << PromoteTo(d64, exponent_adj); |
| } |
| |
| namespace detail { |
| |
| template <class DU64, HWY_IF_V_SIZE_LE_D(DU64, 16)> |
| HWY_INLINE VFromD<DU64> PromoteF32ToU64OverflowMaskToU64( |
| DU64 du64, VFromD<Rebind<int32_t, DU64>> i32_overflow_mask) { |
| const Rebind<int32_t, decltype(du64)> di32; |
| const Twice<decltype(di32)> dt_i32; |
| |
| const auto vt_i32_overflow_mask = ResizeBitCast(dt_i32, i32_overflow_mask); |
| return BitCast(du64, |
| InterleaveLower(vt_i32_overflow_mask, vt_i32_overflow_mask)); |
| } |
| |
| template <class DU64, HWY_IF_V_SIZE_GT_D(DU64, 16)> |
| HWY_INLINE VFromD<DU64> PromoteF32ToU64OverflowMaskToU64( |
| DU64 du64, VFromD<Rebind<int32_t, DU64>> i32_overflow_mask) { |
| const RebindToSigned<decltype(du64)> di64; |
| return BitCast(du64, PromoteTo(di64, i32_overflow_mask)); |
| } |
| |
| } // namespace detail |
| |
| // Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2 |
| template <class D, HWY_IF_U64_D(D)> |
| HWY_API VFromD<D> PromoteTo(D du64, VFromD<Rebind<float, D>> v) { |
| const Rebind<int32_t, decltype(du64)> di32; |
| const RebindToFloat<decltype(di32)> df32; |
| const RebindToUnsigned<decltype(di32)> du32; |
| const Repartition<uint8_t, decltype(du32)> du32_as_du8; |
| |
| const auto non_neg_v = ZeroIfNegative(v); |
| |
| const auto exponent_adj = BitCast( |
| du32, Min(SaturatedSub(BitCast(du32_as_du8, |
| ShiftRight<23>(BitCast(du32, non_neg_v))), |
| BitCast(du32_as_du8, Set(du32, uint32_t{157}))), |
| BitCast(du32_as_du8, Set(du32, uint32_t{33})))); |
| |
| const auto adj_v = |
| BitCast(df32, BitCast(du32, non_neg_v) - ShiftLeft<23>(exponent_adj)); |
| const auto f32_to_i32_result = ConvertInRangeTo(di32, adj_v); |
| |
| const auto i32_overflow_mask = BroadcastSignBit(f32_to_i32_result); |
| const auto overflow_result = |
| detail::PromoteF32ToU64OverflowMaskToU64(du64, i32_overflow_mask); |
| |
| return Or(PromoteTo(du64, BitCast(du32, f32_to_i32_result)) |
| << PromoteTo(du64, exponent_adj), |
| overflow_result); |
| } |
| #endif // HWY_TARGET <= HWY_AVX3 |
| |
| // ------------------------------ MulFixedPoint15 |
| |
| #if HWY_TARGET == HWY_SSE2 |
| HWY_API Vec128<int16_t> MulFixedPoint15(const Vec128<int16_t> a, |
| const Vec128<int16_t> b) { |
| const DFromV<decltype(a)> d; |
| const Repartition<int32_t, decltype(d)> di32; |
| |
| auto lo_product = a * b; |
| auto hi_product = MulHigh(a, b); |
| |
| const VFromD<decltype(di32)> i32_product_lo{ |
| _mm_unpacklo_epi16(lo_product.raw, hi_product.raw)}; |
| const VFromD<decltype(di32)> i32_product_hi{ |
| _mm_unpackhi_epi16(lo_product.raw, hi_product.raw)}; |
| |
| const auto round_up_incr = Set(di32, 0x4000); |
| return ReorderDemote2To(d, ShiftRight<15>(i32_product_lo + round_up_incr), |
| ShiftRight<15>(i32_product_hi + round_up_incr)); |
| } |
| |
| template <size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)> |
| HWY_API Vec128<int16_t, N> MulFixedPoint15(const Vec128<int16_t, N> a, |
| const Vec128<int16_t, N> b) { |
| const DFromV<decltype(a)> d; |
| const Rebind<int32_t, decltype(d)> di32; |
| |
| const auto lo_product = a * b; |
| const auto hi_product = MulHigh(a, b); |
| const VFromD<decltype(di32)> i32_product{ |
| _mm_unpacklo_epi16(lo_product.raw, hi_product.raw)}; |
| |
| return DemoteTo(d, ShiftRight<15>(i32_product + Set(di32, 0x4000))); |
| } |
| #else |
| template <size_t N> |
| HWY_API Vec128<int16_t, N> MulFixedPoint15(const Vec128<int16_t, N> a, |
| const Vec128<int16_t, N> b) { |
| return Vec128<int16_t, N>{_mm_mulhrs_epi16(a.raw, b.raw)}; |
| } |
| #endif |
| |
| // ------------------------------ Truncations |
| |
| template <typename From, class DTo, HWY_IF_LANES_D(DTo, 1)> |
| HWY_API VFromD<DTo> TruncateTo(DTo /* tag */, Vec128<From, 1> v) { |
| // BitCast requires the same size; DTo might be u8x1 and v u16x1. |
| const Repartition<TFromD<DTo>, DFromV<decltype(v)>> dto; |
| return VFromD<DTo>{BitCast(dto, v).raw}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_U8_D(D)> |
| HWY_API VFromD<D> TruncateTo(D d, Vec128<uint64_t> v) { |
| #if HWY_TARGET == HWY_SSE2 |
| const Vec128<uint8_t, 1> lo{v.raw}; |
| const Vec128<uint8_t, 1> hi{_mm_unpackhi_epi64(v.raw, v.raw)}; |
| return Combine(d, hi, lo); |
| #else |
| const Repartition<uint8_t, DFromV<decltype(v)>> d8; |
| (void)d; |
| alignas(16) static constexpr uint8_t kIdx[16] = {0, 8, 0, 8, 0, 8, 0, 8, |
| 0, 8, 0, 8, 0, 8, 0, 8}; |
| const Vec128<uint8_t> v8 = TableLookupBytes(v, Load(d8, kIdx)); |
| return LowerHalf(LowerHalf(LowerHalf(v8))); |
| #endif |
| } |
| |
| template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U16_D(D)> |
| HWY_API VFromD<D> TruncateTo(D d, Vec128<uint64_t> v) { |
| #if HWY_TARGET == HWY_SSE2 |
| const Vec128<uint16_t, 1> lo{v.raw}; |
| const Vec128<uint16_t, 1> hi{_mm_unpackhi_epi64(v.raw, v.raw)}; |
| return Combine(d, hi, lo); |
| #else |
| (void)d; |
| const Repartition<uint16_t, DFromV<decltype(v)>> d16; |
| alignas(16) static constexpr uint16_t kIdx[8] = { |
| 0x100u, 0x908u, 0x100u, 0x908u, 0x100u, 0x908u, 0x100u, 0x908u}; |
| const Vec128<uint16_t> v16 = TableLookupBytes(v, Load(d16, kIdx)); |
| return LowerHalf(LowerHalf(v16)); |
| #endif |
| } |
| |
| template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U32_D(D)> |
| HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint64_t> v) { |
| return VFromD<D>{_mm_shuffle_epi32(v.raw, 0x88)}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)> |
| HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) { |
| const DFromV<decltype(v)> du32; |
| #if HWY_TARGET == HWY_SSE2 |
| const RebindToSigned<decltype(du32)> di32; |
| const Rebind<uint8_t, decltype(di32)> du8; |
| return DemoteTo(du8, BitCast(di32, ShiftRight<24>(ShiftLeft<24>(v)))); |
| #else |
| const Repartition<uint8_t, decltype(du32)> d; |
| alignas(16) static constexpr uint8_t kIdx[16] = { |
| 0x0u, 0x4u, 0x8u, 0xCu, 0x0u, 0x4u, 0x8u, 0xCu, |
| 0x0u, 0x4u, 0x8u, 0xCu, 0x0u, 0x4u, 0x8u, 0xCu}; |
| return LowerHalf(LowerHalf(TableLookupBytes(v, Load(d, kIdx)))); |
| #endif |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)> |
| HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) { |
| const DFromV<decltype(v)> du32; |
| #if HWY_TARGET == HWY_SSE2 |
| const RebindToSigned<decltype(du32)> di32; |
| const Rebind<uint16_t, decltype(di32)> du16; |
| const RebindToSigned<decltype(du16)> di16; |
| return BitCast( |
| du16, DemoteTo(di16, ShiftRight<16>(BitCast(di32, ShiftLeft<16>(v))))); |
| #else |
| const Repartition<uint16_t, decltype(du32)> d; |
| return LowerHalf(ConcatEven(d, BitCast(d, v), BitCast(d, v))); |
| #endif |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)> |
| HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) { |
| const DFromV<decltype(v)> du16; |
| #if HWY_TARGET == HWY_SSE2 |
| const RebindToSigned<decltype(du16)> di16; |
| const Rebind<uint8_t, decltype(di16)> du8; |
| const RebindToSigned<decltype(du8)> di8; |
| return BitCast(du8, |
| DemoteTo(di8, ShiftRight<8>(BitCast(di16, ShiftLeft<8>(v))))); |
| #else |
| const Repartition<uint8_t, decltype(du16)> d; |
| return LowerHalf(ConcatEven(d, BitCast(d, v), BitCast(d, v))); |
| #endif |
| } |
| |
| // ------------------------------ Demotions to/from i64 |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)> |
| HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) { |
| return VFromD<D>{_mm_cvtsepi64_epi32(v.raw)}; |
| } |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I16_D(D)> |
| HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) { |
| return VFromD<D>{_mm_cvtsepi64_epi16(v.raw)}; |
| } |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_I8_D(D)> |
| HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) { |
| return VFromD<D>{_mm_cvtsepi64_epi8(v.raw)}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)> |
| HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) { |
| const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw; |
| return VFromD<D>{_mm_maskz_cvtusepi64_epi32(non_neg_mask, v.raw)}; |
| } |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U16_D(D)> |
| HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) { |
| const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw; |
| return VFromD<D>{_mm_maskz_cvtusepi64_epi16(non_neg_mask, v.raw)}; |
| } |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_U8_D(D)> |
| HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) { |
| const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw; |
| return VFromD<D>{_mm_maskz_cvtusepi64_epi8(non_neg_mask, v.raw)}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)> |
| HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint64_t, D>> v) { |
| return VFromD<D>{_mm_cvtusepi64_epi32(v.raw)}; |
| } |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U16_D(D)> |
| HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint64_t, D>> v) { |
| return VFromD<D>{_mm_cvtusepi64_epi16(v.raw)}; |
| } |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_U8_D(D)> |
| HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint64_t, D>> v) { |
| return VFromD<D>{_mm_cvtusepi64_epi8(v.raw)}; |
| } |
| #else // AVX2 or below |
| |
| // Disable the default unsigned to signed DemoteTo/ReorderDemote2To |
| // implementations in generic_ops-inl.h for U64->I8/I16/I32 demotions on |
| // SSE2/SSSE3/SSE4/AVX2 as U64->I8/I16/I32 DemoteTo/ReorderDemote2To for |
| // SSE2/SSSE3/SSE4/AVX2 is implemented in x86_128-inl.h |
| |
| // The default unsigned to signed DemoteTo/ReorderDemote2To |
| // implementations in generic_ops-inl.h are still used for U32->I8/I16 and |
| // U16->I8 demotions on SSE2/SSSE3/SSE4/AVX2 |
| |
| #undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V |
| #define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) HWY_IF_NOT_T_SIZE_V(V, 8) |
| |
| namespace detail { |
| template <class D, HWY_IF_UNSIGNED_D(D)> |
| HWY_INLINE VFromD<Rebind<uint64_t, D>> DemoteFromU64MaskOutResult( |
| D /*dn*/, VFromD<Rebind<uint64_t, D>> v) { |
| return v; |
| } |
| |
| template <class D, HWY_IF_SIGNED_D(D)> |
| HWY_INLINE VFromD<Rebind<uint64_t, D>> DemoteFromU64MaskOutResult( |
| D /*dn*/, VFromD<Rebind<uint64_t, D>> v) { |
| const DFromV<decltype(v)> du64; |
| return And(v, |
| Set(du64, static_cast<uint64_t>(hwy::HighestValue<TFromD<D>>()))); |
| } |
| |
| template <class D> |
| HWY_INLINE VFromD<Rebind<uint64_t, D>> DemoteFromU64Saturate( |
| D dn, VFromD<Rebind<uint64_t, D>> v) { |
| const Rebind<uint64_t, D> du64; |
| const RebindToSigned<decltype(du64)> di64; |
| constexpr int kShiftAmt = static_cast<int>(sizeof(TFromD<D>) * 8) - |
| static_cast<int>(hwy::IsSigned<TFromD<D>>()); |
| |
| const auto too_big = BitCast( |
| du64, VecFromMask( |
| di64, Gt(BitCast(di64, ShiftRight<kShiftAmt>(v)), Zero(di64)))); |
| return DemoteFromU64MaskOutResult(dn, Or(v, too_big)); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), class V> |
| HWY_INLINE VFromD<D> ReorderDemote2From64To32Combine(D dn, V a, V b) { |
| return ConcatEven(dn, BitCast(dn, b), BitCast(dn, a)); |
| } |
| |
| } // namespace detail |
| |
| template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)), |
| HWY_IF_SIGNED_D(D)> |
| HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<int64_t, D>> v) { |
| const DFromV<decltype(v)> di64; |
| const RebindToUnsigned<decltype(di64)> du64; |
| const RebindToUnsigned<decltype(dn)> dn_u; |
| |
| // Negative values are saturated by first saturating their bitwise inverse |
| // and then inverting the saturation result |
| const auto invert_mask = BitCast(du64, BroadcastSignBit(v)); |
| const auto saturated_vals = Xor( |
| invert_mask, |
| detail::DemoteFromU64Saturate(dn, Xor(invert_mask, BitCast(du64, v)))); |
| return BitCast(dn, TruncateTo(dn_u, saturated_vals)); |
| } |
| |
| template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)), |
| HWY_IF_UNSIGNED_D(D)> |
| HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<int64_t, D>> v) { |
| const DFromV<decltype(v)> di64; |
| const RebindToUnsigned<decltype(di64)> du64; |
| |
| const auto non_neg_vals = BitCast(du64, AndNot(BroadcastSignBit(v), v)); |
| return TruncateTo(dn, detail::DemoteFromU64Saturate(dn, non_neg_vals)); |
| } |
| |
| template <class D, |
| HWY_IF_T_SIZE_ONE_OF_D( |
| D, ((HWY_TARGET != HWY_SSE2) ? ((1 << 1) | (1 << 2)) : 0) | |
| (1 << 4)), |
| HWY_IF_SIGNED_D(D)> |
| HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<uint64_t, D>> v) { |
| const RebindToUnsigned<decltype(dn)> dn_u; |
| return BitCast(dn, TruncateTo(dn_u, detail::DemoteFromU64Saturate(dn, v))); |
| } |
| |
| #if HWY_TARGET == HWY_SSE2 |
| template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)), |
| HWY_IF_SIGNED_D(D)> |
| HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<uint64_t, D>> v) { |
| const Rebind<int32_t, decltype(dn)> di32; |
| return DemoteTo(dn, DemoteTo(di32, v)); |
| } |
| #endif // HWY_TARGET == HWY_SSE2 |
| |
| template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)), |
| HWY_IF_UNSIGNED_D(D)> |
| HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<uint64_t, D>> v) { |
| return TruncateTo(dn, detail::DemoteFromU64Saturate(dn, v)); |
| } |
| #endif // HWY_TARGET <= HWY_AVX3 |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, HWY_MAX_BYTES / 2), |
| HWY_IF_T_SIZE_D(D, 4), HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>)> |
| HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<int64_t, D>> a, |
| VFromD<Repartition<int64_t, D>> b) { |
| const DFromV<decltype(a)> d; |
| const Twice<decltype(d)> dt; |
| return DemoteTo(dn, Combine(dt, b, a)); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, HWY_MAX_BYTES / 2), HWY_IF_U32_D(D)> |
| HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint64_t, D>> a, |
| VFromD<Repartition<uint64_t, D>> b) { |
| const DFromV<decltype(a)> d; |
| const Twice<decltype(d)> dt; |
| return DemoteTo(dn, Combine(dt, b, a)); |
| } |
| |
| #if HWY_TARGET > HWY_AVX3 |
| template <class D, HWY_IF_V_SIZE_LE_D(D, HWY_MAX_BYTES / 2), HWY_IF_I32_D(D)> |
| HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint64_t, D>> a, |
| VFromD<Repartition<uint64_t, D>> b) { |
| const DFromV<decltype(a)> d; |
| const Twice<decltype(d)> dt; |
| return DemoteTo(dn, Combine(dt, b, a)); |
| } |
| #endif |
| |
| #if HWY_TARGET > HWY_AVX2 |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)> |
| HWY_API Vec128<int32_t> ReorderDemote2To(D dn, Vec128<int64_t> a, |
| Vec128<int64_t> b) { |
| const DFromV<decltype(a)> di64; |
| const RebindToUnsigned<decltype(di64)> du64; |
| const Half<decltype(dn)> dnh; |
| |
| // Negative values are saturated by first saturating their bitwise inverse |
| // and then inverting the saturation result |
| const auto invert_mask_a = BitCast(du64, BroadcastSignBit(a)); |
| const auto invert_mask_b = BitCast(du64, BroadcastSignBit(b)); |
| const auto saturated_a = Xor( |
| invert_mask_a, |
| detail::DemoteFromU64Saturate(dnh, Xor(invert_mask_a, BitCast(du64, a)))); |
| const auto saturated_b = Xor( |
| invert_mask_b, |
| detail::DemoteFromU64Saturate(dnh, Xor(invert_mask_b, BitCast(du64, b)))); |
| |
| return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a)); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)> |
| HWY_API Vec128<uint32_t> ReorderDemote2To(D dn, Vec128<int64_t> a, |
| Vec128<int64_t> b) { |
| const DFromV<decltype(a)> di64; |
| const RebindToUnsigned<decltype(di64)> du64; |
| const Half<decltype(dn)> dnh; |
| |
| const auto saturated_a = detail::DemoteFromU64Saturate( |
| dnh, BitCast(du64, AndNot(BroadcastSignBit(a), a))); |
| const auto saturated_b = detail::DemoteFromU64Saturate( |
| dnh, BitCast(du64, AndNot(BroadcastSignBit(b), b))); |
| |
| return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a)); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)> |
| HWY_API VFromD<D> ReorderDemote2To(D dn, Vec128<uint64_t> a, |
| Vec128<uint64_t> b) { |
| const Half<decltype(dn)> dnh; |
| |
| const auto saturated_a = detail::DemoteFromU64Saturate(dnh, a); |
| const auto saturated_b = detail::DemoteFromU64Saturate(dnh, b); |
| |
| return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a)); |
| } |
| #endif // HWY_TARGET > HWY_AVX2 |
| |
| // ------------------------------ Integer <=> fp (ShiftRight, OddEven) |
| |
| #if HWY_HAVE_FLOAT16 |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)> |
| HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) { |
| return VFromD<D>{_mm_cvtepu16_ph(v.raw)}; |
| } |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)> |
| HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) { |
| return VFromD<D>{_mm_cvtepi16_ph(v.raw)}; |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> |
| HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { |
| return VFromD<D>{_mm_cvtepi32_ps(v.raw)}; |
| } |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> |
| HWY_API VFromD<D> ConvertTo(D /*df*/, VFromD<Rebind<uint32_t, D>> v) { |
| return VFromD<D>{_mm_cvtepu32_ps(v.raw)}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> |
| HWY_API VFromD<D> ConvertTo(D /*dd*/, VFromD<Rebind<int64_t, D>> v) { |
| return VFromD<D>{_mm_cvtepi64_pd(v.raw)}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> |
| HWY_API VFromD<D> ConvertTo(D /*dd*/, VFromD<Rebind<uint64_t, D>> v) { |
| return VFromD<D>{_mm_cvtepu64_pd(v.raw)}; |
| } |
| #else // AVX2 or below |
| // Generic for all vector lengths. |
| template <class D, HWY_IF_F32_D(D)> |
| HWY_API VFromD<D> ConvertTo(D df, VFromD<Rebind<uint32_t, D>> v) { |
| // Based on wim's approach (https://stackoverflow.com/questions/34066228/) |
| const RebindToUnsigned<decltype(df)> du32; |
| const RebindToSigned<decltype(df)> d32; |
| |
| const auto msk_lo = Set(du32, 0xFFFF); |
| const auto cnst2_16_flt = Set(df, 65536.0f); // 2^16 |
| |
| // Extract the 16 lowest/highest significant bits of v and cast to signed int |
| const auto v_lo = BitCast(d32, And(v, msk_lo)); |
| const auto v_hi = BitCast(d32, ShiftRight<16>(v)); |
| return MulAdd(cnst2_16_flt, ConvertTo(df, v_hi), ConvertTo(df, v_lo)); |
| } |
| |
| // Generic for all vector lengths. |
| template <class D, HWY_IF_F64_D(D)> |
| HWY_API VFromD<D> ConvertTo(D dd, VFromD<Rebind<int64_t, D>> v) { |
| // Based on wim's approach (https://stackoverflow.com/questions/41144668/) |
| const Repartition<uint32_t, decltype(dd)> d32; |
| const Repartition<uint64_t, decltype(dd)> d64; |
| |
| // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63 |
| const auto k84_63 = Set(d64, 0x4530000080000000ULL); |
| const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63); |
| |
| // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven) |
| const auto k52 = Set(d32, 0x43300000); |
| const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v))); |
| |
| const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL)); |
| return (v_upper - k84_63_52) + v_lower; // order matters! |
| } |
| |
| namespace detail { |
| template <class VW> |
| HWY_INLINE VFromD<Rebind<double, DFromV<VW>>> U64ToF64VecFast(VW w) { |
| const DFromV<decltype(w)> d64; |
| const RebindToFloat<decltype(d64)> dd; |
| const auto cnst2_52_dbl = Set(dd, 0x0010000000000000); // 2^52 |
| return BitCast(dd, Or(w, BitCast(d64, cnst2_52_dbl))) - cnst2_52_dbl; |
| } |
| } // namespace detail |
| |
| // Generic for all vector lengths. |
| template <class D, HWY_IF_F64_D(D)> |
| HWY_API VFromD<D> ConvertTo(D dd, VFromD<Rebind<uint64_t, D>> v) { |
| // Based on wim's approach (https://stackoverflow.com/questions/41144668/) |
| const RebindToUnsigned<decltype(dd)> d64; |
| using VU = VFromD<decltype(d64)>; |
| |
| const VU msk_lo = Set(d64, 0xFFFFFFFF); |
| const auto cnst2_32_dbl = Set(dd, 4294967296.0); // 2^32 |
| |
| // Extract the 32 lowest/highest significant bits of v |
| const VU v_lo = And(v, msk_lo); |
| const VU v_hi = ShiftRight<32>(v); |
| |
| const auto v_lo_dbl = detail::U64ToF64VecFast(v_lo); |
| return MulAdd(cnst2_32_dbl, detail::U64ToF64VecFast(v_hi), v_lo_dbl); |
| } |
| #endif // HWY_TARGET <= HWY_AVX3 |
| |
| // Truncates (rounds toward zero). |
| |
| #ifdef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO |
| #undef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO |
| #else |
| #define HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO |
| #endif |
| |
| #if HWY_HAVE_FLOAT16 |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I16_D(D)> |
| HWY_API VFromD<D> ConvertInRangeTo(D /*di*/, VFromD<RebindToFloat<D>> v) { |
| #if HWY_COMPILER_GCC_ACTUAL |
| // Workaround for undefined behavior in _mm_cvttph_epi16 if any values of v[i] |
| // are not within the range of an int16_t |
| |
| #if HWY_COMPILER_GCC_ACTUAL >= 1200 && !HWY_IS_DEBUG_BUILD && \ |
| HWY_HAVE_SCALAR_F16_TYPE |
| if (detail::IsConstantX86VecForF2IConv<int16_t>(v)) { |
| typedef hwy::float16_t::Native GccF16RawVectType |
| __attribute__((__vector_size__(16))); |
| const auto raw_v = reinterpret_cast<GccF16RawVectType>(v.raw); |
| return Dup128VecFromValues( |
| D(), detail::X86ConvertScalarFromFloat<int16_t>(raw_v[0]), |
| detail::X86ConvertScalarFromFloat<int16_t>(raw_v[1]), |
| detail::X86ConvertScalarFromFloat<int16_t>(raw_v[2]), |
| detail::X86ConvertScalarFromFloat<int16_t>(raw_v[3]), |
| detail::X86ConvertScalarFromFloat<int16_t>(raw_v[4]), |
| detail::X86ConvertScalarFromFloat<int16_t>(raw_v[5]), |
| detail::X86ConvertScalarFromFloat<int16_t>(raw_v[6]), |
| detail::X86ConvertScalarFromFloat<int16_t>(raw_v[7])); |
| } |
| #endif |
| |
| __m128i raw_result; |
| __asm__("vcvttph2w {%1, %0|%0, %1}" |
| : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result) |
| : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw) |
| :); |
| return VFromD<D>{raw_result}; |
| #else // !HWY_COMPILER_GCC_ACTUAL |
| return VFromD<D>{_mm_cvttph_epi16(v.raw)}; |
| #endif |
| } |
| |
| // F16 to I16 ConvertTo is generic for all vector lengths |
| template <class D, HWY_IF_I16_D(D)> |
| HWY_API VFromD<D> ConvertTo(D di, VFromD<RebindToFloat<D>> v) { |
| const RebindToFloat<decltype(di)> df; |
| // See comment at the first occurrence of "IfThenElse(overflow,". |
| const MFromD<D> overflow = |
| RebindMask(di, Ge(v, Set(df, ConvertScalarTo<hwy::float16_t>(32768.0f)))); |
| return IfThenElse(overflow, Set(di, LimitsMax<int16_t>()), |
| ConvertInRangeTo(di, v)); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)> |
| HWY_API VFromD<D> ConvertInRangeTo(D /* tag */, VFromD<RebindToFloat<D>> v) { |
| #if HWY_COMPILER_GCC_ACTUAL |
| // Workaround for undefined behavior in _mm_cvttph_epu16 if any values of v[i] |
| // are not within the range of an uint16_t |
| |
| #if HWY_COMPILER_GCC_ACTUAL >= 1200 && !HWY_IS_DEBUG_BUILD && \ |
| HWY_HAVE_SCALAR_F16_TYPE |
| if (detail::IsConstantX86VecForF2IConv<uint16_t>(v)) { |
| typedef hwy::float16_t::Native GccF16RawVectType |
| __attribute__((__vector_size__(16))); |
| const auto raw_v = reinterpret_cast<GccF16RawVectType>(v.raw); |
| return Dup128VecFromValues( |
| D(), detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[0]), |
| detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[1]), |
| detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[2]), |
| detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[3]), |
| detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[4]), |
| detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[5]), |
| detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[6]), |
| detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[7])); |
| } |
| #endif |
| |
| __m128i raw_result; |
| __asm__("vcvttph2uw {%1, %0|%0, %1}" |
| : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result) |
| : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw) |
| :); |
| return VFromD<D>{raw_result}; |
| #else // !HWY_COMPILER_GCC_ACTUAL |
| return VFromD<D>{_mm_cvttph_epu16(v.raw)}; |
| #endif |
| } |
| |
| // F16->U16 ConvertTo is generic for all vector lengths |
| template <class D, HWY_IF_U16_D(D)> |
| HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToFloat<D>> v) { |
| return ConvertInRangeTo(D(), ZeroIfNegative(v)); |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)> |
| HWY_API VFromD<D> ConvertInRangeTo(D /*di*/, VFromD<RebindToFloat<D>> v) { |
| #if HWY_COMPILER_GCC_ACTUAL |
| // Workaround for undefined behavior in _mm_cvttps_epi32 with GCC if any |
| // values of v[i] are not within the range of an int32_t |
| |
| #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD |
| if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) { |
| typedef float GccF32RawVectType __attribute__((__vector_size__(16))); |
| const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw); |
| return Dup128VecFromValues( |
| D(), detail::X86ConvertScalarFromFloat<int32_t>(raw_v[0]), |
| detail::X86ConvertScalarFromFloat<int32_t>(raw_v[1]), |
| detail::X86ConvertScalarFromFloat<int32_t>(raw_v[2]), |
| detail::X86ConvertScalarFromFloat<int32_t>(raw_v[3])); |
| } |
| #endif |
| |
| __m128i raw_result; |
| __asm__("%vcvttps2dq {%1, %0|%0, %1}" |
| : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result) |
| : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw) |
| :); |
| return VFromD<D>{raw_result}; |
| #else // !HWY_COMPILER_GCC_ACTUAL |
| return VFromD<D>{_mm_cvttps_epi32(v.raw)}; |
| #endif |
| } |
| |
| // F32 to I32 ConvertTo is generic for all vector lengths |
| template <class D, HWY_IF_I32_D(D)> |
| HWY_API VFromD<D> ConvertTo(D di, VFromD<RebindToFloat<D>> v) { |
| const RebindToFloat<decltype(di)> df; |
| // See comment at the first occurrence of "IfThenElse(overflow,". |
| const MFromD<D> overflow = RebindMask(di, Ge(v, Set(df, 2147483648.0f))); |
| return IfThenElse(overflow, Set(di, LimitsMax<int32_t>()), |
| ConvertInRangeTo(di, v)); |
| } |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| template <class DI, HWY_IF_V_SIZE_LE_D(DI, 16), HWY_IF_I64_D(DI)> |
| HWY_API VFromD<DI> ConvertInRangeTo(DI /*di*/, VFromD<RebindToFloat<DI>> v) { |
| #if HWY_COMPILER_GCC_ACTUAL |
| // Workaround for undefined behavior in _mm_cvttpd_epi64 with GCC if any |
| // values of v[i] are not within the range of an int64_t |
| |
| #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD |
| if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) { |
| typedef double GccF64RawVectType __attribute__((__vector_size__(16))); |
| const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw); |
| return Dup128VecFromValues( |
| DI(), detail::X86ConvertScalarFromFloat<int64_t>(raw_v[0]), |
| detail::X86ConvertScalarFromFloat<int64_t>(raw_v[1])); |
| } |
| #endif |
| |
| __m128i raw_result; |
| __asm__("vcvttpd2qq {%1, %0|%0, %1}" |
| : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result) |
| : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw) |
| :); |
| return VFromD<DI>{raw_result}; |
| #else // !HWY_COMPILER_GCC_ACTUAL |
| return VFromD<DI>{_mm_cvttpd_epi64(v.raw)}; |
| #endif |
| } |
| |
| // F64 to I64 ConvertTo is generic for all vector lengths on AVX3 |
| template <class DI, HWY_IF_I64_D(DI)> |
| HWY_API VFromD<DI> ConvertTo(DI di, VFromD<RebindToFloat<DI>> v) { |
| const RebindToFloat<decltype(di)> df; |
| // See comment at the first occurrence of "IfThenElse(overflow,". |
| const MFromD<DI> overflow = |
| RebindMask(di, Ge(v, Set(df, 9.223372036854776e18))); |
| return IfThenElse(overflow, Set(di, LimitsMax<int64_t>()), |
| ConvertInRangeTo(di, v)); |
| } |
| |
| template <class DU, HWY_IF_V_SIZE_LE_D(DU, 16), HWY_IF_U32_D(DU)> |
| HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) { |
| #if HWY_COMPILER_GCC_ACTUAL |
| // Workaround for undefined behavior in _mm_cvttps_epu32 with GCC if any |
| // values of v[i] are not within the range of an uint32_t |
| |
| #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD |
| if (detail::IsConstantX86VecForF2IConv<uint32_t>(v)) { |
| typedef float GccF32RawVectType __attribute__((__vector_size__(16))); |
| const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw); |
| return Dup128VecFromValues( |
| DU(), detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[0]), |
| detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[1]), |
| detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[2]), |
| detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[3])); |
| } |
| #endif |
| |
| __m128i raw_result; |
| __asm__("vcvttps2udq {%1, %0|%0, %1}" |
| : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result) |
| : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw) |
| :); |
| return VFromD<DU>{raw_result}; |
| #else // !HWY_COMPILER_GCC_ACTUAL |
| return VFromD<DU>{_mm_cvttps_epu32(v.raw)}; |
| #endif |
| } |
| |
| // F32->U32 ConvertTo is generic for all vector lengths |
| template <class DU, HWY_IF_U32_D(DU)> |
| HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) { |
| return ConvertInRangeTo(DU(), ZeroIfNegative(v)); |
| } |
| |
| template <class DU, HWY_IF_V_SIZE_LE_D(DU, 16), HWY_IF_U64_D(DU)> |
| HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) { |
| #if HWY_COMPILER_GCC_ACTUAL |
| // Workaround for undefined behavior in _mm_cvttpd_epu64 with GCC if any |
| // values of v[i] are not within the range of an uint64_t |
| |
| #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD |
| if (detail::IsConstantX86VecForF2IConv<uint64_t>(v)) { |
| typedef double GccF64RawVectType __attribute__((__vector_size__(16))); |
| const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw); |
| return Dup128VecFromValues( |
| DU(), detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[0]), |
| detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[1])); |
| } |
| #endif |
| |
| __m128i raw_result; |
| __asm__("vcvttpd2uqq {%1, %0|%0, %1}" |
| : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result) |
| : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw) |
| :); |
| return VFromD<DU>{raw_result}; |
| #else // !HWY_COMPILER_GCC_ACTUAL |
| return VFromD<DU>{_mm_cvttpd_epu64(v.raw)}; |
| #endif |
| } |
| |
| // F64->U64 ConvertTo is generic for all vector lengths |
| template <class DU, HWY_IF_U64_D(DU)> |
| HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) { |
| return ConvertInRangeTo(DU(), ZeroIfNegative(v)); |
| } |
| |
| #else // AVX2 or below |
| |
| namespace detail { |
| |
| template <class DU32, HWY_IF_U32_D(DU32)> |
| static HWY_INLINE VFromD<DU32> ConvInRangeF32ToU32( |
| DU32 du32, VFromD<RebindToFloat<DU32>> v, VFromD<DU32>& exp_diff) { |
| const RebindToSigned<decltype(du32)> di32; |
| const RebindToFloat<decltype(du32)> df32; |
| |
| exp_diff = Set(du32, uint32_t{158}) - ShiftRight<23>(BitCast(du32, v)); |
| const auto scale_down_f32_val_mask = |
| VecFromMask(du32, Eq(exp_diff, Zero(du32))); |
| |
| const auto v_scaled = |
| BitCast(df32, BitCast(du32, v) + ShiftLeft<23>(scale_down_f32_val_mask)); |
| const auto f32_to_u32_result = |
| BitCast(du32, ConvertInRangeTo(di32, v_scaled)); |
| |
| return f32_to_u32_result + And(f32_to_u32_result, scale_down_f32_val_mask); |
| } |
| |
| } // namespace detail |
| |
| // F32 to U32 ConvertInRangeTo is generic for all vector lengths on |
| // SSE2/SSSE3/SSE4/AVX2 |
| template <class DU32, HWY_IF_U32_D(DU32)> |
| HWY_API VFromD<DU32> ConvertInRangeTo(DU32 du32, |
| VFromD<RebindToFloat<DU32>> v) { |
| VFromD<DU32> exp_diff; |
| const auto f32_to_u32_result = detail::ConvInRangeF32ToU32(du32, v, exp_diff); |
| return f32_to_u32_result; |
| } |
| |
| // F32 to U32 ConvertTo is generic for all vector lengths on |
| // SSE2/SSSE3/SSE4/AVX2 |
| template <class DU32, HWY_IF_U32_D(DU32)> |
| HWY_API VFromD<DU32> ConvertTo(DU32 du32, VFromD<RebindToFloat<DU32>> v) { |
| const RebindToSigned<decltype(du32)> di32; |
| |
| const auto non_neg_v = ZeroIfNegative(v); |
| VFromD<DU32> exp_diff; |
| const auto f32_to_u32_result = |
| detail::ConvInRangeF32ToU32(du32, non_neg_v, exp_diff); |
| |
| return Or(f32_to_u32_result, |
| BitCast(du32, BroadcastSignBit(BitCast(di32, exp_diff)))); |
| } |
| |
| namespace detail { |
| |
| template <class D64, HWY_IF_UI64_D(D64)> |
| HWY_API VFromD<D64> ConvAbsInRangeF64ToUI64(D64 d64, |
| VFromD<Rebind<double, D64>> v, |
| VFromD<D64>& biased_exp) { |
| const RebindToSigned<decltype(d64)> di64; |
| const RebindToUnsigned<decltype(d64)> du64; |
| using VU64 = VFromD<decltype(du64)>; |
| const Repartition<uint16_t, decltype(di64)> du16; |
| const VU64 k1075 = Set(du64, 1075); /* biased exponent of 2^52 */ |
| |
| // Exponent indicates whether the number can be represented as int64_t. |
| biased_exp = BitCast(d64, ShiftRight<52>(BitCast(du64, v))); |
| HWY_IF_CONSTEXPR(IsSigned<TFromD<D64>>()) { |
| biased_exp = And(biased_exp, Set(d64, TFromD<D64>{0x7FF})); |
| } |
| |
| // If we were to cap the exponent at 51 and add 2^52, the number would be in |
| // [2^52, 2^53) and mantissa bits could be read out directly. We need to |
| // round-to-0 (truncate), but changing rounding mode in MXCSR hits a |
| // compiler reordering bug: https://gcc.godbolt.org/z/4hKj6c6qc . We instead |
| // manually shift the mantissa into place (we already have many of the |
| // inputs anyway). |
| |
| // Use 16-bit saturated unsigned subtraction to compute shift_mnt and |
| // shift_int since biased_exp[i] is a non-negative integer that is less than |
| // or equal to 2047. |
| |
| // 16-bit saturated unsigned subtraction is also more efficient than a |
| // 64-bit subtraction followed by a 64-bit signed Max operation on |
| // SSE2/SSSE3/SSE4/AVX2. |
| |
| // The upper 48 bits of both shift_mnt and shift_int are guaranteed to be |
| // zero as the upper 48 bits of both k1075 and biased_exp are zero. |
| |
| const VU64 shift_mnt = BitCast( |
| du64, SaturatedSub(BitCast(du16, k1075), BitCast(du16, biased_exp))); |
| const VU64 shift_int = BitCast( |
| du64, SaturatedSub(BitCast(du16, biased_exp), BitCast(du16, k1075))); |
| const VU64 mantissa = BitCast(du64, v) & Set(du64, (1ULL << 52) - 1); |
| // Include implicit 1-bit. NOTE: the shift count may exceed 63; we rely on x86 |
| // returning zero in that case. |
| const VU64 int53 = (mantissa | Set(du64, 1ULL << 52)) >> shift_mnt; |
| |
| // For inputs larger than 2^53 - 1, insert zeros at the bottom. |
| |
| // For inputs less than 2^64, the implicit 1-bit is guaranteed not to be |
| // shifted out of the left shift result below as shift_int[i] <= 11 is true |
| // for any inputs that are less than 2^64. |
| |
| return BitCast(d64, int53 << shift_int); |
| } |
| |
| } // namespace detail |
| |
| #if HWY_ARCH_X86_64 |
| |
| namespace detail { |
| |
| template <size_t N> |
| static HWY_INLINE int64_t SSE2ConvFirstF64LaneToI64(Vec128<double, N> v) { |
| #if HWY_COMPILER_GCC_ACTUAL |
| // Workaround for undefined behavior in _mm_cvttsd_si64 with GCC if v[0] is |
| // not within the range of an int64_t |
| |
| #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD |
| if (IsConstantX86Vec(hwy::SizeTag<1>(), v)) { |
| typedef double GccF64RawVectType __attribute__((__vector_size__(16))); |
| const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw); |
| return X86ConvertScalarFromFloat<int64_t>(raw_v[0]); |
| } |
| #endif |
| |
| int64_t result; |
| __asm__("%vcvttsd2si {%1, %0|%0, %1}" |
| : "=r"(result) |
| : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw) |
| :); |
| return result; |
| #else |
| return _mm_cvttsd_si64(v.raw); |
| #endif |
| } |
| |
| } // namespace detail |
| |
| template <class DI, HWY_IF_V_SIZE_D(DI, 8), HWY_IF_I64_D(DI)> |
| HWY_API VFromD<DI> ConvertInRangeTo(DI /*di*/, Vec64<double> v) { |
| return VFromD<DI>{_mm_cvtsi64_si128(detail::SSE2ConvFirstF64LaneToI64(v))}; |
| } |
| template <class DI, HWY_IF_V_SIZE_D(DI, 16), HWY_IF_I64_D(DI)> |
| HWY_API VFromD<DI> ConvertInRangeTo(DI /*di*/, Vec128<double> v) { |
| const __m128i i0 = _mm_cvtsi64_si128(detail::SSE2ConvFirstF64LaneToI64(v)); |
| const Full64<double> dd2; |
| const __m128i i1 = |
| _mm_cvtsi64_si128(detail::SSE2ConvFirstF64LaneToI64(UpperHalf(dd2, v))); |
| return VFromD<DI>{_mm_unpacklo_epi64(i0, i1)}; |
| } |
| |
| template <class DI, HWY_IF_V_SIZE_LE_D(DI, 16), HWY_IF_I64_D(DI)> |
| HWY_API VFromD<DI> ConvertTo(DI di, VFromD<Rebind<double, DI>> v) { |
| const RebindToFloat<decltype(di)> df; |
| // See comment at the first occurrence of "IfThenElse(overflow,". |
| const MFromD<DI> overflow = |
| RebindMask(di, Ge(v, Set(df, 9.223372036854776e18))); |
| return IfThenElse(overflow, Set(di, LimitsMax<int64_t>()), |
| ConvertInRangeTo(di, v)); |
| } |
| #endif // HWY_ARCH_X86_64 |
| |
| #if !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2 |
| template <class DI, HWY_IF_V_SIZE_GT_D(DI, (HWY_ARCH_X86_64 ? 16 : 0)), |
| HWY_IF_I64_D(DI)> |
| HWY_API VFromD<DI> ConvertInRangeTo(DI di, VFromD<Rebind<double, DI>> v) { |
| using VI = VFromD<DI>; |
| |
| VI biased_exp; |
| const VI shifted = detail::ConvAbsInRangeF64ToUI64(di, v, biased_exp); |
| const VI sign_mask = BroadcastSignBit(BitCast(di, v)); |
| |
| // If the input was negative, negate the integer (two's complement). |
| return (shifted ^ sign_mask) - sign_mask; |
| } |
| |
| template <class DI, HWY_IF_V_SIZE_GT_D(DI, (HWY_ARCH_X86_64 ? 16 : 0)), |
| HWY_IF_I64_D(DI)> |
| HWY_API VFromD<DI> ConvertTo(DI di, VFromD<Rebind<double, DI>> v) { |
| using VI = VFromD<DI>; |
| |
| VI biased_exp; |
| const VI shifted = detail::ConvAbsInRangeF64ToUI64(di, v, biased_exp); |
| |
| #if HWY_TARGET <= HWY_SSE4 |
| const auto in_range = biased_exp < Set(di, 1086); |
| #else |
| const Repartition<int32_t, decltype(di)> di32; |
| const auto in_range = MaskFromVec(BitCast( |
| di, |
| VecFromMask(di32, DupEven(BitCast(di32, biased_exp)) < Set(di32, 1086)))); |
| #endif |
| |
| // Saturate to LimitsMin (unchanged when negating below) or LimitsMax. |
| const VI sign_mask = BroadcastSignBit(BitCast(di, v)); |
| const VI limit = Set(di, LimitsMax<int64_t>()) - sign_mask; |
| const VI magnitude = IfThenElse(in_range, shifted, limit); |
| |
| // If the input was negative, negate the integer (two's complement). |
| return (magnitude ^ sign_mask) - sign_mask; |
| } |
| #endif // !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2 |
| |
| // Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2 |
| template <class DU, HWY_IF_U64_D(DU)> |
| HWY_API VFromD<DU> ConvertInRangeTo(DU du, VFromD<Rebind<double, DU>> v) { |
| VFromD<DU> biased_exp; |
| const auto shifted = detail::ConvAbsInRangeF64ToUI64(du, v, biased_exp); |
| return shifted; |
| } |
| |
| // Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2 |
| template <class DU, HWY_IF_U64_D(DU)> |
| HWY_API VFromD<DU> ConvertTo(DU du, VFromD<Rebind<double, DU>> v) { |
| const RebindToSigned<DU> di; |
| using VU = VFromD<DU>; |
| |
| VU biased_exp; |
| const VU shifted = |
| detail::ConvAbsInRangeF64ToUI64(du, ZeroIfNegative(v), biased_exp); |
| |
| // Exponent indicates whether the number can be represented as uint64_t. |
| #if HWY_TARGET <= HWY_SSE4 |
| const VU out_of_range = |
| BitCast(du, VecFromMask(di, BitCast(di, biased_exp) > Set(di, 1086))); |
| #else |
| const Repartition<int32_t, decltype(di)> di32; |
| const VU out_of_range = BitCast( |
| du, |
| VecFromMask(di32, DupEven(BitCast(di32, biased_exp)) > Set(di32, 1086))); |
| #endif |
| |
| return (shifted | out_of_range); |
| } |
| #endif // HWY_TARGET <= HWY_AVX3 |
| |
| #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD |
| namespace detail { |
| |
| template <class TTo, class TF, HWY_IF_SIGNED(TTo)> |
| static HWY_INLINE HWY_MAYBE_UNUSED HWY_BITCASTSCALAR_CXX14_CONSTEXPR TTo |
| X86ScalarNearestInt(TF flt_val) { |
| #if HWY_HAVE_SCALAR_F16_TYPE && HWY_HAVE_SCALAR_F16_OPERATORS |
| using TFArith = If<hwy::IsSame<RemoveCvRef<TTo>, hwy::bfloat16_t>(), float, |
| RemoveCvRef<TF>>; |
| #else |
| using TFArith = If<sizeof(TF) <= sizeof(float), float, RemoveCvRef<TF>>; |
| #endif |
| |
| const TTo trunc_int_val = X86ConvertScalarFromFloat<TTo>(flt_val); |
| const TFArith abs_val_diff = ScalarAbs( |
| ConvertScalarTo<TFArith>(ConvertScalarTo<TFArith>(flt_val) - |
| ConvertScalarTo<TFArith>(trunc_int_val))); |
| constexpr TFArith kHalf = ConvertScalarTo<TFArith>(0.5); |
| |
| const bool round_result_up = |
| ((trunc_int_val ^ ScalarShr(trunc_int_val, sizeof(TTo) * 8 - 1)) != |
| LimitsMax<TTo>()) && |
| (abs_val_diff > kHalf || |
| (abs_val_diff == kHalf && (trunc_int_val & 1) != 0)); |
| return static_cast<TTo>( |
| trunc_int_val + |
| (round_result_up ? (ScalarSignBit(flt_val) ? (-1) : 1) : 0)); |
| } |
| |
| } // namespace detail |
| #endif // HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD |
| |
| // If these are in namespace detail, the x86_256/512 templates are not found. |
| template <class DI, HWY_IF_V_SIZE_LE_D(DI, 16), HWY_IF_I32_D(DI)> |
| static HWY_INLINE VFromD<DI> NearestIntInRange(DI, |
| VFromD<RebindToFloat<DI>> v) { |
| #if HWY_COMPILER_GCC_ACTUAL |
| // Workaround for undefined behavior in _mm_cvtps_epi32 with GCC if any values |
| // of v[i] are not within the range of an int32_t |
| |
| #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD |
| if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) { |
| typedef float GccF32RawVectType __attribute__((__vector_size__(16))); |
| const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw); |
| return Dup128VecFromValues(DI(), |
| detail::X86ScalarNearestInt<int32_t>(raw_v[0]), |
| detail::X86ScalarNearestInt<int32_t>(raw_v[1]), |
| detail::X86ScalarNearestInt<int32_t>(raw_v[2]), |
| detail::X86ScalarNearestInt<int32_t>(raw_v[3])); |
| } |
| #endif |
| |
| __m128i raw_result; |
| __asm__("%vcvtps2dq {%1, %0|%0, %1}" |
| : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result) |
| : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw) |
| :); |
| return VFromD<DI>{raw_result}; |
| #else // !HWY_COMPILER_GCC_ACTUAL |
| return VFromD<DI>{_mm_cvtps_epi32(v.raw)}; |
| #endif |
| } |
| |
| #if HWY_HAVE_FLOAT16 |
| template <class DI, HWY_IF_V_SIZE_LE_D(DI, 16), HWY_IF_I16_D(DI)> |
| static HWY_INLINE VFromD<DI> NearestIntInRange(DI /*di*/, |
| VFromD<RebindToFloat<DI>> v) { |
| #if HWY_COMPILER_GCC_ACTUAL |
| // Workaround for undefined behavior in _mm_cvtph_epi16 if any values of v[i] |
| // are not within the range of an int16_t |
| |
| #if HWY_COMPILER_GCC_ACTUAL >= 1200 && !HWY_IS_DEBUG_BUILD && \ |
| HWY_HAVE_SCALAR_F16_TYPE |
| if (detail::IsConstantX86VecForF2IConv<int16_t>(v)) { |
| typedef hwy::float16_t::Native GccF16RawVectType |
| __attribute__((__vector_size__(16))); |
| const auto raw_v = reinterpret_cast<GccF16RawVectType>(v.raw); |
| return Dup128VecFromValues(DI(), |
| detail::X86ScalarNearestInt<int16_t>(raw_v[0]), |
| detail::X86ScalarNearestInt<int16_t>(raw_v[1]), |
| detail::X86ScalarNearestInt<int16_t>(raw_v[2]), |
| detail::X86ScalarNearestInt<int16_t>(raw_v[3]), |
| detail::X86ScalarNearestInt<int16_t>(raw_v[4]), |
| detail::X86ScalarNearestInt<int16_t>(raw_v[5]), |
| detail::X86ScalarNearestInt<int16_t>(raw_v[6]), |
| detail::X86ScalarNearestInt<int16_t>(raw_v[7])); |
| } |
| #endif |
| |
| __m128i raw_result; |
| __asm__("vcvtph2w {%1, %0|%0, %1}" |
| : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result) |
| : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw) |
| :); |
| return VFromD<DI>{raw_result}; |
| #else // !HWY_COMPILER_GCC_ACTUAL |
| return VFromD<DI>{_mm_cvtph_epi16(v.raw)}; |
| #endif |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| |
| template <class DI, HWY_IF_V_SIZE_LE_D(DI, 16), HWY_IF_I64_D(DI)> |
| static HWY_INLINE VFromD<DI> NearestIntInRange(DI /*di*/, |
| VFromD<RebindToFloat<DI>> v) { |
| #if HWY_COMPILER_GCC_ACTUAL |
| // Workaround for undefined behavior in _mm_cvtpd_epi64 with GCC if any |
| // values of v[i] are not within the range of an int64_t |
| |
| #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD |
| if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) { |
| typedef double GccF64RawVectType __attribute__((__vector_size__(16))); |
| const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw); |
| return Dup128VecFromValues(DI(), |
| detail::X86ScalarNearestInt<int64_t>(raw_v[0]), |
| detail::X86ScalarNearestInt<int64_t>(raw_v[1])); |
| } |
| #endif |
| |
| __m128i raw_result; |
| __asm__("vcvtpd2qq {%1, %0|%0, %1}" |
| : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result) |
| : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw) |
| :); |
| return VFromD<DI>{raw_result}; |
| #else // !HWY_COMPILER_GCC_ACTUAL |
| return VFromD<DI>{_mm_cvtpd_epi64(v.raw)}; |
| #endif |
| } |
| |
| #else // HWY_TARGET > HWY_AVX3 |
| |
| namespace detail { |
| |
| #if HWY_ARCH_X86_64 |
| template <size_t N> |
| static HWY_INLINE int64_t |
| SSE2ConvFirstF64LaneToNearestI64(Vec128<double, N> v) { |
| #if HWY_COMPILER_GCC_ACTUAL |
| // Workaround for undefined behavior in _mm_cvtsd_si64 with GCC if v[0] is |
| // not within the range of an int64_t |
| |
| #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD |
| if (IsConstantX86Vec(hwy::SizeTag<1>(), v)) { |
| typedef double GccF64RawVectType __attribute__((__vector_size__(16))); |
| const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw); |
| return X86ScalarNearestInt<int64_t>(raw_v[0]); |
| } |
| #endif |
| |
| int64_t result; |
| __asm__("%vcvtsd2si {%1, %0|%0, %1}" |
| : "=r"(result) |
| : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw) |
| :); |
| return result; |
| #else |
| return _mm_cvtsd_si64(v.raw); |
| #endif |
| } |
| #endif // HWY_ARCH_X86_64 |
| |
| #if !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2 |
| template <class DI64, HWY_IF_I64_D(DI64)> |
| static HWY_INLINE VFromD<DI64> SSE2NearestI64InRange( |
| DI64 di64, VFromD<RebindToFloat<DI64>> v) { |
| const RebindToFloat<DI64> df64; |
| const RebindToUnsigned<DI64> du64; |
| using VI64 = VFromD<decltype(di64)>; |
| |
| const auto mant_end = Set(df64, MantissaEnd<double>()); |
| const auto is_small = Lt(Abs(v), mant_end); |
| |
| const auto adj_v = Max(v, Set(df64, -9223372036854775808.0)) + |
| IfThenElseZero(is_small, CopySignToAbs(mant_end, v)); |
| const auto adj_v_biased_exp = |
| And(BitCast(di64, ShiftRight<52>(BitCast(du64, adj_v))), |
| Set(di64, int64_t{0x7FF})); |
| |
| // We can simply subtract 1075 from adj_v_biased_exp[i] to get shift_int since |
| // adj_v_biased_exp[i] is at least 1075 |
| const VI64 shift_int = adj_v_biased_exp + Set(di64, int64_t{-1075}); |
| |
| const VI64 mantissa = BitCast(di64, adj_v) & Set(di64, (1LL << 52) - 1); |
| // Include implicit 1-bit if is_small[i] is 0. NOTE: the shift count may |
| // exceed 63; we rely on x86 returning zero in that case. |
| const VI64 int53 = mantissa | IfThenZeroElse(RebindMask(di64, is_small), |
| Set(di64, 1LL << 52)); |
| |
| const VI64 sign_mask = BroadcastSignBit(BitCast(di64, v)); |
| // If the input was negative, negate the integer (two's complement). |
| return ((int53 << shift_int) ^ sign_mask) - sign_mask; |
| } |
| #endif // !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2 |
| |
| } // namespace detail |
| |
| #if HWY_ARCH_X86_64 |
| template <class DI, HWY_IF_V_SIZE_D(DI, 8), HWY_IF_I64_D(DI)> |
| static HWY_INLINE VFromD<DI> NearestIntInRange(DI /*di*/, Vec64<double> v) { |
| return VFromD<DI>{ |
| _mm_cvtsi64_si128(detail::SSE2ConvFirstF64LaneToNearestI64(v))}; |
| } |
| template <class DI, HWY_IF_V_SIZE_D(DI, 16), HWY_IF_I64_D(DI)> |
| static HWY_INLINE VFromD<DI> NearestIntInRange(DI /*di*/, Vec128<double> v) { |
| const __m128i i0 = |
| _mm_cvtsi64_si128(detail::SSE2ConvFirstF64LaneToNearestI64(v)); |
| const Full64<double> dd2; |
| const __m128i i1 = _mm_cvtsi64_si128( |
| detail::SSE2ConvFirstF64LaneToNearestI64(UpperHalf(dd2, v))); |
| return VFromD<DI>{_mm_unpacklo_epi64(i0, i1)}; |
| } |
| #endif // HWY_ARCH_X86_64 |
| |
| #if !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2 |
| template <class DI, HWY_IF_V_SIZE_GT_D(DI, (HWY_ARCH_X86_64 ? 16 : 0)), |
| HWY_IF_I64_D(DI)> |
| static HWY_INLINE VFromD<DI> NearestIntInRange(DI di, |
| VFromD<RebindToFloat<DI>> v) { |
| return detail::SSE2NearestI64InRange(di, v); |
| } |
| #endif // !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2 |
| |
| #endif // HWY_TARGET <= HWY_AVX3 |
| |
| template <class DI, HWY_IF_V_SIZE_LE_D(DI, 8), HWY_IF_I32_D(DI)> |
| static HWY_INLINE VFromD<DI> DemoteToNearestIntInRange( |
| DI, VFromD<Rebind<double, DI>> v) { |
| #if HWY_COMPILER_GCC_ACTUAL |
| // Workaround for undefined behavior in _mm_cvtpd_epi32 with GCC if any values |
| // of v[i] are not within the range of an int32_t |
| |
| #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD |
| if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) { |
| typedef double GccF32RawVectType __attribute__((__vector_size__(16))); |
| const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw); |
| return Dup128VecFromValues( |
| DI(), detail::X86ScalarNearestInt<int32_t>(raw_v[0]), |
| detail::X86ScalarNearestInt<int32_t>(raw_v[1]), int32_t{0}, int32_t{0}); |
| } |
| #endif |
| |
| __m128i raw_result; |
| __asm__("%vcvtpd2dq {%1, %0|%0, %1}" |
| : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result) |
| : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw) |
| :); |
| return VFromD<DI>{raw_result}; |
| #else // !HWY_COMPILER_GCC_ACTUAL |
| return VFromD<DI>{_mm_cvtpd_epi32(v.raw)}; |
| #endif |
| } |
| |
| // F16/F32/F64 NearestInt is generic for all vector lengths |
| template <class VF, class DF = DFromV<VF>, class DI = RebindToSigned<DF>, |
| HWY_IF_FLOAT_D(DF), |
| HWY_IF_T_SIZE_ONE_OF_D(DF, (1 << 4) | (1 << 8) | |
| (HWY_HAVE_FLOAT16 ? (1 << 2) : 0))> |
| HWY_API VFromD<DI> NearestInt(const VF v) { |
| const DI di; |
| using TI = TFromD<DI>; |
| using TF = TFromD<DF>; |
| using TFArith = If<sizeof(TF) <= sizeof(float), float, RemoveCvRef<TF>>; |
| |
| constexpr TFArith kMinOutOfRangePosVal = |
| static_cast<TFArith>(-static_cast<TFArith>(LimitsMin<TI>())); |
| static_assert(kMinOutOfRangePosVal > static_cast<TFArith>(0.0), |
| "kMinOutOfRangePosVal > 0.0 must be true"); |
| |
| // See comment at the first occurrence of "IfThenElse(overflow,". |
| // Here we are rounding, whereas previous occurrences truncate, but there is |
| // no difference because the previous float value is well below the max i32. |
| const auto overflow = RebindMask( |
| di, Ge(v, Set(DF(), ConvertScalarTo<TF>(kMinOutOfRangePosVal)))); |
| auto result = |
| IfThenElse(overflow, Set(di, LimitsMax<TI>()), NearestIntInRange(di, v)); |
| |
| return result; |
| } |
| |
| template <class DI, HWY_IF_I32_D(DI)> |
| HWY_API VFromD<DI> DemoteToNearestInt(DI, VFromD<Rebind<double, DI>> v) { |
| const DI di; |
| const Rebind<double, DI> df64; |
| return DemoteToNearestIntInRange(di, Min(v, Set(df64, 2147483647.0))); |
| } |
| |
| // ------------------------------ Floating-point rounding (ConvertTo) |
| |
| #if HWY_TARGET >= HWY_SSSE3 |
| |
| // Toward nearest integer, ties to even |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N> Round(const Vec128<T, N> v) { |
| static_assert(IsFloat<T>(), "Only for float"); |
| // Rely on rounding after addition with a large value such that no mantissa |
| // bits remain (assuming the current mode is nearest-even). We may need a |
| // compiler flag for precise floating-point to prevent "optimizing" this out. |
| const DFromV<decltype(v)> df; |
| const auto max = Set(df, MantissaEnd<T>()); |
| const auto large = CopySignToAbs(max, v); |
| const auto added = large + v; |
| const auto rounded = added - large; |
| // Keep original if NaN or the magnitude is large (already an int). |
| return IfThenElse(Abs(v) < max, rounded, v); |
| } |
| |
| namespace detail { |
| |
| // Truncating to integer and converting back to float is correct except when the |
| // input magnitude is large, in which case the input was already an integer |
| // (because mantissa >> exponent is zero). |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> UseInt(const Vec128<T, N> v) { |
| static_assert(IsFloat<T>(), "Only for float"); |
| const DFromV<decltype(v)> d; |
| return Abs(v) < Set(d, MantissaEnd<T>()); |
| } |
| |
| } // namespace detail |
| |
| // Toward zero, aka truncate |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N> Trunc(const Vec128<T, N> v) { |
| static_assert(IsFloat<T>(), "Only for float"); |
| const DFromV<decltype(v)> df; |
| const RebindToSigned<decltype(df)> di; |
| |
| const auto integer = ConvertInRangeTo(di, v); // round toward 0 |
| const auto int_f = ConvertTo(df, integer); |
| |
| return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v); |
| } |
| |
| // Toward +infinity, aka ceiling |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N> Ceil(const Vec128<T, N> v) { |
| static_assert(IsFloat<T>(), "Only for float"); |
| const DFromV<decltype(v)> df; |
| const RebindToSigned<decltype(df)> di; |
| |
| const auto integer = ConvertInRangeTo(di, v); // round toward 0 |
| const auto int_f = ConvertTo(df, integer); |
| |
| // Truncating a positive non-integer ends up smaller; if so, add 1. |
| const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v))); |
| |
| return IfThenElse(detail::UseInt(v), int_f - neg1, v); |
| } |
| |
| #ifdef HWY_NATIVE_CEIL_FLOOR_INT |
| #undef HWY_NATIVE_CEIL_FLOOR_INT |
| #else |
| #define HWY_NATIVE_CEIL_FLOOR_INT |
| #endif |
| |
| template <class V, HWY_IF_FLOAT_V(V)> |
| HWY_API VFromD<RebindToSigned<DFromV<V>>> CeilInt(V v) { |
| const DFromV<decltype(v)> df; |
| const RebindToSigned<decltype(df)> di; |
| |
| const auto integer = ConvertTo(di, v); // round toward 0 |
| const auto int_f = ConvertTo(df, integer); |
| |
| // Truncating a positive non-integer ends up smaller; if so, add 1. |
| return integer - |
| VecFromMask(di, RebindMask(di, And(detail::UseInt(v), int_f < v))); |
| } |
| |
| // Toward -infinity, aka floor |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N> Floor(const Vec128<T, N> v) { |
| static_assert(IsFloat<T>(), "Only for float"); |
| const DFromV<decltype(v)> df; |
| const RebindToSigned<decltype(df)> di; |
| |
| const auto integer = ConvertInRangeTo(di, v); // round toward 0 |
| const auto int_f = ConvertTo(df, integer); |
| |
| // Truncating a negative non-integer ends up larger; if so, subtract 1. |
| const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v))); |
| |
| return IfThenElse(detail::UseInt(v), int_f + neg1, v); |
| } |
| |
| template <class V, HWY_IF_FLOAT_V(V)> |
| HWY_API VFromD<RebindToSigned<DFromV<V>>> FloorInt(V v) { |
| const DFromV<decltype(v)> df; |
| const RebindToSigned<decltype(df)> di; |
| |
| const auto integer = ConvertTo(di, v); // round toward 0 |
| const auto int_f = ConvertTo(df, integer); |
| |
| // Truncating a negative non-integer ends up larger; if so, subtract 1. |
| return integer + |
| VecFromMask(di, RebindMask(di, And(detail::UseInt(v), int_f > v))); |
| } |
| |
| #else |
| |
| // Toward nearest integer, ties to even |
| #if HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Vec128<float16_t, N> Round(const Vec128<float16_t, N> v) { |
| return Vec128<float16_t, N>{ |
| _mm_roundscale_ph(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)}; |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Vec128<float, N> Round(const Vec128<float, N> v) { |
| return Vec128<float, N>{ |
| _mm_round_ps(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<double, N> Round(const Vec128<double, N> v) { |
| return Vec128<double, N>{ |
| _mm_round_pd(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)}; |
| } |
| |
| // Toward zero, aka truncate |
| #if HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Vec128<float16_t, N> Trunc(const Vec128<float16_t, N> v) { |
| return Vec128<float16_t, N>{ |
| _mm_roundscale_ph(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)}; |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) { |
| return Vec128<float, N>{ |
| _mm_round_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<double, N> Trunc(const Vec128<double, N> v) { |
| return Vec128<double, N>{ |
| _mm_round_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)}; |
| } |
| |
| // Toward +infinity, aka ceiling |
| #if HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Vec128<float16_t, N> Ceil(const Vec128<float16_t, N> v) { |
| return Vec128<float16_t, N>{ |
| _mm_roundscale_ph(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)}; |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) { |
| return Vec128<float, N>{ |
| _mm_round_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<double, N> Ceil(const Vec128<double, N> v) { |
| return Vec128<double, N>{ |
| _mm_round_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)}; |
| } |
| |
| // Toward -infinity, aka floor |
| #if HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Vec128<float16_t, N> Floor(const Vec128<float16_t, N> v) { |
| return Vec128<float16_t, N>{ |
| _mm_roundscale_ph(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; |
| } |
| #endif // HWY_HAVE_FLOAT16 |
| template <size_t N> |
| HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) { |
| return Vec128<float, N>{ |
| _mm_round_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; |
| } |
| template <size_t N> |
| HWY_API Vec128<double, N> Floor(const Vec128<double, N> v) { |
| return Vec128<double, N>{ |
| _mm_round_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; |
| } |
| |
| #endif // !HWY_SSSE3 |
| |
| // ------------------------------ Floating-point classification |
| |
| #define HWY_X86_FPCLASS_QNAN 0x01 |
| #define HWY_X86_FPCLASS_POS0 0x02 |
| #define HWY_X86_FPCLASS_NEG0 0x04 |
| #define HWY_X86_FPCLASS_POS_INF 0x08 |
| #define HWY_X86_FPCLASS_NEG_INF 0x10 |
| #define HWY_X86_FPCLASS_SUBNORMAL 0x20 |
| #define HWY_X86_FPCLASS_NEG 0x40 |
| #define HWY_X86_FPCLASS_SNAN 0x80 |
| |
| #if HWY_HAVE_FLOAT16 || HWY_IDE |
| |
| template <size_t N> |
| HWY_API Mask128<float16_t, N> IsNaN(const Vec128<float16_t, N> v) { |
| return Mask128<float16_t, N>{ |
| _mm_fpclass_ph_mask(v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)}; |
| } |
| |
| template <size_t N> |
| HWY_API Mask128<float16_t, N> IsEitherNaN(Vec128<float16_t, N> a, |
| Vec128<float16_t, N> b) { |
| // Work around warnings in the intrinsic definitions (passing -1 as a mask). |
| HWY_DIAGNOSTICS(push) |
| HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") |
| return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_UNORD_Q)}; |
| HWY_DIAGNOSTICS(pop) |
| } |
| |
| template <size_t N> |
| HWY_API Mask128<float16_t, N> IsInf(const Vec128<float16_t, N> v) { |
| return Mask128<float16_t, N>{_mm_fpclass_ph_mask( |
| v.raw, HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}; |
| } |
| |
| template <size_t N> |
| HWY_API Mask128<float16_t, N> IsFinite(const Vec128<float16_t, N> v) { |
| // fpclass doesn't have a flag for positive, so we have to check for inf/NaN |
| // and negate the mask. |
| return Not(Mask128<float16_t, N>{_mm_fpclass_ph_mask( |
| v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN | |
| HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}); |
| } |
| |
| #endif // HWY_HAVE_FLOAT16 |
| |
| template <size_t N> |
| HWY_API Mask128<float, N> IsNaN(const Vec128<float, N> v) { |
| #if HWY_TARGET <= HWY_AVX3 |
| return Mask128<float, N>{ |
| _mm_fpclass_ps_mask(v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)}; |
| #else |
| return Mask128<float, N>{_mm_cmpunord_ps(v.raw, v.raw)}; |
| #endif |
| } |
| template <size_t N> |
| HWY_API Mask128<double, N> IsNaN(const Vec128<double, N> v) { |
| #if HWY_TARGET <= HWY_AVX3 |
| return Mask128<double, N>{ |
| _mm_fpclass_pd_mask(v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)}; |
| #else |
| return Mask128<double, N>{_mm_cmpunord_pd(v.raw, v.raw)}; |
| #endif |
| } |
| |
| #ifdef HWY_NATIVE_IS_EITHER_NAN |
| #undef HWY_NATIVE_IS_EITHER_NAN |
| #else |
| #define HWY_NATIVE_IS_EITHER_NAN |
| #endif |
| |
| template <size_t N> |
| HWY_API Mask128<float, N> IsEitherNaN(Vec128<float, N> a, Vec128<float, N> b) { |
| #if HWY_TARGET <= HWY_AVX3 |
| return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_UNORD_Q)}; |
| #else |
| return Mask128<float, N>{_mm_cmpunord_ps(a.raw, b.raw)}; |
| #endif |
| } |
| |
| template <size_t N> |
| HWY_API Mask128<double, N> IsEitherNaN(Vec128<double, N> a, |
| Vec128<double, N> b) { |
| #if HWY_TARGET <= HWY_AVX3 |
| return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_UNORD_Q)}; |
| #else |
| return Mask128<double, N>{_mm_cmpunord_pd(a.raw, b.raw)}; |
| #endif |
| } |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| |
| // Per-target flag to prevent generic_ops-inl.h from defining IsInf / IsFinite. |
| #ifdef HWY_NATIVE_ISINF |
| #undef HWY_NATIVE_ISINF |
| #else |
| #define HWY_NATIVE_ISINF |
| #endif |
| |
| template <size_t N> |
| HWY_API Mask128<float, N> IsInf(const Vec128<float, N> v) { |
| return Mask128<float, N>{_mm_fpclass_ps_mask( |
| v.raw, HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}; |
| } |
| template <size_t N> |
| HWY_API Mask128<double, N> IsInf(const Vec128<double, N> v) { |
| return Mask128<double, N>{_mm_fpclass_pd_mask( |
| v.raw, HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}; |
| } |
| |
| // Returns whether normal/subnormal/zero. |
| template <size_t N> |
| HWY_API Mask128<float, N> IsFinite(const Vec128<float, N> v) { |
| // fpclass doesn't have a flag for positive, so we have to check for inf/NaN |
| // and negate the mask. |
| return Not(Mask128<float, N>{_mm_fpclass_ps_mask( |
| v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN | |
| HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}); |
| } |
| template <size_t N> |
| HWY_API Mask128<double, N> IsFinite(const Vec128<double, N> v) { |
| return Not(Mask128<double, N>{_mm_fpclass_pd_mask( |
| v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN | |
| HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}); |
| } |
| |
| #endif // HWY_TARGET <= HWY_AVX3 |
| |
| // ================================================== CRYPTO |
| |
| #if !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET <= HWY_SSE4 |
| |
| // Per-target flag to prevent generic_ops-inl.h from defining AESRound. |
| #ifdef HWY_NATIVE_AES |
| #undef HWY_NATIVE_AES |
| #else |
| #define HWY_NATIVE_AES |
| #endif |
| |
| HWY_API Vec128<uint8_t> AESRound(Vec128<uint8_t> state, |
| Vec128<uint8_t> round_key) { |
| return Vec128<uint8_t>{_mm_aesenc_si128(state.raw, round_key.raw)}; |
| } |
| |
| HWY_API Vec128<uint8_t> AESLastRound(Vec128<uint8_t> state, |
| Vec128<uint8_t> round_key) { |
| return Vec128<uint8_t>{_mm_aesenclast_si128(state.raw, round_key.raw)}; |
| } |
| |
| HWY_API Vec128<uint8_t> AESInvMixColumns(Vec128<uint8_t> state) { |
| return Vec128<uint8_t>{_mm_aesimc_si128(state.raw)}; |
| } |
| |
| HWY_API Vec128<uint8_t> AESRoundInv(Vec128<uint8_t> state, |
| Vec128<uint8_t> round_key) { |
| return Vec128<uint8_t>{_mm_aesdec_si128(state.raw, round_key.raw)}; |
| } |
| |
| HWY_API Vec128<uint8_t> AESLastRoundInv(Vec128<uint8_t> state, |
| Vec128<uint8_t> round_key) { |
| return Vec128<uint8_t>{_mm_aesdeclast_si128(state.raw, round_key.raw)}; |
| } |
| |
| template <uint8_t kRcon> |
| HWY_API Vec128<uint8_t> AESKeyGenAssist(Vec128<uint8_t> v) { |
| return Vec128<uint8_t>{_mm_aeskeygenassist_si128(v.raw, kRcon)}; |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<uint64_t, N> CLMulLower(Vec128<uint64_t, N> a, |
| Vec128<uint64_t, N> b) { |
| return Vec128<uint64_t, N>{_mm_clmulepi64_si128(a.raw, b.raw, 0x00)}; |
| } |
| |
| template <size_t N> |
| HWY_API Vec128<uint64_t, N> CLMulUpper(Vec128<uint64_t, N> a, |
| Vec128<uint64_t, N> b) { |
| return Vec128<uint64_t, N>{_mm_clmulepi64_si128(a.raw, b.raw, 0x11)}; |
| } |
| |
| #endif // !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET <= HWY_SSE4 |
| |
| // ================================================== MISC |
| |
| // ------------------------------ LoadMaskBits (TestBit) |
| |
| #if HWY_TARGET > HWY_AVX3 |
| namespace detail { |
| |
| template <class D, HWY_IF_T_SIZE_D(D, 1)> |
| HWY_INLINE MFromD<D> LoadMaskBits128(D d, uint64_t mask_bits) { |
| const RebindToUnsigned<decltype(d)> du; |
| // Easier than Set(), which would require an >8-bit type, which would not |
| // compile for T=uint8_t, kN=1. |
| const VFromD<D> vbits{_mm_cvtsi32_si128(static_cast<int>(mask_bits))}; |
| |
| #if HWY_TARGET == HWY_SSE2 |
| // {b0, b1, ...} ===> {b0, b0, b1, b1, ...} |
| __m128i unpacked_vbits = _mm_unpacklo_epi8(vbits.raw, vbits.raw); |
| // {b0, b0, b1, b1, ...} ==> {b0, b0, b0, b0, b1, b1, b1, b1, ...} |
| unpacked_vbits = _mm_unpacklo_epi16(unpacked_vbits, unpacked_vbits); |
| // {b0, b0, b0, b0, b1, b1, b1, b1, ...} ==> |
| // {b0, b0, b0, b0, b0, b0, b0, b0, b1, b1, b1, b1, b1, b1, b1, b1} |
| const VFromD<decltype(du)> rep8{ |
| _mm_unpacklo_epi32(unpacked_vbits, unpacked_vbits)}; |
| #else |
| // Replicate bytes 8x such that each byte contains the bit that governs it. |
| alignas(16) static constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0, |
| 1, 1, 1, 1, 1, 1, 1, 1}; |
| const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8)); |
| #endif |
| const VFromD<decltype(du)> bit = Dup128VecFromValues( |
| du, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128); |
| return RebindMask(d, TestBit(rep8, bit)); |
| } |
| |
| template <class D, HWY_IF_T_SIZE_D(D, 2)> |
| HWY_INLINE MFromD<D> LoadMaskBits128(D d, uint64_t mask_bits) { |
| const RebindToUnsigned<decltype(d)> du; |
| alignas(16) static constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128}; |
| const auto vmask_bits = Set(du, static_cast<uint16_t>(mask_bits)); |
| return RebindMask(d, TestBit(vmask_bits, Load(du, kBit))); |
| } |
| |
| template <class D, HWY_IF_T_SIZE_D(D, 4)> |
| HWY_INLINE MFromD<D> LoadMaskBits128(D d, uint64_t mask_bits) { |
| const RebindToUnsigned<decltype(d)> du; |
| alignas(16) static constexpr uint32_t kBit[8] = {1, 2, 4, 8}; |
| const auto vmask_bits = Set(du, static_cast<uint32_t>(mask_bits)); |
| return RebindMask(d, TestBit(vmask_bits, Load(du, kBit))); |
| } |
| |
| template <class D, HWY_IF_T_SIZE_D(D, 8)> |
| HWY_INLINE MFromD<D> LoadMaskBits128(D d, uint64_t mask_bits) { |
| const RebindToUnsigned<decltype(d)> du; |
| alignas(16) static constexpr uint64_t kBit[8] = {1, 2}; |
| return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit))); |
| } |
| |
| } // namespace detail |
| #endif // HWY_TARGET > HWY_AVX3 |
| |
| // `p` points to at least 8 readable bytes, not all of which need be valid. |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { |
| constexpr size_t kN = MaxLanes(d); |
| #if HWY_TARGET <= HWY_AVX3 |
| (void)d; |
| uint64_t mask_bits = 0; |
| constexpr size_t kNumBytes = (kN + 7) / 8; |
| CopyBytes<kNumBytes>(bits, &mask_bits); |
| if (kN < 8) { |
| mask_bits &= (1ull << kN) - 1; |
| } |
| |
| return MFromD<D>::FromBits(mask_bits); |
| #else |
| uint64_t mask_bits = 0; |
| constexpr size_t kNumBytes = (kN + 7) / 8; |
| CopyBytes<kNumBytes>(bits, &mask_bits); |
| if (kN < 8) { |
| mask_bits &= (1ull << kN) - 1; |
| } |
| |
| return detail::LoadMaskBits128(d, mask_bits); |
| #endif |
| } |
| |
| // ------------------------------ Dup128MaskFromMaskBits |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) { |
| constexpr size_t kN = MaxLanes(d); |
| if (kN < 8) mask_bits &= (1u << kN) - 1; |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| return MFromD<D>::FromBits(mask_bits); |
| #else |
| return detail::LoadMaskBits128(d, mask_bits); |
| #endif |
| } |
| |
| template <typename T> |
| struct CompressIsPartition { |
| #if HWY_TARGET <= HWY_AVX3 |
| // AVX3 supports native compress, but a table-based approach allows |
| // 'partitioning' (also moving mask=false lanes to the top), which helps |
| // vqsort. This is only feasible for eight or less lanes, i.e. sizeof(T) == 8 |
| // on AVX3. For simplicity, we only use tables for 64-bit lanes (not AVX3 |
| // u32x8 etc.). |
| enum { value = (sizeof(T) == 8) }; |
| #else |
| // generic_ops-inl does not guarantee IsPartition for 8-bit. |
| enum { value = (sizeof(T) != 1) }; |
| #endif |
| }; |
| |
| namespace detail { |
| |
| // Returns `mask_bits` (from movemask) with the upper bits cleared, if there |
| // are 8 or fewer valid bits. |
| template <class D> |
| constexpr uint64_t OnlyActive(D d, uint64_t mask_bits) { |
| return (d.MaxBytes() >= 16) ? mask_bits |
| : mask_bits & ((1ull << d.MaxLanes()) - 1); |
| } |
| |
| } // namespace detail |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| |
| // ------------------------------ BitsFromMask (MFromD, OnlyActive) |
| // Generic for all vector lengths. |
| template <class D> |
| HWY_INLINE uint64_t BitsFromMask(D d, MFromD<D> mask) { |
| return detail::OnlyActive(d, mask.raw); |
| } |
| |
| // ------------------------------ StoreMaskBits |
| |
| // `p` points to at least 8 writable bytes. |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) { |
| constexpr size_t kN = MaxLanes(d); |
| constexpr size_t kNumBytes = (kN + 7) / 8; |
| CopyBytes<kNumBytes>(&mask.raw, bits); |
| |
| // Non-full byte, need to clear the undefined upper bits. |
| if (kN < 8) { |
| const int mask_bits = (1 << kN) - 1; |
| bits[0] = static_cast<uint8_t>(bits[0] & mask_bits); |
| } |
| |
| return kNumBytes; |
| } |
| |
| // ------------------------------ Mask testing |
| |
| // Beware: the suffix indicates the number of mask bits, not lane size! |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API size_t CountTrue(D d, MFromD<D> mask) { |
| constexpr size_t kN = MaxLanes(d); |
| const uint64_t mask_bits = uint64_t{mask.raw} & ((1ull << kN) - 1); |
| return PopCount(mask_bits); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API size_t FindKnownFirstTrue(D d, MFromD<D> mask) { |
| constexpr size_t kN = MaxLanes(d); |
| const uint32_t mask_bits = uint32_t{mask.raw} & ((1u << kN) - 1); |
| return Num0BitsBelowLS1Bit_Nonzero32(mask_bits); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API intptr_t FindFirstTrue(D d, MFromD<D> mask) { |
| constexpr size_t kN = MaxLanes(d); |
| const uint32_t mask_bits = uint32_t{mask.raw} & ((1u << kN) - 1); |
| return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API size_t FindKnownLastTrue(D d, MFromD<D> mask) { |
| constexpr size_t kN = MaxLanes(d); |
| const uint32_t mask_bits = uint32_t{mask.raw} & ((1u << kN) - 1); |
| return 31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) { |
| constexpr size_t kN = MaxLanes(d); |
| const uint32_t mask_bits = uint32_t{mask.raw} & ((1u << kN) - 1); |
| return mask_bits ? intptr_t(31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits)) |
| : -1; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API bool AllFalse(D d, MFromD<D> mask) { |
| constexpr size_t kN = MaxLanes(d); |
| const uint64_t mask_bits = uint64_t{mask.raw} & ((1ull << kN) - 1); |
| return mask_bits == 0; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API bool AllTrue(D d, MFromD<D> mask) { |
| constexpr size_t kN = MaxLanes(d); |
| const uint64_t mask_bits = uint64_t{mask.raw} & ((1ull << kN) - 1); |
| // Cannot use _kortestc because we may have less than 8 mask bits. |
| return mask_bits == (1ull << kN) - 1; |
| } |
| |
| // ------------------------------ Compress |
| |
| // 8-16 bit Compress, CompressStore defined in x86_512 because they use Vec512. |
| |
| // Single lane: no-op |
| template <typename T> |
| HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) { |
| return v; |
| } |
| |
| template <size_t N, HWY_IF_V_SIZE_GT(float, N, 4)> |
| HWY_API Vec128<float, N> Compress(Vec128<float, N> v, Mask128<float, N> mask) { |
| return Vec128<float, N>{_mm_maskz_compress_ps(mask.raw, v.raw)}; |
| } |
| |
| template <typename T, HWY_IF_T_SIZE(T, 8)> |
| HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) { |
| HWY_DASSERT(mask.raw < 4); |
| |
| // There are only 2 lanes, so we can afford to load the index vector directly. |
| alignas(16) static constexpr uint8_t u8_indices[64] = { |
| 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, |
| 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, |
| 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, |
| 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; |
| |
| const DFromV<decltype(v)> d; |
| const Repartition<uint8_t, decltype(d)> d8; |
| const auto index = Load(d8, u8_indices + 16 * mask.raw); |
| return BitCast(d, TableLookupBytes(BitCast(d8, v), index)); |
| } |
| |
| // ------------------------------ CompressNot (Compress) |
| |
| // Single lane: no-op |
| template <typename T> |
| HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) { |
| return v; |
| } |
| |
| template <typename T, HWY_IF_T_SIZE(T, 8)> |
| HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) { |
| // See CompressIsPartition, PrintCompressNot64x2NibbleTables |
| alignas(16) static constexpr uint64_t packed_array[16] = { |
| 0x00000010, 0x00000001, 0x00000010, 0x00000010}; |
| |
| // For lane i, shift the i-th 4-bit index down to bits [0, 2) - |
| // _mm_permutexvar_epi64 will ignore the upper bits. |
| const DFromV<decltype(v)> d; |
| const RebindToUnsigned<decltype(d)> du64; |
| const auto packed = Set(du64, packed_array[mask.raw]); |
| alignas(16) static constexpr uint64_t shifts[2] = {0, 4}; |
| const auto indices = Indices128<T>{(packed >> Load(du64, shifts)).raw}; |
| return TableLookupLanes(v, indices); |
| } |
| |
| // ------------------------------ CompressBlocksNot |
| HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v, |
| Mask128<uint64_t> /* m */) { |
| return v; |
| } |
| |
| // ------------------------------ CompressStore (defined in x86_512) |
| |
| // ------------------------------ CompressBlendedStore (defined in x86_avx3) |
| |
| // ------------------------------ CompressBitsStore (defined in x86_512) |
| |
| #else // AVX2 or below |
| |
| // ------------------------------ BitsFromMask |
| |
| namespace detail { |
| |
| constexpr HWY_INLINE uint64_t U64FromInt(int mask_bits) { |
| return static_cast<uint64_t>(static_cast<unsigned>(mask_bits)); |
| } |
| |
| } // namespace detail |
| |
| template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) { |
| const auto sign_bits = BitCast(d, VecFromMask(d, mask)).raw; |
| return detail::OnlyActive(d, |
| detail::U64FromInt(_mm_movemask_epi8(sign_bits))); |
| } |
| |
| template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) { |
| // Remove useless lower half of each u16 while preserving the sign bit. |
| const auto sign_bits = _mm_packs_epi16(mask.raw, _mm_setzero_si128()); |
| return detail::OnlyActive(d, |
| detail::U64FromInt(_mm_movemask_epi8(sign_bits))); |
| } |
| |
| template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) { |
| const RebindToFloat<decltype(d)> df; |
| const auto sign_bits = BitCast(df, VecFromMask(d, mask)); |
| return detail::OnlyActive(d, |
| detail::U64FromInt(_mm_movemask_ps(sign_bits.raw))); |
| } |
| |
| template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) { |
| const RebindToFloat<D> df; |
| const auto sign_bits = BitCast(df, VecFromMask(d, mask)); |
| return detail::OnlyActive(d, |
| detail::U64FromInt(_mm_movemask_pd(sign_bits.raw))); |
| } |
| |
| // ------------------------------ StoreMaskBits |
| // `p` points to at least 8 writable bytes. |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) { |
| constexpr size_t kNumBytes = (MaxLanes(d) + 7) / 8; |
| const uint64_t mask_bits = BitsFromMask(d, mask); |
| CopyBytes<kNumBytes>(&mask_bits, bits); |
| return kNumBytes; |
| } |
| |
| // ------------------------------ Mask testing |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API bool AllFalse(D d, MFromD<D> mask) { |
| // Cheaper than PTEST, which is 2 uop / 3L. |
| return BitsFromMask(d, mask) == 0; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API bool AllTrue(D d, MFromD<D> mask) { |
| constexpr uint64_t kAllBits = (1ull << MaxLanes(d)) - 1; |
| return BitsFromMask(d, mask) == kAllBits; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API size_t CountTrue(D d, MFromD<D> mask) { |
| return PopCount(BitsFromMask(d, mask)); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API size_t FindKnownFirstTrue(D d, MFromD<D> mask) { |
| return Num0BitsBelowLS1Bit_Nonzero32( |
| static_cast<uint32_t>(BitsFromMask(d, mask))); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API intptr_t FindFirstTrue(D d, MFromD<D> mask) { |
| const uint32_t mask_bits = static_cast<uint32_t>(BitsFromMask(d, mask)); |
| return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API size_t FindKnownLastTrue(D d, MFromD<D> mask) { |
| return 31 - Num0BitsAboveMS1Bit_Nonzero32( |
| static_cast<uint32_t>(BitsFromMask(d, mask))); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> |
| HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) { |
| const uint32_t mask_bits = static_cast<uint32_t>(BitsFromMask(d, mask)); |
| return mask_bits ? intptr_t(31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits)) |
| : -1; |
| } |
| |
| // ------------------------------ Compress, CompressBits |
| |
| namespace detail { |
| |
| // Also works for N < 8 because the first 16 4-tuples only reference bytes 0-6. |
| template <class D, HWY_IF_T_SIZE_D(D, 2)> |
| HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) { |
| HWY_DASSERT(mask_bits < 256); |
| const Rebind<uint8_t, decltype(d)> d8; |
| const Twice<decltype(d8)> d8t; |
| const RebindToUnsigned<decltype(d)> du; |
| |
| // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need |
| // byte indices for PSHUFB (one vector's worth for each of 256 combinations of |
| // 8 mask bits). Loading them directly would require 4 KiB. We can instead |
| // store lane indices and convert to byte indices (2*lane + 0..1), with the |
| // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane |
| // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts. |
| // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles |
| // is likely more costly than the higher cache footprint from storing bytes. |
| alignas(16) static constexpr uint8_t table[2048] = { |
| // PrintCompress16x8Tables |
| 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // |
| 2, 0, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // |
| 4, 0, 2, 6, 8, 10, 12, 14, /**/ 0, 4, 2, 6, 8, 10, 12, 14, // |
| 2, 4, 0, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // |
| 6, 0, 2, 4, 8, 10, 12, 14, /**/ 0, 6, 2, 4, 8, 10, 12, 14, // |
| 2, 6, 0, 4, 8, 10, 12, 14, /**/ 0, 2, 6, 4, 8, 10, 12, 14, // |
| 4, 6, 0, 2, 8, 10, 12, 14, /**/ 0, 4, 6, 2, 8, 10, 12, 14, // |
| 2, 4, 6, 0, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // |
| 8, 0, 2, 4, 6, 10, 12, 14, /**/ 0, 8, 2, 4, 6, 10, 12, 14, // |
| 2, 8, 0, 4, 6, 10, 12, 14, /**/ 0, 2, 8, 4, 6, 10, 12, 14, // |
| 4, 8, 0, 2, 6, 10, 12, 14, /**/ 0, 4, 8, 2, 6, 10, 12, 14, // |
| 2, 4, 8, 0, 6, 10, 12, 14, /**/ 0, 2, 4, 8, 6, 10, 12, 14, // |
| 6, 8, 0, 2, 4, 10, 12, 14, /**/ 0, 6, 8, 2, 4, 10, 12, 14, // |
| 2, 6, 8, 0, 4, 10, 12, 14, /**/ 0, 2, 6, 8, 4, 10, 12, 14, // |
| 4, 6, 8, 0, 2, 10, 12, 14, /**/ 0, 4, 6, 8, 2, 10, 12, 14, // |
| 2, 4, 6, 8, 0, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // |
| 10, 0, 2, 4, 6, 8, 12, 14, /**/ 0, 10, 2, 4, 6, 8, 12, 14, // |
| 2, 10, 0, 4, 6, 8, 12, 14, /**/ 0, 2, 10, 4, 6, 8, 12, 14, // |
| 4, 10, 0, 2, 6, 8, 12, 14, /**/ 0, 4, 10, 2, 6, 8, 12, 14, // |
| 2, 4, 10, 0, 6, 8, 12, 14, /**/ 0, 2, 4, 10, 6, 8, 12, 14, // |
| 6, 10, 0, 2, 4, 8, 12, 14, /**/ 0, 6, 10, 2, 4, 8, 12, 14, // |
| 2, 6, 10, 0, 4, 8, 12, 14, /**/ 0, 2, 6, 10, 4, 8, 12, 14, // |
| 4, 6, 10, 0, 2, 8, 12, 14, /**/ 0, 4, 6, 10, 2, 8, 12, 14, // |
| 2, 4, 6, 10, 0, 8, 12, 14, /**/ 0, 2, 4, 6, 10, 8, 12, 14, // |
| 8, 10, 0, 2, 4, 6, 12, 14, /**/ 0, 8, 10, 2, 4, 6, 12, 14, // |
| 2, 8, 10, 0, 4, 6, 12, 14, /**/ 0, 2, 8, 10, 4, 6, 12, 14, // |
| 4, 8, 10, 0, 2, 6, 12, 14, /**/ 0, 4, 8, 10, 2, 6, 12, 14, // |
| 2, 4, 8, 10, 0, 6, 12, 14, /**/ 0, 2, 4, 8, 10, 6, 12, 14, // |
| 6, 8, 10, 0, 2, 4, 12, 14, /**/ 0, 6, 8, 10, 2, 4, 12, 14, // |
| 2, 6, 8, 10, 0, 4, 12, 14, /**/ 0, 2, 6, 8, 10, 4, 12, 14, // |
| 4, 6, 8, 10, 0, 2, 12, 14, /**/ 0, 4, 6, 8, 10, 2, 12, 14, // |
| 2, 4, 6, 8, 10, 0, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // |
| 12, 0, 2, 4, 6, 8, 10, 14, /**/ 0, 12, 2, 4, 6, 8, 10, 14, // |
| 2, 12, 0, 4, 6, 8, 10, 14, /**/ 0, 2, 12, 4, 6, 8, 10, 14, // |
| 4, 12, 0, 2, 6, 8, 10, 14, /**/ 0, 4, 12, 2, 6, 8, 10, 14, // |
| 2, 4, 12, 0, 6, 8, 10, 14, /**/ 0, 2, 4, 12, 6, 8, 10, 14, // |
| 6, 12, 0, 2, 4, 8, 10, 14, /**/ 0, 6, 12, 2, 4, 8, 10, 14, // |
| 2, 6, 12, 0, 4, 8, 10, 14, /**/ 0, 2, 6, 12, 4, 8, 10, 14, // |
| 4, 6, 12, 0, 2, 8, 10, 14, /**/ 0, 4, 6, 12, 2, 8, 10, 14, // |
| 2, 4, 6, 12, 0, 8, 10, 14, /**/ 0, 2, 4, 6, 12, 8, 10, 14, // |
| 8, 12, 0, 2, 4, 6, 10, 14, /**/ 0, 8, 12, 2, 4, 6, 10, 14, // |
| 2, 8, 12, 0, 4, 6, 10, 14, /**/ 0, 2, 8, 12, 4, 6, 10, 14, // |
| 4, 8, 12, 0, 2, 6, 10, 14, /**/ 0, 4, 8, 12, 2, 6, 10, 14, // |
| 2, 4, 8, 12, 0, 6, 10, 14, /**/ 0, 2, 4, 8, 12, 6, 10, 14, // |
| 6, 8, 12, 0, 2, 4, 10, 14, /**/ 0, 6, 8, 12, 2, 4, 10, 14, // |
| 2, 6, 8, 12, 0, 4, 10, 14, /**/ 0, 2, 6, 8, 12, 4, 10, 14, // |
| 4, 6, 8, 12, 0, 2, 10, 14, /**/ 0, 4, 6, 8, 12, 2, 10, 14, // |
| 2, 4, 6, 8, 12, 0, 10, 14, /**/ 0, 2, 4, 6, 8, 12, 10, 14, // |
| 10, 12, 0, 2, 4, 6, 8, 14, /**/ 0, 10, 12, 2, 4, 6, 8, 14, // |
| 2, 10, 12, 0, 4, 6, 8, 14, /**/ 0, 2, 10, 12, 4, 6, 8, 14, // |
| 4, 10, 12, 0, 2, 6, 8, 14, /**/ 0, 4, 10, 12, 2, 6, 8, 14, // |
| 2, 4, 10, 12, 0, 6, 8, 14, /**/ 0, 2, 4, 10, 12, 6, 8, 14, // |
| 6, 10, 12, 0, 2, 4, 8, 14, /**/ 0, 6, 10, 12, 2, 4, 8, 14, // |
| 2, 6, 10, 12, 0, 4, 8, 14, /**/ 0, 2, 6, 10, 12, 4, 8, 14, // |
| 4, 6, 10, 12, 0, 2, 8, 14, /**/ 0, 4, 6, 10, 12, 2, 8, 14, // |
| 2, 4, 6, 10, 12, 0, 8, 14, /**/ 0, 2, 4, 6, 10, 12, 8, 14, // |
| 8, 10, 12, 0, 2, 4, 6, 14, /**/ 0, 8, 10, 12, 2, 4, 6, 14, // |
| 2, 8, 10, 12, 0, 4, 6, 14, /**/ 0, 2, 8, 10, 12, 4, 6, 14, // |
| 4, 8, 10, 12, 0, 2, 6, 14, /**/ 0, 4, 8, 10, 12, 2, 6, 14, // |
| 2, 4, 8, 10, 12, 0, 6, 14, /**/ 0, 2, 4, 8, 10, 12, 6, 14, // |
| 6, 8, 10, 12, 0, 2, 4, 14, /**/ 0, 6, 8, 10, 12, 2, 4, 14, // |
| 2, 6, 8, 10, 12, 0, 4, 14, /**/ 0, 2, 6, 8, 10, 12, 4, 14, // |
| 4, 6, 8, 10, 12, 0, 2, 14, /**/ 0, 4, 6, 8, 10, 12, 2, 14, // |
| 2, 4, 6, 8, 10, 12, 0, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // |
| 14, 0, 2, 4, 6, 8, 10, 12, /**/ 0, 14, 2, 4, 6, 8, 10, 12, // |
| 2, 14, 0, 4, 6, 8, 10, 12, /**/ 0, 2, 14, 4, 6, 8, 10, 12, // |
| 4, 14, 0, 2, 6, 8, 10, 12, /**/ 0, 4, 14, 2, 6, 8, 10, 12, // |
| 2, 4, 14, 0, 6, 8, 10, 12, /**/ 0, 2, 4, 14, 6, 8, 10, 12, // |
| 6, 14, 0, 2, 4, 8, 10, 12, /**/ 0, 6, 14, 2, 4, 8, 10, 12, // |
| 2, 6, 14, 0, 4, 8, 10, 12, /**/ 0, 2, 6, 14, 4, 8, 10, 12, // |
| 4, 6, 14, 0, 2, 8, 10, 12, /**/ 0, 4, 6, 14, 2, 8, 10, 12, // |
| 2, 4, 6, 14, 0, 8, 10, 12, /**/ 0, 2, 4, 6, 14, 8, 10, 12, // |
| 8, 14, 0, 2, 4, 6, 10, 12, /**/ 0, 8, 14, 2, 4, 6, 10, 12, // |
| 2, 8, 14, 0, 4, 6, 10, 12, /**/ 0, 2, 8, 14, 4, 6, 10, 12, // |
| 4, 8, 14, 0, 2, 6, 10, 12, /**/ 0, 4, 8, 14, 2, 6, 10, 12, // |
| 2, 4, 8, 14, 0, 6, 10, 12, /**/ 0, 2, 4, 8, 14, 6, 10, 12, // |
| 6, 8, 14, 0, 2, 4, 10, 12, /**/ 0, 6, 8, 14, 2, 4, 10, 12, // |
| 2, 6, 8, 14, 0, 4, 10, 12, /**/ 0, 2, 6, 8, 14, 4, 10, 12, // |
| 4, 6, 8, 14, 0, 2, 10, 12, /**/ 0, 4, 6, 8, 14, 2, 10, 12, // |
| 2, 4, 6, 8, 14, 0, 10, 12, /**/ 0, 2, 4, 6, 8, 14, 10, 12, // |
| 10, 14, 0, 2, 4, 6, 8, 12, /**/ 0, 10, 14, 2, 4, 6, 8, 12, // |
| 2, 10, 14, 0, 4, 6, 8, 12, /**/ 0, 2, 10, 14, 4, 6, 8, 12, // |
| 4, 10, 14, 0, 2, 6, 8, 12, /**/ 0, 4, 10, 14, 2, 6, 8, 12, // |
| 2, 4, 10, 14, 0, 6, 8, 12, /**/ 0, 2, 4, 10, 14, 6, 8, 12, // |
| 6, 10, 14, 0, 2, 4, 8, 12, /**/ 0, 6, 10, 14, 2, 4, 8, 12, // |
| 2, 6, 10, 14, 0, 4, 8, 12, /**/ 0, 2, 6, 10, 14, 4, 8, 12, // |
| 4, 6, 10, 14, 0, 2, 8, 12, /**/ 0, 4, 6, 10, 14, 2, 8, 12, // |
| 2, 4, 6, 10, 14, 0, 8, 12, /**/ 0, 2, 4, 6, 10, 14, 8, 12, // |
| 8, 10, 14, 0, 2, 4, 6, 12, /**/ 0, 8, 10, 14, 2, 4, 6, 12, // |
| 2, 8, 10, 14, 0, 4, 6, 12, /**/ 0, 2, 8, 10, 14, 4, 6, 12, // |
| 4, 8, 10, 14, 0, 2, 6, 12, /**/ 0, 4, 8, 10, 14, 2, 6, 12, // |
| 2, 4, 8, 10, 14, 0, 6, 12, /**/ 0, 2, 4, 8, 10, 14, 6, 12, // |
| 6, 8, 10, 14, 0, 2, 4, 12, /**/ 0, 6, 8, 10, 14, 2, 4, 12, // |
| 2, 6, 8, 10, 14, 0, 4, 12, /**/ 0, 2, 6, 8, 10, 14, 4, 12, // |
| 4, 6, 8, 10, 14, 0, 2, 12, /**/ 0, 4, 6, 8, 10, 14, 2, 12, // |
| 2, 4, 6, 8, 10, 14, 0, 12, /**/ 0, 2, 4, 6, 8, 10, 14, 12, // |
| 12, 14, 0, 2, 4, 6, 8, 10, /**/ 0, 12, 14, 2, 4, 6, 8, 10, // |
| 2, 12, 14, 0, 4, 6, 8, 10, /**/ 0, 2, 12, 14, 4, 6, 8, 10, // |
| 4, 12, 14, 0, 2, 6, 8, 10, /**/ 0, 4, 12, 14, 2, 6, 8, 10, // |
| 2, 4, 12, 14, 0, 6, 8, 10, /**/ 0, 2, 4, 12, 14, 6, 8, 10, // |
| 6, 12, 14, 0, 2, 4, 8, 10, /**/ 0, 6, 12, 14, 2, 4, 8, 10, // |
| 2, 6, 12, 14, 0, 4, 8, 10, /**/ 0, 2, 6, 12, 14, 4, 8, 10, // |
| 4, 6, 12, 14, 0, 2, 8, 10, /**/ 0, 4, 6, 12, 14, 2, 8, 10, // |
| 2, 4, 6, 12, 14, 0, 8, 10, /**/ 0, 2, 4, 6, 12, 14, 8, 10, // |
| 8, 12, 14, 0, 2, 4, 6, 10, /**/ 0, 8, 12, 14, 2, 4, 6, 10, // |
| 2, 8, 12, 14, 0, 4, 6, 10, /**/ 0, 2, 8, 12, 14, 4, 6, 10, // |
| 4, 8, 12, 14, 0, 2, 6, 10, /**/ 0, 4, 8, 12, 14, 2, 6, 10, // |
| 2, 4, 8, 12, 14, 0, 6, 10, /**/ 0, 2, 4, 8, 12, 14, 6, 10, // |
| 6, 8, 12, 14, 0, 2, 4, 10, /**/ 0, 6, 8, 12, 14, 2, 4, 10, // |
| 2, 6, 8, 12, 14, 0, 4, 10, /**/ 0, 2, 6, 8, 12, 14, 4, 10, // |
| 4, 6, 8, 12, 14, 0, 2, 10, /**/ 0, 4, 6, 8, 12, 14, 2, 10, // |
| 2, 4, 6, 8, 12, 14, 0, 10, /**/ 0, 2, 4, 6, 8, 12, 14, 10, // |
| 10, 12, 14, 0, 2, 4, 6, 8, /**/ 0, 10, 12, 14, 2, 4, 6, 8, // |
| 2, 10, 12, 14, 0, 4, 6, 8, /**/ 0, 2, 10, 12, 14, 4, 6, 8, // |
| 4, 10, 12, 14, 0, 2, 6, 8, /**/ 0, 4, 10, 12, 14, 2, 6, 8, // |
| 2, 4, 10, 12, 14, 0, 6, 8, /**/ 0, 2, 4, 10, 12, 14, 6, 8, // |
| 6, 10, 12, 14, 0, 2, 4, 8, /**/ 0, 6, 10, 12, 14, 2, 4, 8, // |
| 2, 6, 10, 12, 14, 0, 4, 8, /**/ 0, 2, 6, 10, 12, 14, 4, 8, // |
| 4, 6, 10, 12, 14, 0, 2, 8, /**/ 0, 4, 6, 10, 12, 14, 2, 8, // |
| 2, 4, 6, 10, 12, 14, 0, 8, /**/ 0, 2, 4, 6, 10, 12, 14, 8, // |
| 8, 10, 12, 14, 0, 2, 4, 6, /**/ 0, 8, 10, 12, 14, 2, 4, 6, // |
| 2, 8, 10, 12, 14, 0, 4, 6, /**/ 0, 2, 8, 10, 12, 14, 4, 6, // |
| 4, 8, 10, 12, 14, 0, 2, 6, /**/ 0, 4, 8, 10, 12, 14, 2, 6, // |
| 2, 4, 8, 10, 12, 14, 0, 6, /**/ 0, 2, 4, 8, 10, 12, 14, 6, // |
| 6, 8, 10, 12, 14, 0, 2, 4, /**/ 0, 6, 8, 10, 12, 14, 2, 4, // |
| 2, 6, 8, 10, 12, 14, 0, 4, /**/ 0, 2, 6, 8, 10, 12, 14, 4, // |
| 4, 6, 8, 10, 12, 14, 0, 2, /**/ 0, 4, 6, 8, 10, 12, 14, 2, // |
| 2, 4, 6, 8, 10, 12, 14, 0, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; |
| |
| const VFromD<decltype(d8t)> byte_idx{Load(d8, table + mask_bits * 8).raw}; |
| const VFromD<decltype(du)> pairs = ZipLower(byte_idx, byte_idx); |
| return BitCast(d, pairs + Set(du, 0x0100)); |
| } |
| |
| template <class D, HWY_IF_T_SIZE_D(D, 2)> |
| HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) { |
| HWY_DASSERT(mask_bits < 256); |
| const Rebind<uint8_t, decltype(d)> d8; |
| const Twice<decltype(d8)> d8t; |
| const RebindToUnsigned<decltype(d)> du; |
| |
| // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need |
| // byte indices for PSHUFB (one vector's worth for each of 256 combinations of |
| // 8 mask bits). Loading them directly would require 4 KiB. We can instead |
| // store lane indices and convert to byte indices (2*lane + 0..1), with the |
| // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane |
| // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts. |
| // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles |
| // is likely more costly than the higher cache footprint from storing bytes. |
| alignas(16) static constexpr uint8_t table[2048] = { |
| // PrintCompressNot16x8Tables |
| 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 14, 0, // |
| 0, 4, 6, 8, 10, 12, 14, 2, /**/ 4, 6, 8, 10, 12, 14, 0, 2, // |
| 0, 2, 6, 8, 10, 12, 14, 4, /**/ 2, 6, 8, 10, 12, 14, 0, 4, // |
| 0, 6, 8, 10, 12, 14, 2, 4, /**/ 6, 8, 10, 12, 14, 0, 2, 4, // |
| 0, 2, 4, 8, 10, 12, 14, 6, /**/ 2, 4, 8, 10, 12, 14, 0, 6, // |
| 0, 4, 8, 10, 12, 14, 2, 6, /**/ 4, 8, 10, 12, 14, 0, 2, 6, // |
| 0, 2, 8, 10, 12, 14, 4, 6, /**/ 2, 8, 10, 12, 14, 0, 4, 6, // |
| 0, 8, 10, 12, 14, 2, 4, 6, /**/ 8, 10, 12, 14, 0, 2, 4, 6, // |
| 0, 2, 4, 6, 10, 12, 14, 8, /**/ 2, 4, 6, 10, 12, 14, 0, 8, // |
| 0, 4, 6, 10, 12, 14, 2, 8, /**/ 4, 6, 10, 12, 14, 0, 2, 8, // |
| 0, 2, 6, 10, 12, 14, 4, 8, /**/ 2, 6, 10, 12, 14, 0, 4, 8, // |
| 0, 6, 10, 12, 14, 2, 4, 8, /**/ 6, 10, 12, 14, 0, 2, 4, 8, // |
| 0, 2, 4, 10, 12, 14, 6, 8, /**/ 2, 4, 10, 12, 14, 0, 6, 8, // |
| 0, 4, 10, 12, 14, 2, 6, 8, /**/ 4, 10, 12, 14, 0, 2, 6, 8, // |
| 0, 2, 10, 12, 14, 4, 6, 8, /**/ 2, 10, 12, 14, 0, 4, 6, 8, // |
| 0, 10, 12, 14, 2, 4, 6, 8, /**/ 10, 12, 14, 0, 2, 4, 6, 8, // |
| 0, 2, 4, 6, 8, 12, 14, 10, /**/ 2, 4, 6, 8, 12, 14, 0, 10, // |
| 0, 4, 6, 8, 12, 14, 2, 10, /**/ 4, 6, 8, 12, 14, 0, 2, 10, // |
| 0, 2, 6, 8, 12, 14, 4, 10, /**/ 2, 6, 8, 12, 14, 0, 4, 10, // |
| 0, 6, 8, 12, 14, 2, 4, 10, /**/ 6, 8, 12, 14, 0, 2, 4, 10, // |
| 0, 2, 4, 8, 12, 14, 6, 10, /**/ 2, 4, 8, 12, 14, 0, 6, 10, // |
| 0, 4, 8, 12, 14, 2, 6, 10, /**/ 4, 8, 12, 14, 0, 2, 6, 10, // |
| 0, 2, 8, 12, 14, 4, 6, 10, /**/ 2, 8, 12, 14, 0, 4, 6, 10, // |
| 0, 8, 12, 14, 2, 4, 6, 10, /**/ 8, 12, 14, 0, 2, 4, 6, 10, // |
| 0, 2, 4, 6, 12, 14, 8, 10, /**/ 2, 4, 6, 12, 14, 0, 8, 10, // |
| 0, 4, 6, 12, 14, 2, 8, 10, /**/ 4, 6, 12, 14, 0, 2, 8, 10, // |
| 0, 2, 6, 12, 14, 4, 8, 10, /**/ 2, 6, 12, 14, 0, 4, 8, 10, // |
| 0, 6, 12, 14, 2, 4, 8, 10, /**/ 6, 12, 14, 0, 2, 4, 8, 10, // |
| 0, 2, 4, 12, 14, 6, 8, 10, /**/ 2, 4, 12, 14, 0, 6, 8, 10, // |
| 0, 4, 12, 14, 2, 6, 8, 10, /**/ 4, 12, 14, 0, 2, 6, 8, 10, // |
| 0, 2, 12, 14, 4, 6, 8, 10, /**/ 2, 12, 14, 0, 4, 6, 8, 10, // |
| 0, 12, 14, 2, 4, 6, 8, 10, /**/ 12, 14, 0, 2, 4, 6, 8, 10, // |
| 0, 2, 4, 6, 8, 10, 14, 12, /**/ 2, 4, 6, 8, 10, 14, 0, 12, // |
| 0, 4, 6, 8, 10, 14, 2, 12, /**/ 4, 6, 8, 10, 14, 0, 2, 12, // |
| 0, 2, 6, 8, 10, 14, 4, 12, /**/ 2, 6, 8, 10, 14, 0, 4, 12, // |
| 0, 6, 8, 10, 14, 2, 4, 12, /**/ 6, 8, 10, 14, 0, 2, 4, 12, // |
| 0, 2, 4, 8, 10, 14, 6, 12, /**/ 2, 4, 8, 10, 14, 0, 6, 12, // |
| 0, 4, 8, 10, 14, 2, 6, 12, /**/ 4, 8, 10, 14, 0, 2, 6, 12, // |
| 0, 2, 8, 10, 14, 4, 6, 12, /**/ 2, 8, 10, 14, 0, 4, 6, 12, // |
| 0, 8, 10, 14, 2, 4, 6, 12, /**/ 8, 10, 14, 0, 2, 4, 6, 12, // |
| 0, 2, 4, 6, 10, 14, 8, 12, /**/ 2, 4, 6, 10, 14, 0, 8, 12, // |
| 0, 4, 6, 10, 14, 2, 8, 12, /**/ 4, 6, 10, 14, 0, 2, 8, 12, // |
| 0, 2, 6, 10, 14, 4, 8, 12, /**/ 2, 6, 10, 14, 0, 4, 8, 12, // |
| 0, 6, 10, 14, 2, 4, 8, 12, /**/ 6, 10, 14, 0, 2, 4, 8, 12, // |
| 0, 2, 4, 10, 14, 6, 8, 12, /**/ 2, 4, 10, 14, 0, 6, 8, 12, // |
| 0, 4, 10, 14, 2, 6, 8, 12, /**/ 4, 10, 14, 0, 2, 6, 8, 12, // |
| 0, 2, 10, 14, 4, 6, 8, 12, /**/ 2, 10, 14, 0, 4, 6, 8, 12, // |
| 0, 10, 14, 2, 4, 6, 8, 12, /**/ 10, 14, 0, 2, 4, 6, 8, 12, // |
| 0, 2, 4, 6, 8, 14, 10, 12, /**/ 2, 4, 6, 8, 14, 0, 10, 12, // |
| 0, 4, 6, 8, 14, 2, 10, 12, /**/ 4, 6, 8, 14, 0, 2, 10, 12, // |
| 0, 2, 6, 8, 14, 4, 10, 12, /**/ 2, 6, 8, 14, 0, 4, 10, 12, // |
| 0, 6, 8, 14, 2, 4, 10, 12, /**/ 6, 8, 14, 0, 2, 4, 10, 12, // |
| 0, 2, 4, 8, 14, 6, 10, 12, /**/ 2, 4, 8, 14, 0, 6, 10, 12, // |
| 0, 4, 8, 14, 2, 6, 10, 12, /**/ 4, 8, 14, 0, 2, 6, 10, 12, // |
| 0, 2, 8, 14, 4, 6, 10, 12, /**/ 2, 8, 14, 0, 4, 6, 10, 12, // |
| 0, 8, 14, 2, 4, 6, 10, 12, /**/ 8, 14, 0, 2, 4, 6, 10, 12, // |
| 0, 2, 4, 6, 14, 8, 10, 12, /**/ 2, 4, 6, 14, 0, 8, 10, 12, // |
| 0, 4, 6, 14, 2, 8, 10, 12, /**/ 4, 6, 14, 0, 2, 8, 10, 12, // |
| 0, 2, 6, 14, 4, 8, 10, 12, /**/ 2, 6, 14, 0, 4, 8, 10, 12, // |
| 0, 6, 14, 2, 4, 8, 10, 12, /**/ 6, 14, 0, 2, 4, 8, 10, 12, // |
| 0, 2, 4, 14, 6, 8, 10, 12, /**/ 2, 4, 14, 0, 6, 8, 10, 12, // |
| 0, 4, 14, 2, 6, 8, 10, 12, /**/ 4, 14, 0, 2, 6, 8, 10, 12, // |
| 0, 2, 14, 4, 6, 8, 10, 12, /**/ 2, 14, 0, 4, 6, 8, 10, 12, // |
| 0, 14, 2, 4, 6, 8, 10, 12, /**/ 14, 0, 2, 4, 6, 8, 10, 12, // |
| 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 0, 14, // |
| 0, 4, 6, 8, 10, 12, 2, 14, /**/ 4, 6, 8, 10, 12, 0, 2, 14, // |
| 0, 2, 6, 8, 10, 12, 4, 14, /**/ 2, 6, 8, 10, 12, 0, 4, 14, // |
| 0, 6, 8, 10, 12, 2, 4, 14, /**/ 6, 8, 10, 12, 0, 2, 4, 14, // |
| 0, 2, 4, 8, 10, 12, 6, 14, /**/ 2, 4, 8, 10, 12, 0, 6, 14, // |
| 0, 4, 8, 10, 12, 2, 6, 14, /**/ 4, 8, 10, 12, 0, 2, 6, 14, // |
| 0, 2, 8, 10, 12, 4, 6, 14, /**/ 2, 8, 10, 12, 0, 4, 6, 14, // |
| 0, 8, 10, 12, 2, 4, 6, 14, /**/ 8, 10, 12, 0, 2, 4, 6, 14, // |
| 0, 2, 4, 6, 10, 12, 8, 14, /**/ 2, 4, 6, 10, 12, 0, 8, 14, // |
| 0, 4, 6, 10, 12, 2, 8, 14, /**/ 4, 6, 10, 12, 0, 2, 8, 14, // |
| 0, 2, 6, 10, 12, 4, 8, 14, /**/ 2, 6, 10, 12, 0, 4, 8, 14, // |
| 0, 6, 10, 12, 2, 4, 8, 14, /**/ 6, 10, 12, 0, 2, 4, 8, 14, // |
| 0, 2, 4, 10, 12, 6, 8, 14, /**/ 2, 4, 10, 12, 0, 6, 8, 14, // |
| 0, 4, 10, 12, 2, 6, 8, 14, /**/ 4, 10, 12, 0, 2, 6, 8, 14, // |
| 0, 2, 10, 12, 4, 6, 8, 14, /**/ 2, 10, 12, 0, 4, 6, 8, 14, // |
| 0, 10, 12, 2, 4, 6, 8, 14, /**/ 10, 12, 0, 2, 4, 6, 8, 14, // |
| 0, 2, 4, 6, 8, 12, 10, 14, /**/ 2, 4, 6, 8, 12, 0, 10, 14, // |
| 0, 4, 6, 8, 12, 2, 10, 14, /**/ 4, 6, 8, 12, 0, 2, 10, 14, // |
| 0, 2, 6, 8, 12, 4, 10, 14, /**/ 2, 6, 8, 12, 0, 4, 10, 14, // |
| 0, 6, 8, 12, 2, 4, 10, 14, /**/ 6, 8, 12, 0, 2, 4, 10, 14, // |
| 0, 2, 4, 8, 12, 6, 10, 14, /**/ 2, 4, 8, 12, 0, 6, 10, 14, // |
| 0, 4, 8, 12, 2, 6, 10, 14, /**/ 4, 8, 12, 0, 2, 6, 10, 14, // |
| 0, 2, 8, 12, 4, 6, 10, 14, /**/ 2, 8, 12, 0, 4, 6, 10, 14, // |
| 0, 8, 12, 2, 4, 6, 10, 14, /**/ 8, 12, 0, 2, 4, 6, 10, 14, // |
| 0, 2, 4, 6, 12, 8, 10, 14, /**/ 2, 4, 6, 12, 0, 8, 10, 14, // |
| 0, 4, 6, 12, 2, 8, 10, 14, /**/ 4, 6, 12, 0, 2, 8, 10, 14, // |
| 0, 2, 6, 12, 4, 8, 10, 14, /**/ 2, 6, 12, 0, 4, 8, 10, 14, // |
| 0, 6, 12, 2, 4, 8, 10, 14, /**/ 6, 12, 0, 2, 4, 8, 10, 14, // |
| 0, 2, 4, 12, 6, 8, 10, 14, /**/ 2, 4, 12, 0, 6, 8, 10, 14, // |
| 0, 4, 12, 2, 6, 8, 10, 14, /**/ 4, 12, 0, 2, 6, 8, 10, 14, // |
| 0, 2, 12, 4, 6, 8, 10, 14, /**/ 2, 12, 0, 4, 6, 8, 10, 14, // |
| 0, 12, 2, 4, 6, 8, 10, 14, /**/ 12, 0, 2, 4, 6, 8, 10, 14, // |
| 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 0, 12, 14, // |
| 0, 4, 6, 8, 10, 2, 12, 14, /**/ 4, 6, 8, 10, 0, 2, 12, 14, // |
| 0, 2, 6, 8, 10, 4, 12, 14, /**/ 2, 6, 8, 10, 0, 4, 12, 14, // |
| 0, 6, 8, 10, 2, 4, 12, 14, /**/ 6, 8, 10, 0, 2, 4, 12, 14, // |
| 0, 2, 4, 8, 10, 6, 12, 14, /**/ 2, 4, 8, 10, 0, 6, 12, 14, // |
| 0, 4, 8, 10, 2, 6, 12, 14, /**/ 4, 8, 10, 0, 2, 6, 12, 14, // |
| 0, 2, 8, 10, 4, 6, 12, 14, /**/ 2, 8, 10, 0, 4, 6, 12, 14, // |
| 0, 8, 10, 2, 4, 6, 12, 14, /**/ 8, 10, 0, 2, 4, 6, 12, 14, // |
| 0, 2, 4, 6, 10, 8, 12, 14, /**/ 2, 4, 6, 10, 0, 8, 12, 14, // |
| 0, 4, 6, 10, 2, 8, 12, 14, /**/ 4, 6, 10, 0, 2, 8, 12, 14, // |
| 0, 2, 6, 10, 4, 8, 12, 14, /**/ 2, 6, 10, 0, 4, 8, 12, 14, // |
| 0, 6, 10, 2, 4, 8, 12, 14, /**/ 6, 10, 0, 2, 4, 8, 12, 14, // |
| 0, 2, 4, 10, 6, 8, 12, 14, /**/ 2, 4, 10, 0, 6, 8, 12, 14, // |
| 0, 4, 10, 2, 6, 8, 12, 14, /**/ 4, 10, 0, 2, 6, 8, 12, 14, // |
| 0, 2, 10, 4, 6, 8, 12, 14, /**/ 2, 10, 0, 4, 6, 8, 12, 14, // |
| 0, 10, 2, 4, 6, 8, 12, 14, /**/ 10, 0, 2, 4, 6, 8, 12, 14, // |
| 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 0, 10, 12, 14, // |
| 0, 4, 6, 8, 2, 10, 12, 14, /**/ 4, 6, 8, 0, 2, 10, 12, 14, // |
| 0, 2, 6, 8, 4, 10, 12, 14, /**/ 2, 6, 8, 0, 4, 10, 12, 14, // |
| 0, 6, 8, 2, 4, 10, 12, 14, /**/ 6, 8, 0, 2, 4, 10, 12, 14, // |
| 0, 2, 4, 8, 6, 10, 12, 14, /**/ 2, 4, 8, 0, 6, 10, 12, 14, // |
| 0, 4, 8, 2, 6, 10, 12, 14, /**/ 4, 8, 0, 2, 6, 10, 12, 14, // |
| 0, 2, 8, 4, 6, 10, 12, 14, /**/ 2, 8, 0, 4, 6, 10, 12, 14, // |
| 0, 8, 2, 4, 6, 10, 12, 14, /**/ 8, 0, 2, 4, 6, 10, 12, 14, // |
| 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 0, 8, 10, 12, 14, // |
| 0, 4, 6, 2, 8, 10, 12, 14, /**/ 4, 6, 0, 2, 8, 10, 12, 14, // |
| 0, 2, 6, 4, 8, 10, 12, 14, /**/ 2, 6, 0, 4, 8, 10, 12, 14, // |
| 0, 6, 2, 4, 8, 10, 12, 14, /**/ 6, 0, 2, 4, 8, 10, 12, 14, // |
| 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 0, 6, 8, 10, 12, 14, // |
| 0, 4, 2, 6, 8, 10, 12, 14, /**/ 4, 0, 2, 6, 8, 10, 12, 14, // |
| 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 0, 4, 6, 8, 10, 12, 14, // |
| 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; |
| |
| const VFromD<decltype(d8t)> byte_idx{Load(d8, table + mask_bits * 8).raw}; |
| const VFromD<decltype(du)> pairs = ZipLower(byte_idx, byte_idx); |
| return BitCast(d, pairs + Set(du, 0x0100)); |
| } |
| |
| template <class D, HWY_IF_T_SIZE_D(D, 4)> |
| HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) { |
| HWY_DASSERT(mask_bits < 16); |
| |
| // There are only 4 lanes, so we can afford to load the index vector directly. |
| alignas(16) static constexpr uint8_t u8_indices[256] = { |
| // PrintCompress32x4Tables |
| 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // |
| 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // |
| 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, // |
| 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // |
| 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, // |
| 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, // |
| 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, // |
| 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // |
| 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, // |
| 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, // |
| 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, // |
| 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, // |
| 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, // |
| 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, // |
| 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, // |
| 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; |
| |
| const Repartition<uint8_t, decltype(d)> d8; |
| return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); |
| } |
| |
| template <class D, HWY_IF_T_SIZE_D(D, 4)> |
| HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) { |
| HWY_DASSERT(mask_bits < 16); |
| |
| // There are only 4 lanes, so we can afford to load the index vector directly. |
| alignas(16) static constexpr uint8_t u8_indices[256] = { |
| // PrintCompressNot32x4Tables |
| 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, |
| 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, |
| 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, |
| 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, |
| 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, |
| 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, |
| 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, |
| 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, |
| 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, |
| 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, |
| 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, |
| 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3, |
| 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, |
| 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, |
| 12, 13, 14, 15}; |
| |
| const Repartition<uint8_t, decltype(d)> d8; |
| return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); |
| } |
| |
| template <class D, HWY_IF_T_SIZE_D(D, 8)> |
| HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) { |
| HWY_DASSERT(mask_bits < 4); |
| |
| // There are only 2 lanes, so we can afford to load the index vector directly. |
| alignas(16) static constexpr uint8_t u8_indices[64] = { |
| // PrintCompress64x2Tables |
| 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, |
| 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, |
| 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, |
| 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; |
| |
| const Repartition<uint8_t, decltype(d)> d8; |
| return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); |
| } |
| |
| template <class D, HWY_IF_T_SIZE_D(D, 8)> |
| HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) { |
| HWY_DASSERT(mask_bits < 4); |
| |
| // There are only 2 lanes, so we can afford to load the index vector directly. |
| alignas(16) static constexpr uint8_t u8_indices[64] = { |
| // PrintCompressNot64x2Tables |
| 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, |
| 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, |
| 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, |
| 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; |
| |
| const Repartition<uint8_t, decltype(d)> d8; |
| return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); |
| } |
| |
| template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)> |
| HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v, uint64_t mask_bits) { |
| const DFromV<decltype(v)> d; |
| const RebindToUnsigned<decltype(d)> du; |
| |
| HWY_DASSERT(mask_bits < (1ull << N)); |
| const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); |
| return BitCast(d, TableLookupBytes(BitCast(du, v), indices)); |
| } |
| |
| template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)> |
| HWY_API Vec128<T, N> CompressNotBits(Vec128<T, N> v, uint64_t mask_bits) { |
| const DFromV<decltype(v)> d; |
| const RebindToUnsigned<decltype(d)> du; |
| |
| HWY_DASSERT(mask_bits < (1ull << N)); |
| const auto indices = BitCast(du, detail::IndicesFromNotBits128(d, mask_bits)); |
| return BitCast(d, TableLookupBytes(BitCast(du, v), indices)); |
| } |
| |
| } // namespace detail |
| |
| // Single lane: no-op |
| template <typename T> |
| HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) { |
| return v; |
| } |
| |
| // Two lanes: conditional swap |
| template <typename T, HWY_IF_T_SIZE(T, 8)> |
| HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) { |
| // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep. |
| const DFromV<decltype(v)> d; |
| const Vec128<T> m = VecFromMask(d, mask); |
| const Vec128<T> maskL = DupEven(m); |
| const Vec128<T> maskH = DupOdd(m); |
| const Vec128<T> swap = AndNot(maskL, maskH); |
| return IfVecThenElse(swap, Shuffle01(v), v); |
| } |
| |
| // General case, 2 or 4 bytes |
| template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))> |
| HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) { |
| const DFromV<decltype(v)> d; |
| return detail::CompressBits(v, BitsFromMask(d, mask)); |
| } |
| |
| // ------------------------------ CompressNot |
| |
| // Single lane: no-op |
| template <typename T> |
| HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) { |
| return v; |
| } |
| |
| // Two lanes: conditional swap |
| template <typename T, HWY_IF_T_SIZE(T, 8)> |
| HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) { |
| // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep. |
| const DFromV<decltype(v)> d; |
| const Vec128<T> m = VecFromMask(d, mask); |
| const Vec128<T> maskL = DupEven(m); |
| const Vec128<T> maskH = DupOdd(m); |
| const Vec128<T> swap = AndNot(maskH, maskL); |
| return IfVecThenElse(swap, Shuffle01(v), v); |
| } |
| |
| template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))> |
| HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) { |
| const DFromV<decltype(v)> d; |
| // For partial vectors, we cannot pull the Not() into the table because |
| // BitsFromMask clears the upper bits. |
| if (N < 16 / sizeof(T)) { |
| return detail::CompressBits(v, BitsFromMask(d, Not(mask))); |
| } |
| return detail::CompressNotBits(v, BitsFromMask(d, mask)); |
| } |
| |
| // ------------------------------ CompressBlocksNot |
| HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v, |
| Mask128<uint64_t> /* m */) { |
| return v; |
| } |
| |
| template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)> |
| HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v, |
| const uint8_t* HWY_RESTRICT bits) { |
| uint64_t mask_bits = 0; |
| constexpr size_t kNumBytes = (N + 7) / 8; |
| CopyBytes<kNumBytes>(bits, &mask_bits); |
| if (N < 8) { |
| mask_bits &= (1ull << N) - 1; |
| } |
| |
| return detail::CompressBits(v, mask_bits); |
| } |
| |
| // ------------------------------ CompressStore, CompressBitsStore |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_T_SIZE_D(D, 1)> |
| HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> m, D d, |
| TFromD<D>* HWY_RESTRICT unaligned) { |
| const RebindToUnsigned<decltype(d)> du; |
| |
| const uint64_t mask_bits = BitsFromMask(d, m); |
| HWY_DASSERT(mask_bits < (1ull << MaxLanes(d))); |
| const size_t count = PopCount(mask_bits); |
| |
| // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches). |
| const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); |
| const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); |
| StoreU(compressed, d, unaligned); |
| detail::MaybeUnpoison(unaligned, count); |
| return count; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_T_SIZE_D(D, 1)> |
| HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d, |
| TFromD<D>* HWY_RESTRICT unaligned) { |
| const RebindToUnsigned<decltype(d)> du; |
| |
| const uint64_t mask_bits = BitsFromMask(d, m); |
| HWY_DASSERT(mask_bits < (1ull << MaxLanes(d))); |
| const size_t count = PopCount(mask_bits); |
| |
| // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches). |
| const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); |
| const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); |
| BlendedStore(compressed, FirstN(d, count), d, unaligned); |
| detail::MaybeUnpoison(unaligned, count); |
| return count; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_T_SIZE_D(D, 1)> |
| HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits, |
| D d, TFromD<D>* HWY_RESTRICT unaligned) { |
| const RebindToUnsigned<decltype(d)> du; |
| |
| uint64_t mask_bits = 0; |
| constexpr size_t kN = MaxLanes(d); |
| constexpr size_t kNumBytes = (kN + 7) / 8; |
| CopyBytes<kNumBytes>(bits, &mask_bits); |
| if (kN < 8) { |
| mask_bits &= (1ull << kN) - 1; |
| } |
| const size_t count = PopCount(mask_bits); |
| |
| // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches). |
| const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); |
| const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); |
| StoreU(compressed, d, unaligned); |
| |
| detail::MaybeUnpoison(unaligned, count); |
| return count; |
| } |
| |
| #endif // HWY_TARGET <= HWY_AVX3 |
| |
| // ------------------------------ Expand |
| |
| // Otherwise, use the generic_ops-inl.h fallback. |
| #if HWY_TARGET <= HWY_AVX3 || HWY_IDE |
| |
| // The native instructions for 8/16-bit actually require VBMI2 (HWY_AVX3_DL), |
| // but we still want to override generic_ops-inl's table-based implementation |
| // whenever we have the 32-bit expand provided by AVX3. |
| #ifdef HWY_NATIVE_EXPAND |
| #undef HWY_NATIVE_EXPAND |
| #else |
| #define HWY_NATIVE_EXPAND |
| #endif |
| |
| namespace detail { |
| |
| #if HWY_TARGET <= HWY_AVX3_DL || HWY_IDE // VBMI2 |
| |
| template <size_t N> |
| HWY_INLINE Vec128<uint8_t, N> NativeExpand(Vec128<uint8_t, N> v, |
| Mask128<uint8_t, N> mask) { |
| return Vec128<uint8_t, N>{_mm_maskz_expand_epi8(mask.raw, v.raw)}; |
| } |
| |
| template <size_t N> |
| HWY_INLINE Vec128<uint16_t, N> NativeExpand(Vec128<uint16_t, N> v, |
| Mask128<uint16_t, N> mask) { |
| return Vec128<uint16_t, N>{_mm_maskz_expand_epi16(mask.raw, v.raw)}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U8_D(D)> |
| HWY_INLINE VFromD<D> NativeLoadExpand(MFromD<D> mask, D /* d */, |
| const uint8_t* HWY_RESTRICT unaligned) { |
| return VFromD<D>{_mm_maskz_expandloadu_epi8(mask.raw, unaligned)}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)> |
| HWY_INLINE VFromD<D> NativeLoadExpand(MFromD<D> mask, D /* d */, |
| const uint16_t* HWY_RESTRICT unaligned) { |
| return VFromD<D>{_mm_maskz_expandloadu_epi16(mask.raw, unaligned)}; |
| } |
| |
| #endif // HWY_TARGET <= HWY_AVX3_DL |
| |
| template <size_t N> |
| HWY_INLINE Vec128<uint32_t, N> NativeExpand(Vec128<uint32_t, N> v, |
| Mask128<uint32_t, N> mask) { |
| return Vec128<uint32_t, N>{_mm_maskz_expand_epi32(mask.raw, v.raw)}; |
| } |
| |
| template <size_t N> |
| HWY_INLINE Vec128<uint64_t, N> NativeExpand(Vec128<uint64_t, N> v, |
| Mask128<uint64_t, N> mask) { |
| return Vec128<uint64_t, N>{_mm_maskz_expand_epi64(mask.raw, v.raw)}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)> |
| HWY_INLINE VFromD<D> NativeLoadExpand(MFromD<D> mask, D /* d */, |
| const uint32_t* HWY_RESTRICT unaligned) { |
| return VFromD<D>{_mm_maskz_expandloadu_epi32(mask.raw, unaligned)}; |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)> |
| HWY_INLINE VFromD<D> NativeLoadExpand(MFromD<D> mask, D /* d */, |
| const uint64_t* HWY_RESTRICT unaligned) { |
| return VFromD<D>{_mm_maskz_expandloadu_epi64(mask.raw, unaligned)}; |
| } |
| |
| } // namespace detail |
| |
| // Otherwise, 8/16-bit are implemented in x86_512 using PromoteTo. |
| #if HWY_TARGET <= HWY_AVX3_DL || HWY_IDE // VBMI2 |
| |
| template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))> |
| HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) { |
| const DFromV<decltype(v)> d; |
| const RebindToUnsigned<decltype(d)> du; |
| const MFromD<decltype(du)> mu = RebindMask(du, mask); |
| return BitCast(d, detail::NativeExpand(BitCast(du, v), mu)); |
| } |
| |
| #endif // HWY_TARGET <= HWY_AVX3_DL |
| |
| template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))> |
| HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) { |
| const DFromV<decltype(v)> d; |
| const RebindToUnsigned<decltype(d)> du; |
| const MFromD<decltype(du)> mu = RebindMask(du, mask); |
| return BitCast(d, detail::NativeExpand(BitCast(du, v), mu)); |
| } |
| |
| // ------------------------------ LoadExpand |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), |
| HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))> |
| HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d, |
| const TFromD<D>* HWY_RESTRICT unaligned) { |
| #if HWY_TARGET <= HWY_AVX3_DL // VBMI2 |
| const RebindToUnsigned<decltype(d)> du; |
| using TU = TFromD<decltype(du)>; |
| const TU* HWY_RESTRICT pu = reinterpret_cast<const TU*>(unaligned); |
| const MFromD<decltype(du)> mu = RebindMask(du, mask); |
| return BitCast(d, detail::NativeLoadExpand(mu, du, pu)); |
| #else |
| return Expand(LoadU(d, unaligned), mask); |
| #endif |
| } |
| |
| template <class D, HWY_IF_V_SIZE_LE_D(D, 16), |
| HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))> |
| HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d, |
| const TFromD<D>* HWY_RESTRICT unaligned) { |
| #if HWY_TARGET <= HWY_AVX3 |
| const RebindToUnsigned<decltype(d)> du; |
| using TU = TFromD<decltype(du)>; |
| const TU* HWY_RESTRICT pu = reinterpret_cast<const TU*>(unaligned); |
| const MFromD<decltype(du)> mu = RebindMask(du, mask); |
| return BitCast(d, detail::NativeLoadExpand(mu, du, pu)); |
| #else |
| return Expand(LoadU(d, unaligned), mask); |
| #endif |
| } |
| |
| #endif // HWY_TARGET <= HWY_AVX3 |
| |
| // ------------------------------ StoreInterleaved2/3/4 |
| |
| // HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in |
| // generic_ops-inl.h. |
| |
| // ------------------------------ Additional mask logical operations |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| namespace detail { |
| |
| template <class T, HWY_IF_LANES_LE(sizeof(T), 4)> |
| static HWY_INLINE uint32_t AVX3Blsi(T x) { |
| using TU = MakeUnsigned<T>; |
| const auto u32_val = static_cast<uint32_t>(static_cast<TU>(x)); |
| #if HWY_COMPILER_CLANGCL |
| return static_cast<uint32_t>(u32_val & (0u - u32_val)); |
| #else |
| return static_cast<uint32_t>(_blsi_u32(u32_val)); |
| #endif |
| } |
| template <class T, HWY_IF_T_SIZE(T, 8)> |
| static HWY_INLINE uint64_t AVX3Blsi(T x) { |
| const auto u64_val = static_cast<uint64_t>(x); |
| #if HWY_COMPILER_CLANGCL || HWY_ARCH_X86_32 |
| return static_cast<uint64_t>(u64_val & (0ULL - u64_val)); |
| #else |
| return static_cast<uint64_t>(_blsi_u64(u64_val)); |
| #endif |
| } |
| |
| template <class T, HWY_IF_LANES_LE(sizeof(T), 4)> |
| static HWY_INLINE uint32_t AVX3Blsmsk(T x) { |
| using TU = MakeUnsigned<T>; |
| const auto u32_val = static_cast<uint32_t>(static_cast<TU>(x)); |
| #if HWY_COMPILER_CLANGCL |
| return static_cast<uint32_t>(u32_val ^ (u32_val - 1u)); |
| #else |
| return static_cast<uint32_t>(_blsmsk_u32(u32_val)); |
| #endif |
| } |
| template <class T, HWY_IF_T_SIZE(T, 8)> |
| static HWY_INLINE uint64_t AVX3Blsmsk(T x) { |
| const auto u64_val = static_cast<uint64_t>(x); |
| #if HWY_COMPILER_CLANGCL || HWY_ARCH_X86_32 |
| return static_cast<uint64_t>(u64_val ^ (u64_val - 1ULL)); |
| #else |
| return static_cast<uint64_t>(_blsmsk_u64(u64_val)); |
| #endif |
| } |
| |
| } // namespace detail |
| |
| template <class T, size_t N> |
| HWY_API Mask128<T, N> SetAtOrAfterFirst(Mask128<T, N> mask) { |
| constexpr uint32_t kActiveElemMask = (uint32_t{1} << N) - 1; |
| return Mask128<T, N>{static_cast<typename Mask128<T, N>::Raw>( |
| (0u - detail::AVX3Blsi(mask.raw)) & kActiveElemMask)}; |
| } |
| template <class T, size_t N> |
| HWY_API Mask128<T, N> SetBeforeFirst(Mask128<T, N> mask) { |
| constexpr uint32_t kActiveElemMask = (uint32_t{1} << N) - 1; |
| return Mask128<T, N>{static_cast<typename Mask128<T, N>::Raw>( |
| (detail::AVX3Blsi(mask.raw) - 1u) & kActiveElemMask)}; |
| } |
| template <class T, size_t N> |
| HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) { |
| constexpr uint32_t kActiveElemMask = (uint32_t{1} << N) - 1; |
| return Mask128<T, N>{static_cast<typename Mask128<T, N>::Raw>( |
| detail::AVX3Blsmsk(mask.raw) & kActiveElemMask)}; |
| } |
| template <class T, size_t N> |
| HWY_API Mask128<T, N> SetOnlyFirst(Mask128<T, N> mask) { |
| return Mask128<T, N>{ |
| static_cast<typename Mask128<T, N>::Raw>(detail::AVX3Blsi(mask.raw))}; |
| } |
| #else // AVX2 or below |
| template <class T> |
| HWY_API Mask128<T, 1> SetAtOrAfterFirst(Mask128<T, 1> mask) { |
| return mask; |
| } |
| template <class T> |
| HWY_API Mask128<T, 2> SetAtOrAfterFirst(Mask128<T, 2> mask) { |
| const FixedTag<T, 2> d; |
| const auto vmask = VecFromMask(d, mask); |
| return MaskFromVec(Or(vmask, InterleaveLower(vmask, vmask))); |
| } |
| template <class T, size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)> |
| HWY_API Mask128<T, N> SetAtOrAfterFirst(Mask128<T, N> mask) { |
| const Simd<T, N, 0> d; |
| const auto vmask = VecFromMask(d, mask); |
| const auto neg_vmask = |
| ResizeBitCast(d, Neg(ResizeBitCast(Full64<int64_t>(), vmask))); |
| return MaskFromVec(Or(vmask, neg_vmask)); |
| } |
| template <class T, HWY_IF_NOT_T_SIZE(T, 8)> |
| HWY_API Mask128<T> SetAtOrAfterFirst(Mask128<T> mask) { |
| const Full128<T> d; |
| const Repartition<int64_t, decltype(d)> di64; |
| const Repartition<float, decltype(d)> df32; |
| const Repartition<int32_t, decltype(d)> di32; |
| using VF = VFromD<decltype(df32)>; |
| |
| auto vmask = BitCast(di64, VecFromMask(d, mask)); |
| vmask = Or(vmask, Neg(vmask)); |
| |
| // Copy the sign bit of the first int64_t lane to the second int64_t lane |
| const auto vmask2 = BroadcastSignBit( |
| BitCast(di32, VF{_mm_shuffle_ps(Zero(df32).raw, BitCast(df32, vmask).raw, |
| _MM_SHUFFLE(1, 1, 0, 0))})); |
| return MaskFromVec(BitCast(d, Or(vmask, BitCast(di64, vmask2)))); |
| } |
| |
| template <class T, size_t N> |
| HWY_API Mask128<T, N> SetBeforeFirst(Mask128<T, N> mask) { |
| return Not(SetAtOrAfterFirst(mask)); |
| } |
| |
| template <class T> |
| HWY_API Mask128<T, 1> SetOnlyFirst(Mask128<T, 1> mask) { |
| return mask; |
| } |
| template <class T> |
| HWY_API Mask128<T, 2> SetOnlyFirst(Mask128<T, 2> mask) { |
| const FixedTag<T, 2> d; |
| const RebindToSigned<decltype(d)> di; |
| |
| const auto vmask = BitCast(di, VecFromMask(d, mask)); |
| const auto zero = Zero(di); |
| const auto vmask2 = VecFromMask(di, InterleaveLower(zero, vmask) == zero); |
| return MaskFromVec(BitCast(d, And(vmask, vmask2))); |
| } |
| template <class T, size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)> |
| HWY_API Mask128<T, N> SetOnlyFirst(Mask128<T, N> mask) { |
| const Simd<T, N, 0> d; |
| const RebindToSigned<decltype(d)> di; |
| |
| const auto vmask = ResizeBitCast(Full64<int64_t>(), VecFromMask(d, mask)); |
| const auto only_first_vmask = |
| BitCast(d, Neg(ResizeBitCast(di, And(vmask, Neg(vmask))))); |
| return MaskFromVec(only_first_vmask); |
| } |
| template <class T, HWY_IF_NOT_T_SIZE(T, 8)> |
| HWY_API Mask128<T> SetOnlyFirst(Mask128<T> mask) { |
| const Full128<T> d; |
| const RebindToSigned<decltype(d)> di; |
| const Repartition<int64_t, decltype(d)> di64; |
| |
| const auto zero = Zero(di64); |
| const auto vmask = BitCast(di64, VecFromMask(d, mask)); |
| const auto vmask2 = VecFromMask(di64, InterleaveLower(zero, vmask) == zero); |
| const auto only_first_vmask = Neg(BitCast(di, And(vmask, Neg(vmask)))); |
| return MaskFromVec(BitCast(d, And(only_first_vmask, BitCast(di, vmask2)))); |
| } |
| |
| template <class T> |
| HWY_API Mask128<T, 1> SetAtOrBeforeFirst(Mask128<T, 1> /*mask*/) { |
| const FixedTag<T, 1> d; |
| const RebindToSigned<decltype(d)> di; |
| using TI = MakeSigned<T>; |
| |
| return RebindMask(d, MaskFromVec(Set(di, TI(-1)))); |
| } |
| template <class T, size_t N, HWY_IF_LANES_GT(N, 1)> |
| HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) { |
| const Simd<T, N, 0> d; |
| return SetBeforeFirst(MaskFromVec(ShiftLeftLanes<1>(VecFromMask(d, mask)))); |
| } |
| #endif // HWY_TARGET <= HWY_AVX3 |
| |
| // ------------------------------ Reductions |
| |
| // Nothing fully native, generic_ops-inl defines SumOfLanes and ReduceSum. |
| |
| // We provide specializations of u8x8 and u8x16, so exclude those. |
| #undef HWY_IF_SUM_OF_LANES_D |
| #define HWY_IF_SUM_OF_LANES_D(D) \ |
| HWY_IF_LANES_GT_D(D, 1), \ |
| hwy::EnableIf<!hwy::IsSame<TFromD<D>, uint8_t>() || \ |
| (HWY_V_SIZE_D(D) != 8 && HWY_V_SIZE_D(D) != 16)>* = \ |
| nullptr |
| |
| template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_D(D, 8)> |
| HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) { |
| return Set(d, static_cast<uint8_t>(GetLane(SumsOf8(v)) & 0xFF)); |
| } |
| template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_D(D, 16)> |
| HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) { |
| const Repartition<uint64_t, decltype(d)> d64; |
| VFromD<decltype(d64)> sums = SumsOf8(v); |
| sums = SumOfLanes(d64, sums); |
| return Broadcast<0>(BitCast(d, sums)); |
| } |
| |
| #if HWY_TARGET <= HWY_SSE4 |
| // We provide specializations of u8x8, u8x16, and u16x8, so exclude those. |
| #undef HWY_IF_MINMAX_OF_LANES_D |
| #define HWY_IF_MINMAX_OF_LANES_D(D) \ |
| HWY_IF_LANES_GT_D(D, 1), \ |
| hwy::EnableIf<(!hwy::IsSame<TFromD<D>, uint8_t>() || \ |
| ((HWY_V_SIZE_D(D) < 8) || (HWY_V_SIZE_D(D) > 16))) && \ |
| (!hwy::IsSame<TFromD<D>, uint16_t>() || \ |
| (HWY_V_SIZE_D(D) != 16))>* = nullptr |
| |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)> |
| HWY_API Vec128<uint16_t> MinOfLanes(D /* tag */, Vec128<uint16_t> v) { |
| return Broadcast<0>(Vec128<uint16_t>{_mm_minpos_epu16(v.raw)}); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)> |
| HWY_API Vec128<uint16_t> MaxOfLanes(D d, Vec128<uint16_t> v) { |
| const Vec128<uint16_t> max = Set(d, LimitsMax<uint16_t>()); |
| return max - MinOfLanes(d, max - v); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)> |
| HWY_API Vec64<uint8_t> MinOfLanes(D d, Vec64<uint8_t> v) { |
| const Rebind<uint16_t, decltype(d)> d16; |
| return TruncateTo(d, MinOfLanes(d16, PromoteTo(d16, v))); |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)> |
| HWY_API Vec128<uint8_t> MinOfLanes(D d, Vec128<uint8_t> v) { |
| const Half<decltype(d)> dh; |
| Vec64<uint8_t> result = |
| Min(MinOfLanes(dh, UpperHalf(dh, v)), MinOfLanes(dh, LowerHalf(dh, v))); |
| return Combine(d, result, result); |
| } |
| |
| template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)> |
| HWY_API Vec64<uint8_t> MaxOfLanes(D d, Vec64<uint8_t> v) { |
| const Vec64<uint8_t> m(Set(d, LimitsMax<uint8_t>())); |
| return m - MinOfLanes(d, m - v); |
| } |
| template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)> |
| HWY_API Vec128<uint8_t> MaxOfLanes(D d, Vec128<uint8_t> v) { |
| const Vec128<uint8_t> m(Set(d, LimitsMax<uint8_t>())); |
| return m - MinOfLanes(d, m - v); |
| } |
| |
| #endif // HWY_TARGET <= HWY_SSE4 |
| |
| // ------------------------------ BitShuffle |
| #if HWY_TARGET <= HWY_AVX3_DL |
| |
| #ifdef HWY_NATIVE_BITSHUFFLE |
| #undef HWY_NATIVE_BITSHUFFLE |
| #else |
| #define HWY_NATIVE_BITSHUFFLE |
| #endif |
| |
| template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>), |
| HWY_IF_V_SIZE_LE_V(V, 16), |
| HWY_IF_V_SIZE_V(VI, HWY_MAX_LANES_V(V) * 8)> |
| HWY_API V BitShuffle(V v, VI idx) { |
| const DFromV<decltype(v)> d64; |
| const RebindToUnsigned<decltype(d64)> du64; |
| const Rebind<uint8_t, decltype(d64)> du8; |
| |
| int32_t i32_bit_shuf_result = static_cast<int32_t>( |
| static_cast<uint16_t>(_mm_bitshuffle_epi64_mask(v.raw, idx.raw))); |
| |
| return BitCast(d64, PromoteTo(du64, VFromD<decltype(du8)>{_mm_cvtsi32_si128( |
| i32_bit_shuf_result)})); |
| } |
| #endif // HWY_TARGET <= HWY_AVX3_DL |
| |
| // ------------------------------ MultiRotateRight |
| |
| #if HWY_TARGET <= HWY_AVX3_DL |
| |
| #ifdef HWY_NATIVE_MULTIROTATERIGHT |
| #undef HWY_NATIVE_MULTIROTATERIGHT |
| #else |
| #define HWY_NATIVE_MULTIROTATERIGHT |
| #endif |
| |
| template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>), |
| HWY_IF_V_SIZE_LE_V(V, 16), |
| HWY_IF_V_SIZE_V(VI, HWY_MAX_LANES_V(V) * 8)> |
| HWY_API V MultiRotateRight(V v, VI idx) { |
| return V{_mm_multishift_epi64_epi8(idx.raw, v.raw)}; |
| } |
| |
| #endif |
| |
| // ------------------------------ Lt128 |
| |
| namespace detail { |
| |
| // Returns vector-mask for Lt128. Generic for all vector lengths. |
| template <class D, HWY_IF_U64_D(D)> |
| HWY_INLINE VFromD<D> Lt128Vec(const D d, VFromD<D> a, VFromD<D> b) { |
| // Truth table of Eq and Lt for Hi and Lo u64. |
| // (removed lines with (=H && cH) or (=L && cL) - cannot both be true) |
| // =H =L cH cL | out = cH | (=H & cL) |
| // 0 0 0 0 | 0 |
| // 0 0 0 1 | 0 |
| // 0 0 1 0 | 1 |
| // 0 0 1 1 | 1 |
| // 0 1 0 0 | 0 |
| // 0 1 0 1 | 0 |
| // 0 1 1 0 | 1 |
| // 1 0 0 0 | 0 |
| // 1 0 0 1 | 1 |
| // 1 1 0 0 | 0 |
| const auto eqHL = Eq(a, b); |
| const VFromD<D> ltHL = VecFromMask(d, Lt(a, b)); |
| const VFromD<D> ltLX = ShiftLeftLanes<1>(ltHL); |
| const VFromD<D> vecHx = IfThenElse(eqHL, ltLX, ltHL); |
| return InterleaveUpper(d, vecHx, vecHx); |
| } |
| |
| // Returns vector-mask for Eq128. Generic for all vector lengths. |
| template <class D, HWY_IF_U64_D(D)> |
| HWY_INLINE VFromD<D> Eq128Vec(D d, VFromD<D> a, VFromD<D> b) { |
| const auto eqHL = VecFromMask(d, Eq(a, b)); |
| const auto eqLH = Reverse2(d, eqHL); |
| return And(eqHL, eqLH); |
| } |
| |
| template <class D, HWY_IF_U64_D(D)> |
| HWY_INLINE VFromD<D> Ne128Vec(D d, VFromD<D> a, VFromD<D> b) { |
| const auto neHL = VecFromMask(d, Ne(a, b)); |
| const auto neLH = Reverse2(d, neHL); |
| return Or(neHL, neLH); |
| } |
| |
| template <class D, HWY_IF_U64_D(D)> |
| HWY_INLINE VFromD<D> Lt128UpperVec(D d, VFromD<D> a, VFromD<D> b) { |
| // No specialization required for AVX-512: Mask <-> Vec is fast, and |
| // copying mask bits to their neighbor seems infeasible. |
| const VFromD<D> ltHL = VecFromMask(d, Lt(a, b)); |
| return InterleaveUpper(d, ltHL, ltHL); |
| } |
| |
| template <class D, HWY_IF_U64_D(D)> |
| HWY_INLINE VFromD<D> Eq128UpperVec(D d, VFromD<D> a, VFromD<D> b) { |
| // No specialization required for AVX-512: Mask <-> Vec is fast, and |
| // copying mask bits to their neighbor seems infeasible. |
| const VFromD<D> eqHL = VecFromMask(d, Eq(a, b)); |
| return InterleaveUpper(d, eqHL, eqHL); |
| } |
| |
| template <class D, HWY_IF_U64_D(D)> |
| HWY_INLINE VFromD<D> Ne128UpperVec(D d, VFromD<D> a, VFromD<D> b) { |
| // No specialization required for AVX-512: Mask <-> Vec is fast, and |
| // copying mask bits to their neighbor seems infeasible. |
| const VFromD<D> neHL = VecFromMask(d, Ne(a, b)); |
| return InterleaveUpper(d, neHL, neHL); |
| } |
| |
| } // namespace detail |
| |
| template <class D, HWY_IF_U64_D(D)> |
| HWY_API MFromD<D> Lt128(D d, VFromD<D> a, VFromD<D> b) { |
| return MaskFromVec(detail::Lt128Vec(d, a, b)); |
| } |
| |
| template <class D, HWY_IF_U64_D(D)> |
| HWY_API MFromD<D> Eq128(D d, VFromD<D> a, VFromD<D> b) { |
| return MaskFromVec(detail::Eq128Vec(d, a, b)); |
| } |
| |
| template <class D, HWY_IF_U64_D(D)> |
| HWY_API MFromD<D> Ne128(D d, VFromD<D> a, VFromD<D> b) { |
| return MaskFromVec(detail::Ne128Vec(d, a, b)); |
| } |
| |
| template <class D, HWY_IF_U64_D(D)> |
| HWY_API MFromD<D> Lt128Upper(D d, VFromD<D> a, VFromD<D> b) { |
| return MaskFromVec(detail::Lt128UpperVec(d, a, b)); |
| } |
| |
| template <class D, HWY_IF_U64_D(D)> |
| HWY_API MFromD<D> Eq128Upper(D d, VFromD<D> a, VFromD<D> b) { |
| return MaskFromVec(detail::Eq128UpperVec(d, a, b)); |
| } |
| |
| template <class D, HWY_IF_U64_D(D)> |
| HWY_API MFromD<D> Ne128Upper(D d, VFromD<D> a, VFromD<D> b) { |
| return MaskFromVec(detail::Ne128UpperVec(d, a, b)); |
| } |
| |
| // ------------------------------ Min128, Max128 (Lt128) |
| |
| // Avoids the extra MaskFromVec in Lt128. |
| template <class D, HWY_IF_U64_D(D)> |
| HWY_API VFromD<D> Min128(D d, VFromD<D> a, VFromD<D> b) { |
| return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b); |
| } |
| |
| template <class D, HWY_IF_U64_D(D)> |
| HWY_API VFromD<D> Max128(D d, VFromD<D> a, VFromD<D> b) { |
| return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b); |
| } |
| |
| template <class D, HWY_IF_U64_D(D)> |
| HWY_API VFromD<D> Min128Upper(D d, VFromD<D> a, VFromD<D> b) { |
| return IfVecThenElse(detail::Lt128UpperVec(d, a, b), a, b); |
| } |
| |
| template <class D, HWY_IF_U64_D(D)> |
| HWY_API VFromD<D> Max128Upper(D d, VFromD<D> a, VFromD<D> b) { |
| return IfVecThenElse(detail::Lt128UpperVec(d, b, a), a, b); |
| } |
| |
| // -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex |
| |
| #if HWY_TARGET <= HWY_AVX3 |
| |
| #ifdef HWY_NATIVE_LEADING_ZERO_COUNT |
| #undef HWY_NATIVE_LEADING_ZERO_COUNT |
| #else |
| #define HWY_NATIVE_LEADING_ZERO_COUNT |
| #endif |
| |
| template <class V, HWY_IF_UI32(TFromV<V>), HWY_IF_V_SIZE_LE_D(DFromV<V>, 16)> |
| HWY_API V LeadingZeroCount(V v) { |
| return V{_mm_lzcnt_epi32(v.raw)}; |
| } |
| |
| template <class V, HWY_IF_UI64(TFromV<V>), HWY_IF_V_SIZE_LE_D(DFromV<V>, 16)> |
| HWY_API V LeadingZeroCount(V v) { |
| return V{_mm_lzcnt_epi64(v.raw)}; |
| } |
| |
| // HighestSetBitIndex and TrailingZeroCount is implemented in x86_512-inl.h |
| // for AVX3 targets |
| |
| #endif // HWY_TARGET <= HWY_AVX3 |
| |
| // NOLINTNEXTLINE(google-readability-namespace-comments) |
| } // namespace HWY_NAMESPACE |
| } // namespace hwy |
| HWY_AFTER_NAMESPACE(); |
| |
| #undef HWY_X86_IF_EMULATED_D |
| |
| // Note that the GCC warnings are not suppressed if we only wrap the *intrin.h - |
| // the warning seems to be issued at the call site of intrinsics, i.e. our code. |
| HWY_DIAGNOSTICS(pop) |