| // Copyright 2019 Google LLC |
| // SPDX-License-Identifier: Apache-2.0 |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // External include guard in highway.h - see comment there. |
| |
| #if HWY_TARGET == HWY_AVX10_2 |
| // For AVX10 targets that only support 256-bit or smaller vectors. Already |
| // includes base.h and shared-inl.h. |
| #include "third_party/highway/hwy/ops/x86_256-inl.h" |
| #else |
| // For AVX3/AVX10 targets that support 512-byte vectors. Already includes base.h |
| // and shared-inl.h. |
| #include "third_party/highway/hwy/ops/x86_512-inl.h" |
| #endif |
| |
| // AVX3/AVX10 ops that have dependencies on ops defined in x86_512-inl.h if |
| // HWY_MAX_BYTES >= 64 is true are defined below |
| |
| // Avoid uninitialized warnings in GCC's avx512fintrin.h - see |
| // https://github.com/google/highway/issues/710) |
| HWY_DIAGNOSTICS(push) |
| #if HWY_COMPILER_GCC_ACTUAL |
| HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") |
| HWY_DIAGNOSTICS_OFF(disable : 4701 4703 6001 26494, |
| ignored "-Wmaybe-uninitialized") |
| #endif |
| |
| HWY_BEFORE_NAMESPACE(); |
| namespace hwy { |
| namespace HWY_NAMESPACE { |
| |
| #if HWY_TARGET <= HWY_AVX3_DL |
| |
| // ------------------------------ ShiftLeft |
| |
| // Generic for all vector lengths. Must be defined after all GaloisAffine. |
| template <int kBits, class V, HWY_IF_T_SIZE_V(V, 1)> |
| HWY_API V ShiftLeft(const V v) { |
| const Repartition<uint64_t, DFromV<V>> du64; |
| if (kBits == 0) return v; |
| if (kBits == 1) return v + v; |
| constexpr uint64_t kMatrix = (0x0102040810204080ULL >> kBits) & |
| (0x0101010101010101ULL * (0xFF >> kBits)); |
| return detail::GaloisAffine(v, Set(du64, kMatrix)); |
| } |
| |
| // ------------------------------ ShiftRight |
| |
| // Generic for all vector lengths. Must be defined after all GaloisAffine. |
| template <int kBits, class V, HWY_IF_U8_D(DFromV<V>)> |
| HWY_API V ShiftRight(const V v) { |
| const Repartition<uint64_t, DFromV<V>> du64; |
| if (kBits == 0) return v; |
| constexpr uint64_t kMatrix = |
| (0x0102040810204080ULL << kBits) & |
| (0x0101010101010101ULL * ((0xFF << kBits) & 0xFF)); |
| return detail::GaloisAffine(v, Set(du64, kMatrix)); |
| } |
| |
| // Generic for all vector lengths. Must be defined after all GaloisAffine. |
| template <int kBits, class V, HWY_IF_I8_D(DFromV<V>)> |
| HWY_API V ShiftRight(const V v) { |
| const Repartition<uint64_t, DFromV<V>> du64; |
| if (kBits == 0) return v; |
| constexpr uint64_t kShift = |
| (0x0102040810204080ULL << kBits) & |
| (0x0101010101010101ULL * ((0xFF << kBits) & 0xFF)); |
| constexpr uint64_t kSign = |
| kBits == 0 ? 0 : (0x8080808080808080ULL >> (64 - (8 * kBits))); |
| return detail::GaloisAffine(v, Set(du64, kShift | kSign)); |
| } |
| |
| // ------------------------------ RotateRight |
| |
| // U8 RotateRight is generic for all vector lengths on AVX3_DL |
| template <int kBits, class V, HWY_IF_U8(TFromV<V>)> |
| HWY_API V RotateRight(V v) { |
| static_assert(0 <= kBits && kBits < 8, "Invalid shift count"); |
| |
| const Repartition<uint64_t, DFromV<V>> du64; |
| if (kBits == 0) return v; |
| |
| constexpr uint64_t kShrMatrix = |
| (0x0102040810204080ULL << kBits) & |
| (0x0101010101010101ULL * ((0xFF << kBits) & 0xFF)); |
| constexpr int kShlBits = (-kBits) & 7; |
| constexpr uint64_t kShlMatrix = (0x0102040810204080ULL >> kShlBits) & |
| (0x0101010101010101ULL * (0xFF >> kShlBits)); |
| constexpr uint64_t kMatrix = kShrMatrix | kShlMatrix; |
| |
| return detail::GaloisAffine(v, Set(du64, kMatrix)); |
| } |
| |
| #endif // HWY_TARGET <= HWY_AVX3_DL |
| |
| // ------------------------------ Compress |
| |
| #pragma push_macro("HWY_X86_SLOW_COMPRESS_STORE") |
| |
| #ifndef HWY_X86_SLOW_COMPRESS_STORE // allow override |
| // Slow on Zen4 and SPR, faster if we emulate via Compress(). |
| #if HWY_TARGET == HWY_AVX3_ZEN4 || HWY_TARGET == HWY_AVX3_SPR |
| #define HWY_X86_SLOW_COMPRESS_STORE 1 |
| #else |
| #define HWY_X86_SLOW_COMPRESS_STORE 0 |
| #endif |
| #endif // HWY_X86_SLOW_COMPRESS_STORE |
| |
| // Always implement 8-bit here even if we lack VBMI2 because we can do better |
| // than generic_ops (8 at a time) via the native 32-bit compress (16 at a time). |
| #ifdef HWY_NATIVE_COMPRESS8 |
| #undef HWY_NATIVE_COMPRESS8 |
| #else |
| #define HWY_NATIVE_COMPRESS8 |
| #endif |
| |
| namespace detail { |
| |
| #if HWY_TARGET <= HWY_AVX3_DL // VBMI2 |
| template <size_t N> |
| HWY_INLINE Vec128<uint8_t, N> NativeCompress(const Vec128<uint8_t, N> v, |
| const Mask128<uint8_t, N> mask) { |
| return Vec128<uint8_t, N>{_mm_maskz_compress_epi8(mask.raw, v.raw)}; |
| } |
| HWY_INLINE Vec256<uint8_t> NativeCompress(const Vec256<uint8_t> v, |
| const Mask256<uint8_t> mask) { |
| return Vec256<uint8_t>{_mm256_maskz_compress_epi8(mask.raw, v.raw)}; |
| } |
| #if HWY_MAX_BYTES >= 64 |
| HWY_INLINE Vec512<uint8_t> NativeCompress(const Vec512<uint8_t> v, |
| const Mask512<uint8_t> mask) { |
| return Vec512<uint8_t>{_mm512_maskz_compress_epi8(mask.raw, v.raw)}; |
| } |
| #endif |
| |
| template <size_t N> |
| HWY_INLINE Vec128<uint16_t, N> NativeCompress(const Vec128<uint16_t, N> v, |
| const Mask128<uint16_t, N> mask) { |
| return Vec128<uint16_t, N>{_mm_maskz_compress_epi16(mask.raw, v.raw)}; |
| } |
| HWY_INLINE Vec256<uint16_t> NativeCompress(const Vec256<uint16_t> v, |
| const Mask256<uint16_t> mask) { |
| return Vec256<uint16_t>{_mm256_maskz_compress_epi16(mask.raw, v.raw)}; |
| } |
| #if HWY_MAX_BYTES >= 64 |
| HWY_INLINE Vec512<uint16_t> NativeCompress(const Vec512<uint16_t> v, |
| const Mask512<uint16_t> mask) { |
| return Vec512<uint16_t>{_mm512_maskz_compress_epi16(mask.raw, v.raw)}; |
| } |
| #endif |
| |
| // Do not even define these to prevent accidental usage. |
| #if !HWY_X86_SLOW_COMPRESS_STORE |
| |
| template <size_t N> |
| HWY_INLINE void NativeCompressStore(Vec128<uint8_t, N> v, |
| Mask128<uint8_t, N> mask, |
| uint8_t* HWY_RESTRICT unaligned) { |
| _mm_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw); |
| } |
| HWY_INLINE void NativeCompressStore(Vec256<uint8_t> v, Mask256<uint8_t> mask, |
| uint8_t* HWY_RESTRICT unaligned) { |
| _mm256_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw); |
| } |
| #if HWY_MAX_BYTES >= 64 |
| HWY_INLINE void NativeCompressStore(Vec512<uint8_t> v, Mask512<uint8_t> mask, |
| uint8_t* HWY_RESTRICT unaligned) { |
| _mm512_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw); |
| } |
| #endif |
| |
| template <size_t N> |
| HWY_INLINE void NativeCompressStore(Vec128<uint16_t, N> v, |
| Mask128<uint16_t, N> mask, |
| uint16_t* HWY_RESTRICT unaligned) { |
| _mm_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw); |
| } |
| HWY_INLINE void NativeCompressStore(Vec256<uint16_t> v, Mask256<uint16_t> mask, |
| uint16_t* HWY_RESTRICT unaligned) { |
| _mm256_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw); |
| } |
| #if HWY_MAX_BYTES >= 64 |
| HWY_INLINE void NativeCompressStore(Vec512<uint16_t> v, Mask512<uint16_t> mask, |
| uint16_t* HWY_RESTRICT unaligned) { |
| _mm512_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw); |
| } |
| #endif // HWY_MAX_BYTES >= 64 |
| |
| #endif // HWY_X86_SLOW_COMPRESS_STORE |
| |
| #endif // HWY_TARGET <= HWY_AVX3_DL |
| |
| template <size_t N> |
| HWY_INLINE Vec128<uint32_t, N> NativeCompress(Vec128<uint32_t, N> v, |
| Mask128<uint32_t, N> mask) { |
| return Vec128<uint32_t, N>{_mm_maskz_compress_epi32(mask.raw, v.raw)}; |
| } |
| HWY_INLINE Vec256<uint32_t> NativeCompress(Vec256<uint32_t> v, |
| Mask256<uint32_t> mask) { |
| return Vec256<uint32_t>{_mm256_maskz_compress_epi32(mask.raw, v.raw)}; |
| } |
| |
| #if HWY_MAX_BYTES >= 64 |
| HWY_INLINE Vec512<uint32_t> NativeCompress(Vec512<uint32_t> v, |
| Mask512<uint32_t> mask) { |
| return Vec512<uint32_t>{_mm512_maskz_compress_epi32(mask.raw, v.raw)}; |
| } |
| #endif |
| // We use table-based compress for 64-bit lanes, see CompressIsPartition. |
| |
| // Do not even define these to prevent accidental usage. |
| #if !HWY_X86_SLOW_COMPRESS_STORE |
| |
| template <size_t N> |
| HWY_INLINE void NativeCompressStore(Vec128<uint32_t, N> v, |
| Mask128<uint32_t, N> mask, |
| uint32_t* HWY_RESTRICT unaligned) { |
| _mm_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw); |
| } |
| HWY_INLINE void NativeCompressStore(Vec256<uint32_t> v, Mask256<uint32_t> mask, |
| uint32_t* HWY_RESTRICT unaligned) { |
| _mm256_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw); |
| } |
| #if HWY_MAX_BYTES >= 64 |
| HWY_INLINE void NativeCompressStore(Vec512<uint32_t> v, Mask512<uint32_t> mask, |
| uint32_t* HWY_RESTRICT unaligned) { |
| _mm512_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw); |
| } |
| #endif |
| |
| template <size_t N> |
| HWY_INLINE void NativeCompressStore(Vec128<uint64_t, N> v, |
| Mask128<uint64_t, N> mask, |
| uint64_t* HWY_RESTRICT unaligned) { |
| _mm_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw); |
| } |
| HWY_INLINE void NativeCompressStore(Vec256<uint64_t> v, Mask256<uint64_t> mask, |
| uint64_t* HWY_RESTRICT unaligned) { |
| _mm256_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw); |
| } |
| #if HWY_MAX_BYTES >= 64 |
| HWY_INLINE void NativeCompressStore(Vec512<uint64_t> v, Mask512<uint64_t> mask, |
| uint64_t* HWY_RESTRICT unaligned) { |
| _mm512_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw); |
| } |
| #endif |
| |
| template <size_t N> |
| HWY_INLINE void NativeCompressStore(Vec128<float, N> v, Mask128<float, N> mask, |
| float* HWY_RESTRICT unaligned) { |
| _mm_mask_compressstoreu_ps(unaligned, mask.raw, v.raw); |
| } |
| HWY_INLINE void NativeCompressStore(Vec256<float> v, Mask256<float> mask, |
| float* HWY_RESTRICT unaligned) { |
| _mm256_mask_compressstoreu_ps(unaligned, mask.raw, v.raw); |
| } |
| #if HWY_MAX_BYTES >= 64 |
| HWY_INLINE void NativeCompressStore(Vec512<float> v, Mask512<float> mask, |
| float* HWY_RESTRICT unaligned) { |
| _mm512_mask_compressstoreu_ps(unaligned, mask.raw, v.raw); |
| } |
| #endif |
| |
| template <size_t N> |
| HWY_INLINE void NativeCompressStore(Vec128<double, N> v, |
| Mask128<double, N> mask, |
| double* HWY_RESTRICT unaligned) { |
| _mm_mask_compressstoreu_pd(unaligned, mask.raw, v.raw); |
| } |
| HWY_INLINE void NativeCompressStore(Vec256<double> v, Mask256<double> mask, |
| double* HWY_RESTRICT unaligned) { |
| _mm256_mask_compressstoreu_pd(unaligned, mask.raw, v.raw); |
| } |
| #if HWY_MAX_BYTES >= 64 |
| HWY_INLINE void NativeCompressStore(Vec512<double> v, Mask512<double> mask, |
| double* HWY_RESTRICT unaligned) { |
| _mm512_mask_compressstoreu_pd(unaligned, mask.raw, v.raw); |
| } |
| #endif |
| |
| #endif // HWY_X86_SLOW_COMPRESS_STORE |
| |
| // For u8x16 and <= u16x16 we can avoid store+load for Compress because there is |
| // only a single compressed vector (u32x16). Other EmuCompress are implemented |
| // after the EmuCompressStore they build upon. |
| template <class V, HWY_IF_U8(TFromV<V>), |
| HWY_IF_LANES_LE_D(DFromV<V>, HWY_MAX_BYTES / 4)> |
| static HWY_INLINE HWY_MAYBE_UNUSED V EmuCompress(V v, MFromD<DFromV<V>> mask) { |
| const DFromV<decltype(v)> d; |
| const Rebind<uint32_t, decltype(d)> d32; |
| const VFromD<decltype(d32)> v0 = PromoteTo(d32, v); |
| |
| using M32 = MFromD<decltype(d32)>; |
| const M32 m0 = PromoteMaskTo(d32, d, mask); |
| return TruncateTo(d, Compress(v0, m0)); |
| } |
| |
| template <class V, HWY_IF_U16(TFromV<V>), |
| HWY_IF_LANES_LE_D(DFromV<V>, HWY_MAX_BYTES / 4)> |
| static HWY_INLINE HWY_MAYBE_UNUSED V EmuCompress(V v, MFromD<DFromV<V>> mask) { |
| const DFromV<decltype(v)> d; |
| const Rebind<int32_t, decltype(d)> di32; |
| const RebindToUnsigned<decltype(di32)> du32; |
| |
| const MFromD<decltype(du32)> mask32 = PromoteMaskTo(du32, d, mask); |
| // DemoteTo is 2 ops, but likely lower latency than TruncateTo on SKX. |
| // Only i32 -> u16 is supported, whereas NativeCompress expects u32. |
| const VFromD<decltype(du32)> v32 = PromoteTo(du32, v); |
| return DemoteTo(d, BitCast(di32, NativeCompress(v32, mask32))); |
| } |
| |
| // See above - small-vector EmuCompressStore are implemented via EmuCompress. |
| template <class D, HWY_IF_UNSIGNED_D(D), |
| HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)), |
| HWY_IF_LANES_LE_D(D, HWY_MAX_BYTES / 4)> |
| static HWY_INLINE HWY_MAYBE_UNUSED void EmuCompressStore( |
| VFromD<D> v, MFromD<D> mask, D d, TFromD<D>* HWY_RESTRICT unaligned) { |
| StoreU(EmuCompress(v, mask), d, unaligned); |
| } |
| |
| // Main emulation logic for wider vector, starting with EmuCompressStore because |
| // it is most convenient to merge pieces using memory (concatenating vectors at |
| // byte offsets is difficult). |
| template <class D, HWY_IF_UNSIGNED_D(D), |
| HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)), |
| HWY_IF_LANES_GT_D(D, HWY_MAX_BYTES / 4)> |
| static HWY_INLINE HWY_MAYBE_UNUSED void EmuCompressStore( |
| VFromD<D> v, MFromD<D> mask, D d, TFromD<D>* HWY_RESTRICT unaligned) { |
| const Half<decltype(d)> dh; |
| |
| const MFromD<decltype(dh)> m0 = LowerHalfOfMask(dh, mask); |
| const MFromD<decltype(dh)> m1 = UpperHalfOfMask(dh, mask); |
| |
| const VFromD<decltype(dh)> v0 = LowerHalf(dh, v); |
| const VFromD<decltype(dh)> v1 = UpperHalf(dh, v); |
| |
| EmuCompressStore(v0, m0, dh, unaligned); |
| EmuCompressStore(v1, m1, dh, unaligned + CountTrue(dh, m0)); |
| } |
| |
| // Finally, the remaining EmuCompress for wide vectors, using EmuCompressStore. |
| template <class V, HWY_IF_UNSIGNED_V(V), |
| HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)), |
| HWY_IF_LANES_GT_D(DFromV<V>, HWY_MAX_BYTES / 4)> |
| static HWY_INLINE HWY_MAYBE_UNUSED V EmuCompress(V v, MFromD<DFromV<V>> mask) { |
| using D = DFromV<decltype(v)>; |
| using T = TFromD<D>; |
| const D d; |
| |
| alignas(HWY_MAX_LANES_D(D) * sizeof(T)) T buf[2 * HWY_MAX_LANES_D(D)]; |
| EmuCompressStore(v, mask, d, buf); |
| return Load(d, buf); |
| } |
| |
| } // namespace detail |
| |
| template <class V, class M, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))> |
| HWY_API V Compress(V v, const M mask) { |
| const DFromV<decltype(v)> d; |
| const RebindToUnsigned<decltype(d)> du; |
| const auto mu = RebindMask(du, mask); |
| #if HWY_TARGET <= HWY_AVX3_DL // VBMI2 |
| return BitCast(d, detail::NativeCompress(BitCast(du, v), mu)); |
| #else |
| return BitCast(d, detail::EmuCompress(BitCast(du, v), mu)); |
| #endif |
| } |
| |
| template <class V, class M, HWY_IF_T_SIZE_V(V, 4)> |
| HWY_API V Compress(V v, const M mask) { |
| const DFromV<decltype(v)> d; |
| const RebindToUnsigned<decltype(d)> du; |
| const auto mu = RebindMask(du, mask); |
| return BitCast(d, detail::NativeCompress(BitCast(du, v), mu)); |
| } |
| |
| // ------------------------------ CompressNot |
| |
| template <class V, class M, HWY_IF_NOT_T_SIZE_V(V, 8)> |
| HWY_API V CompressNot(V v, const M mask) { |
| return Compress(v, Not(mask)); |
| } |
| |
| // uint64_t lanes. Only implement for 256 and 512-bit vectors because this is a |
| // no-op for 128-bit. |
| template <class V, class M, HWY_IF_V_SIZE_GT_D(DFromV<V>, 16)> |
| HWY_API V CompressBlocksNot(V v, M mask) { |
| return CompressNot(v, mask); |
| } |
| |
| // ------------------------------ CompressBits |
| template <class V> |
| HWY_API V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) { |
| return Compress(v, LoadMaskBits(DFromV<V>(), bits)); |
| } |
| |
| // ------------------------------ CompressStore |
| |
| // Generic for all vector lengths. |
| |
| template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))> |
| HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d, |
| TFromD<D>* HWY_RESTRICT unaligned) { |
| #if HWY_X86_SLOW_COMPRESS_STORE |
| StoreU(Compress(v, mask), d, unaligned); |
| #else |
| const RebindToUnsigned<decltype(d)> du; |
| const auto mu = RebindMask(du, mask); |
| auto pu = reinterpret_cast<TFromD<decltype(du)> * HWY_RESTRICT>(unaligned); |
| |
| #if HWY_TARGET <= HWY_AVX3_DL // VBMI2 |
| detail::NativeCompressStore(BitCast(du, v), mu, pu); |
| #else |
| detail::EmuCompressStore(BitCast(du, v), mu, du, pu); |
| #endif |
| #endif // HWY_X86_SLOW_COMPRESS_STORE |
| const size_t count = CountTrue(d, mask); |
| detail::MaybeUnpoison(unaligned, count); |
| return count; |
| } |
| |
| template <class D, HWY_IF_NOT_FLOAT_D(D), |
| HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))> |
| HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d, |
| TFromD<D>* HWY_RESTRICT unaligned) { |
| #if HWY_X86_SLOW_COMPRESS_STORE |
| StoreU(Compress(v, mask), d, unaligned); |
| #else |
| const RebindToUnsigned<decltype(d)> du; |
| const auto mu = RebindMask(du, mask); |
| using TU = TFromD<decltype(du)>; |
| TU* HWY_RESTRICT pu = reinterpret_cast<TU*>(unaligned); |
| detail::NativeCompressStore(BitCast(du, v), mu, pu); |
| #endif // HWY_X86_SLOW_COMPRESS_STORE |
| const size_t count = CountTrue(d, mask); |
| detail::MaybeUnpoison(unaligned, count); |
| return count; |
| } |
| |
| // Additional overloads to avoid casting to uint32_t (delay?). |
| template <class D, HWY_IF_FLOAT3264_D(D)> |
| HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d, |
| TFromD<D>* HWY_RESTRICT unaligned) { |
| #if HWY_X86_SLOW_COMPRESS_STORE |
| StoreU(Compress(v, mask), d, unaligned); |
| #else |
| (void)d; |
| detail::NativeCompressStore(v, mask, unaligned); |
| #endif // HWY_X86_SLOW_COMPRESS_STORE |
| const size_t count = PopCount(uint64_t{mask.raw}); |
| detail::MaybeUnpoison(unaligned, count); |
| return count; |
| } |
| |
| // ------------------------------ CompressBlendedStore |
| template <class D> |
| HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d, |
| TFromD<D>* HWY_RESTRICT unaligned) { |
| // Native CompressStore already does the blending at no extra cost (latency |
| // 11, rthroughput 2 - same as compress plus store). |
| |
| HWY_IF_CONSTEXPR(HWY_MAX_LANES_D(D) < (16 / sizeof(TFromD<D>))) { |
| m = And(m, FirstN(d, HWY_MAX_LANES_D(D))); |
| } |
| |
| HWY_IF_CONSTEXPR(!HWY_X86_SLOW_COMPRESS_STORE && |
| (HWY_TARGET <= HWY_AVX3_DL || sizeof(TFromD<D>) > 2)) { |
| return CompressStore(v, m, d, unaligned); |
| } |
| else { |
| const size_t count = CountTrue(d, m); |
| StoreN(Compress(v, m), d, unaligned, count); |
| detail::MaybeUnpoison(unaligned, count); |
| return count; |
| } |
| } |
| |
| // ------------------------------ CompressBitsStore |
| // Generic for all vector lengths. |
| template <class D> |
| HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits, |
| D d, TFromD<D>* HWY_RESTRICT unaligned) { |
| return CompressStore(v, LoadMaskBits(d, bits), d, unaligned); |
| } |
| |
| #pragma pop_macro("HWY_X86_SLOW_COMPRESS_STORE") |
| |
| // NOLINTNEXTLINE(google-readability-namespace-comments) |
| } // namespace HWY_NAMESPACE |
| } // namespace hwy |
| HWY_AFTER_NAMESPACE(); |
| |
| // Note that the GCC warnings are not suppressed if we only wrap the *intrin.h - |
| // the warning seems to be issued at the call site of intrinsics, i.e. our code. |
| HWY_DIAGNOSTICS(pop) |