third_party/highway/hwy/ops/x86_avx3-inl.h - aom - Git at Google

 // Copyright 2019 Google LLC
 // SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // External include guard in highway.h - see comment there.

 #if HWY_TARGET == HWY_AVX10_2
 // For AVX10 targets that only support 256-bit or smaller vectors. Already
 // includes base.h and shared-inl.h.
 #include "third_party/highway/hwy/ops/x86_256-inl.h"
 #else
 // For AVX3/AVX10 targets that support 512-byte vectors. Already includes base.h
 // and shared-inl.h.
 #include "third_party/highway/hwy/ops/x86_512-inl.h"
 #endif

 // AVX3/AVX10 ops that have dependencies on ops defined in x86_512-inl.h if
 // HWY_MAX_BYTES >= 64 is true are defined below

 // Avoid uninitialized warnings in GCC's avx512fintrin.h - see
 // https://github.com/google/highway/issues/710)
 HWY_DIAGNOSTICS(push)
 #if HWY_COMPILER_GCC_ACTUAL
 HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
 HWY_DIAGNOSTICS_OFF(disable : 4701 4703 6001 26494,
                     ignored "-Wmaybe-uninitialized")
 #endif

 HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {

 #if HWY_TARGET <= HWY_AVX3_DL

 // ------------------------------ ShiftLeft

 // Generic for all vector lengths. Must be defined after all GaloisAffine.
 template <int kBits, class V, HWY_IF_T_SIZE_V(V, 1)>
 HWY_API V ShiftLeft(const V v) {
   const Repartition<uint64_t, DFromV<V>> du64;
   if (kBits == 0) return v;
   if (kBits == 1) return v + v;
   constexpr uint64_t kMatrix = (0x0102040810204080ULL >> kBits) &
                                (0x0101010101010101ULL * (0xFF >> kBits));
   return detail::GaloisAffine(v, Set(du64, kMatrix));
 }

 // ------------------------------ ShiftRight

 // Generic for all vector lengths. Must be defined after all GaloisAffine.
 template <int kBits, class V, HWY_IF_U8_D(DFromV<V>)>
 HWY_API V ShiftRight(const V v) {
   const Repartition<uint64_t, DFromV<V>> du64;
   if (kBits == 0) return v;
   constexpr uint64_t kMatrix =
       (0x0102040810204080ULL << kBits) &
       (0x0101010101010101ULL * ((0xFF << kBits) & 0xFF));
   return detail::GaloisAffine(v, Set(du64, kMatrix));
 }

 // Generic for all vector lengths. Must be defined after all GaloisAffine.
 template <int kBits, class V, HWY_IF_I8_D(DFromV<V>)>
 HWY_API V ShiftRight(const V v) {
   const Repartition<uint64_t, DFromV<V>> du64;
   if (kBits == 0) return v;
   constexpr uint64_t kShift =
       (0x0102040810204080ULL << kBits) &
       (0x0101010101010101ULL * ((0xFF << kBits) & 0xFF));
   constexpr uint64_t kSign =
       kBits == 0 ? 0 : (0x8080808080808080ULL >> (64 - (8 * kBits)));
   return detail::GaloisAffine(v, Set(du64, kShift | kSign));
 }

 // ------------------------------ RotateRight

 // U8 RotateRight is generic for all vector lengths on AVX3_DL
 template <int kBits, class V, HWY_IF_U8(TFromV<V>)>
 HWY_API V RotateRight(V v) {
   static_assert(0 <= kBits && kBits < 8, "Invalid shift count");

   const Repartition<uint64_t, DFromV<V>> du64;
   if (kBits == 0) return v;

   constexpr uint64_t kShrMatrix =
       (0x0102040810204080ULL << kBits) &
       (0x0101010101010101ULL * ((0xFF << kBits) & 0xFF));
   constexpr int kShlBits = (-kBits) & 7;
   constexpr uint64_t kShlMatrix = (0x0102040810204080ULL >> kShlBits) &
                                   (0x0101010101010101ULL * (0xFF >> kShlBits));
   constexpr uint64_t kMatrix = kShrMatrix | kShlMatrix;

   return detail::GaloisAffine(v, Set(du64, kMatrix));
 }

 #endif  // HWY_TARGET <= HWY_AVX3_DL

 // ------------------------------ Compress

 #pragma push_macro("HWY_X86_SLOW_COMPRESS_STORE")

 #ifndef HWY_X86_SLOW_COMPRESS_STORE  // allow override
 // Slow on Zen4 and SPR, faster if we emulate via Compress().
 #if HWY_TARGET == HWY_AVX3_ZEN4 || HWY_TARGET == HWY_AVX3_SPR
 #define HWY_X86_SLOW_COMPRESS_STORE 1
 #else
 #define HWY_X86_SLOW_COMPRESS_STORE 0
 #endif
 #endif  // HWY_X86_SLOW_COMPRESS_STORE

 // Always implement 8-bit here even if we lack VBMI2 because we can do better
 // than generic_ops (8 at a time) via the native 32-bit compress (16 at a time).
 #ifdef HWY_NATIVE_COMPRESS8
 #undef HWY_NATIVE_COMPRESS8
 #else
 #define HWY_NATIVE_COMPRESS8
 #endif

 namespace detail {

 #if HWY_TARGET <= HWY_AVX3_DL  // VBMI2
 template <size_t N>
 HWY_INLINE Vec128<uint8_t, N> NativeCompress(const Vec128<uint8_t, N> v,
                                              const Mask128<uint8_t, N> mask) {
   return Vec128<uint8_t, N>{_mm_maskz_compress_epi8(mask.raw, v.raw)};
 }
 HWY_INLINE Vec256<uint8_t> NativeCompress(const Vec256<uint8_t> v,
                                           const Mask256<uint8_t> mask) {
   return Vec256<uint8_t>{_mm256_maskz_compress_epi8(mask.raw, v.raw)};
 }
 #if HWY_MAX_BYTES >= 64
 HWY_INLINE Vec512<uint8_t> NativeCompress(const Vec512<uint8_t> v,
                                           const Mask512<uint8_t> mask) {
   return Vec512<uint8_t>{_mm512_maskz_compress_epi8(mask.raw, v.raw)};
 }
 #endif

 template <size_t N>
 HWY_INLINE Vec128<uint16_t, N> NativeCompress(const Vec128<uint16_t, N> v,
                                               const Mask128<uint16_t, N> mask) {
   return Vec128<uint16_t, N>{_mm_maskz_compress_epi16(mask.raw, v.raw)};
 }
 HWY_INLINE Vec256<uint16_t> NativeCompress(const Vec256<uint16_t> v,
                                            const Mask256<uint16_t> mask) {
   return Vec256<uint16_t>{_mm256_maskz_compress_epi16(mask.raw, v.raw)};
 }
 #if HWY_MAX_BYTES >= 64
 HWY_INLINE Vec512<uint16_t> NativeCompress(const Vec512<uint16_t> v,
                                            const Mask512<uint16_t> mask) {
   return Vec512<uint16_t>{_mm512_maskz_compress_epi16(mask.raw, v.raw)};
 }
 #endif

 // Do not even define these to prevent accidental usage.
 #if !HWY_X86_SLOW_COMPRESS_STORE

 template <size_t N>
 HWY_INLINE void NativeCompressStore(Vec128<uint8_t, N> v,
                                     Mask128<uint8_t, N> mask,
                                     uint8_t* HWY_RESTRICT unaligned) {
   _mm_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw);
 }
 HWY_INLINE void NativeCompressStore(Vec256<uint8_t> v, Mask256<uint8_t> mask,
                                     uint8_t* HWY_RESTRICT unaligned) {
   _mm256_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw);
 }
 #if HWY_MAX_BYTES >= 64
 HWY_INLINE void NativeCompressStore(Vec512<uint8_t> v, Mask512<uint8_t> mask,
                                     uint8_t* HWY_RESTRICT unaligned) {
   _mm512_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw);
 }
 #endif

 template <size_t N>
 HWY_INLINE void NativeCompressStore(Vec128<uint16_t, N> v,
                                     Mask128<uint16_t, N> mask,
                                     uint16_t* HWY_RESTRICT unaligned) {
   _mm_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw);
 }
 HWY_INLINE void NativeCompressStore(Vec256<uint16_t> v, Mask256<uint16_t> mask,
                                     uint16_t* HWY_RESTRICT unaligned) {
   _mm256_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw);
 }
 #if HWY_MAX_BYTES >= 64
 HWY_INLINE void NativeCompressStore(Vec512<uint16_t> v, Mask512<uint16_t> mask,
                                     uint16_t* HWY_RESTRICT unaligned) {
   _mm512_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw);
 }
 #endif  // HWY_MAX_BYTES >= 64

 #endif  // HWY_X86_SLOW_COMPRESS_STORE

 #endif  // HWY_TARGET <= HWY_AVX3_DL

 template <size_t N>
 HWY_INLINE Vec128<uint32_t, N> NativeCompress(Vec128<uint32_t, N> v,
                                               Mask128<uint32_t, N> mask) {
   return Vec128<uint32_t, N>{_mm_maskz_compress_epi32(mask.raw, v.raw)};
 }
 HWY_INLINE Vec256<uint32_t> NativeCompress(Vec256<uint32_t> v,
                                            Mask256<uint32_t> mask) {
   return Vec256<uint32_t>{_mm256_maskz_compress_epi32(mask.raw, v.raw)};
 }

 #if HWY_MAX_BYTES >= 64
 HWY_INLINE Vec512<uint32_t> NativeCompress(Vec512<uint32_t> v,
                                            Mask512<uint32_t> mask) {
   return Vec512<uint32_t>{_mm512_maskz_compress_epi32(mask.raw, v.raw)};
 }
 #endif
 // We use table-based compress for 64-bit lanes, see CompressIsPartition.

 // Do not even define these to prevent accidental usage.
 #if !HWY_X86_SLOW_COMPRESS_STORE

 template <size_t N>
 HWY_INLINE void NativeCompressStore(Vec128<uint32_t, N> v,
                                     Mask128<uint32_t, N> mask,
                                     uint32_t* HWY_RESTRICT unaligned) {
   _mm_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
 }
 HWY_INLINE void NativeCompressStore(Vec256<uint32_t> v, Mask256<uint32_t> mask,
                                     uint32_t* HWY_RESTRICT unaligned) {
   _mm256_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
 }
 #if HWY_MAX_BYTES >= 64
 HWY_INLINE void NativeCompressStore(Vec512<uint32_t> v, Mask512<uint32_t> mask,
                                     uint32_t* HWY_RESTRICT unaligned) {
   _mm512_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
 }
 #endif

 template <size_t N>
 HWY_INLINE void NativeCompressStore(Vec128<uint64_t, N> v,
                                     Mask128<uint64_t, N> mask,
                                     uint64_t* HWY_RESTRICT unaligned) {
   _mm_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
 }
 HWY_INLINE void NativeCompressStore(Vec256<uint64_t> v, Mask256<uint64_t> mask,
                                     uint64_t* HWY_RESTRICT unaligned) {
   _mm256_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
 }
 #if HWY_MAX_BYTES >= 64
 HWY_INLINE void NativeCompressStore(Vec512<uint64_t> v, Mask512<uint64_t> mask,
                                     uint64_t* HWY_RESTRICT unaligned) {
   _mm512_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
 }
 #endif

 template <size_t N>
 HWY_INLINE void NativeCompressStore(Vec128<float, N> v, Mask128<float, N> mask,
                                     float* HWY_RESTRICT unaligned) {
   _mm_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
 }
 HWY_INLINE void NativeCompressStore(Vec256<float> v, Mask256<float> mask,
                                     float* HWY_RESTRICT unaligned) {
   _mm256_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
 }
 #if HWY_MAX_BYTES >= 64
 HWY_INLINE void NativeCompressStore(Vec512<float> v, Mask512<float> mask,
                                     float* HWY_RESTRICT unaligned) {
   _mm512_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
 }
 #endif

 template <size_t N>
 HWY_INLINE void NativeCompressStore(Vec128<double, N> v,
                                     Mask128<double, N> mask,
                                     double* HWY_RESTRICT unaligned) {
   _mm_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
 }
 HWY_INLINE void NativeCompressStore(Vec256<double> v, Mask256<double> mask,
                                     double* HWY_RESTRICT unaligned) {
   _mm256_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
 }
 #if HWY_MAX_BYTES >= 64
 HWY_INLINE void NativeCompressStore(Vec512<double> v, Mask512<double> mask,
                                     double* HWY_RESTRICT unaligned) {
   _mm512_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
 }
 #endif

 #endif  // HWY_X86_SLOW_COMPRESS_STORE

 // For u8x16 and <= u16x16 we can avoid store+load for Compress because there is
 // only a single compressed vector (u32x16). Other EmuCompress are implemented
 // after the EmuCompressStore they build upon.
 template <class V, HWY_IF_U8(TFromV<V>),
           HWY_IF_LANES_LE_D(DFromV<V>, HWY_MAX_BYTES / 4)>
 static HWY_INLINE HWY_MAYBE_UNUSED V EmuCompress(V v, MFromD<DFromV<V>> mask) {
   const DFromV<decltype(v)> d;
   const Rebind<uint32_t, decltype(d)> d32;
   const VFromD<decltype(d32)> v0 = PromoteTo(d32, v);

   using M32 = MFromD<decltype(d32)>;
   const M32 m0 = PromoteMaskTo(d32, d, mask);
   return TruncateTo(d, Compress(v0, m0));
 }

 template <class V, HWY_IF_U16(TFromV<V>),
           HWY_IF_LANES_LE_D(DFromV<V>, HWY_MAX_BYTES / 4)>
 static HWY_INLINE HWY_MAYBE_UNUSED V EmuCompress(V v, MFromD<DFromV<V>> mask) {
   const DFromV<decltype(v)> d;
   const Rebind<int32_t, decltype(d)> di32;
   const RebindToUnsigned<decltype(di32)> du32;

   const MFromD<decltype(du32)> mask32 = PromoteMaskTo(du32, d, mask);
   // DemoteTo is 2 ops, but likely lower latency than TruncateTo on SKX.
   // Only i32 -> u16 is supported, whereas NativeCompress expects u32.
   const VFromD<decltype(du32)> v32 = PromoteTo(du32, v);
   return DemoteTo(d, BitCast(di32, NativeCompress(v32, mask32)));
 }

 // See above - small-vector EmuCompressStore are implemented via EmuCompress.
 template <class D, HWY_IF_UNSIGNED_D(D),
           HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
           HWY_IF_LANES_LE_D(D, HWY_MAX_BYTES / 4)>
 static HWY_INLINE HWY_MAYBE_UNUSED void EmuCompressStore(
     VFromD<D> v, MFromD<D> mask, D d, TFromD<D>* HWY_RESTRICT unaligned) {
   StoreU(EmuCompress(v, mask), d, unaligned);
 }

 // Main emulation logic for wider vector, starting with EmuCompressStore because
 // it is most convenient to merge pieces using memory (concatenating vectors at
 // byte offsets is difficult).
 template <class D, HWY_IF_UNSIGNED_D(D),
           HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
           HWY_IF_LANES_GT_D(D, HWY_MAX_BYTES / 4)>
 static HWY_INLINE HWY_MAYBE_UNUSED void EmuCompressStore(
     VFromD<D> v, MFromD<D> mask, D d, TFromD<D>* HWY_RESTRICT unaligned) {
   const Half<decltype(d)> dh;

   const MFromD<decltype(dh)> m0 = LowerHalfOfMask(dh, mask);
   const MFromD<decltype(dh)> m1 = UpperHalfOfMask(dh, mask);

   const VFromD<decltype(dh)> v0 = LowerHalf(dh, v);
   const VFromD<decltype(dh)> v1 = UpperHalf(dh, v);

   EmuCompressStore(v0, m0, dh, unaligned);
   EmuCompressStore(v1, m1, dh, unaligned + CountTrue(dh, m0));
 }

 // Finally, the remaining EmuCompress for wide vectors, using EmuCompressStore.
 template <class V, HWY_IF_UNSIGNED_V(V),
           HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
           HWY_IF_LANES_GT_D(DFromV<V>, HWY_MAX_BYTES / 4)>
 static HWY_INLINE HWY_MAYBE_UNUSED V EmuCompress(V v, MFromD<DFromV<V>> mask) {
   using D = DFromV<decltype(v)>;
   using T = TFromD<D>;
   const D d;

   alignas(HWY_MAX_LANES_D(D) * sizeof(T)) T buf[2 * HWY_MAX_LANES_D(D)];
   EmuCompressStore(v, mask, d, buf);
   return Load(d, buf);
 }

 }  // namespace detail

 template <class V, class M, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
 HWY_API V Compress(V v, const M mask) {
   const DFromV<decltype(v)> d;
   const RebindToUnsigned<decltype(d)> du;
   const auto mu = RebindMask(du, mask);
 #if HWY_TARGET <= HWY_AVX3_DL  // VBMI2
   return BitCast(d, detail::NativeCompress(BitCast(du, v), mu));
 #else
   return BitCast(d, detail::EmuCompress(BitCast(du, v), mu));
 #endif
 }

 template <class V, class M, HWY_IF_T_SIZE_V(V, 4)>
 HWY_API V Compress(V v, const M mask) {
   const DFromV<decltype(v)> d;
   const RebindToUnsigned<decltype(d)> du;
   const auto mu = RebindMask(du, mask);
   return BitCast(d, detail::NativeCompress(BitCast(du, v), mu));
 }

 // ------------------------------ CompressNot

 template <class V, class M, HWY_IF_NOT_T_SIZE_V(V, 8)>
 HWY_API V CompressNot(V v, const M mask) {
   return Compress(v, Not(mask));
 }

 // uint64_t lanes. Only implement for 256 and 512-bit vectors because this is a
 // no-op for 128-bit.
 template <class V, class M, HWY_IF_V_SIZE_GT_D(DFromV<V>, 16)>
 HWY_API V CompressBlocksNot(V v, M mask) {
   return CompressNot(v, mask);
 }

 // ------------------------------ CompressBits
 template <class V>
 HWY_API V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) {
   return Compress(v, LoadMaskBits(DFromV<V>(), bits));
 }

 // ------------------------------ CompressStore

 // Generic for all vector lengths.

 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
 HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d,
                              TFromD<D>* HWY_RESTRICT unaligned) {
 #if HWY_X86_SLOW_COMPRESS_STORE
   StoreU(Compress(v, mask), d, unaligned);
 #else
   const RebindToUnsigned<decltype(d)> du;
   const auto mu = RebindMask(du, mask);
   auto pu = reinterpret_cast<TFromD<decltype(du)> * HWY_RESTRICT>(unaligned);

 #if HWY_TARGET <= HWY_AVX3_DL  // VBMI2
   detail::NativeCompressStore(BitCast(du, v), mu, pu);
 #else
   detail::EmuCompressStore(BitCast(du, v), mu, du, pu);
 #endif
 #endif  // HWY_X86_SLOW_COMPRESS_STORE
   const size_t count = CountTrue(d, mask);
   detail::MaybeUnpoison(unaligned, count);
   return count;
 }

 template <class D, HWY_IF_NOT_FLOAT_D(D),
           HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
 HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d,
                              TFromD<D>* HWY_RESTRICT unaligned) {
 #if HWY_X86_SLOW_COMPRESS_STORE
   StoreU(Compress(v, mask), d, unaligned);
 #else
   const RebindToUnsigned<decltype(d)> du;
   const auto mu = RebindMask(du, mask);
   using TU = TFromD<decltype(du)>;
   TU* HWY_RESTRICT pu = reinterpret_cast<TU*>(unaligned);
   detail::NativeCompressStore(BitCast(du, v), mu, pu);
 #endif  // HWY_X86_SLOW_COMPRESS_STORE
   const size_t count = CountTrue(d, mask);
   detail::MaybeUnpoison(unaligned, count);
   return count;
 }

 // Additional overloads to avoid casting to uint32_t (delay?).
 template <class D, HWY_IF_FLOAT3264_D(D)>
 HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d,
                              TFromD<D>* HWY_RESTRICT unaligned) {
 #if HWY_X86_SLOW_COMPRESS_STORE
   StoreU(Compress(v, mask), d, unaligned);
 #else
   (void)d;
   detail::NativeCompressStore(v, mask, unaligned);
 #endif  // HWY_X86_SLOW_COMPRESS_STORE
   const size_t count = PopCount(uint64_t{mask.raw});
   detail::MaybeUnpoison(unaligned, count);
   return count;
 }

 // ------------------------------ CompressBlendedStore
 template <class D>
 HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
                                     TFromD<D>* HWY_RESTRICT unaligned) {
   // Native CompressStore already does the blending at no extra cost (latency
   // 11, rthroughput 2 - same as compress plus store).

   HWY_IF_CONSTEXPR(HWY_MAX_LANES_D(D) < (16 / sizeof(TFromD<D>))) {
     m = And(m, FirstN(d, HWY_MAX_LANES_D(D)));
   }

   HWY_IF_CONSTEXPR(!HWY_X86_SLOW_COMPRESS_STORE &&
                    (HWY_TARGET <= HWY_AVX3_DL || sizeof(TFromD<D>) > 2)) {
     return CompressStore(v, m, d, unaligned);
   }
   else {
     const size_t count = CountTrue(d, m);
     StoreN(Compress(v, m), d, unaligned, count);
     detail::MaybeUnpoison(unaligned, count);
     return count;
   }
 }

 // ------------------------------ CompressBitsStore
 // Generic for all vector lengths.
 template <class D>
 HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
                                  D d, TFromD<D>* HWY_RESTRICT unaligned) {
   return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
 }

 #pragma pop_macro("HWY_X86_SLOW_COMPRESS_STORE")

 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace hwy
 HWY_AFTER_NAMESPACE();

 // Note that the GCC warnings are not suppressed if we only wrap the *intrin.h -
 // the warning seems to be issued at the call site of intrinsics, i.e. our code.
 HWY_DIAGNOSTICS(pop)
	// Copyright 2019 Google LLC
	// SPDX-License-Identifier: Apache-2.0
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	// External include guard in highway.h - see comment there.

	#if HWY_TARGET == HWY_AVX10_2
	// For AVX10 targets that only support 256-bit or smaller vectors. Already
	// includes base.h and shared-inl.h.
	#include "third_party/highway/hwy/ops/x86_256-inl.h"
	#else
	// For AVX3/AVX10 targets that support 512-byte vectors. Already includes base.h
	// and shared-inl.h.
	#include "third_party/highway/hwy/ops/x86_512-inl.h"
	#endif

	// AVX3/AVX10 ops that have dependencies on ops defined in x86_512-inl.h if
	// HWY_MAX_BYTES >= 64 is true are defined below

	// Avoid uninitialized warnings in GCC's avx512fintrin.h - see
	// https://github.com/google/highway/issues/710)
	HWY_DIAGNOSTICS(push)
	#if HWY_COMPILER_GCC_ACTUAL
	HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
	HWY_DIAGNOSTICS_OFF(disable : 4701 4703 6001 26494,
	ignored "-Wmaybe-uninitialized")
	#endif

	HWY_BEFORE_NAMESPACE();
	namespace hwy {
	namespace HWY_NAMESPACE {

	#if HWY_TARGET <= HWY_AVX3_DL

	// ------------------------------ ShiftLeft

	// Generic for all vector lengths. Must be defined after all GaloisAffine.
	template <int kBits, class V, HWY_IF_T_SIZE_V(V, 1)>
	HWY_API V ShiftLeft(const V v) {
	const Repartition<uint64_t, DFromV<V>> du64;
	if (kBits == 0) return v;
	if (kBits == 1) return v + v;
	constexpr uint64_t kMatrix = (0x0102040810204080ULL >> kBits) &
	(0x0101010101010101ULL * (0xFF >> kBits));
	return detail::GaloisAffine(v, Set(du64, kMatrix));
	}

	// ------------------------------ ShiftRight

	// Generic for all vector lengths. Must be defined after all GaloisAffine.
	template <int kBits, class V, HWY_IF_U8_D(DFromV<V>)>
	HWY_API V ShiftRight(const V v) {
	const Repartition<uint64_t, DFromV<V>> du64;
	if (kBits == 0) return v;
	constexpr uint64_t kMatrix =
	(0x0102040810204080ULL << kBits) &
	(0x0101010101010101ULL * ((0xFF << kBits) & 0xFF));
	return detail::GaloisAffine(v, Set(du64, kMatrix));
	}

	// Generic for all vector lengths. Must be defined after all GaloisAffine.
	template <int kBits, class V, HWY_IF_I8_D(DFromV<V>)>
	HWY_API V ShiftRight(const V v) {
	const Repartition<uint64_t, DFromV<V>> du64;
	if (kBits == 0) return v;
	constexpr uint64_t kShift =
	(0x0102040810204080ULL << kBits) &
	(0x0101010101010101ULL * ((0xFF << kBits) & 0xFF));
	constexpr uint64_t kSign =
	kBits == 0 ? 0 : (0x8080808080808080ULL >> (64 - (8 * kBits)));
	return detail::GaloisAffine(v, Set(du64, kShift \| kSign));
	}

	// ------------------------------ RotateRight

	// U8 RotateRight is generic for all vector lengths on AVX3_DL
	template <int kBits, class V, HWY_IF_U8(TFromV<V>)>
	HWY_API V RotateRight(V v) {
	static_assert(0 <= kBits && kBits < 8, "Invalid shift count");

	const Repartition<uint64_t, DFromV<V>> du64;
	if (kBits == 0) return v;

	constexpr uint64_t kShrMatrix =
	(0x0102040810204080ULL << kBits) &
	(0x0101010101010101ULL * ((0xFF << kBits) & 0xFF));
	constexpr int kShlBits = (-kBits) & 7;
	constexpr uint64_t kShlMatrix = (0x0102040810204080ULL >> kShlBits) &
	(0x0101010101010101ULL * (0xFF >> kShlBits));
	constexpr uint64_t kMatrix = kShrMatrix \| kShlMatrix;

	return detail::GaloisAffine(v, Set(du64, kMatrix));
	}

	#endif // HWY_TARGET <= HWY_AVX3_DL

	// ------------------------------ Compress

	#pragma push_macro("HWY_X86_SLOW_COMPRESS_STORE")

	#ifndef HWY_X86_SLOW_COMPRESS_STORE // allow override
	// Slow on Zen4 and SPR, faster if we emulate via Compress().
	#if HWY_TARGET == HWY_AVX3_ZEN4 \|\| HWY_TARGET == HWY_AVX3_SPR
	#define HWY_X86_SLOW_COMPRESS_STORE 1
	#else
	#define HWY_X86_SLOW_COMPRESS_STORE 0
	#endif
	#endif // HWY_X86_SLOW_COMPRESS_STORE

	// Always implement 8-bit here even if we lack VBMI2 because we can do better
	// than generic_ops (8 at a time) via the native 32-bit compress (16 at a time).
	#ifdef HWY_NATIVE_COMPRESS8
	#undef HWY_NATIVE_COMPRESS8
	#else
	#define HWY_NATIVE_COMPRESS8
	#endif

	namespace detail {

	#if HWY_TARGET <= HWY_AVX3_DL // VBMI2
	template <size_t N>
	HWY_INLINE Vec128<uint8_t, N> NativeCompress(const Vec128<uint8_t, N> v,
	const Mask128<uint8_t, N> mask) {
	return Vec128<uint8_t, N>{_mm_maskz_compress_epi8(mask.raw, v.raw)};
	}
	HWY_INLINE Vec256<uint8_t> NativeCompress(const Vec256<uint8_t> v,
	const Mask256<uint8_t> mask) {
	return Vec256<uint8_t>{_mm256_maskz_compress_epi8(mask.raw, v.raw)};
	}
	#if HWY_MAX_BYTES >= 64
	HWY_INLINE Vec512<uint8_t> NativeCompress(const Vec512<uint8_t> v,
	const Mask512<uint8_t> mask) {
	return Vec512<uint8_t>{_mm512_maskz_compress_epi8(mask.raw, v.raw)};
	}
	#endif

	template <size_t N>
	HWY_INLINE Vec128<uint16_t, N> NativeCompress(const Vec128<uint16_t, N> v,
	const Mask128<uint16_t, N> mask) {
	return Vec128<uint16_t, N>{_mm_maskz_compress_epi16(mask.raw, v.raw)};
	}
	HWY_INLINE Vec256<uint16_t> NativeCompress(const Vec256<uint16_t> v,
	const Mask256<uint16_t> mask) {
	return Vec256<uint16_t>{_mm256_maskz_compress_epi16(mask.raw, v.raw)};
	}
	#if HWY_MAX_BYTES >= 64
	HWY_INLINE Vec512<uint16_t> NativeCompress(const Vec512<uint16_t> v,
	const Mask512<uint16_t> mask) {
	return Vec512<uint16_t>{_mm512_maskz_compress_epi16(mask.raw, v.raw)};
	}
	#endif

	// Do not even define these to prevent accidental usage.
	#if !HWY_X86_SLOW_COMPRESS_STORE

	template <size_t N>
	HWY_INLINE void NativeCompressStore(Vec128<uint8_t, N> v,
	Mask128<uint8_t, N> mask,
	uint8_t* HWY_RESTRICT unaligned) {
	_mm_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw);
	}
	HWY_INLINE void NativeCompressStore(Vec256<uint8_t> v, Mask256<uint8_t> mask,
	uint8_t* HWY_RESTRICT unaligned) {
	_mm256_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw);
	}
	#if HWY_MAX_BYTES >= 64
	HWY_INLINE void NativeCompressStore(Vec512<uint8_t> v, Mask512<uint8_t> mask,
	uint8_t* HWY_RESTRICT unaligned) {
	_mm512_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw);
	}
	#endif

	template <size_t N>
	HWY_INLINE void NativeCompressStore(Vec128<uint16_t, N> v,
	Mask128<uint16_t, N> mask,
	uint16_t* HWY_RESTRICT unaligned) {
	_mm_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw);
	}
	HWY_INLINE void NativeCompressStore(Vec256<uint16_t> v, Mask256<uint16_t> mask,
	uint16_t* HWY_RESTRICT unaligned) {
	_mm256_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw);
	}
	#if HWY_MAX_BYTES >= 64
	HWY_INLINE void NativeCompressStore(Vec512<uint16_t> v, Mask512<uint16_t> mask,
	uint16_t* HWY_RESTRICT unaligned) {
	_mm512_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw);
	}
	#endif // HWY_MAX_BYTES >= 64

	#endif // HWY_X86_SLOW_COMPRESS_STORE

	#endif // HWY_TARGET <= HWY_AVX3_DL

	template <size_t N>
	HWY_INLINE Vec128<uint32_t, N> NativeCompress(Vec128<uint32_t, N> v,
	Mask128<uint32_t, N> mask) {
	return Vec128<uint32_t, N>{_mm_maskz_compress_epi32(mask.raw, v.raw)};
	}
	HWY_INLINE Vec256<uint32_t> NativeCompress(Vec256<uint32_t> v,
	Mask256<uint32_t> mask) {
	return Vec256<uint32_t>{_mm256_maskz_compress_epi32(mask.raw, v.raw)};
	}

	#if HWY_MAX_BYTES >= 64
	HWY_INLINE Vec512<uint32_t> NativeCompress(Vec512<uint32_t> v,
	Mask512<uint32_t> mask) {
	return Vec512<uint32_t>{_mm512_maskz_compress_epi32(mask.raw, v.raw)};
	}
	#endif
	// We use table-based compress for 64-bit lanes, see CompressIsPartition.

	// Do not even define these to prevent accidental usage.
	#if !HWY_X86_SLOW_COMPRESS_STORE

	template <size_t N>
	HWY_INLINE void NativeCompressStore(Vec128<uint32_t, N> v,
	Mask128<uint32_t, N> mask,
	uint32_t* HWY_RESTRICT unaligned) {
	_mm_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
	}
	HWY_INLINE void NativeCompressStore(Vec256<uint32_t> v, Mask256<uint32_t> mask,
	uint32_t* HWY_RESTRICT unaligned) {
	_mm256_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
	}
	#if HWY_MAX_BYTES >= 64
	HWY_INLINE void NativeCompressStore(Vec512<uint32_t> v, Mask512<uint32_t> mask,
	uint32_t* HWY_RESTRICT unaligned) {
	_mm512_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
	}
	#endif

	template <size_t N>
	HWY_INLINE void NativeCompressStore(Vec128<uint64_t, N> v,
	Mask128<uint64_t, N> mask,
	uint64_t* HWY_RESTRICT unaligned) {
	_mm_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
	}
	HWY_INLINE void NativeCompressStore(Vec256<uint64_t> v, Mask256<uint64_t> mask,
	uint64_t* HWY_RESTRICT unaligned) {
	_mm256_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
	}
	#if HWY_MAX_BYTES >= 64
	HWY_INLINE void NativeCompressStore(Vec512<uint64_t> v, Mask512<uint64_t> mask,
	uint64_t* HWY_RESTRICT unaligned) {
	_mm512_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
	}
	#endif

	template <size_t N>
	HWY_INLINE void NativeCompressStore(Vec128<float, N> v, Mask128<float, N> mask,
	float* HWY_RESTRICT unaligned) {
	_mm_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
	}
	HWY_INLINE void NativeCompressStore(Vec256<float> v, Mask256<float> mask,
	float* HWY_RESTRICT unaligned) {
	_mm256_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
	}
	#if HWY_MAX_BYTES >= 64
	HWY_INLINE void NativeCompressStore(Vec512<float> v, Mask512<float> mask,
	float* HWY_RESTRICT unaligned) {
	_mm512_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
	}
	#endif

	template <size_t N>
	HWY_INLINE void NativeCompressStore(Vec128<double, N> v,
	Mask128<double, N> mask,
	double* HWY_RESTRICT unaligned) {
	_mm_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
	}
	HWY_INLINE void NativeCompressStore(Vec256<double> v, Mask256<double> mask,
	double* HWY_RESTRICT unaligned) {
	_mm256_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
	}
	#if HWY_MAX_BYTES >= 64
	HWY_INLINE void NativeCompressStore(Vec512<double> v, Mask512<double> mask,
	double* HWY_RESTRICT unaligned) {
	_mm512_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
	}
	#endif

	#endif // HWY_X86_SLOW_COMPRESS_STORE

	// For u8x16 and <= u16x16 we can avoid store+load for Compress because there is
	// only a single compressed vector (u32x16). Other EmuCompress are implemented
	// after the EmuCompressStore they build upon.
	template <class V, HWY_IF_U8(TFromV<V>),
	HWY_IF_LANES_LE_D(DFromV<V>, HWY_MAX_BYTES / 4)>
	static HWY_INLINE HWY_MAYBE_UNUSED V EmuCompress(V v, MFromD<DFromV<V>> mask) {
	const DFromV<decltype(v)> d;
	const Rebind<uint32_t, decltype(d)> d32;
	const VFromD<decltype(d32)> v0 = PromoteTo(d32, v);

	using M32 = MFromD<decltype(d32)>;
	const M32 m0 = PromoteMaskTo(d32, d, mask);
	return TruncateTo(d, Compress(v0, m0));
	}

	template <class V, HWY_IF_U16(TFromV<V>),
	HWY_IF_LANES_LE_D(DFromV<V>, HWY_MAX_BYTES / 4)>
	static HWY_INLINE HWY_MAYBE_UNUSED V EmuCompress(V v, MFromD<DFromV<V>> mask) {
	const DFromV<decltype(v)> d;
	const Rebind<int32_t, decltype(d)> di32;
	const RebindToUnsigned<decltype(di32)> du32;

	const MFromD<decltype(du32)> mask32 = PromoteMaskTo(du32, d, mask);
	// DemoteTo is 2 ops, but likely lower latency than TruncateTo on SKX.
	// Only i32 -> u16 is supported, whereas NativeCompress expects u32.
	const VFromD<decltype(du32)> v32 = PromoteTo(du32, v);
	return DemoteTo(d, BitCast(di32, NativeCompress(v32, mask32)));
	}

	// See above - small-vector EmuCompressStore are implemented via EmuCompress.
	template <class D, HWY_IF_UNSIGNED_D(D),
	HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) \| (1 << 2)),
	HWY_IF_LANES_LE_D(D, HWY_MAX_BYTES / 4)>
	static HWY_INLINE HWY_MAYBE_UNUSED void EmuCompressStore(
	VFromD<D> v, MFromD<D> mask, D d, TFromD<D>* HWY_RESTRICT unaligned) {
	StoreU(EmuCompress(v, mask), d, unaligned);
	}

	// Main emulation logic for wider vector, starting with EmuCompressStore because
	// it is most convenient to merge pieces using memory (concatenating vectors at
	// byte offsets is difficult).
	template <class D, HWY_IF_UNSIGNED_D(D),
	HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) \| (1 << 2)),
	HWY_IF_LANES_GT_D(D, HWY_MAX_BYTES / 4)>
	static HWY_INLINE HWY_MAYBE_UNUSED void EmuCompressStore(
	VFromD<D> v, MFromD<D> mask, D d, TFromD<D>* HWY_RESTRICT unaligned) {
	const Half<decltype(d)> dh;

	const MFromD<decltype(dh)> m0 = LowerHalfOfMask(dh, mask);
	const MFromD<decltype(dh)> m1 = UpperHalfOfMask(dh, mask);

	const VFromD<decltype(dh)> v0 = LowerHalf(dh, v);
	const VFromD<decltype(dh)> v1 = UpperHalf(dh, v);

	EmuCompressStore(v0, m0, dh, unaligned);
	EmuCompressStore(v1, m1, dh, unaligned + CountTrue(dh, m0));
	}

	// Finally, the remaining EmuCompress for wide vectors, using EmuCompressStore.
	template <class V, HWY_IF_UNSIGNED_V(V),
	HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) \| (1 << 2)),
	HWY_IF_LANES_GT_D(DFromV<V>, HWY_MAX_BYTES / 4)>
	static HWY_INLINE HWY_MAYBE_UNUSED V EmuCompress(V v, MFromD<DFromV<V>> mask) {
	using D = DFromV<decltype(v)>;
	using T = TFromD<D>;
	const D d;

	alignas(HWY_MAX_LANES_D(D) * sizeof(T)) T buf[2 * HWY_MAX_LANES_D(D)];
	EmuCompressStore(v, mask, d, buf);
	return Load(d, buf);
	}

	} // namespace detail

	template <class V, class M, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) \| (1 << 2))>
	HWY_API V Compress(V v, const M mask) {
	const DFromV<decltype(v)> d;
	const RebindToUnsigned<decltype(d)> du;
	const auto mu = RebindMask(du, mask);
	#if HWY_TARGET <= HWY_AVX3_DL // VBMI2
	return BitCast(d, detail::NativeCompress(BitCast(du, v), mu));
	#else
	return BitCast(d, detail::EmuCompress(BitCast(du, v), mu));
	#endif
	}

	template <class V, class M, HWY_IF_T_SIZE_V(V, 4)>
	HWY_API V Compress(V v, const M mask) {
	const DFromV<decltype(v)> d;
	const RebindToUnsigned<decltype(d)> du;
	const auto mu = RebindMask(du, mask);
	return BitCast(d, detail::NativeCompress(BitCast(du, v), mu));
	}

	// ------------------------------ CompressNot

	template <class V, class M, HWY_IF_NOT_T_SIZE_V(V, 8)>
	HWY_API V CompressNot(V v, const M mask) {
	return Compress(v, Not(mask));
	}

	// uint64_t lanes. Only implement for 256 and 512-bit vectors because this is a
	// no-op for 128-bit.
	template <class V, class M, HWY_IF_V_SIZE_GT_D(DFromV<V>, 16)>
	HWY_API V CompressBlocksNot(V v, M mask) {
	return CompressNot(v, mask);
	}

	// ------------------------------ CompressBits
	template <class V>
	HWY_API V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) {
	return Compress(v, LoadMaskBits(DFromV<V>(), bits));
	}

	// ------------------------------ CompressStore

	// Generic for all vector lengths.

	template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) \| (1 << 2))>
	HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d,
	TFromD<D>* HWY_RESTRICT unaligned) {
	#if HWY_X86_SLOW_COMPRESS_STORE
	StoreU(Compress(v, mask), d, unaligned);
	#else
	const RebindToUnsigned<decltype(d)> du;
	const auto mu = RebindMask(du, mask);
	auto pu = reinterpret_cast<TFromD<decltype(du)> * HWY_RESTRICT>(unaligned);

	#if HWY_TARGET <= HWY_AVX3_DL // VBMI2
	detail::NativeCompressStore(BitCast(du, v), mu, pu);
	#else
	detail::EmuCompressStore(BitCast(du, v), mu, du, pu);
	#endif
	#endif // HWY_X86_SLOW_COMPRESS_STORE
	const size_t count = CountTrue(d, mask);
	detail::MaybeUnpoison(unaligned, count);
	return count;
	}

	template <class D, HWY_IF_NOT_FLOAT_D(D),
	HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) \| (1 << 8))>
	HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d,
	TFromD<D>* HWY_RESTRICT unaligned) {
	#if HWY_X86_SLOW_COMPRESS_STORE
	StoreU(Compress(v, mask), d, unaligned);
	#else
	const RebindToUnsigned<decltype(d)> du;
	const auto mu = RebindMask(du, mask);
	using TU = TFromD<decltype(du)>;
	TU* HWY_RESTRICT pu = reinterpret_cast<TU*>(unaligned);
	detail::NativeCompressStore(BitCast(du, v), mu, pu);
	#endif // HWY_X86_SLOW_COMPRESS_STORE
	const size_t count = CountTrue(d, mask);
	detail::MaybeUnpoison(unaligned, count);
	return count;
	}

	// Additional overloads to avoid casting to uint32_t (delay?).
	template <class D, HWY_IF_FLOAT3264_D(D)>
	HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d,
	TFromD<D>* HWY_RESTRICT unaligned) {
	#if HWY_X86_SLOW_COMPRESS_STORE
	StoreU(Compress(v, mask), d, unaligned);
	#else
	(void)d;
	detail::NativeCompressStore(v, mask, unaligned);
	#endif // HWY_X86_SLOW_COMPRESS_STORE
	const size_t count = PopCount(uint64_t{mask.raw});
	detail::MaybeUnpoison(unaligned, count);
	return count;
	}

	// ------------------------------ CompressBlendedStore
	template <class D>
	HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
	TFromD<D>* HWY_RESTRICT unaligned) {
	// Native CompressStore already does the blending at no extra cost (latency
	// 11, rthroughput 2 - same as compress plus store).

	HWY_IF_CONSTEXPR(HWY_MAX_LANES_D(D) < (16 / sizeof(TFromD<D>))) {
	m = And(m, FirstN(d, HWY_MAX_LANES_D(D)));
	}

	HWY_IF_CONSTEXPR(!HWY_X86_SLOW_COMPRESS_STORE &&
	(HWY_TARGET <= HWY_AVX3_DL \|\| sizeof(TFromD<D>) > 2)) {
	return CompressStore(v, m, d, unaligned);
	}
	else {
	const size_t count = CountTrue(d, m);
	StoreN(Compress(v, m), d, unaligned, count);
	detail::MaybeUnpoison(unaligned, count);
	return count;
	}
	}

	// ------------------------------ CompressBitsStore
	// Generic for all vector lengths.
	template <class D>
	HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
	D d, TFromD<D>* HWY_RESTRICT unaligned) {
	return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
	}

	#pragma pop_macro("HWY_X86_SLOW_COMPRESS_STORE")

	// NOLINTNEXTLINE(google-readability-namespace-comments)
	} // namespace HWY_NAMESPACE
	} // namespace hwy
	HWY_AFTER_NAMESPACE();

	// Note that the GCC warnings are not suppressed if we only wrap the *intrin.h -
	// the warning seems to be issued at the call site of intrinsics, i.e. our code.
	HWY_DIAGNOSTICS(pop)