third_party/highway/hwy/ops/wasm_128-inl.h - aom - Git at Google

 // Copyright 2019 Google LLC
 // SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // 128-bit WASM vectors and operations.
 // External include guard in highway.h - see comment there.

 #include <wasm_simd128.h>

 #include "third_party/highway/hwy/base.h"
 #include "third_party/highway/hwy/ops/shared-inl.h"

 #ifdef HWY_WASM_OLD_NAMES
 #define wasm_i8x16_shuffle wasm_v8x16_shuffle
 #define wasm_i16x8_shuffle wasm_v16x8_shuffle
 #define wasm_i32x4_shuffle wasm_v32x4_shuffle
 #define wasm_i64x2_shuffle wasm_v64x2_shuffle
 #define wasm_u16x8_extend_low_u8x16 wasm_i16x8_widen_low_u8x16
 #define wasm_u32x4_extend_low_u16x8 wasm_i32x4_widen_low_u16x8
 #define wasm_i32x4_extend_low_i16x8 wasm_i32x4_widen_low_i16x8
 #define wasm_i16x8_extend_low_i8x16 wasm_i16x8_widen_low_i8x16
 #define wasm_u32x4_extend_high_u16x8 wasm_i32x4_widen_high_u16x8
 #define wasm_i32x4_extend_high_i16x8 wasm_i32x4_widen_high_i16x8
 #define wasm_i32x4_trunc_sat_f32x4 wasm_i32x4_trunc_saturate_f32x4
 #define wasm_i62x2_trunc_sat_f64x2 wasm_i64x2_trunc_saturate_f64x2
 #define wasm_u8x16_add_sat wasm_u8x16_add_saturate
 #define wasm_u8x16_sub_sat wasm_u8x16_sub_saturate
 #define wasm_u16x8_add_sat wasm_u16x8_add_saturate
 #define wasm_u16x8_sub_sat wasm_u16x8_sub_saturate
 #define wasm_i8x16_add_sat wasm_i8x16_add_saturate
 #define wasm_i8x16_sub_sat wasm_i8x16_sub_saturate
 #define wasm_i16x8_add_sat wasm_i16x8_add_saturate
 #define wasm_i16x8_sub_sat wasm_i16x8_sub_saturate
 #endif

 HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {

 #if HWY_TARGET == HWY_WASM_EMU256
 template <typename T>
 using Full256 = Simd<T, 32 / sizeof(T), 0>;
 #endif

 namespace detail {

 template <typename T>
 struct Raw128 {
   using type = __v128_u;
 };
 template <>
 struct Raw128<float> {
   using type = __f32x4;
 };
 template <>
 struct Raw128<double> {
   using type = __f64x2;
 };

 }  // namespace detail

 template <typename T, size_t N = 16 / sizeof(T)>
 class Vec128 {
   using Raw = typename detail::Raw128<T>::type;

  public:
   using PrivateT = T;                     // only for DFromV
   static constexpr size_t kPrivateN = N;  // only for DFromV

   // Compound assignment. Only usable if there is a corresponding non-member
   // binary operator overload. For example, only f32 and f64 support division.
   HWY_INLINE Vec128& operator*=(const Vec128 other) {
     return *this = (*this * other);
   }
   HWY_INLINE Vec128& operator/=(const Vec128 other) {
     return *this = (*this / other);
   }
   HWY_INLINE Vec128& operator+=(const Vec128 other) {
     return *this = (*this + other);
   }
   HWY_INLINE Vec128& operator-=(const Vec128 other) {
     return *this = (*this - other);
   }
   HWY_INLINE Vec128& operator%=(const Vec128 other) {
     return *this = (*this % other);
   }
   HWY_INLINE Vec128& operator&=(const Vec128 other) {
     return *this = (*this & other);
   }
   HWY_INLINE Vec128& operator|=(const Vec128 other) {
     return *this = (*this | other);
   }
   HWY_INLINE Vec128& operator^=(const Vec128 other) {
     return *this = (*this ^ other);
   }

   Raw raw;
 };

 template <typename T>
 using Vec64 = Vec128<T, 8 / sizeof(T)>;

 template <typename T>
 using Vec32 = Vec128<T, 4 / sizeof(T)>;

 template <typename T>
 using Vec16 = Vec128<T, 2 / sizeof(T)>;

 // FF..FF or 0.
 template <typename T, size_t N = 16 / sizeof(T)>
 struct Mask128 {
   using PrivateT = T;                     // only for DFromM
   static constexpr size_t kPrivateN = N;  // only for DFromM

   typename detail::Raw128<T>::type raw;
 };

 template <class V>
 using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;

 template <class M>
 using DFromM = Simd<typename M::PrivateT, M::kPrivateN, 0>;

 template <class V>
 using TFromV = typename V::PrivateT;

 // ------------------------------ Zero

 // Use HWY_MAX_LANES_D here because VFromD is defined in terms of Zero.
 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_FLOAT3264_D(D)>
 HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
   return Vec128<TFromD<D>, HWY_MAX_LANES_D(D)>{wasm_i32x4_splat(0)};
 }
 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
 HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
   return Vec128<TFromD<D>, HWY_MAX_LANES_D(D)>{wasm_f32x4_splat(0.0f)};
 }
 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
 HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
   return Vec128<TFromD<D>, HWY_MAX_LANES_D(D)>{wasm_f64x2_splat(0.0)};
 }

 template <class D>
 using VFromD = decltype(Zero(D()));

 // ------------------------------ BitCast

 namespace detail {

 HWY_INLINE __v128_u BitCastToInteger(__v128_u v) { return v; }
 HWY_INLINE __v128_u BitCastToInteger(__f32x4 v) {
   return static_cast<__v128_u>(v);
 }
 HWY_INLINE __v128_u BitCastToInteger(__f64x2 v) {
   return static_cast<__v128_u>(v);
 }

 template <typename T, size_t N>
 HWY_INLINE Vec128<uint8_t, N * sizeof(T)> BitCastToByte(Vec128<T, N> v) {
   return Vec128<uint8_t, N * sizeof(T)>{BitCastToInteger(v.raw)};
 }

 // Cannot rely on function overloading because return types differ.
 template <typename T>
 struct BitCastFromInteger128 {
   HWY_INLINE __v128_u operator()(__v128_u v) { return v; }
 };
 template <>
 struct BitCastFromInteger128<float> {
   HWY_INLINE __f32x4 operator()(__v128_u v) { return static_cast<__f32x4>(v); }
 };
 template <>
 struct BitCastFromInteger128<double> {
   HWY_INLINE __f64x2 operator()(__v128_u v) { return static_cast<__f64x2>(v); }
 };

 template <class D>
 HWY_INLINE VFromD<D> BitCastFromByte(D d, Vec128<uint8_t, d.MaxBytes()> v) {
   return VFromD<D>{BitCastFromInteger128<TFromD<D>>()(v.raw)};
 }

 }  // namespace detail

 template <class D, typename FromT>
 HWY_API VFromD<D> BitCast(D d,
                           Vec128<FromT, Repartition<FromT, D>().MaxLanes()> v) {
   return detail::BitCastFromByte(d, detail::BitCastToByte(v));
 }

 // ------------------------------ ResizeBitCast

 template <class D, typename FromV, HWY_IF_V_SIZE_LE_V(FromV, 16),
           HWY_IF_V_SIZE_LE_D(D, 16)>
 HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
   const Repartition<uint8_t, decltype(d)> du8_to;
   return BitCast(d, VFromD<decltype(du8_to)>{detail::BitCastToInteger(v.raw)});
 }

 // ------------------------------ Set

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
   return VFromD<D>{wasm_i8x16_splat(static_cast<int8_t>(t))};
 }
 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI16_D(D)>
 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
   return VFromD<D>{wasm_i16x8_splat(static_cast<int16_t>(t))};
 }
 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)>
 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
   return VFromD<D>{wasm_i32x4_splat(static_cast<int32_t>(t))};
 }
 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)>
 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
   return VFromD<D>{wasm_i64x2_splat(static_cast<int64_t>(t))};
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_SPECIAL_FLOAT_D(D)>
 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
   return VFromD<D>{wasm_i16x8_splat(BitCastScalar<int16_t>(t))};
 }
 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
   return VFromD<D>{wasm_f32x4_splat(t)};
 }
 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
   return VFromD<D>{wasm_f64x2_splat(t)};
 }

 HWY_DIAGNOSTICS(push)
 HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")

 // For all vector sizes.
 template <class D>
 HWY_API VFromD<D> Undefined(D d) {
   return Zero(d);
 }

 HWY_DIAGNOSTICS(pop)

 // For all vector sizes.
 template <class D, typename T = TFromD<D>, typename T2>
 HWY_API VFromD<D> Iota(D d, const T2 first) {
   HWY_ALIGN T lanes[MaxLanes(d)];
   for (size_t i = 0; i < MaxLanes(d); ++i) {
     lanes[i] = AddWithWraparound(static_cast<T>(first), i);
   }
   return Load(d, lanes);
 }

 // ------------------------------ Dup128VecFromValues
 template <class D, HWY_IF_I8_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
                                       TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
                                       TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
                                       TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
                                       TFromD<D> t11, TFromD<D> t12,
                                       TFromD<D> t13, TFromD<D> t14,
                                       TFromD<D> t15) {
   return VFromD<D>{wasm_i8x16_make(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10,
                                    t11, t12, t13, t14, t15)};
 }

 template <class D, HWY_IF_U8_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
                                       TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
                                       TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
                                       TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
                                       TFromD<D> t11, TFromD<D> t12,
                                       TFromD<D> t13, TFromD<D> t14,
                                       TFromD<D> t15) {
   return VFromD<D>{wasm_u8x16_make(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10,
                                    t11, t12, t13, t14, t15)};
 }

 template <class D, HWY_IF_I16_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
                                       TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
                                       TFromD<D> t5, TFromD<D> t6,
                                       TFromD<D> t7) {
   return VFromD<D>{wasm_i16x8_make(t0, t1, t2, t3, t4, t5, t6, t7)};
 }

 template <class D, HWY_IF_U16_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
                                       TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
                                       TFromD<D> t5, TFromD<D> t6,
                                       TFromD<D> t7) {
   return VFromD<D>{wasm_u16x8_make(t0, t1, t2, t3, t4, t5, t6, t7)};
 }

 template <class D, HWY_IF_SPECIAL_FLOAT_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
                                       TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
                                       TFromD<D> t5, TFromD<D> t6,
                                       TFromD<D> t7) {
   const RebindToSigned<decltype(d)> di;
   return BitCast(d,
                  Dup128VecFromValues(
                      di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
                      BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
                      BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
                      BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
 }

 template <class D, HWY_IF_I32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
                                       TFromD<D> t2, TFromD<D> t3) {
   return VFromD<D>{wasm_i32x4_make(t0, t1, t2, t3)};
 }

 template <class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
                                       TFromD<D> t2, TFromD<D> t3) {
   return VFromD<D>{wasm_u32x4_make(t0, t1, t2, t3)};
 }

 template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
                                       TFromD<D> t2, TFromD<D> t3) {
   return VFromD<D>{wasm_f32x4_make(t0, t1, t2, t3)};
 }

 template <class D, HWY_IF_I64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
   return VFromD<D>{wasm_i64x2_make(t0, t1)};
 }

 template <class D, HWY_IF_U64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
   return VFromD<D>{wasm_u64x2_make(t0, t1)};
 }

 template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
   return VFromD<D>{wasm_f64x2_make(t0, t1)};
 }

 // ================================================== ARITHMETIC

 // ------------------------------ Addition

 // Unsigned
 template <size_t N>
 HWY_API Vec128<uint8_t, N> operator+(const Vec128<uint8_t, N> a,
                                      const Vec128<uint8_t, N> b) {
   return Vec128<uint8_t, N>{wasm_i8x16_add(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint16_t, N> operator+(const Vec128<uint16_t, N> a,
                                       const Vec128<uint16_t, N> b) {
   return Vec128<uint16_t, N>{wasm_i16x8_add(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint32_t, N> operator+(const Vec128<uint32_t, N> a,
                                       const Vec128<uint32_t, N> b) {
   return Vec128<uint32_t, N>{wasm_i32x4_add(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint64_t, N> operator+(const Vec128<uint64_t, N> a,
                                       const Vec128<uint64_t, N> b) {
   return Vec128<uint64_t, N>{wasm_i64x2_add(a.raw, b.raw)};
 }

 // Signed
 template <size_t N>
 HWY_API Vec128<int8_t, N> operator+(const Vec128<int8_t, N> a,
                                     const Vec128<int8_t, N> b) {
   return Vec128<int8_t, N>{wasm_i8x16_add(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int16_t, N> operator+(const Vec128<int16_t, N> a,
                                      const Vec128<int16_t, N> b) {
   return Vec128<int16_t, N>{wasm_i16x8_add(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int32_t, N> operator+(const Vec128<int32_t, N> a,
                                      const Vec128<int32_t, N> b) {
   return Vec128<int32_t, N>{wasm_i32x4_add(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int64_t, N> operator+(const Vec128<int64_t, N> a,
                                      const Vec128<int64_t, N> b) {
   return Vec128<int64_t, N>{wasm_i64x2_add(a.raw, b.raw)};
 }

 // Float
 template <size_t N>
 HWY_API Vec128<float, N> operator+(const Vec128<float, N> a,
                                    const Vec128<float, N> b) {
   return Vec128<float, N>{wasm_f32x4_add(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<double, N> operator+(const Vec128<double, N> a,
                                     const Vec128<double, N> b) {
   return Vec128<double, N>{wasm_f64x2_add(a.raw, b.raw)};
 }

 // ------------------------------ Subtraction

 // Unsigned
 template <size_t N>
 HWY_API Vec128<uint8_t, N> operator-(const Vec128<uint8_t, N> a,
                                      const Vec128<uint8_t, N> b) {
   return Vec128<uint8_t, N>{wasm_i8x16_sub(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint16_t, N> operator-(Vec128<uint16_t, N> a,
                                       Vec128<uint16_t, N> b) {
   return Vec128<uint16_t, N>{wasm_i16x8_sub(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint32_t, N> operator-(const Vec128<uint32_t, N> a,
                                       const Vec128<uint32_t, N> b) {
   return Vec128<uint32_t, N>{wasm_i32x4_sub(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint64_t, N> operator-(const Vec128<uint64_t, N> a,
                                       const Vec128<uint64_t, N> b) {
   return Vec128<uint64_t, N>{wasm_i64x2_sub(a.raw, b.raw)};
 }

 // Signed
 template <size_t N>
 HWY_API Vec128<int8_t, N> operator-(const Vec128<int8_t, N> a,
                                     const Vec128<int8_t, N> b) {
   return Vec128<int8_t, N>{wasm_i8x16_sub(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int16_t, N> operator-(const Vec128<int16_t, N> a,
                                      const Vec128<int16_t, N> b) {
   return Vec128<int16_t, N>{wasm_i16x8_sub(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int32_t, N> operator-(const Vec128<int32_t, N> a,
                                      const Vec128<int32_t, N> b) {
   return Vec128<int32_t, N>{wasm_i32x4_sub(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int64_t, N> operator-(const Vec128<int64_t, N> a,
                                      const Vec128<int64_t, N> b) {
   return Vec128<int64_t, N>{wasm_i64x2_sub(a.raw, b.raw)};
 }

 // Float
 template <size_t N>
 HWY_API Vec128<float, N> operator-(const Vec128<float, N> a,
                                    const Vec128<float, N> b) {
   return Vec128<float, N>{wasm_f32x4_sub(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<double, N> operator-(const Vec128<double, N> a,
                                     const Vec128<double, N> b) {
   return Vec128<double, N>{wasm_f64x2_sub(a.raw, b.raw)};
 }

 // ------------------------------ SaturatedAdd

 // Returns a + b clamped to the destination range.

 // Unsigned
 template <size_t N>
 HWY_API Vec128<uint8_t, N> SaturatedAdd(const Vec128<uint8_t, N> a,
                                         const Vec128<uint8_t, N> b) {
   return Vec128<uint8_t, N>{wasm_u8x16_add_sat(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint16_t, N> SaturatedAdd(const Vec128<uint16_t, N> a,
                                          const Vec128<uint16_t, N> b) {
   return Vec128<uint16_t, N>{wasm_u16x8_add_sat(a.raw, b.raw)};
 }

 // Signed
 template <size_t N>
 HWY_API Vec128<int8_t, N> SaturatedAdd(const Vec128<int8_t, N> a,
                                        const Vec128<int8_t, N> b) {
   return Vec128<int8_t, N>{wasm_i8x16_add_sat(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int16_t, N> SaturatedAdd(const Vec128<int16_t, N> a,
                                         const Vec128<int16_t, N> b) {
   return Vec128<int16_t, N>{wasm_i16x8_add_sat(a.raw, b.raw)};
 }

 // ------------------------------ SaturatedSub

 // Returns a - b clamped to the destination range.

 // Unsigned
 template <size_t N>
 HWY_API Vec128<uint8_t, N> SaturatedSub(const Vec128<uint8_t, N> a,
                                         const Vec128<uint8_t, N> b) {
   return Vec128<uint8_t, N>{wasm_u8x16_sub_sat(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint16_t, N> SaturatedSub(const Vec128<uint16_t, N> a,
                                          const Vec128<uint16_t, N> b) {
   return Vec128<uint16_t, N>{wasm_u16x8_sub_sat(a.raw, b.raw)};
 }

 // Signed
 template <size_t N>
 HWY_API Vec128<int8_t, N> SaturatedSub(const Vec128<int8_t, N> a,
                                        const Vec128<int8_t, N> b) {
   return Vec128<int8_t, N>{wasm_i8x16_sub_sat(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int16_t, N> SaturatedSub(const Vec128<int16_t, N> a,
                                         const Vec128<int16_t, N> b) {
   return Vec128<int16_t, N>{wasm_i16x8_sub_sat(a.raw, b.raw)};
 }

 // ------------------------------ Average

 // Returns (a + b + 1) / 2

 // Unsigned
 template <size_t N>
 HWY_API Vec128<uint8_t, N> AverageRound(const Vec128<uint8_t, N> a,
                                         const Vec128<uint8_t, N> b) {
   return Vec128<uint8_t, N>{wasm_u8x16_avgr(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint16_t, N> AverageRound(const Vec128<uint16_t, N> a,
                                          const Vec128<uint16_t, N> b) {
   return Vec128<uint16_t, N>{wasm_u16x8_avgr(a.raw, b.raw)};
 }

 template <class V, HWY_IF_SIGNED_V(V),
           HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
 HWY_API V AverageRound(V a, V b) {
   const DFromV<decltype(a)> d;
   const RebindToUnsigned<decltype(d)> du;
   const V sign_bit = SignBit(d);
   return Xor(BitCast(d, AverageRound(BitCast(du, Xor(a, sign_bit)),
                                      BitCast(du, Xor(b, sign_bit)))),
              sign_bit);
 }

 // ------------------------------ Absolute value

 // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
 template <size_t N>
 HWY_API Vec128<int8_t, N> Abs(const Vec128<int8_t, N> v) {
   return Vec128<int8_t, N>{wasm_i8x16_abs(v.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int16_t, N> Abs(const Vec128<int16_t, N> v) {
   return Vec128<int16_t, N>{wasm_i16x8_abs(v.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) {
   return Vec128<int32_t, N>{wasm_i32x4_abs(v.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int64_t, N> Abs(const Vec128<int64_t, N> v) {
   return Vec128<int64_t, N>{wasm_i64x2_abs(v.raw)};
 }

 template <size_t N>
 HWY_API Vec128<float, N> Abs(const Vec128<float, N> v) {
   return Vec128<float, N>{wasm_f32x4_abs(v.raw)};
 }
 template <size_t N>
 HWY_API Vec128<double, N> Abs(const Vec128<double, N> v) {
   return Vec128<double, N>{wasm_f64x2_abs(v.raw)};
 }

 // ------------------------------ Shift lanes by constant #bits

 // Unsigned
 template <int kBits, size_t N>
 HWY_API Vec128<uint16_t, N> ShiftLeft(const Vec128<uint16_t, N> v) {
   return Vec128<uint16_t, N>{wasm_i16x8_shl(v.raw, kBits)};
 }
 template <int kBits, size_t N>
 HWY_API Vec128<uint16_t, N> ShiftRight(const Vec128<uint16_t, N> v) {
   return Vec128<uint16_t, N>{wasm_u16x8_shr(v.raw, kBits)};
 }
 template <int kBits, size_t N>
 HWY_API Vec128<uint32_t, N> ShiftLeft(const Vec128<uint32_t, N> v) {
   return Vec128<uint32_t, N>{wasm_i32x4_shl(v.raw, kBits)};
 }
 template <int kBits, size_t N>
 HWY_API Vec128<uint64_t, N> ShiftLeft(const Vec128<uint64_t, N> v) {
   return Vec128<uint64_t, N>{wasm_i64x2_shl(v.raw, kBits)};
 }
 template <int kBits, size_t N>
 HWY_API Vec128<uint32_t, N> ShiftRight(const Vec128<uint32_t, N> v) {
   return Vec128<uint32_t, N>{wasm_u32x4_shr(v.raw, kBits)};
 }
 template <int kBits, size_t N>
 HWY_API Vec128<uint64_t, N> ShiftRight(const Vec128<uint64_t, N> v) {
   return Vec128<uint64_t, N>{wasm_u64x2_shr(v.raw, kBits)};
 }

 // Signed
 template <int kBits, size_t N>
 HWY_API Vec128<int16_t, N> ShiftLeft(const Vec128<int16_t, N> v) {
   return Vec128<int16_t, N>{wasm_i16x8_shl(v.raw, kBits)};
 }
 template <int kBits, size_t N>
 HWY_API Vec128<int16_t, N> ShiftRight(const Vec128<int16_t, N> v) {
   return Vec128<int16_t, N>{wasm_i16x8_shr(v.raw, kBits)};
 }
 template <int kBits, size_t N>
 HWY_API Vec128<int32_t, N> ShiftLeft(const Vec128<int32_t, N> v) {
   return Vec128<int32_t, N>{wasm_i32x4_shl(v.raw, kBits)};
 }
 template <int kBits, size_t N>
 HWY_API Vec128<int64_t, N> ShiftLeft(const Vec128<int64_t, N> v) {
   return Vec128<int64_t, N>{wasm_i64x2_shl(v.raw, kBits)};
 }
 template <int kBits, size_t N>
 HWY_API Vec128<int32_t, N> ShiftRight(const Vec128<int32_t, N> v) {
   return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, kBits)};
 }
 template <int kBits, size_t N>
 HWY_API Vec128<int64_t, N> ShiftRight(const Vec128<int64_t, N> v) {
   return Vec128<int64_t, N>{wasm_i64x2_shr(v.raw, kBits)};
 }

 // 8-bit
 template <int kBits, typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
 HWY_API Vec128<T, N> ShiftLeft(const Vec128<T, N> v) {
   const DFromV<decltype(v)> d8;
   // Use raw instead of BitCast to support N=1.
   const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw};
   return kBits == 1
              ? (v + v)
              : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
 }

 template <int kBits, size_t N>
 HWY_API Vec128<uint8_t, N> ShiftRight(const Vec128<uint8_t, N> v) {
   const DFromV<decltype(v)> d8;
   // Use raw instead of BitCast to support N=1.
   const Vec128<uint8_t, N> shifted{
       ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
   return shifted & Set(d8, 0xFF >> kBits);
 }

 template <int kBits, size_t N>
 HWY_API Vec128<int8_t, N> ShiftRight(const Vec128<int8_t, N> v) {
   const DFromV<decltype(v)> di;
   const RebindToUnsigned<decltype(di)> du;
   const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
   const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
   return (shifted ^ shifted_sign) - shifted_sign;
 }

 // ------------------------------ RotateRight (ShiftRight, Or)
 template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
 HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
   const DFromV<decltype(v)> d;
   const RebindToUnsigned<decltype(d)> du;

   constexpr size_t kSizeInBits = sizeof(T) * 8;
   static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");

   if (kBits == 0) return v;
   return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
             ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
 }

 // ------------------------------ Shift lanes by same variable #bits

 // After https://reviews.llvm.org/D108415 shift argument became unsigned.
 HWY_DIAGNOSTICS(push)
 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")

 // Unsigned
 template <size_t N>
 HWY_API Vec128<uint16_t, N> ShiftLeftSame(const Vec128<uint16_t, N> v,
                                           const int bits) {
   return Vec128<uint16_t, N>{wasm_i16x8_shl(v.raw, bits)};
 }
 template <size_t N>
 HWY_API Vec128<uint16_t, N> ShiftRightSame(const Vec128<uint16_t, N> v,
                                            const int bits) {
   return Vec128<uint16_t, N>{wasm_u16x8_shr(v.raw, bits)};
 }
 template <size_t N>
 HWY_API Vec128<uint32_t, N> ShiftLeftSame(const Vec128<uint32_t, N> v,
                                           const int bits) {
   return Vec128<uint32_t, N>{wasm_i32x4_shl(v.raw, bits)};
 }
 template <size_t N>
 HWY_API Vec128<uint32_t, N> ShiftRightSame(const Vec128<uint32_t, N> v,
                                            const int bits) {
   return Vec128<uint32_t, N>{wasm_u32x4_shr(v.raw, bits)};
 }
 template <size_t N>
 HWY_API Vec128<uint64_t, N> ShiftLeftSame(const Vec128<uint64_t, N> v,
                                           const int bits) {
   return Vec128<uint64_t, N>{wasm_i64x2_shl(v.raw, bits)};
 }
 template <size_t N>
 HWY_API Vec128<uint64_t, N> ShiftRightSame(const Vec128<uint64_t, N> v,
                                            const int bits) {
   return Vec128<uint64_t, N>{wasm_u64x2_shr(v.raw, bits)};
 }

 // Signed
 template <size_t N>
 HWY_API Vec128<int16_t, N> ShiftLeftSame(const Vec128<int16_t, N> v,
                                          const int bits) {
   return Vec128<int16_t, N>{wasm_i16x8_shl(v.raw, bits)};
 }
 template <size_t N>
 HWY_API Vec128<int16_t, N> ShiftRightSame(const Vec128<int16_t, N> v,
                                           const int bits) {
   return Vec128<int16_t, N>{wasm_i16x8_shr(v.raw, bits)};
 }
 template <size_t N>
 HWY_API Vec128<int32_t, N> ShiftLeftSame(const Vec128<int32_t, N> v,
                                          const int bits) {
   return Vec128<int32_t, N>{wasm_i32x4_shl(v.raw, bits)};
 }
 template <size_t N>
 HWY_API Vec128<int32_t, N> ShiftRightSame(const Vec128<int32_t, N> v,
                                           const int bits) {
   return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, bits)};
 }
 template <size_t N>
 HWY_API Vec128<int64_t, N> ShiftLeftSame(const Vec128<int64_t, N> v,
                                          const int bits) {
   return Vec128<int64_t, N>{wasm_i64x2_shl(v.raw, bits)};
 }
 template <size_t N>
 HWY_API Vec128<int64_t, N> ShiftRightSame(const Vec128<int64_t, N> v,
                                           const int bits) {
   return Vec128<int64_t, N>{wasm_i64x2_shr(v.raw, bits)};
 }

 // 8-bit
 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
 HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, const int bits) {
   const DFromV<decltype(v)> d8;
   // Use raw instead of BitCast to support N=1.
   const Vec128<T, N> shifted{
       ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw};
   return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF));
 }

 template <size_t N>
 HWY_API Vec128<uint8_t, N> ShiftRightSame(Vec128<uint8_t, N> v,
                                           const int bits) {
   const DFromV<decltype(v)> d8;
   // Use raw instead of BitCast to support N=1.
   const Vec128<uint8_t, N> shifted{
       ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw};
   return shifted & Set(d8, 0xFF >> bits);
 }

 template <size_t N>
 HWY_API Vec128<int8_t, N> ShiftRightSame(Vec128<int8_t, N> v, const int bits) {
   const DFromV<decltype(v)> di;
   const RebindToUnsigned<decltype(di)> du;
   const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
   const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits));
   return (shifted ^ shifted_sign) - shifted_sign;
 }

 // ignore Wsign-conversion
 HWY_DIAGNOSTICS(pop)

 // ------------------------------ Minimum

 // Unsigned
 template <size_t N>
 HWY_API Vec128<uint8_t, N> Min(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
   return Vec128<uint8_t, N>{wasm_u8x16_min(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint16_t, N> Min(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) {
   return Vec128<uint16_t, N>{wasm_u16x8_min(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint32_t, N> Min(Vec128<uint32_t, N> a, Vec128<uint32_t, N> b) {
   return Vec128<uint32_t, N>{wasm_u32x4_min(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint64_t, N> Min(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
   // Avoid wasm_u64x2_extract_lane - not all implementations have it yet.
   const uint64_t a0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0));
   const uint64_t b0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0));
   const uint64_t a1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1));
   const uint64_t b1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1));
   alignas(16) uint64_t min[2] = {HWY_MIN(a0, b0), HWY_MIN(a1, b1)};
   return Vec128<uint64_t, N>{wasm_v128_load(min)};
 }

 // Signed
 template <size_t N>
 HWY_API Vec128<int8_t, N> Min(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
   return Vec128<int8_t, N>{wasm_i8x16_min(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int16_t, N> Min(Vec128<int16_t, N> a, Vec128<int16_t, N> b) {
   return Vec128<int16_t, N>{wasm_i16x8_min(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int32_t, N> Min(Vec128<int32_t, N> a, Vec128<int32_t, N> b) {
   return Vec128<int32_t, N>{wasm_i32x4_min(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int64_t, N> Min(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
   alignas(16) int64_t min[4];
   min[0] = HWY_MIN(wasm_i64x2_extract_lane(a.raw, 0),
                    wasm_i64x2_extract_lane(b.raw, 0));
   min[1] = HWY_MIN(wasm_i64x2_extract_lane(a.raw, 1),
                    wasm_i64x2_extract_lane(b.raw, 1));
   return Vec128<int64_t, N>{wasm_v128_load(min)};
 }

 // Float
 template <size_t N>
 HWY_API Vec128<float, N> Min(Vec128<float, N> a, Vec128<float, N> b) {
   // Equivalent to a < b ? a : b (taking into account our swapped arg order,
   // so that Min(NaN, x) is x to match x86).
   return Vec128<float, N>{wasm_f32x4_pmin(b.raw, a.raw)};
 }
 template <size_t N>
 HWY_API Vec128<double, N> Min(Vec128<double, N> a, Vec128<double, N> b) {
   // Equivalent to a < b ? a : b (taking into account our swapped arg order,
   // so that Min(NaN, x) is x to match x86).
   return Vec128<double, N>{wasm_f64x2_pmin(b.raw, a.raw)};
 }

 // ------------------------------ Maximum

 // Unsigned
 template <size_t N>
 HWY_API Vec128<uint8_t, N> Max(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
   return Vec128<uint8_t, N>{wasm_u8x16_max(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint16_t, N> Max(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) {
   return Vec128<uint16_t, N>{wasm_u16x8_max(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint32_t, N> Max(Vec128<uint32_t, N> a, Vec128<uint32_t, N> b) {
   return Vec128<uint32_t, N>{wasm_u32x4_max(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint64_t, N> Max(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
   // Avoid wasm_u64x2_extract_lane - not all implementations have it yet.
   const uint64_t a0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0));
   const uint64_t b0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0));
   const uint64_t a1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1));
   const uint64_t b1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1));
   alignas(16) uint64_t max[2] = {HWY_MAX(a0, b0), HWY_MAX(a1, b1)};
   return Vec128<uint64_t, N>{wasm_v128_load(max)};
 }

 // Signed
 template <size_t N>
 HWY_API Vec128<int8_t, N> Max(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
   return Vec128<int8_t, N>{wasm_i8x16_max(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int16_t, N> Max(Vec128<int16_t, N> a, Vec128<int16_t, N> b) {
   return Vec128<int16_t, N>{wasm_i16x8_max(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int32_t, N> Max(Vec128<int32_t, N> a, Vec128<int32_t, N> b) {
   return Vec128<int32_t, N>{wasm_i32x4_max(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int64_t, N> Max(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
   alignas(16) int64_t max[2];
   max[0] = HWY_MAX(wasm_i64x2_extract_lane(a.raw, 0),
                    wasm_i64x2_extract_lane(b.raw, 0));
   max[1] = HWY_MAX(wasm_i64x2_extract_lane(a.raw, 1),
                    wasm_i64x2_extract_lane(b.raw, 1));
   return Vec128<int64_t, N>{wasm_v128_load(max)};
 }

 // Float
 template <size_t N>
 HWY_API Vec128<float, N> Max(Vec128<float, N> a, Vec128<float, N> b) {
   // Equivalent to b < a ? a : b (taking into account our swapped arg order,
   // so that Max(NaN, x) is x to match x86).
   return Vec128<float, N>{wasm_f32x4_pmax(b.raw, a.raw)};
 }
 template <size_t N>
 HWY_API Vec128<double, N> Max(Vec128<double, N> a, Vec128<double, N> b) {
   // Equivalent to b < a ? a : b (taking into account our swapped arg order,
   // so that Max(NaN, x) is x to match x86).
   return Vec128<double, N>{wasm_f64x2_pmax(b.raw, a.raw)};
 }

 // ------------------------------ Integer multiplication

 // Unsigned
 template <size_t N>
 HWY_API Vec128<uint16_t, N> operator*(const Vec128<uint16_t, N> a,
                                       const Vec128<uint16_t, N> b) {
   return Vec128<uint16_t, N>{wasm_i16x8_mul(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint32_t, N> operator*(const Vec128<uint32_t, N> a,
                                       const Vec128<uint32_t, N> b) {
   return Vec128<uint32_t, N>{wasm_i32x4_mul(a.raw, b.raw)};
 }

 // Signed
 template <size_t N>
 HWY_API Vec128<int16_t, N> operator*(const Vec128<int16_t, N> a,
                                      const Vec128<int16_t, N> b) {
   return Vec128<int16_t, N>{wasm_i16x8_mul(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int32_t, N> operator*(const Vec128<int32_t, N> a,
                                      const Vec128<int32_t, N> b) {
   return Vec128<int32_t, N>{wasm_i32x4_mul(a.raw, b.raw)};
 }

 // Returns the upper sizeof(T)*8 bits of a * b in each lane.
 template <size_t N>
 HWY_API Vec128<uint8_t, N> MulHigh(const Vec128<uint8_t, N> a,
                                    const Vec128<uint8_t, N> b) {
   const auto l = wasm_u16x8_extmul_low_u8x16(a.raw, b.raw);
   const auto h = wasm_u16x8_extmul_high_u8x16(a.raw, b.raw);
   // TODO(eustas): shift-right + narrow?
   return Vec128<uint8_t, N>{wasm_i8x16_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15,
                                                17, 19, 21, 23, 25, 27, 29, 31)};
 }
 template <size_t N>
 HWY_API Vec128<int8_t, N> MulHigh(const Vec128<int8_t, N> a,
                                   const Vec128<int8_t, N> b) {
   const auto l = wasm_i16x8_extmul_low_i8x16(a.raw, b.raw);
   const auto h = wasm_i16x8_extmul_high_i8x16(a.raw, b.raw);
   // TODO(eustas): shift-right + narrow?
   return Vec128<int8_t, N>{wasm_i8x16_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15,
                                               17, 19, 21, 23, 25, 27, 29, 31)};
 }
 template <size_t N>
 HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a,
                                     const Vec128<uint16_t, N> b) {
   const auto l = wasm_u32x4_extmul_low_u16x8(a.raw, b.raw);
   const auto h = wasm_u32x4_extmul_high_u16x8(a.raw, b.raw);
   // TODO(eustas): shift-right + narrow?
   return Vec128<uint16_t, N>{
       wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
 }
 template <size_t N>
 HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a,
                                    const Vec128<int16_t, N> b) {
   const auto l = wasm_i32x4_extmul_low_i16x8(a.raw, b.raw);
   const auto h = wasm_i32x4_extmul_high_i16x8(a.raw, b.raw);
   // TODO(eustas): shift-right + narrow?
   return Vec128<int16_t, N>{
       wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
 }
 template <size_t N>
 HWY_API Vec128<uint32_t, N> MulHigh(const Vec128<uint32_t, N> a,
                                     const Vec128<uint32_t, N> b) {
   const auto l = wasm_u64x2_extmul_low_u32x4(a.raw, b.raw);
   const auto h = wasm_u64x2_extmul_high_u32x4(a.raw, b.raw);
   // TODO(eustas): shift-right + narrow?
   return Vec128<uint32_t, N>{wasm_i32x4_shuffle(l, h, 1, 3, 5, 7)};
 }
 template <size_t N>
 HWY_API Vec128<int32_t, N> MulHigh(const Vec128<int32_t, N> a,
                                    const Vec128<int32_t, N> b) {
   const auto l = wasm_i64x2_extmul_low_i32x4(a.raw, b.raw);
   const auto h = wasm_i64x2_extmul_high_i32x4(a.raw, b.raw);
   // TODO(eustas): shift-right + narrow?
   return Vec128<int32_t, N>{wasm_i32x4_shuffle(l, h, 1, 3, 5, 7)};
 }

 template <size_t N>
 HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
                                            Vec128<int16_t, N> b) {
   return Vec128<int16_t, N>{wasm_i16x8_q15mulr_sat(a.raw, b.raw)};
 }

 // Multiplies even lanes (0, 2 ..) and returns the double-width result.
 template <class T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
           HWY_IF_SIGNED(T)>
 HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulEven(const Vec128<T, N> a,
                                                  const Vec128<T, N> b) {
   const DFromV<decltype(a)> d;
   const RepartitionToWide<decltype(d)> dw;
   constexpr int kSrcBits = sizeof(T) * 8;

   const auto ae =
       ShiftRight<kSrcBits>(ShiftLeft<kSrcBits>(ResizeBitCast(dw, a)));
   const auto be =
       ShiftRight<kSrcBits>(ShiftLeft<kSrcBits>(ResizeBitCast(dw, b)));
   return ae * be;
 }
 template <class T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
           HWY_IF_UNSIGNED(T)>
 HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulEven(const Vec128<T, N> a,
                                                  const Vec128<T, N> b) {
   const DFromV<decltype(a)> d;
   const RepartitionToWide<decltype(d)> dw;
   const auto kEvenMask = Set(dw, LimitsMax<T>());

   const auto ae = And(ResizeBitCast(dw, a), kEvenMask);
   const auto be = And(ResizeBitCast(dw, b), kEvenMask);
   return ae * be;
 }
 template <size_t N>
 HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
                                              const Vec128<int32_t, N> b) {
   const DFromV<decltype(a)> d;
   const RepartitionToWide<decltype(d)> dw;
   const auto ae = ShiftRight<32>(ShiftLeft<32>(ResizeBitCast(dw, a))).raw;
   const auto be = ShiftRight<32>(ShiftLeft<32>(ResizeBitCast(dw, b))).raw;
   return Vec128<int64_t, (N + 1) / 2>{wasm_i64x2_mul(ae, be)};
 }
 template <size_t N>
 HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(const Vec128<uint32_t, N> a,
                                               const Vec128<uint32_t, N> b) {
   const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
   const auto ae = wasm_v128_and(a.raw, kEvenMask);
   const auto be = wasm_v128_and(b.raw, kEvenMask);
   return Vec128<uint64_t, (N + 1) / 2>{wasm_i64x2_mul(ae, be)};
 }

 // Multiplies odd lanes (1, 3 ..) and returns the double-width result.
 template <class T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
           HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
 HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulOdd(const Vec128<T, N> a,
                                                 const Vec128<T, N> b) {
   const DFromV<decltype(a)> d;
   const RepartitionToWide<decltype(d)> dw;
   constexpr int kSrcBits = sizeof(T) * 8;

   const auto ao = ShiftRight<kSrcBits>(BitCast(dw, a));
   const auto bo = ShiftRight<kSrcBits>(BitCast(dw, b));
   return ao * bo;
 }
 template <class T, size_t N, HWY_IF_UI32(T)>
 HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulOdd(const Vec128<T, N> a,
                                                 const Vec128<T, N> b) {
   const DFromV<decltype(a)> d;
   const RepartitionToWide<decltype(d)> dw;

   const auto ao = ShiftRight<32>(BitCast(dw, a));
   const auto bo = ShiftRight<32>(BitCast(dw, b));
   return Vec128<MakeWide<T>, (N + 1) / 2>{wasm_i64x2_mul(ao.raw, bo.raw)};
 }

 // ------------------------------ Negate

 template <typename T, size_t N, HWY_IF_FLOAT_OR_SPECIAL(T)>
 HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
   return Xor(v, SignBit(DFromV<decltype(v)>()));
 }

 template <size_t N>
 HWY_API Vec128<int8_t, N> Neg(const Vec128<int8_t, N> v) {
   return Vec128<int8_t, N>{wasm_i8x16_neg(v.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int16_t, N> Neg(const Vec128<int16_t, N> v) {
   return Vec128<int16_t, N>{wasm_i16x8_neg(v.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int32_t, N> Neg(const Vec128<int32_t, N> v) {
   return Vec128<int32_t, N>{wasm_i32x4_neg(v.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int64_t, N> Neg(const Vec128<int64_t, N> v) {
   return Vec128<int64_t, N>{wasm_i64x2_neg(v.raw)};
 }

 // ------------------------------ Floating-point mul / div

 template <size_t N>
 HWY_API Vec128<float, N> operator*(Vec128<float, N> a, Vec128<float, N> b) {
   return Vec128<float, N>{wasm_f32x4_mul(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<double, N> operator*(Vec128<double, N> a, Vec128<double, N> b) {
   return Vec128<double, N>{wasm_f64x2_mul(a.raw, b.raw)};
 }

 template <size_t N>
 HWY_API Vec128<float, N> operator/(const Vec128<float, N> a,
                                    const Vec128<float, N> b) {
   return Vec128<float, N>{wasm_f32x4_div(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<double, N> operator/(const Vec128<double, N> a,
                                     const Vec128<double, N> b) {
   return Vec128<double, N>{wasm_f64x2_div(a.raw, b.raw)};
 }

 template <class V, HWY_IF_F32(TFromV<V>)>
 HWY_API V ApproximateReciprocal(const V v) {
   return Set(DFromV<decltype(v)>(), 1.0f) / v;
 }

 // Integer overload defined in generic_ops-inl.h.
 template <typename T, size_t N, HWY_IF_FLOAT(T)>
 HWY_API Vec128<T, N> AbsDiff(const Vec128<T, N> a, const Vec128<T, N> b) {
   return Abs(a - b);
 }

 // ------------------------------ Floating-point multiply-add variants

 template <typename T, size_t N, HWY_IF_FLOAT(T)>
 HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x,
                             Vec128<T, N> add) {
   return mul * x + add;
 }

 template <typename T, size_t N, HWY_IF_FLOAT(T)>
 HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x,
                                Vec128<T, N> add) {
   return add - mul * x;
 }

 template <typename T, size_t N, HWY_IF_FLOAT(T)>
 HWY_API Vec128<T, N> MulSub(Vec128<T, N> mul, Vec128<T, N> x,
                             Vec128<T, N> sub) {
   return mul * x - sub;
 }

 template <typename T, size_t N, HWY_IF_FLOAT(T)>
 HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x,
                                Vec128<T, N> sub) {
   return Neg(mul) * x - sub;
 }

 // ------------------------------ Floating-point square root

 // Full precision square root
 template <size_t N>
 HWY_API Vec128<float, N> Sqrt(const Vec128<float, N> v) {
   return Vec128<float, N>{wasm_f32x4_sqrt(v.raw)};
 }
 template <size_t N>
 HWY_API Vec128<double, N> Sqrt(const Vec128<double, N> v) {
   return Vec128<double, N>{wasm_f64x2_sqrt(v.raw)};
 }

 // Approximate reciprocal square root
 template <class V, HWY_IF_F32(TFromV<V>)>
 HWY_API V ApproximateReciprocalSqrt(V v) {
   // TODO(eustas): find cheaper a way to calculate this.
   return Set(DFromV<decltype(v)>(), static_cast<TFromV<V>>(1.0)) / Sqrt(v);
 }

 // ------------------------------ Floating-point rounding

 // Toward nearest integer, ties to even
 template <size_t N>
 HWY_API Vec128<float, N> Round(const Vec128<float, N> v) {
   return Vec128<float, N>{wasm_f32x4_nearest(v.raw)};
 }
 template <size_t N>
 HWY_API Vec128<double, N> Round(const Vec128<double, N> v) {
   return Vec128<double, N>{wasm_f64x2_nearest(v.raw)};
 }

 // Toward zero, aka truncate
 template <size_t N>
 HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) {
   return Vec128<float, N>{wasm_f32x4_trunc(v.raw)};
 }
 template <size_t N>
 HWY_API Vec128<double, N> Trunc(const Vec128<double, N> v) {
   return Vec128<double, N>{wasm_f64x2_trunc(v.raw)};
 }

 // Toward +infinity, aka ceiling
 template <size_t N>
 HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) {
   return Vec128<float, N>{wasm_f32x4_ceil(v.raw)};
 }
 template <size_t N>
 HWY_API Vec128<double, N> Ceil(const Vec128<double, N> v) {
   return Vec128<double, N>{wasm_f64x2_ceil(v.raw)};
 }

 // Toward -infinity, aka floor
 template <size_t N>
 HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) {
   return Vec128<float, N>{wasm_f32x4_floor(v.raw)};
 }
 template <size_t N>
 HWY_API Vec128<double, N> Floor(const Vec128<double, N> v) {
   return Vec128<double, N>{wasm_f64x2_floor(v.raw)};
 }

 // ------------------------------ Floating-point classification
 template <typename T, size_t N>
 HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
   return v != v;
 }

 template <typename T, size_t N, HWY_IF_FLOAT(T)>
 HWY_API Mask128<T, N> IsInf(const Vec128<T, N> v) {
   const DFromV<decltype(v)> d;
   const RebindToUnsigned<decltype(d)> du;
   const VFromD<decltype(du)> vu = BitCast(du, v);
   // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
   return RebindMask(d, Eq(Add(vu, vu), Set(du, hwy::MaxExponentTimes2<T>())));
 }

 // Returns whether normal/subnormal/zero.
 template <typename T, size_t N, HWY_IF_FLOAT(T)>
 HWY_API Mask128<T, N> IsFinite(const Vec128<T, N> v) {
   const DFromV<decltype(v)> d;
   const RebindToUnsigned<decltype(d)> du;
   const RebindToSigned<decltype(d)> di;  // cheaper than unsigned comparison
   const VFromD<decltype(du)> vu = BitCast(du, v);
   // 'Shift left' to clear the sign bit, then right so we can compare with the
   // max exponent (cannot compare with MaxExponentTimes2 directly because it is
   // negative and non-negative floats would be greater).
   const VFromD<decltype(di)> exp =
       BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
   return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
 }

 // ================================================== COMPARE

 // Comparisons fill a lane with 1-bits if the condition is true, else 0.

 // Mask and Vec are the same (true = FF..FF).
 template <typename T, size_t N>
 HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
   return Mask128<T, N>{v.raw};
 }

 template <class D>
 using MFromD = decltype(MaskFromVec(VFromD<D>()));

 template <typename TFrom, size_t NFrom, class DTo>
 HWY_API MFromD<DTo> RebindMask(DTo /* tag */, Mask128<TFrom, NFrom> m) {
   static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size");
   return MFromD<DTo>{m.raw};
 }

 template <typename T, size_t N>
 HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
   static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
   return (v & bit) == bit;
 }

 // ------------------------------ Equality

 // Unsigned
 template <size_t N>
 HWY_API Mask128<uint8_t, N> operator==(const Vec128<uint8_t, N> a,
                                        const Vec128<uint8_t, N> b) {
   return Mask128<uint8_t, N>{wasm_i8x16_eq(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<uint16_t, N> operator==(const Vec128<uint16_t, N> a,
                                         const Vec128<uint16_t, N> b) {
   return Mask128<uint16_t, N>{wasm_i16x8_eq(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<uint32_t, N> operator==(const Vec128<uint32_t, N> a,
                                         const Vec128<uint32_t, N> b) {
   return Mask128<uint32_t, N>{wasm_i32x4_eq(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
                                         const Vec128<uint64_t, N> b) {
   return Mask128<uint64_t, N>{wasm_i64x2_eq(a.raw, b.raw)};
 }

 // Signed
 template <size_t N>
 HWY_API Mask128<int8_t, N> operator==(const Vec128<int8_t, N> a,
                                       const Vec128<int8_t, N> b) {
   return Mask128<int8_t, N>{wasm_i8x16_eq(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<int16_t, N> operator==(Vec128<int16_t, N> a,
                                        Vec128<int16_t, N> b) {
   return Mask128<int16_t, N>{wasm_i16x8_eq(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<int32_t, N> operator==(const Vec128<int32_t, N> a,
                                        const Vec128<int32_t, N> b) {
   return Mask128<int32_t, N>{wasm_i32x4_eq(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
                                        const Vec128<int64_t, N> b) {
   return Mask128<int64_t, N>{wasm_i64x2_eq(a.raw, b.raw)};
 }

 // Float
 template <size_t N>
 HWY_API Mask128<float, N> operator==(const Vec128<float, N> a,
                                      const Vec128<float, N> b) {
   return Mask128<float, N>{wasm_f32x4_eq(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<double, N> operator==(const Vec128<double, N> a,
                                       const Vec128<double, N> b) {
   return Mask128<double, N>{wasm_f64x2_eq(a.raw, b.raw)};
 }

 // ------------------------------ Inequality

 // Unsigned
 template <size_t N>
 HWY_API Mask128<uint8_t, N> operator!=(const Vec128<uint8_t, N> a,
                                        const Vec128<uint8_t, N> b) {
   return Mask128<uint8_t, N>{wasm_i8x16_ne(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<uint16_t, N> operator!=(const Vec128<uint16_t, N> a,
                                         const Vec128<uint16_t, N> b) {
   return Mask128<uint16_t, N>{wasm_i16x8_ne(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<uint32_t, N> operator!=(const Vec128<uint32_t, N> a,
                                         const Vec128<uint32_t, N> b) {
   return Mask128<uint32_t, N>{wasm_i32x4_ne(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<uint64_t, N> operator!=(const Vec128<uint64_t, N> a,
                                         const Vec128<uint64_t, N> b) {
   return Mask128<uint64_t, N>{wasm_i64x2_ne(a.raw, b.raw)};
 }

 // Signed
 template <size_t N>
 HWY_API Mask128<int8_t, N> operator!=(const Vec128<int8_t, N> a,
                                       const Vec128<int8_t, N> b) {
   return Mask128<int8_t, N>{wasm_i8x16_ne(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<int16_t, N> operator!=(const Vec128<int16_t, N> a,
                                        const Vec128<int16_t, N> b) {
   return Mask128<int16_t, N>{wasm_i16x8_ne(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<int32_t, N> operator!=(const Vec128<int32_t, N> a,
                                        const Vec128<int32_t, N> b) {
   return Mask128<int32_t, N>{wasm_i32x4_ne(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<int64_t, N> operator!=(const Vec128<int64_t, N> a,
                                        const Vec128<int64_t, N> b) {
   return Mask128<int64_t, N>{wasm_i64x2_ne(a.raw, b.raw)};
 }

 // Float
 template <size_t N>
 HWY_API Mask128<float, N> operator!=(const Vec128<float, N> a,
                                      const Vec128<float, N> b) {
   return Mask128<float, N>{wasm_f32x4_ne(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<double, N> operator!=(const Vec128<double, N> a,
                                       const Vec128<double, N> b) {
   return Mask128<double, N>{wasm_f64x2_ne(a.raw, b.raw)};
 }

 // ------------------------------ Strict inequality

 template <size_t N>
 HWY_API Mask128<int8_t, N> operator>(const Vec128<int8_t, N> a,
                                      const Vec128<int8_t, N> b) {
   return Mask128<int8_t, N>{wasm_i8x16_gt(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<int16_t, N> operator>(const Vec128<int16_t, N> a,
                                       const Vec128<int16_t, N> b) {
   return Mask128<int16_t, N>{wasm_i16x8_gt(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<int32_t, N> operator>(const Vec128<int32_t, N> a,
                                       const Vec128<int32_t, N> b) {
   return Mask128<int32_t, N>{wasm_i32x4_gt(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<int64_t, N> operator>(const Vec128<int64_t, N> a,
                                       const Vec128<int64_t, N> b) {
   return Mask128<int64_t, N>{wasm_i64x2_gt(a.raw, b.raw)};
 }

 template <size_t N>
 HWY_API Mask128<uint8_t, N> operator>(const Vec128<uint8_t, N> a,
                                       const Vec128<uint8_t, N> b) {
   return Mask128<uint8_t, N>{wasm_u8x16_gt(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<uint16_t, N> operator>(const Vec128<uint16_t, N> a,
                                        const Vec128<uint16_t, N> b) {
   return Mask128<uint16_t, N>{wasm_u16x8_gt(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<uint32_t, N> operator>(const Vec128<uint32_t, N> a,
                                        const Vec128<uint32_t, N> b) {
   return Mask128<uint32_t, N>{wasm_u32x4_gt(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<uint64_t, N> operator>(const Vec128<uint64_t, N> a,
                                        const Vec128<uint64_t, N> b) {
   const DFromV<decltype(a)> d;
   const Repartition<uint32_t, decltype(d)> d32;
   const auto a32 = BitCast(d32, a);
   const auto b32 = BitCast(d32, b);
   // If the upper halves are not equal, this is the answer.
   const auto m_gt = a32 > b32;

   // Otherwise, the lower half decides.
   const auto m_eq = a32 == b32;
   const auto lo_in_hi = wasm_i32x4_shuffle(m_gt.raw, m_gt.raw, 0, 0, 2, 2);
   const auto lo_gt = And(m_eq, MaskFromVec(VFromD<decltype(d32)>{lo_in_hi}));

   const auto gt = Or(lo_gt, m_gt);
   // Copy result in upper 32 bits to lower 32 bits.
   return Mask128<uint64_t, N>{wasm_i32x4_shuffle(gt.raw, gt.raw, 1, 1, 3, 3)};
 }

 template <size_t N>
 HWY_API Mask128<float, N> operator>(const Vec128<float, N> a,
                                     const Vec128<float, N> b) {
   return Mask128<float, N>{wasm_f32x4_gt(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<double, N> operator>(const Vec128<double, N> a,
                                      const Vec128<double, N> b) {
   return Mask128<double, N>{wasm_f64x2_gt(a.raw, b.raw)};
 }

 template <typename T, size_t N>
 HWY_API Mask128<T, N> operator<(const Vec128<T, N> a, const Vec128<T, N> b) {
   return operator>(b, a);
 }

 // ------------------------------ Weak inequality

 // Float >=
 template <size_t N>
 HWY_API Mask128<float, N> operator>=(const Vec128<float, N> a,
                                      const Vec128<float, N> b) {
   return Mask128<float, N>{wasm_f32x4_ge(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<double, N> operator>=(const Vec128<double, N> a,
                                       const Vec128<double, N> b) {
   return Mask128<double, N>{wasm_f64x2_ge(a.raw, b.raw)};
 }

 template <size_t N>
 HWY_API Mask128<int8_t, N> operator>=(const Vec128<int8_t, N> a,
                                       const Vec128<int8_t, N> b) {
   return Mask128<int8_t, N>{wasm_i8x16_ge(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<int16_t, N> operator>=(const Vec128<int16_t, N> a,
                                        const Vec128<int16_t, N> b) {
   return Mask128<int16_t, N>{wasm_i16x8_ge(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<int32_t, N> operator>=(const Vec128<int32_t, N> a,
                                        const Vec128<int32_t, N> b) {
   return Mask128<int32_t, N>{wasm_i32x4_ge(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<int64_t, N> operator>=(const Vec128<int64_t, N> a,
                                        const Vec128<int64_t, N> b) {
   return Mask128<int64_t, N>{wasm_i64x2_ge(a.raw, b.raw)};
 }

 template <size_t N>
 HWY_API Mask128<uint8_t, N> operator>=(const Vec128<uint8_t, N> a,
                                        const Vec128<uint8_t, N> b) {
   return Mask128<uint8_t, N>{wasm_u8x16_ge(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<uint16_t, N> operator>=(const Vec128<uint16_t, N> a,
                                         const Vec128<uint16_t, N> b) {
   return Mask128<uint16_t, N>{wasm_u16x8_ge(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<uint32_t, N> operator>=(const Vec128<uint32_t, N> a,
                                         const Vec128<uint32_t, N> b) {
   return Mask128<uint32_t, N>{wasm_u32x4_ge(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<uint64_t, N> operator>=(const Vec128<uint64_t, N> a,
                                         const Vec128<uint64_t, N> b) {
   return Not(b > a);
 }

 template <typename T, size_t N>
 HWY_API Mask128<T, N> operator<=(const Vec128<T, N> a, const Vec128<T, N> b) {
   return operator>=(b, a);
 }

 // ------------------------------ FirstN (Iota, Lt)

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
 HWY_API MFromD<D> FirstN(D d, size_t num) {
   const RebindToSigned<decltype(d)> di;  // Signed comparisons may be cheaper.
   using TI = TFromD<decltype(di)>;
   return RebindMask(d, Iota(di, 0) < Set(di, static_cast<TI>(num)));
 }

 // ================================================== LOGICAL

 // ------------------------------ Not

 template <typename T, size_t N>
 HWY_API Vec128<T, N> Not(Vec128<T, N> v) {
   return Vec128<T, N>{wasm_v128_not(v.raw)};
 }

 // ------------------------------ And

 template <typename T, size_t N>
 HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) {
   return Vec128<T, N>{wasm_v128_and(a.raw, b.raw)};
 }

 // ------------------------------ AndNot

 // Returns ~not_mask & mask.
 template <typename T, size_t N>
 HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
   return Vec128<T, N>{wasm_v128_andnot(mask.raw, not_mask.raw)};
 }

 // ------------------------------ Or

 template <typename T, size_t N>
 HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) {
   return Vec128<T, N>{wasm_v128_or(a.raw, b.raw)};
 }

 // ------------------------------ Xor

 template <typename T, size_t N>
 HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) {
   return Vec128<T, N>{wasm_v128_xor(a.raw, b.raw)};
 }

 // ------------------------------ Xor3

 template <typename T, size_t N>
 HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
   return Xor(x1, Xor(x2, x3));
 }

 // ------------------------------ Or3

 template <typename T, size_t N>
 HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
   return Or(o1, Or(o2, o3));
 }

 // ------------------------------ OrAnd

 template <typename T, size_t N>
 HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
   return Or(o, And(a1, a2));
 }

 // ------------------------------ IfVecThenElse

 template <typename T, size_t N>
 HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
                                    Vec128<T, N> no) {
   return IfThenElse(MaskFromVec(mask), yes, no);
 }

 // ------------------------------ Operator overloads (internal-only if float)

 template <typename T, size_t N>
 HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) {
   return And(a, b);
 }

 template <typename T, size_t N>
 HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) {
   return Or(a, b);
 }

 template <typename T, size_t N>
 HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) {
   return Xor(a, b);
 }

 // ------------------------------ CopySign
 template <typename T, size_t N>
 HWY_API Vec128<T, N> CopySign(const Vec128<T, N> magn,
                               const Vec128<T, N> sign) {
   static_assert(IsFloat<T>(), "Only makes sense for floating-point");
   const DFromV<decltype(magn)> d;
   return BitwiseIfThenElse(SignBit(d), sign, magn);
 }

 // ------------------------------ CopySignToAbs
 template <typename T, size_t N>
 HWY_API Vec128<T, N> CopySignToAbs(const Vec128<T, N> abs,
                                    const Vec128<T, N> sign) {
   static_assert(IsFloat<T>(), "Only makes sense for floating-point");
   const DFromV<decltype(abs)> d;
   return OrAnd(abs, SignBit(d), sign);
 }

 // ------------------------------ BroadcastSignBit (compare)

 template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
 HWY_API Vec128<T, N> BroadcastSignBit(const Vec128<T, N> v) {
   return ShiftRight<sizeof(T) * 8 - 1>(v);
 }
 template <size_t N>
 HWY_API Vec128<int8_t, N> BroadcastSignBit(const Vec128<int8_t, N> v) {
   const DFromV<decltype(v)> d;
   return VecFromMask(d, v < Zero(d));
 }

 // ------------------------------ Mask

 template <class D>
 HWY_API VFromD<D> VecFromMask(D /* tag */, MFromD<D> v) {
   return VFromD<D>{v.raw};
 }

 // mask ? yes : no
 template <typename T, size_t N>
 HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
                                 Vec128<T, N> no) {
   return Vec128<T, N>{wasm_v128_bitselect(yes.raw, no.raw, mask.raw)};
 }

 // mask ? yes : 0
 template <typename T, size_t N>
 HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
   return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
 }

 // mask ? 0 : no
 template <typename T, size_t N>
 HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
   return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
 }

 template <typename T, size_t N>
 HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
                                         Vec128<T, N> no) {
   static_assert(IsSigned<T>(), "Only works for signed/float");
   const DFromV<decltype(v)> d;
   const RebindToSigned<decltype(d)> di;

   v = BitCast(d, BroadcastSignBit(BitCast(di, v)));
   return IfThenElse(MaskFromVec(v), yes, no);
 }

 // ------------------------------ Mask logical

 template <typename T, size_t N>
 HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
   const DFromM<decltype(m)> d;
   return MaskFromVec(Not(VecFromMask(d, m)));
 }

 template <typename T, size_t N>
 HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
   const DFromM<decltype(a)> d;
   return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
 }

 template <typename T, size_t N>
 HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
   const DFromM<decltype(a)> d;
   return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
 }

 template <typename T, size_t N>
 HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
   const DFromM<decltype(a)> d;
   return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
 }

 template <typename T, size_t N>
 HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
   const DFromM<decltype(a)> d;
   return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
 }

 template <typename T, size_t N>
 HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) {
   const DFromM<decltype(a)> d;
   return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
 }

 // ------------------------------ Shl (BroadcastSignBit, IfThenElse)

 // The x86 multiply-by-Pow2() trick will not work because WASM saturates
 // float->int correctly to 2^31-1 (not 2^31). Because WASM's shifts take a
 // scalar count operand, per-lane shift instructions would require extract_lane
 // for each lane, and hoping that shuffle is correctly mapped to a native
 // instruction. Using non-vector shifts would incur a store-load forwarding
 // stall when loading the result vector. We instead test bits of the shift
 // count to "predicate" a shift of the entire vector by a constant.

 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
 HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) {
   const DFromV<decltype(v)> d;
   Mask128<T, N> mask;
   // Need a signed type for BroadcastSignBit.
   auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
   // Move the highest valid bit of the shift count into the sign bit.
   test = ShiftLeft<5>(test);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   test = ShiftLeft<1>(test);  // next bit (descending order)
   v = IfThenElse(mask, ShiftLeft<4>(v), v);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   test = ShiftLeft<1>(test);  // next bit (descending order)
   v = IfThenElse(mask, ShiftLeft<2>(v), v);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   return IfThenElse(mask, ShiftLeft<1>(v), v);
 }

 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2),
           HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
 HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) {
   const DFromV<decltype(v)> d;
   Mask128<T, N> mask;
   // Need a signed type for BroadcastSignBit.
   auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
   // Move the highest valid bit of the shift count into the sign bit.
   test = ShiftLeft<12>(test);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   test = ShiftLeft<1>(test);  // next bit (descending order)
   v = IfThenElse(mask, ShiftLeft<8>(v), v);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   test = ShiftLeft<1>(test);  // next bit (descending order)
   v = IfThenElse(mask, ShiftLeft<4>(v), v);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   test = ShiftLeft<1>(test);  // next bit (descending order)
   v = IfThenElse(mask, ShiftLeft<2>(v), v);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   return IfThenElse(mask, ShiftLeft<1>(v), v);
 }

 template <typename T, size_t N, HWY_IF_UI32(T)>
 HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) {
   const DFromV<decltype(v)> d;
   Mask128<T, N> mask;
   // Need a signed type for BroadcastSignBit.
   auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
   // Move the highest valid bit of the shift count into the sign bit.
   test = ShiftLeft<27>(test);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   test = ShiftLeft<1>(test);  // next bit (descending order)
   v = IfThenElse(mask, ShiftLeft<16>(v), v);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   test = ShiftLeft<1>(test);  // next bit (descending order)
   v = IfThenElse(mask, ShiftLeft<8>(v), v);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   test = ShiftLeft<1>(test);  // next bit (descending order)
   v = IfThenElse(mask, ShiftLeft<4>(v), v);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   test = ShiftLeft<1>(test);  // next bit (descending order)
   v = IfThenElse(mask, ShiftLeft<2>(v), v);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   return IfThenElse(mask, ShiftLeft<1>(v), v);
 }

 template <typename T, size_t N, HWY_IF_UI64(T)>
 HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) {
   const DFromV<decltype(v)> d;
   const RebindToUnsigned<decltype(d)> du;
   using TU = MakeUnsigned<T>;
   alignas(16) TU lanes[2] = {};
   alignas(16) TU bits_lanes[2] = {};
   Store(BitCast(du, v), du, lanes);
   Store(BitCast(du, bits), du, bits_lanes);
   lanes[0] <<= (bits_lanes[0] & 63);
   lanes[1] <<= (bits_lanes[1] & 63);
   return BitCast(d, Load(du, lanes));
 }

 // ------------------------------ Shr (BroadcastSignBit, IfThenElse)

 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
 HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, const Vec128<T, N> bits) {
   const DFromV<decltype(v)> d;
   Mask128<T, N> mask;
   // Need a signed type for BroadcastSignBit.
   auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
   // Move the highest valid bit of the shift count into the sign bit.
   test = ShiftLeft<5>(test);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   test = ShiftLeft<1>(test);  // next bit (descending order)
   v = IfThenElse(mask, ShiftRight<4>(v), v);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   test = ShiftLeft<1>(test);  // next bit (descending order)
   v = IfThenElse(mask, ShiftRight<2>(v), v);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   return IfThenElse(mask, ShiftRight<1>(v), v);
 }

 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2),
           HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
 HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, const Vec128<T, N> bits) {
   const DFromV<decltype(v)> d;
   Mask128<T, N> mask;
   // Need a signed type for BroadcastSignBit.
   auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
   // Move the highest valid bit of the shift count into the sign bit.
   test = ShiftLeft<12>(test);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   test = ShiftLeft<1>(test);  // next bit (descending order)
   v = IfThenElse(mask, ShiftRight<8>(v), v);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   test = ShiftLeft<1>(test);  // next bit (descending order)
   v = IfThenElse(mask, ShiftRight<4>(v), v);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   test = ShiftLeft<1>(test);  // next bit (descending order)
   v = IfThenElse(mask, ShiftRight<2>(v), v);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   return IfThenElse(mask, ShiftRight<1>(v), v);
 }

 template <typename T, size_t N, HWY_IF_UI32(T)>
 HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, const Vec128<T, N> bits) {
   const DFromV<decltype(v)> d;
   Mask128<T, N> mask;
   // Need a signed type for BroadcastSignBit.
   auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
   // Move the highest valid bit of the shift count into the sign bit.
   test = ShiftLeft<27>(test);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   test = ShiftLeft<1>(test);  // next bit (descending order)
   v = IfThenElse(mask, ShiftRight<16>(v), v);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   test = ShiftLeft<1>(test);  // next bit (descending order)
   v = IfThenElse(mask, ShiftRight<8>(v), v);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   test = ShiftLeft<1>(test);  // next bit (descending order)
   v = IfThenElse(mask, ShiftRight<4>(v), v);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   test = ShiftLeft<1>(test);  // next bit (descending order)
   v = IfThenElse(mask, ShiftRight<2>(v), v);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   return IfThenElse(mask, ShiftRight<1>(v), v);
 }

 template <typename T, size_t N, HWY_IF_UI64(T)>
 HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, const Vec128<T, N> bits) {
   const DFromV<decltype(v)> d;
   alignas(16) T lanes[2] = {};
   alignas(16) T bits_lanes[2] = {};
   Store(v, d, lanes);
   Store(bits, d, bits_lanes);
   lanes[0] >>= (bits_lanes[0] & 63);
   lanes[1] >>= (bits_lanes[1] & 63);
   return Load(d, lanes);
 }

 // ================================================== MEMORY

 // ------------------------------ Load

 template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
 HWY_API Vec128<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) {
   return Vec128<T>{wasm_v128_load(aligned)};
 }

 // Partial
 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
 HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) {
   VFromD<D> v;
   CopyBytes<d.MaxBytes()>(p, &v);
   return v;
 }

 // LoadU == Load.
 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
 HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
   return Load(d, p);
 }

 // 128-bit SIMD => nothing to duplicate, same as an unaligned load.
 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
 HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) {
   return Load(d, p);
 }

 template <class D, typename T = TFromD<D>>
 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, const T* HWY_RESTRICT aligned) {
   return IfThenElseZero(m, Load(d, aligned));
 }

 template <class D, typename T = TFromD<D>>
 HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
                                const T* HWY_RESTRICT aligned) {
   return IfThenElse(m, Load(d, aligned), v);
 }

 // ------------------------------ Store

 namespace detail {

 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
 HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
   return static_cast<T>(wasm_i8x16_extract_lane(v.raw, kLane));
 }
 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2),
           HWY_IF_NOT_SPECIAL_FLOAT(T)>
 HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
   const int16_t lane = wasm_i16x8_extract_lane(v.raw, kLane);
   return static_cast<T>(lane);
 }
 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2),
           HWY_IF_SPECIAL_FLOAT(T)>
 HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
   const DFromV<decltype(v)> d;
   const RebindToUnsigned<decltype(d)> du;

   const uint16_t bits = ExtractLane<kLane>(BitCast(du, v));
   return BitCastScalar<T>(bits);
 }
 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
 HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
   return static_cast<T>(wasm_i32x4_extract_lane(v.raw, kLane));
 }
 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
 HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
   return static_cast<T>(wasm_i64x2_extract_lane(v.raw, kLane));
 }

 template <size_t kLane, size_t N>
 HWY_INLINE float ExtractLane(const Vec128<float, N> v) {
   return wasm_f32x4_extract_lane(v.raw, kLane);
 }
 template <size_t kLane, size_t N>
 HWY_INLINE double ExtractLane(const Vec128<double, N> v) {
   return wasm_f64x2_extract_lane(v.raw, kLane);
 }

 }  // namespace detail

 template <class D, HWY_IF_V_SIZE_D(D, 16)>
 HWY_API void Store(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT aligned) {
   wasm_v128_store(aligned, v.raw);
 }

 // Partial
 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_LANES_GT_D(D, 1)>
 HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
   CopyBytes<d.MaxBytes()>(&v, p);
 }

 template <class D, HWY_IF_LANES_D(D, 1)>
 HWY_API void Store(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT p) {
   *p = detail::ExtractLane<0>(v);
 }

 // StoreU == Store.
 template <class D>
 HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
   Store(v, d, p);
 }

 template <class D>
 HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
                           TFromD<D>* HWY_RESTRICT p) {
   StoreU(IfThenElse(m, v, LoadU(d, p)), d, p);
 }

 // ------------------------------ Non-temporal stores

 // Same as aligned stores on non-x86.

 template <class D>
 HWY_API void Stream(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT aligned) {
   wasm_v128_store(aligned, v.raw);
 }

 // ------------------------------ Scatter in generic_ops-inl.h
 // ------------------------------ Gather in generic_ops-inl.h

 // ================================================== SWIZZLE

 // ------------------------------ ExtractLane

 // One overload per vector length just in case *_extract_lane raise compile
 // errors if their argument is out of bounds (even if that would never be
 // reached at runtime).
 template <typename T>
 HWY_API T ExtractLane(const Vec128<T, 1> v, size_t i) {
   HWY_DASSERT(i == 0);
   (void)i;
   return detail::ExtractLane<0>(v);
 }

 template <typename T>
 HWY_API T ExtractLane(const Vec128<T, 2> v, size_t i) {
 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   if (__builtin_constant_p(i)) {
     switch (i) {
       case 0:
         return detail::ExtractLane<0>(v);
       case 1:
         return detail::ExtractLane<1>(v);
     }
   }
 #endif
   alignas(16) T lanes[2];
   Store(v, DFromV<decltype(v)>(), lanes);
   return lanes[i];
 }

 template <typename T>
 HWY_API T ExtractLane(const Vec128<T, 4> v, size_t i) {
 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   if (__builtin_constant_p(i)) {
     switch (i) {
       case 0:
         return detail::ExtractLane<0>(v);
       case 1:
         return detail::ExtractLane<1>(v);
       case 2:
         return detail::ExtractLane<2>(v);
       case 3:
         return detail::ExtractLane<3>(v);
     }
   }
 #endif
   alignas(16) T lanes[4];
   Store(v, DFromV<decltype(v)>(), lanes);
   return lanes[i];
 }

 template <typename T>
 HWY_API T ExtractLane(const Vec128<T, 8> v, size_t i) {
 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   if (__builtin_constant_p(i)) {
     switch (i) {
       case 0:
         return detail::ExtractLane<0>(v);
       case 1:
         return detail::ExtractLane<1>(v);
       case 2:
         return detail::ExtractLane<2>(v);
       case 3:
         return detail::ExtractLane<3>(v);
       case 4:
         return detail::ExtractLane<4>(v);
       case 5:
         return detail::ExtractLane<5>(v);
       case 6:
         return detail::ExtractLane<6>(v);
       case 7:
         return detail::ExtractLane<7>(v);
     }
   }
 #endif
   alignas(16) T lanes[8];
   Store(v, DFromV<decltype(v)>(), lanes);
   return lanes[i];
 }

 template <typename T>
 HWY_API T ExtractLane(const Vec128<T, 16> v, size_t i) {
 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   if (__builtin_constant_p(i)) {
     switch (i) {
       case 0:
         return detail::ExtractLane<0>(v);
       case 1:
         return detail::ExtractLane<1>(v);
       case 2:
         return detail::ExtractLane<2>(v);
       case 3:
         return detail::ExtractLane<3>(v);
       case 4:
         return detail::ExtractLane<4>(v);
       case 5:
         return detail::ExtractLane<5>(v);
       case 6:
         return detail::ExtractLane<6>(v);
       case 7:
         return detail::ExtractLane<7>(v);
       case 8:
         return detail::ExtractLane<8>(v);
       case 9:
         return detail::ExtractLane<9>(v);
       case 10:
         return detail::ExtractLane<10>(v);
       case 11:
         return detail::ExtractLane<11>(v);
       case 12:
         return detail::ExtractLane<12>(v);
       case 13:
         return detail::ExtractLane<13>(v);
       case 14:
         return detail::ExtractLane<14>(v);
       case 15:
         return detail::ExtractLane<15>(v);
     }
   }
 #endif
   alignas(16) T lanes[16];
   Store(v, DFromV<decltype(v)>(), lanes);
   return lanes[i];
 }

 // ------------------------------ GetLane
 template <typename T, size_t N>
 HWY_API T GetLane(const Vec128<T, N> v) {
   return detail::ExtractLane<0>(v);
 }

 // ------------------------------ InsertLane

 namespace detail {

 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
 HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
   static_assert(kLane < N, "Lane index out of bounds");
   return Vec128<T, N>{
       wasm_i8x16_replace_lane(v.raw, kLane, static_cast<int8_t>(t))};
 }

 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
 HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
   static_assert(kLane < N, "Lane index out of bounds");
   return Vec128<T, N>{
       wasm_i16x8_replace_lane(v.raw, kLane, BitCastScalar<int16_t>(t))};
 }

 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
 HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
   static_assert(kLane < N, "Lane index out of bounds");
   return Vec128<T, N>{
       wasm_i32x4_replace_lane(v.raw, kLane, static_cast<int32_t>(t))};
 }

 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
 HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
   static_assert(kLane < N, "Lane index out of bounds");
   return Vec128<T, N>{
       wasm_i64x2_replace_lane(v.raw, kLane, static_cast<int64_t>(t))};
 }

 template <size_t kLane, size_t N>
 HWY_INLINE Vec128<float, N> InsertLane(const Vec128<float, N> v, float t) {
   static_assert(kLane < N, "Lane index out of bounds");
   return Vec128<float, N>{wasm_f32x4_replace_lane(v.raw, kLane, t)};
 }

 template <size_t kLane, size_t N>
 HWY_INLINE Vec128<double, N> InsertLane(const Vec128<double, N> v, double t) {
   static_assert(kLane < 2, "Lane index out of bounds");
   return Vec128<double, N>{wasm_f64x2_replace_lane(v.raw, kLane, t)};
 }

 }  // namespace detail

 // Requires one overload per vector length because InsertLane<3> may be a
 // compile error if it calls wasm_f64x2_replace_lane.

 template <typename T>
 HWY_API Vec128<T, 1> InsertLane(const Vec128<T, 1> v, size_t i, T t) {
   HWY_DASSERT(i == 0);
   (void)i;
   return Set(DFromV<decltype(v)>(), t);
 }

 template <typename T>
 HWY_API Vec128<T, 2> InsertLane(const Vec128<T, 2> v, size_t i, T t) {
 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   if (__builtin_constant_p(i)) {
     switch (i) {
       case 0:
         return detail::InsertLane<0>(v, t);
       case 1:
         return detail::InsertLane<1>(v, t);
     }
   }
 #endif
   const DFromV<decltype(v)> d;
   alignas(16) T lanes[2];
   Store(v, d, lanes);
   lanes[i] = t;
   return Load(d, lanes);
 }

 template <typename T>
 HWY_API Vec128<T, 4> InsertLane(const Vec128<T, 4> v, size_t i, T t) {
 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   if (__builtin_constant_p(i)) {
     switch (i) {
       case 0:
         return detail::InsertLane<0>(v, t);
       case 1:
         return detail::InsertLane<1>(v, t);
       case 2:
         return detail::InsertLane<2>(v, t);
       case 3:
         return detail::InsertLane<3>(v, t);
     }
   }
 #endif
   const DFromV<decltype(v)> d;
   alignas(16) T lanes[4];
   Store(v, d, lanes);
   lanes[i] = t;
   return Load(d, lanes);
 }

 template <typename T>
 HWY_API Vec128<T, 8> InsertLane(const Vec128<T, 8> v, size_t i, T t) {
 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   if (__builtin_constant_p(i)) {
     switch (i) {
       case 0:
         return detail::InsertLane<0>(v, t);
       case 1:
         return detail::InsertLane<1>(v, t);
       case 2:
         return detail::InsertLane<2>(v, t);
       case 3:
         return detail::InsertLane<3>(v, t);
       case 4:
         return detail::InsertLane<4>(v, t);
       case 5:
         return detail::InsertLane<5>(v, t);
       case 6:
         return detail::InsertLane<6>(v, t);
       case 7:
         return detail::InsertLane<7>(v, t);
     }
   }
 #endif
   const DFromV<decltype(v)> d;
   alignas(16) T lanes[8];
   Store(v, d, lanes);
   lanes[i] = t;
   return Load(d, lanes);
 }

 template <typename T>
 HWY_API Vec128<T, 16> InsertLane(const Vec128<T, 16> v, size_t i, T t) {
 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   if (__builtin_constant_p(i)) {
     switch (i) {
       case 0:
         return detail::InsertLane<0>(v, t);
       case 1:
         return detail::InsertLane<1>(v, t);
       case 2:
         return detail::InsertLane<2>(v, t);
       case 3:
         return detail::InsertLane<3>(v, t);
       case 4:
         return detail::InsertLane<4>(v, t);
       case 5:
         return detail::InsertLane<5>(v, t);
       case 6:
         return detail::InsertLane<6>(v, t);
       case 7:
         return detail::InsertLane<7>(v, t);
       case 8:
         return detail::InsertLane<8>(v, t);
       case 9:
         return detail::InsertLane<9>(v, t);
       case 10:
         return detail::InsertLane<10>(v, t);
       case 11:
         return detail::InsertLane<11>(v, t);
       case 12:
         return detail::InsertLane<12>(v, t);
       case 13:
         return detail::InsertLane<13>(v, t);
       case 14:
         return detail::InsertLane<14>(v, t);
       case 15:
         return detail::InsertLane<15>(v, t);
     }
   }
 #endif
   const DFromV<decltype(v)> d;
   alignas(16) T lanes[16];
   Store(v, d, lanes);
   lanes[i] = t;
   return Load(d, lanes);
 }

 // ------------------------------ LowerHalf

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
 HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) {
   return VFromD<D>{v.raw};
 }
 template <typename T, size_t N>
 HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
   return Vec128<T, N / 2>{v.raw};
 }

 // ------------------------------ ShiftLeftBytes

 // 0x01..0F, kBytes = 1 => 0x02..0F00
 template <int kBytes, class D>
 HWY_API VFromD<D> ShiftLeftBytes(D /* tag */, VFromD<D> v) {
   static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
   const __i8x16 zero = wasm_i8x16_splat(0);
   switch (kBytes) {
     case 0:
       return v;

     case 1:
       return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5, 6,
                                           7, 8, 9, 10, 11, 12, 13, 14)};

     case 2:
       return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4, 5,
                                           6, 7, 8, 9, 10, 11, 12, 13)};

     case 3:
       return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2, 3,
                                           4, 5, 6, 7, 8, 9, 10, 11, 12)};

     case 4:
       return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1, 2,
                                           3, 4, 5, 6, 7, 8, 9, 10, 11)};

     case 5:
       return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0, 1,
                                           2, 3, 4, 5, 6, 7, 8, 9, 10)};

     case 6:
       return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
                                           0, 1, 2, 3, 4, 5, 6, 7, 8, 9)};

     case 7:
       return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
                                           16, 0, 1, 2, 3, 4, 5, 6, 7, 8)};

     case 8:
       return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
                                           16, 16, 0, 1, 2, 3, 4, 5, 6, 7)};

     case 9:
       return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
                                           16, 16, 16, 0, 1, 2, 3, 4, 5, 6)};

     case 10:
       return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
                                           16, 16, 16, 16, 0, 1, 2, 3, 4, 5)};

     case 11:
       return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
                                           16, 16, 16, 16, 16, 0, 1, 2, 3, 4)};

     case 12:
       return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
                                           16, 16, 16, 16, 16, 16, 0, 1, 2, 3)};

     case 13:
       return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
                                           16, 16, 16, 16, 16, 16, 16, 0, 1, 2)};

     case 14:
       return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
                                           16, 16, 16, 16, 16, 16, 16, 16, 0,
                                           1)};

     case 15:
       return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
                                           16, 16, 16, 16, 16, 16, 16, 16, 16,
                                           0)};
   }
   return VFromD<D>{zero};
 }

 template <int kBytes, typename T, size_t N>
 HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) {
   return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
 }

 // ------------------------------ ShiftLeftLanes

 template <int kLanes, class D>
 HWY_API VFromD<D> ShiftLeftLanes(D d, const VFromD<D> v) {
   const Repartition<uint8_t, decltype(d)> d8;
   constexpr size_t kBytes = kLanes * sizeof(TFromD<D>);
   return BitCast(d, ShiftLeftBytes<kBytes>(BitCast(d8, v)));
 }

 template <int kLanes, typename T, size_t N>
 HWY_API Vec128<T, N> ShiftLeftLanes(const Vec128<T, N> v) {
   return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
 }

 // ------------------------------ ShiftRightBytes
 namespace detail {

 // Helper function allows zeroing invalid lanes in caller.
 template <int kBytes, typename T, size_t N>
 HWY_API __i8x16 ShrBytes(const Vec128<T, N> v) {
   static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
   const __i8x16 zero = wasm_i8x16_splat(0);

   switch (kBytes) {
     case 0:
       return v.raw;

     case 1:
       return wasm_i8x16_shuffle(v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
                                 12, 13, 14, 15, 16);

     case 2:
       return wasm_i8x16_shuffle(v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                 13, 14, 15, 16, 16);

     case 3:
       return wasm_i8x16_shuffle(v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                 13, 14, 15, 16, 16, 16);

     case 4:
       return wasm_i8x16_shuffle(v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
                                 14, 15, 16, 16, 16, 16);

     case 5:
       return wasm_i8x16_shuffle(v.raw, zero, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
                                 15, 16, 16, 16, 16, 16);

     case 6:
       return wasm_i8x16_shuffle(v.raw, zero, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                                 16, 16, 16, 16, 16, 16);

     case 7:
       return wasm_i8x16_shuffle(v.raw, zero, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                                 16, 16, 16, 16, 16, 16, 16);

     case 8:
       return wasm_i8x16_shuffle(v.raw, zero, 8, 9, 10, 11, 12, 13, 14, 15, 16,
                                 16, 16, 16, 16, 16, 16, 16);

     case 9:
       return wasm_i8x16_shuffle(v.raw, zero, 9, 10, 11, 12, 13, 14, 15, 16, 16,
                                 16, 16, 16, 16, 16, 16, 16);

     case 10:
       return wasm_i8x16_shuffle(v.raw, zero, 10, 11, 12, 13, 14, 15, 16, 16, 16,
                                 16, 16, 16, 16, 16, 16, 16);

     case 11:
       return wasm_i8x16_shuffle(v.raw, zero, 11, 12, 13, 14, 15, 16, 16, 16, 16,
                                 16, 16, 16, 16, 16, 16, 16);

     case 12:
       return wasm_i8x16_shuffle(v.raw, zero, 12, 13, 14, 15, 16, 16, 16, 16, 16,
                                 16, 16, 16, 16, 16, 16, 16);

     case 13:
       return wasm_i8x16_shuffle(v.raw, zero, 13, 14, 15, 16, 16, 16, 16, 16, 16,
                                 16, 16, 16, 16, 16, 16, 16);

     case 14:
       return wasm_i8x16_shuffle(v.raw, zero, 14, 15, 16, 16, 16, 16, 16, 16, 16,
                                 16, 16, 16, 16, 16, 16, 16);

     case 15:
       return wasm_i8x16_shuffle(v.raw, zero, 15, 16, 16, 16, 16, 16, 16, 16, 16,
                                 16, 16, 16, 16, 16, 16, 16);
     case 16:
       return zero;
   }
 }

 }  // namespace detail

 // 0x01..0F, kBytes = 1 => 0x0001..0E
 template <int kBytes, class D>
 HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) {
   // For partial vectors, clear upper lanes so we shift in zeros.
   if (d.MaxBytes() != 16) {
     const Full128<TFromD<D>> dfull;
     const VFromD<decltype(dfull)> vfull{v.raw};
     v = VFromD<D>{IfThenElseZero(FirstN(dfull, MaxLanes(d)), vfull).raw};
   }
   return VFromD<D>{detail::ShrBytes<kBytes>(v)};
 }

 // ------------------------------ ShiftRightLanes
 template <int kLanes, class D>
 HWY_API VFromD<D> ShiftRightLanes(D d, const VFromD<D> v) {
   const Repartition<uint8_t, decltype(d)> d8;
   constexpr size_t kBytes = kLanes * sizeof(TFromD<D>);
   return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v)));
 }

 // ------------------------------ UpperHalf (ShiftRightBytes)

 template <class D, typename T = TFromD<D>>
 HWY_API Vec64<T> UpperHalf(D /* tag */, const Vec128<T> v) {
   return Vec64<T>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
 }

 // Partial
 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
 HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
   return LowerHalf(d, ShiftRightBytes<d.MaxBytes()>(Twice<D>(), v));
 }

 // ------------------------------ CombineShiftRightBytes

 template <int kBytes, class D, typename T = TFromD<D>>
 HWY_API Vec128<T> CombineShiftRightBytes(D /* tag */, Vec128<T> hi,
                                          Vec128<T> lo) {
   static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
   switch (kBytes) {
     case 0:
       return lo;

     case 1:
       return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7,
                                           8, 9, 10, 11, 12, 13, 14, 15, 16)};

     case 2:
       return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8,
                                           9, 10, 11, 12, 13, 14, 15, 16, 17)};

     case 3:
       return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9,
                                           10, 11, 12, 13, 14, 15, 16, 17, 18)};

     case 4:
       return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10,
                                           11, 12, 13, 14, 15, 16, 17, 18, 19)};

     case 5:
       return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11,
                                           12, 13, 14, 15, 16, 17, 18, 19, 20)};

     case 6:
       return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11,
                                           12, 13, 14, 15, 16, 17, 18, 19, 20,
                                           21)};

     case 7:
       return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12,
                                           13, 14, 15, 16, 17, 18, 19, 20, 21,
                                           22)};

     case 8:
       return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13,
                                           14, 15, 16, 17, 18, 19, 20, 21, 22,
                                           23)};

     case 9:
       return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14,
                                           15, 16, 17, 18, 19, 20, 21, 22, 23,
                                           24)};

     case 10:
       return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14,
                                           15, 16, 17, 18, 19, 20, 21, 22, 23,
                                           24, 25)};

     case 11:
       return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15,
                                           16, 17, 18, 19, 20, 21, 22, 23, 24,
                                           25, 26)};

     case 12:
       return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16,
                                           17, 18, 19, 20, 21, 22, 23, 24, 25,
                                           26, 27)};

     case 13:
       return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17,
                                           18, 19, 20, 21, 22, 23, 24, 25, 26,
                                           27, 28)};

     case 14:
       return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18,
                                           19, 20, 21, 22, 23, 24, 25, 26, 27,
                                           28, 29)};

     case 15:
       return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19,
                                           20, 21, 22, 23, 24, 25, 26, 27, 28,
                                           29, 30)};
   }
   return hi;
 }

 template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 8)>
 HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) {
   constexpr size_t kSize = d.MaxBytes();
   static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
   const Repartition<uint8_t, decltype(d)> d8;
   using V8 = Vec128<uint8_t>;
   const DFromV<V8> dfull8;
   const Repartition<TFromD<D>, decltype(dfull8)> dfull;
   const V8 hi8{BitCast(d8, hi).raw};
   // Move into most-significant bytes
   const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw});
   const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(dfull8, hi8, lo8);
   return VFromD<D>{BitCast(dfull, r).raw};
 }

 // ------------------------------ Broadcast/splat any lane

 template <int kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
 HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
   static_assert(0 <= kLane && kLane < N, "Invalid lane");
   return Vec128<T, N>{wasm_i8x16_shuffle(
       v.raw, v.raw, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane,
       kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane)};
 }

 template <int kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
 HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
   static_assert(0 <= kLane && kLane < N, "Invalid lane");
   return Vec128<T, N>{wasm_i16x8_shuffle(v.raw, v.raw, kLane, kLane, kLane,
                                          kLane, kLane, kLane, kLane, kLane)};
 }

 template <int kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
 HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
   static_assert(0 <= kLane && kLane < N, "Invalid lane");
   return Vec128<T, N>{
       wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
 }

 template <int kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
 HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
   static_assert(0 <= kLane && kLane < N, "Invalid lane");
   return Vec128<T, N>{wasm_i64x2_shuffle(v.raw, v.raw, kLane, kLane)};
 }

 // ------------------------------ TableLookupBytes

 // Returns vector of bytes[from[i]]. "from" is also interpreted as bytes, i.e.
 // lane indices in [0, 16).
 template <typename T, size_t N, typename TI, size_t NI>
 HWY_API Vec128<TI, NI> TableLookupBytes(const Vec128<T, N> bytes,
                                         const Vec128<TI, NI> from) {
   return Vec128<TI, NI>{wasm_i8x16_swizzle(bytes.raw, from.raw)};
 }

 template <typename T, size_t N, typename TI, size_t NI>
 HWY_API Vec128<TI, NI> TableLookupBytesOr0(const Vec128<T, N> bytes,
                                            const Vec128<TI, NI> from) {
   const DFromV<decltype(from)> d;
   // Mask size must match vector type, so cast everything to this type.
   Repartition<int8_t, decltype(d)> di8;
   Repartition<int8_t, DFromV<decltype(bytes)>> d_bytes8;
   const auto msb = BitCast(di8, from) < Zero(di8);
   const auto lookup =
       TableLookupBytes(BitCast(d_bytes8, bytes), BitCast(di8, from));
   return BitCast(d, IfThenZeroElse(msb, lookup));
 }

 // ------------------------------ Hard-coded shuffles

 // Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
 // Shuffle0321 rotates one lane to the right (the previous least-significant
 // lane is now most-significant). These could also be implemented via
 // CombineShiftRightBytes but the shuffle_abcd notation is more convenient.

 // Swap 32-bit halves in 64-bit halves.
 template <typename T, size_t N>
 HWY_API Vec128<T, N> Shuffle2301(const Vec128<T, N> v) {
   static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
   static_assert(N == 2 || N == 4, "Does not make sense for N=1");
   return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
 }

 // These are used by generic_ops-inl to implement LoadInterleaved3.
 namespace detail {

 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
 HWY_API Vec128<T, N> ShuffleTwo2301(const Vec128<T, N> a,
                                     const Vec128<T, N> b) {
   static_assert(N == 2 || N == 4, "Does not make sense for N=1");
   return Vec128<T, N>{wasm_i8x16_shuffle(a.raw, b.raw, 1, 0, 3 + 16, 2 + 16,
                                          0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
                                          0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)};
 }
 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
 HWY_API Vec128<T, N> ShuffleTwo2301(const Vec128<T, N> a,
                                     const Vec128<T, N> b) {
   static_assert(N == 2 || N == 4, "Does not make sense for N=1");
   return Vec128<T, N>{wasm_i16x8_shuffle(a.raw, b.raw, 1, 0, 3 + 8, 2 + 8,
                                          0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)};
 }
 template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
 HWY_API Vec128<T, N> ShuffleTwo2301(const Vec128<T, N> a,
                                     const Vec128<T, N> b) {
   static_assert(N == 2 || N == 4, "Does not make sense for N=1");
   return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 1, 0, 3 + 4, 2 + 4)};
 }

 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
 HWY_API Vec128<T, N> ShuffleTwo1230(const Vec128<T, N> a,
                                     const Vec128<T, N> b) {
   static_assert(N == 2 || N == 4, "Does not make sense for N=1");
   return Vec128<T, N>{wasm_i8x16_shuffle(a.raw, b.raw, 0, 3, 2 + 16, 1 + 16,
                                          0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
                                          0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)};
 }
 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
 HWY_API Vec128<T, N> ShuffleTwo1230(const Vec128<T, N> a,
                                     const Vec128<T, N> b) {
   static_assert(N == 2 || N == 4, "Does not make sense for N=1");
   return Vec128<T, N>{wasm_i16x8_shuffle(a.raw, b.raw, 0, 3, 2 + 8, 1 + 8,
                                          0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)};
 }
 template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
 HWY_API Vec128<T, N> ShuffleTwo1230(const Vec128<T, N> a,
                                     const Vec128<T, N> b) {
   static_assert(N == 2 || N == 4, "Does not make sense for N=1");
   return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 3, 2 + 4, 1 + 4)};
 }

 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
 HWY_API Vec128<T, N> ShuffleTwo3012(const Vec128<T, N> a,
                                     const Vec128<T, N> b) {
   static_assert(N == 2 || N == 4, "Does not make sense for N=1");
   return Vec128<T, N>{wasm_i8x16_shuffle(a.raw, b.raw, 2, 1, 0 + 16, 3 + 16,
                                          0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
                                          0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)};
 }
 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
 HWY_API Vec128<T, N> ShuffleTwo3012(const Vec128<T, N> a,
                                     const Vec128<T, N> b) {
   static_assert(N == 2 || N == 4, "Does not make sense for N=1");
   return Vec128<T, N>{wasm_i16x8_shuffle(a.raw, b.raw, 2, 1, 0 + 8, 3 + 8,
                                          0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)};
 }
 template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
 HWY_API Vec128<T, N> ShuffleTwo3012(const Vec128<T, N> a,
                                     const Vec128<T, N> b) {
   static_assert(N == 2 || N == 4, "Does not make sense for N=1");
   return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 1, 0 + 4, 3 + 4)};
 }

 }  // namespace detail

 // Swap 64-bit halves
 template <typename T>
 HWY_API Vec128<T> Shuffle01(const Vec128<T> v) {
   static_assert(sizeof(T) == 8, "Only for 64-bit lanes");
   return Vec128<T>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
 }
 template <typename T>
 HWY_API Vec128<T> Shuffle1032(const Vec128<T> v) {
   static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
   return Vec128<T>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
 }

 // Rotate right 32 bits
 template <typename T>
 HWY_API Vec128<T> Shuffle0321(const Vec128<T> v) {
   static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
   return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
 }

 // Rotate left 32 bits
 template <typename T>
 HWY_API Vec128<T> Shuffle2103(const Vec128<T> v) {
   static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
   return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
 }

 // Reverse
 template <typename T>
 HWY_API Vec128<T> Shuffle0123(const Vec128<T> v) {
   static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
   return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
 }

 // ------------------------------ TableLookupLanes

 // Returned by SetTableIndices for use by TableLookupLanes.
 template <typename T, size_t N = 16 / sizeof(T)>
 struct Indices128 {
   __v128_u raw;
 };

 namespace detail {

 template <class D, HWY_IF_T_SIZE_D(D, 1)>
 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes(
     D d) {
   const Repartition<uint8_t, decltype(d)> d8;
   return Iota(d8, 0);
 }

 template <class D, HWY_IF_T_SIZE_D(D, 2)>
 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes(
     D d) {
   const Repartition<uint8_t, decltype(d)> d8;
   alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = {
       0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
   return Load(d8, kBroadcastLaneBytes);
 }

 template <class D, HWY_IF_T_SIZE_D(D, 4)>
 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes(
     D d) {
   const Repartition<uint8_t, decltype(d)> d8;
   alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = {
       0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
   return Load(d8, kBroadcastLaneBytes);
 }

 template <class D, HWY_IF_T_SIZE_D(D, 8)>
 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes(
     D d) {
   const Repartition<uint8_t, decltype(d)> d8;
   alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = {
       0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
   return Load(d8, kBroadcastLaneBytes);
 }

 template <class D, HWY_IF_T_SIZE_D(D, 1)>
 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) {
   const Repartition<uint8_t, decltype(d)> d8;
   return Zero(d8);
 }

 template <class D, HWY_IF_T_SIZE_D(D, 2)>
 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) {
   const Repartition<uint8_t, decltype(d)> d8;
   alignas(16) static constexpr uint8_t kByteOffsets[16] = {
       0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
   return Load(d8, kByteOffsets);
 }

 template <class D, HWY_IF_T_SIZE_D(D, 4)>
 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) {
   const Repartition<uint8_t, decltype(d)> d8;
   alignas(16) static constexpr uint8_t kByteOffsets[16] = {
       0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3};
   return Load(d8, kByteOffsets);
 }

 template <class D, HWY_IF_T_SIZE_D(D, 8)>
 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) {
   const Repartition<uint8_t, decltype(d)> d8;
   alignas(16) static constexpr uint8_t kByteOffsets[16] = {
       0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
   return Load(d8, kByteOffsets);
 }

 }  // namespace detail

 template <class D, typename TI, HWY_IF_V_SIZE_LE_D(D, 16),
           HWY_IF_T_SIZE_D(D, 1)>
 HWY_API Indices128<TFromD<D>, MaxLanes(D())> IndicesFromVec(
     D d, Vec128<TI, MaxLanes(D())> vec) {
   using T = TFromD<D>;
   static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
 #if HWY_IS_DEBUG_BUILD
   const RebindToUnsigned<decltype(d)> du;
   using TU = TFromD<decltype(du)>;
   HWY_DASSERT(AllTrue(
       du, Lt(BitCast(du, vec), Set(du, static_cast<TU>(MaxLanes(d) * 2)))));
 #endif

   (void)d;
   return Indices128<TFromD<D>, MaxLanes(D())>{vec.raw};
 }

 template <class D, typename TI, HWY_IF_V_SIZE_LE_D(D, 16),
           HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 2) | (1 << 4) | (1 << 8))>
 HWY_API Indices128<TFromD<D>, MaxLanes(D())> IndicesFromVec(
     D d, Vec128<TI, MaxLanes(D())> vec) {
   using T = TFromD<D>;
   static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
 #if HWY_IS_DEBUG_BUILD
   const RebindToUnsigned<decltype(d)> du;
   using TU = TFromD<decltype(du)>;
   HWY_DASSERT(AllTrue(
       du, Lt(BitCast(du, vec), Set(du, static_cast<TU>(MaxLanes(d) * 2)))));
 #endif

   const Repartition<uint8_t, decltype(d)> d8;
   using V8 = VFromD<decltype(d8)>;

   // Broadcast each lane index to all bytes of T and shift to bytes
   const V8 lane_indices = TableLookupBytes(
       BitCast(d8, vec), detail::IndicesFromVecBroadcastLaneBytes(d));
   constexpr int kIndexShiftAmt = static_cast<int>(FloorLog2(sizeof(T)));
   const V8 byte_indices = ShiftLeft<kIndexShiftAmt>(lane_indices);
   const V8 sum = Add(byte_indices, detail::IndicesFromVecByteOffsets(d));
   return Indices128<TFromD<D>, MaxLanes(D())>{BitCast(d, sum).raw};
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename TI>
 HWY_API Indices128<TFromD<D>, HWY_MAX_LANES_D(D)> SetTableIndices(
     D d, const TI* idx) {
   const Rebind<TI, decltype(d)> di;
   return IndicesFromVec(d, LoadU(di, idx));
 }

 template <typename T, size_t N>
 HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) {
   using TI = MakeSigned<T>;
   const DFromV<decltype(v)> d;
   const Rebind<TI, decltype(d)> di;
   return BitCast(d, TableLookupBytes(BitCast(di, v), Vec128<TI, N>{idx.raw}));
 }

 template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
 HWY_API Vec128<T, N> TwoTablesLookupLanes(Vec128<T, N> a, Vec128<T, N> b,
                                           Indices128<T, N> idx) {
   const DFromV<decltype(a)> d;
   const Twice<decltype(d)> dt;
 // TableLookupLanes currently requires table and index vectors to be the same
 // size, though a half-length index vector would be sufficient here.
 #if HWY_IS_MSAN
   const Vec128<T, N> idx_vec{idx.raw};
   const Indices128<T, N * 2> idx2{Combine(dt, idx_vec, idx_vec).raw};
 #else
   // We only keep LowerHalf of the result, which is valid in idx.
   const Indices128<T, N * 2> idx2{idx.raw};
 #endif
   return LowerHalf(d, TableLookupLanes(Combine(dt, b, a), idx2));
 }

 template <typename T>
 HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b,
                                        Indices128<T> idx) {
   const DFromV<decltype(a)> d;
   const Repartition<uint8_t, decltype(d)> du8;

   const VFromD<decltype(du8)> byte_idx{idx.raw};
   const auto byte_idx_mod = byte_idx & Set(du8, uint8_t{0x0F});
   // If ANDing did not change the index, it is for the lower half.
   const auto is_lo = (byte_idx == byte_idx_mod);

   return BitCast(d, IfThenElse(is_lo, TableLookupBytes(a, byte_idx_mod),
                                TableLookupBytes(b, byte_idx_mod)));
 }

 // ------------------------------ Reverse (Shuffle0123, Shuffle2301, Shuffle01)

 // Single lane: no change
 template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)>
 HWY_API Vec128<T, 1> Reverse(D /* tag */, Vec128<T, 1> v) {
   return v;
 }

 // 32-bit x2: shuffle
 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
 HWY_API Vec64<T> Reverse(D /* tag */, const Vec64<T> v) {
   return Vec64<T>{Shuffle2301(Vec128<T>{v.raw}).raw};
 }

 // 64-bit x2: shuffle
 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
 HWY_API Vec128<T> Reverse(D /* tag */, const Vec128<T> v) {
   return Shuffle01(v);
 }

 // 32-bit x2: shuffle
 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
 HWY_API Vec128<T> Reverse(D /* tag */, const Vec128<T> v) {
   return Shuffle0123(v);
 }

 // 16-bit
 template <class D, HWY_IF_T_SIZE_D(D, 2)>
 HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
   const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
   return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v))));
 }

 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_LE_D(D, 16)>
 HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
   static constexpr int kN = 16 + Lanes(d);
   return VFromD<D>{wasm_i8x16_shuffle(
       v.raw, v.raw,
       // kN is adjusted to ensure we have valid indices for all lengths.
       kN - 1, kN - 2, kN - 3, kN - 4, kN - 5, kN - 6, kN - 7, kN - 8, kN - 9,
       kN - 10, kN - 11, kN - 12, kN - 13, kN - 14, kN - 15, kN - 16)};
 }

 // ------------------------------ Reverse2

 template <class D, HWY_IF_T_SIZE_D(D, 2)>
 HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
   const RepartitionToWide<RebindToUnsigned<decltype(d)>> dw;
   return BitCast(d, RotateRight<16>(BitCast(dw, v)));
 }

 template <class D, HWY_IF_T_SIZE_D(D, 4)>
 HWY_API VFromD<D> Reverse2(D /* tag */, const VFromD<D> v) {
   return Shuffle2301(v);
 }

 template <class D, HWY_IF_T_SIZE_D(D, 8)>
 HWY_API VFromD<D> Reverse2(D /* tag */, const VFromD<D> v) {
   return Shuffle01(v);
 }

 // ------------------------------ Reverse4

 template <class D, HWY_IF_T_SIZE_D(D, 2)>
 HWY_API VFromD<D> Reverse4(D /* tag */, const VFromD<D> v) {
   return VFromD<D>{wasm_i16x8_shuffle(v.raw, v.raw, 3, 2, 1, 0, 7, 6, 5, 4)};
 }

 template <class D, HWY_IF_T_SIZE_D(D, 4)>
 HWY_API VFromD<D> Reverse4(D /* tag */, const VFromD<D> v) {
   return Shuffle0123(v);
 }

 template <class D, HWY_IF_T_SIZE_D(D, 8)>
 HWY_API VFromD<D> Reverse4(D /* tag */, const VFromD<D>) {
   HWY_ASSERT(0);  // don't have 8 u64 lanes
 }

 // ------------------------------ Reverse8

 template <class D, HWY_IF_T_SIZE_D(D, 2)>
 HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) {
   return Reverse(d, v);
 }

 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
 HWY_API VFromD<D> Reverse8(D /* tag */, const VFromD<D>) {
   HWY_ASSERT(0);  // don't have 8 lanes for > 16-bit lanes
 }

 // ------------------------------ InterleaveLower

 template <size_t N>
 HWY_API Vec128<uint8_t, N> InterleaveLower(Vec128<uint8_t, N> a,
                                            Vec128<uint8_t, N> b) {
   return Vec128<uint8_t, N>{wasm_i8x16_shuffle(
       a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
 }
 template <size_t N>
 HWY_API Vec128<uint16_t, N> InterleaveLower(Vec128<uint16_t, N> a,
                                             Vec128<uint16_t, N> b) {
   return Vec128<uint16_t, N>{
       wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
 }
 template <size_t N>
 HWY_API Vec128<uint32_t, N> InterleaveLower(Vec128<uint32_t, N> a,
                                             Vec128<uint32_t, N> b) {
   return Vec128<uint32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
 }
 template <size_t N>
 HWY_API Vec128<uint64_t, N> InterleaveLower(Vec128<uint64_t, N> a,
                                             Vec128<uint64_t, N> b) {
   return Vec128<uint64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
 }

 template <size_t N>
 HWY_API Vec128<int8_t, N> InterleaveLower(Vec128<int8_t, N> a,
                                           Vec128<int8_t, N> b) {
   return Vec128<int8_t, N>{wasm_i8x16_shuffle(
       a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
 }
 template <size_t N>
 HWY_API Vec128<int16_t, N> InterleaveLower(Vec128<int16_t, N> a,
                                            Vec128<int16_t, N> b) {
   return Vec128<int16_t, N>{
       wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
 }
 template <size_t N>
 HWY_API Vec128<int32_t, N> InterleaveLower(Vec128<int32_t, N> a,
                                            Vec128<int32_t, N> b) {
   return Vec128<int32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
 }
 template <size_t N>
 HWY_API Vec128<int64_t, N> InterleaveLower(Vec128<int64_t, N> a,
                                            Vec128<int64_t, N> b) {
   return Vec128<int64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
 }

 template <size_t N>
 HWY_API Vec128<float, N> InterleaveLower(Vec128<float, N> a,
                                          Vec128<float, N> b) {
   return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
 }

 template <size_t N>
 HWY_API Vec128<double, N> InterleaveLower(Vec128<double, N> a,
                                           Vec128<double, N> b) {
   return Vec128<double, N>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
 }

 template <class T, size_t N, HWY_IF_T_SIZE(T, 2), HWY_IF_SPECIAL_FLOAT(T)>
 HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
   const DFromV<decltype(a)> d;
   const RebindToUnsigned<decltype(d)> du;
   return BitCast(d, InterleaveLower(BitCast(du, a), BitCast(du, b)));
 }

 // Additional overload for the optional tag (all vector lengths).
 template <class D>
 HWY_API VFromD<D> InterleaveLower(D /* tag */, VFromD<D> a, VFromD<D> b) {
   return InterleaveLower(a, b);
 }

 // ------------------------------ InterleaveUpper (UpperHalf)

 // All functions inside detail lack the required D parameter.
 namespace detail {

 template <size_t N>
 HWY_API Vec128<uint8_t, N> InterleaveUpper(Vec128<uint8_t, N> a,
                                            Vec128<uint8_t, N> b) {
   return Vec128<uint8_t, N>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10,
                                                26, 11, 27, 12, 28, 13, 29, 14,
                                                30, 15, 31)};
 }
 template <size_t N>
 HWY_API Vec128<uint16_t, N> InterleaveUpper(Vec128<uint16_t, N> a,
                                             Vec128<uint16_t, N> b) {
   return Vec128<uint16_t, N>{
       wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
 }
 template <size_t N>
 HWY_API Vec128<uint32_t, N> InterleaveUpper(Vec128<uint32_t, N> a,
                                             Vec128<uint32_t, N> b) {
   return Vec128<uint32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
 }
 template <size_t N>
 HWY_API Vec128<uint64_t, N> InterleaveUpper(Vec128<uint64_t, N> a,
                                             Vec128<uint64_t, N> b) {
   return Vec128<uint64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
 }

 template <size_t N>
 HWY_API Vec128<int8_t, N> InterleaveUpper(Vec128<int8_t, N> a,
                                           Vec128<int8_t, N> b) {
   return Vec128<int8_t, N>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10,
                                               26, 11, 27, 12, 28, 13, 29, 14,
                                               30, 15, 31)};
 }
 template <size_t N>
 HWY_API Vec128<int16_t, N> InterleaveUpper(Vec128<int16_t, N> a,
                                            Vec128<int16_t, N> b) {
   return Vec128<int16_t, N>{
       wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
 }
 template <size_t N>
 HWY_API Vec128<int32_t, N> InterleaveUpper(Vec128<int32_t, N> a,
                                            Vec128<int32_t, N> b) {
   return Vec128<int32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
 }
 template <size_t N>
 HWY_API Vec128<int64_t, N> InterleaveUpper(Vec128<int64_t, N> a,
                                            Vec128<int64_t, N> b) {
   return Vec128<int64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
 }

 template <size_t N>
 HWY_API Vec128<float16_t, N> InterleaveUpper(Vec128<float16_t, N> a,
                                              Vec128<float16_t, N> b) {
   return Vec128<float16_t, N>{
       wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
 }
 template <size_t N>
 HWY_API Vec128<bfloat16_t, N> InterleaveUpper(Vec128<bfloat16_t, N> a,
                                               Vec128<bfloat16_t, N> b) {
   return Vec128<bfloat16_t, N>{
       wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
 }

 template <size_t N>
 HWY_API Vec128<float, N> InterleaveUpper(Vec128<float, N> a,
                                          Vec128<float, N> b) {
   return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
 }

 template <size_t N>
 HWY_API Vec128<double, N> InterleaveUpper(Vec128<double, N> a,
                                           Vec128<double, N> b) {
   return Vec128<double, N>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
 }

 }  // namespace detail

 // Full
 template <class D, typename T = TFromD<D>>
 HWY_API Vec128<T> InterleaveUpper(D /* tag */, Vec128<T> a, Vec128<T> b) {
   return detail::InterleaveUpper(a, b);
 }

 // Partial
 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
 HWY_API VFromD<D> InterleaveUpper(D d, VFromD<D> a, VFromD<D> b) {
   const Half<decltype(d)> d2;
   return InterleaveLower(d, VFromD<D>{UpperHalf(d2, a).raw},
                          VFromD<D>{UpperHalf(d2, b).raw});
 }

 // ------------------------------ ZipLower/ZipUpper (InterleaveLower)

 // Same as Interleave*, except that the return lanes are double-width integers;
 // this is necessary because the single-lane scalar cannot return two values.
 template <class V, class DW = RepartitionToWide<DFromV<V>>>
 HWY_API VFromD<DW> ZipLower(V a, V b) {
   return BitCast(DW(), InterleaveLower(a, b));
 }
 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
 HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
   return BitCast(dw, InterleaveLower(D(), a, b));
 }

 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
 HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
   return BitCast(dw, InterleaveUpper(D(), a, b));
 }

 // ------------------------------ Per4LaneBlockShuffle
 namespace detail {

 template <size_t kIdx3210, size_t kVectSize, class V,
           HWY_IF_LANES_LE(kVectSize, 16)>
 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
                                   hwy::SizeTag<1> /*lane_size_tag*/,
                                   hwy::SizeTag<kVectSize> /*vect_size_tag*/,
                                   V v) {
   constexpr int kIdx3 = static_cast<int>((kIdx3210 >> 6) & 3);
   constexpr int kIdx2 = static_cast<int>((kIdx3210 >> 4) & 3);
   constexpr int kIdx1 = static_cast<int>((kIdx3210 >> 2) & 3);
   constexpr int kIdx0 = static_cast<int>(kIdx3210 & 3);
   return V{wasm_i8x16_shuffle(v.raw, v.raw, kIdx0, kIdx1, kIdx2, kIdx3,
                               kIdx0 + 4, kIdx1 + 4, kIdx2 + 4, kIdx3 + 4,
                               kIdx0 + 8, kIdx1 + 8, kIdx2 + 8, kIdx3 + 8,
                               kIdx0 + 12, kIdx1 + 12, kIdx2 + 12, kIdx3 + 12)};
 }

 template <size_t kIdx3210, size_t kVectSize, class V,
           HWY_IF_LANES_LE(kVectSize, 16)>
 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
                                   hwy::SizeTag<2> /*lane_size_tag*/,
                                   hwy::SizeTag<kVectSize> /*vect_size_tag*/,
                                   V v) {
   constexpr int kIdx3 = static_cast<int>((kIdx3210 >> 6) & 3);
   constexpr int kIdx2 = static_cast<int>((kIdx3210 >> 4) & 3);
   constexpr int kIdx1 = static_cast<int>((kIdx3210 >> 2) & 3);
   constexpr int kIdx0 = static_cast<int>(kIdx3210 & 3);
   return V{wasm_i16x8_shuffle(v.raw, v.raw, kIdx0, kIdx1, kIdx2, kIdx3,
                               kIdx0 + 4, kIdx1 + 4, kIdx2 + 4, kIdx3 + 4)};
 }

 template <size_t kIdx3210, size_t kVectSize, class V,
           HWY_IF_LANES_LE(kVectSize, 16)>
 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
                                   hwy::SizeTag<4> /*lane_size_tag*/,
                                   hwy::SizeTag<kVectSize> /*vect_size_tag*/,
                                   V v) {
   constexpr int kIdx3 = static_cast<int>((kIdx3210 >> 6) & 3);
   constexpr int kIdx2 = static_cast<int>((kIdx3210 >> 4) & 3);
   constexpr int kIdx1 = static_cast<int>((kIdx3210 >> 2) & 3);
   constexpr int kIdx0 = static_cast<int>(kIdx3210 & 3);
   return V{wasm_i32x4_shuffle(v.raw, v.raw, kIdx0, kIdx1, kIdx2, kIdx3)};
 }

 }  // namespace detail

 // ------------------------------ SlideUpLanes

 namespace detail {

 template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
 HWY_INLINE V SlideUpLanes(V v, size_t amt) {
   const DFromV<decltype(v)> d;
   const Full64<uint64_t> du64;
   const auto vu64 = ResizeBitCast(du64, v);
   return ResizeBitCast(
       d, ShiftLeftSame(vu64, static_cast<int>(amt * sizeof(TFromV<V>) * 8)));
 }

 template <class V, HWY_IF_V_SIZE_V(V, 16)>
 HWY_INLINE V SlideUpLanes(V v, size_t amt) {
   const DFromV<decltype(v)> d;
   const Repartition<uint8_t, decltype(d)> du8;
   const auto idx =
       Iota(du8, static_cast<uint8_t>(size_t{0} - amt * sizeof(TFromV<V>)));
   return BitCast(d, TableLookupBytesOr0(BitCast(du8, v), idx));
 }

 }  // namespace detail

 template <class D, HWY_IF_LANES_D(D, 1)>
 HWY_API VFromD<D> SlideUpLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) {
   return v;
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)>
 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
 #if !HWY_IS_DEBUG_BUILD
   if (__builtin_constant_p(amt)) {
     switch (amt) {
       case 0:
         return v;
       case 1:
         return ShiftLeftLanes<1>(d, v);
     }
   }
 #else
   (void)d;
 #endif

   return detail::SlideUpLanes(v, amt);
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)>
 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
 #if !HWY_IS_DEBUG_BUILD
   if (__builtin_constant_p(amt)) {
     switch (amt) {
       case 0:
         return v;
       case 1:
         return ShiftLeftLanes<1>(d, v);
       case 2:
         return ShiftLeftLanes<2>(d, v);
       case 3:
         return ShiftLeftLanes<3>(d, v);
     }
   }
 #else
   (void)d;
 #endif

   return detail::SlideUpLanes(v, amt);
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)>
 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
 #if !HWY_IS_DEBUG_BUILD
   if (__builtin_constant_p(amt)) {
     switch (amt) {
       case 0:
         return v;
       case 1:
         return ShiftLeftLanes<1>(d, v);
       case 2:
         return ShiftLeftLanes<2>(d, v);
       case 3:
         return ShiftLeftLanes<3>(d, v);
       case 4:
         return ShiftLeftLanes<4>(d, v);
       case 5:
         return ShiftLeftLanes<5>(d, v);
       case 6:
         return ShiftLeftLanes<6>(d, v);
       case 7:
         return ShiftLeftLanes<7>(d, v);
     }
   }
 #else
   (void)d;
 #endif

   return detail::SlideUpLanes(v, amt);
 }

 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_LANES_D(D, 16)>
 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
 #if !HWY_IS_DEBUG_BUILD
   if (__builtin_constant_p(amt)) {
     switch (amt) {
       case 0:
         return v;
       case 1:
         return ShiftLeftLanes<1>(d, v);
       case 2:
         return ShiftLeftLanes<2>(d, v);
       case 3:
         return ShiftLeftLanes<3>(d, v);
       case 4:
         return ShiftLeftLanes<4>(d, v);
       case 5:
         return ShiftLeftLanes<5>(d, v);
       case 6:
         return ShiftLeftLanes<6>(d, v);
       case 7:
         return ShiftLeftLanes<7>(d, v);
       case 8:
         return ShiftLeftLanes<8>(d, v);
       case 9:
         return ShiftLeftLanes<9>(d, v);
       case 10:
         return ShiftLeftLanes<10>(d, v);
       case 11:
         return ShiftLeftLanes<11>(d, v);
       case 12:
         return ShiftLeftLanes<12>(d, v);
       case 13:
         return ShiftLeftLanes<13>(d, v);
       case 14:
         return ShiftLeftLanes<14>(d, v);
       case 15:
         return ShiftLeftLanes<15>(d, v);
     }
   }
 #else
   (void)d;
 #endif

   return detail::SlideUpLanes(v, amt);
 }

 // ------------------------------ SlideDownLanes

 namespace detail {

 template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
 HWY_INLINE V SlideDownLanes(V v, size_t amt) {
   const DFromV<decltype(v)> d;
   const Repartition<UnsignedFromSize<d.MaxBytes()>, decltype(d)> dv;
   return BitCast(d,
                  ShiftRightSame(BitCast(dv, v),
                                 static_cast<int>(amt * sizeof(TFromV<V>) * 8)));
 }

 template <class V, HWY_IF_V_SIZE_V(V, 16)>
 HWY_INLINE V SlideDownLanes(V v, size_t amt) {
   const DFromV<decltype(v)> d;
   const Repartition<int8_t, decltype(d)> di8;
   auto idx = Iota(di8, static_cast<int8_t>(amt * sizeof(TFromV<V>)));
   idx = Or(idx, VecFromMask(di8, idx > Set(di8, int8_t{15})));
   return BitCast(d, TableLookupBytesOr0(BitCast(di8, v), idx));
 }

 }  // namespace detail

 template <class D, HWY_IF_LANES_D(D, 1)>
 HWY_API VFromD<D> SlideDownLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) {
   return v;
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)>
 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
 #if !HWY_IS_DEBUG_BUILD
   if (__builtin_constant_p(amt)) {
     switch (amt) {
       case 0:
         return v;
       case 1:
         return ShiftRightLanes<1>(d, v);
     }
   }
 #else
   (void)d;
 #endif

   return detail::SlideDownLanes(v, amt);
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)>
 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
 #if !HWY_IS_DEBUG_BUILD
   if (__builtin_constant_p(amt)) {
     switch (amt) {
       case 0:
         return v;
       case 1:
         return ShiftRightLanes<1>(d, v);
       case 2:
         return ShiftRightLanes<2>(d, v);
       case 3:
         return ShiftRightLanes<3>(d, v);
     }
   }
 #else
   (void)d;
 #endif

   return detail::SlideDownLanes(v, amt);
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)>
 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
 #if !HWY_IS_DEBUG_BUILD
   if (__builtin_constant_p(amt)) {
     switch (amt) {
       case 0:
         return v;
       case 1:
         return ShiftRightLanes<1>(d, v);
       case 2:
         return ShiftRightLanes<2>(d, v);
       case 3:
         return ShiftRightLanes<3>(d, v);
       case 4:
         return ShiftRightLanes<4>(d, v);
       case 5:
         return ShiftRightLanes<5>(d, v);
       case 6:
         return ShiftRightLanes<6>(d, v);
       case 7:
         return ShiftRightLanes<7>(d, v);
     }
   }
 #else
   (void)d;
 #endif

   return detail::SlideDownLanes(v, amt);
 }

 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_LANES_D(D, 16)>
 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
 #if !HWY_IS_DEBUG_BUILD
   if (__builtin_constant_p(amt)) {
     switch (amt) {
       case 0:
         return v;
       case 1:
         return ShiftRightLanes<1>(d, v);
       case 2:
         return ShiftRightLanes<2>(d, v);
       case 3:
         return ShiftRightLanes<3>(d, v);
       case 4:
         return ShiftRightLanes<4>(d, v);
       case 5:
         return ShiftRightLanes<5>(d, v);
       case 6:
         return ShiftRightLanes<6>(d, v);
       case 7:
         return ShiftRightLanes<7>(d, v);
       case 8:
         return ShiftRightLanes<8>(d, v);
       case 9:
         return ShiftRightLanes<9>(d, v);
       case 10:
         return ShiftRightLanes<10>(d, v);
       case 11:
         return ShiftRightLanes<11>(d, v);
       case 12:
         return ShiftRightLanes<12>(d, v);
       case 13:
         return ShiftRightLanes<13>(d, v);
       case 14:
         return ShiftRightLanes<14>(d, v);
       case 15:
         return ShiftRightLanes<15>(d, v);
     }
   }
 #else
   (void)d;
 #endif

   return detail::SlideDownLanes(v, amt);
 }

 // ================================================== COMBINE

 // ------------------------------ Combine (InterleaveLower)

 // N = N/2 + N/2 (upper half undefined)
 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), class VH = VFromD<Half<D>>>
 HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) {
   const Half<decltype(d)> dh;
   const RebindToUnsigned<decltype(dh)> duh;
   // Treat half-width input as one lane, and expand to two lanes.
   using VU = Vec128<UnsignedFromSize<dh.MaxBytes()>, 2>;
   const VU lo{BitCast(duh, lo_half).raw};
   const VU hi{BitCast(duh, hi_half).raw};
   return BitCast(d, InterleaveLower(lo, hi));
 }

 // ------------------------------ ZeroExtendVector (IfThenElseZero)
 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
 HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) {
   const Half<D> dh;
   return IfThenElseZero(FirstN(d, MaxLanes(dh)), VFromD<D>{lo.raw});
 }

 // ------------------------------ ConcatLowerLower
 template <class D, typename T = TFromD<D>>
 HWY_API Vec128<T> ConcatLowerLower(D /* tag */, Vec128<T> hi, Vec128<T> lo) {
   return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 0, 2)};
 }

 // ------------------------------ ConcatUpperUpper
 template <class D, typename T = TFromD<D>>
 HWY_API Vec128<T> ConcatUpperUpper(D /* tag */, Vec128<T> hi, Vec128<T> lo) {
   return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 1, 3)};
 }

 // ------------------------------ ConcatLowerUpper
 template <class D, typename T = TFromD<D>>
 HWY_API Vec128<T> ConcatLowerUpper(D d, Vec128<T> hi, Vec128<T> lo) {
   return CombineShiftRightBytes<8>(d, hi, lo);
 }

 // ------------------------------ ConcatUpperLower
 template <class D, typename T = TFromD<D>>
 HWY_API Vec128<T> ConcatUpperLower(D d, Vec128<T> hi, Vec128<T> lo) {
   return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi);
 }

 // ------------------------------ Concat partial (Combine, LowerHalf)

 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
 HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
   const Half<decltype(d)> d2;
   return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo));
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
 HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
   const Half<decltype(d)> d2;
   return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo));
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
 HWY_API VFromD<D> ConcatLowerUpper(D d, const VFromD<D> hi,
                                    const VFromD<D> lo) {
   const Half<decltype(d)> d2;
   return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo));
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
 HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
   const Half<decltype(d)> d2;
   return Combine(d, UpperHalf(d2, hi), LowerHalf(d2, lo));
 }

 // ------------------------------ ConcatOdd

 // 8-bit full
 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
 HWY_API Vec128<T> ConcatOdd(D /* tag */, Vec128<T> hi, Vec128<T> lo) {
   return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 9, 11, 13, 15,
                                       17, 19, 21, 23, 25, 27, 29, 31)};
 }

 // 8-bit x8
 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
 HWY_API Vec64<T> ConcatOdd(D /* tag */, Vec64<T> hi, Vec64<T> lo) {
   // Don't care about upper half.
   return Vec128<T, 8>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 17, 19, 21,
                                          23, 1, 3, 5, 7, 17, 19, 21, 23)};
 }

 // 8-bit x4
 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
 HWY_API Vec32<T> ConcatOdd(D /* tag */, Vec32<T> hi, Vec32<T> lo) {
   // Don't care about upper 3/4.
   return Vec128<T, 4>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 17, 19, 1, 3, 17,
                                          19, 1, 3, 17, 19, 1, 3, 17, 19)};
 }

 // 16-bit full
 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
 HWY_API Vec128<T> ConcatOdd(D /* tag */, Vec128<T> hi, Vec128<T> lo) {
   return Vec128<T>{
       wasm_i16x8_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 9, 11, 13, 15)};
 }

 // 16-bit x4
 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
 HWY_API Vec64<T> ConcatOdd(D /* tag */, Vec64<T> hi, Vec64<T> lo) {
   // Don't care about upper half.
   return Vec128<T, 4>{
       wasm_i16x8_shuffle(lo.raw, hi.raw, 1, 3, 9, 11, 1, 3, 9, 11)};
 }

 // 32-bit full
 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
 HWY_API Vec128<T> ConcatOdd(D /* tag */, Vec128<T> hi, Vec128<T> lo) {
   return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 1, 3, 5, 7)};
 }

 // Any T x2
 template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 2)>
 HWY_API Vec128<T, 2> ConcatOdd(D d, Vec128<T, 2> hi, Vec128<T, 2> lo) {
   return InterleaveUpper(d, lo, hi);
 }

 // ------------------------------ ConcatEven (InterleaveLower)

 // 8-bit full
 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
 HWY_API Vec128<T> ConcatEven(D /* tag */, Vec128<T> hi, Vec128<T> lo) {
   return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 8, 10, 12, 14,
                                       16, 18, 20, 22, 24, 26, 28, 30)};
 }

 // 8-bit x8
 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
 HWY_API Vec64<T> ConcatEven(D /* tag */, Vec64<T> hi, Vec64<T> lo) {
   // Don't care about upper half.
   return Vec64<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 16, 18, 20, 22,
                                      0, 2, 4, 6, 16, 18, 20, 22)};
 }

 // 8-bit x4
 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
 HWY_API Vec32<T> ConcatEven(D /* tag */, Vec32<T> hi, Vec32<T> lo) {
   // Don't care about upper 3/4.
   return Vec32<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 16, 18, 0, 2, 16, 18,
                                      0, 2, 16, 18, 0, 2, 16, 18)};
 }

 // 16-bit full
 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
 HWY_API Vec128<T> ConcatEven(D /* tag */, Vec128<T> hi, Vec128<T> lo) {
   return Vec128<T>{
       wasm_i16x8_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 8, 10, 12, 14)};
 }

 // 16-bit x4
 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
 HWY_API Vec64<T> ConcatEven(D /* tag */, Vec64<T> hi, Vec64<T> lo) {
   // Don't care about upper half.
   return Vec64<T>{wasm_i16x8_shuffle(lo.raw, hi.raw, 0, 2, 8, 10, 0, 2, 8, 10)};
 }

 // 32-bit full
 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
 HWY_API Vec128<T> ConcatEven(D /* tag */, Vec128<T> hi, Vec128<T> lo) {
   return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 0, 2, 4, 6)};
 }

 // Any T x2
 template <typename D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 2)>
 HWY_API Vec128<T, 2> ConcatEven(D d, Vec128<T, 2> hi, Vec128<T, 2> lo) {
   return InterleaveLower(d, lo, hi);
 }

 // ------------------------------ DupEven (InterleaveLower)

 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
 HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
   return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, v.raw, 0, 0, 2, 2, 4, 4, 6, 6,
                                          8, 8, 10, 10, 12, 12, 14, 14)};
 }

 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
 HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
   return Vec128<T, N>{wasm_i16x8_shuffle(v.raw, v.raw, 0, 0, 2, 2, 4, 4, 6, 6)};
 }

 template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
 HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
   return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 0, 0, 2, 2)};
 }

 template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
 HWY_API Vec128<T, N> DupEven(const Vec128<T, N> v) {
   return InterleaveLower(DFromV<decltype(v)>(), v, v);
 }

 // ------------------------------ DupOdd (InterleaveUpper)

 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
 HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
   return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, v.raw, 1, 1, 3, 3, 5, 5, 7, 7,
                                          9, 9, 11, 11, 13, 13, 15, 15)};
 }

 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
 HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
   return Vec128<T, N>{wasm_i16x8_shuffle(v.raw, v.raw, 1, 1, 3, 3, 5, 5, 7, 7)};
 }

 template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
 HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
   return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 1, 3, 3)};
 }

 template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
 HWY_API Vec128<T, N> DupOdd(const Vec128<T, N> v) {
   return InterleaveUpper(DFromV<decltype(v)>(), v, v);
 }

 // ------------------------------ OddEven

 namespace detail {

 template <typename T, size_t N>
 HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<1> /* tag */, const Vec128<T, N> a,
                                 const Vec128<T, N> b) {
   const DFromV<decltype(a)> d;
   const Repartition<uint8_t, decltype(d)> d8;
   alignas(16) static constexpr uint8_t mask[16] = {
       0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
   return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
 }
 template <typename T, size_t N>
 HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<2> /* tag */, const Vec128<T, N> a,
                                 const Vec128<T, N> b) {
   return Vec128<T, N>{
       wasm_i16x8_shuffle(a.raw, b.raw, 8, 1, 10, 3, 12, 5, 14, 7)};
 }
 template <typename T, size_t N>
 HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<4> /* tag */, const Vec128<T, N> a,
                                 const Vec128<T, N> b) {
   return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
 }
 template <typename T, size_t N>
 HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<8> /* tag */, const Vec128<T, N> a,
                                 const Vec128<T, N> b) {
   return Vec128<T, N>{wasm_i64x2_shuffle(a.raw, b.raw, 2, 1)};
 }

 }  // namespace detail

 template <typename T, size_t N>
 HWY_API Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
   return detail::OddEven(hwy::SizeTag<sizeof(T)>(), a, b);
 }
 template <size_t N>
 HWY_API Vec128<float, N> OddEven(const Vec128<float, N> a,
                                  const Vec128<float, N> b) {
   return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
 }

 // ------------------------------ InterleaveEven
 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
 HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
   return VFromD<D>{wasm_i8x16_shuffle(a.raw, b.raw, 0, 16, 2, 18, 4, 20, 6, 22,
                                       8, 24, 10, 26, 12, 28, 14, 30)};
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
 HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
   return VFromD<D>{wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 2, 10, 4, 12, 6, 14)};
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)>
 HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
   return VFromD<D>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 2, 6)};
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)>
 HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
   return InterleaveLower(a, b);
 }

 // ------------------------------ InterleaveOdd
 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
 HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
   return VFromD<D>{wasm_i8x16_shuffle(a.raw, b.raw, 1, 17, 3, 19, 5, 21, 7, 23,
                                       9, 25, 11, 27, 13, 29, 15, 31)};
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
 HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
   return VFromD<D>{wasm_i16x8_shuffle(a.raw, b.raw, 1, 9, 3, 11, 5, 13, 7, 15)};
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)>
 HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
   return VFromD<D>{wasm_i32x4_shuffle(a.raw, b.raw, 1, 5, 3, 7)};
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)>
 HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
   return InterleaveUpper(d, a, b);
 }

 // ------------------------------ OddEvenBlocks
 template <typename T, size_t N>
 HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
   return even;
 }

 // ------------------------------ SwapAdjacentBlocks
 template <typename T, size_t N>
 HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
   return v;
 }

 // ------------------------------ InterleaveEvenBlocks
 template <class D, class V = VFromD<D>, HWY_IF_V_SIZE_LE_D(D, 16)>
 HWY_API V InterleaveEvenBlocks(D, V a, V /*b*/) {
   return a;
 }
 // ------------------------------ InterleaveOddBlocks
 template <class D, class V = VFromD<D>, HWY_IF_V_SIZE_LE_D(D, 16)>
 HWY_API V InterleaveOddBlocks(D, V a, V /*b*/) {
   return a;
 }

 // ------------------------------ ReverseBlocks
 template <class D>
 HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) {
   return v;  // Single block: no change
 }

 // ================================================== CONVERT

 // ------------------------------ Promotions (part w/ narrow lanes -> full)

 // Unsigned: zero-extend.
 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)>
 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) {
   return VFromD<D>{wasm_u16x8_extend_low_u8x16(v.raw)};
 }
 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)>
 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) {
   return VFromD<D>{wasm_u32x4_extend_low_u16x8(v.raw)};
 }
 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
   return VFromD<D>{wasm_u64x2_extend_low_u32x4(v.raw)};
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)>
 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) {
   return VFromD<D>{
       wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I16_D(D)>
 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) {
   return VFromD<D>{wasm_u16x8_extend_low_u8x16(v.raw)};
 }
 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) {
   return VFromD<D>{wasm_u32x4_extend_low_u16x8(v.raw)};
 }
 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
   return VFromD<D>{wasm_u64x2_extend_low_u32x4(v.raw)};
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) {
   return VFromD<D>{
       wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
 }

 // U8/U16 to U64/I64: First, zero-extend to U32, and then zero-extend to
 // TFromD<D>
 template <class D, class V, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D),
           HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V)), HWY_IF_UNSIGNED_V(V),
           HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
 HWY_API VFromD<D> PromoteTo(D d, V v) {
   const Rebind<uint32_t, decltype(d)> du32;
   return PromoteTo(d, PromoteTo(du32, v));
 }

 // Signed: replicate sign bit.
 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I16_D(D)>
 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) {
   return VFromD<D>{wasm_i16x8_extend_low_i8x16(v.raw)};
 }
 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) {
   return VFromD<D>{wasm_i32x4_extend_low_i16x8(v.raw)};
 }
 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
   return VFromD<D>{wasm_i64x2_extend_low_i32x4(v.raw)};
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) {
   return VFromD<D>{
       wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(v.raw))};
 }

 // I8/I16 to I64: First, promote to I32, and then promote to I64
 template <class D, class V, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D),
           HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V)), HWY_IF_SIGNED_V(V),
           HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
 HWY_API VFromD<D> PromoteTo(D d, V v) {
   const Rebind<int32_t, decltype(d)> di32;
   return PromoteTo(d, PromoteTo(di32, v));
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
 HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<bfloat16_t, D>> v) {
   const Rebind<uint16_t, decltype(df32)> du16;
   const RebindToSigned<decltype(df32)> di32;
   return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
   return VFromD<D>{wasm_f64x2_convert_low_i32x4(v.raw)};
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
   return VFromD<D>{wasm_f64x2_convert_low_u32x4(v.raw)};
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
   return VFromD<D>{wasm_f64x2_promote_low_f32x4(v.raw)};
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
 HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
   const Rebind<int32_t, decltype(di64)> di32;
   const RebindToFloat<decltype(di32)> df32;
   const RebindToUnsigned<decltype(di32)> du32;
   const Repartition<uint8_t, decltype(du32)> du32_as_du8;

   const auto exponent_adj = BitCast(
       du32,
       Min(SaturatedSub(BitCast(du32_as_du8, ShiftRight<23>(BitCast(du32, v))),
                        BitCast(du32_as_du8, Set(du32, uint32_t{157}))),
           BitCast(du32_as_du8, Set(du32, uint32_t{32}))));
   const auto adj_v =
       BitCast(df32, BitCast(du32, v) - ShiftLeft<23>(exponent_adj));

   const auto f32_to_i32_result = ConvertTo(di32, adj_v);
   const auto lo64_or_mask = PromoteTo(
       di64,
       BitCast(du32, VecFromMask(di32, Eq(f32_to_i32_result,
                                          Set(di32, LimitsMax<int32_t>())))));

   return Or(PromoteTo(di64, BitCast(di32, f32_to_i32_result))
                 << PromoteTo(di64, exponent_adj),
             lo64_or_mask);
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
 HWY_API VFromD<D> PromoteTo(D du64, VFromD<Rebind<float, D>> v) {
   const Rebind<uint32_t, decltype(du64)> du32;
   const RebindToFloat<decltype(du32)> df32;
   const Repartition<uint8_t, decltype(du32)> du32_as_du8;

   const auto exponent_adj = BitCast(
       du32,
       Min(SaturatedSub(BitCast(du32_as_du8, ShiftRight<23>(BitCast(du32, v))),
                        BitCast(du32_as_du8, Set(du32, uint32_t{158}))),
           BitCast(du32_as_du8, Set(du32, uint32_t{32}))));

   const auto adj_v =
       BitCast(df32, BitCast(du32, v) - ShiftLeft<23>(exponent_adj));
   const auto f32_to_u32_result = ConvertTo(du32, adj_v);
   const auto lo32_or_mask = PromoteTo(
       du64,
       VecFromMask(du32, f32_to_u32_result == Set(du32, LimitsMax<uint32_t>())));

   return Or(PromoteTo(du64, f32_to_u32_result) << PromoteTo(du64, exponent_adj),
             lo32_or_mask);
 }

 // ------------------------------ PromoteUpperTo

 // Per-target flag to prevent generic_ops-inl.h from defining PromoteUpperTo.
 #ifdef HWY_NATIVE_PROMOTE_UPPER_TO
 #undef HWY_NATIVE_PROMOTE_UPPER_TO
 #else
 #define HWY_NATIVE_PROMOTE_UPPER_TO
 #endif

 // Unsigned: zero-extend.
 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
 HWY_API VFromD<D> PromoteUpperTo(D /* tag */,
                                  VFromD<Repartition<uint8_t, D>> v) {
   return VFromD<D>{wasm_u16x8_extend_high_u8x16(v.raw)};
 }
 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
 HWY_API VFromD<D> PromoteUpperTo(D /* tag */,
                                  VFromD<Repartition<uint16_t, D>> v) {
   return VFromD<D>{wasm_u32x4_extend_high_u16x8(v.raw)};
 }
 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U64_D(D)>
 HWY_API VFromD<D> PromoteUpperTo(D /* tag */,
                                  VFromD<Repartition<uint32_t, D>> v) {
   return VFromD<D>{wasm_u64x2_extend_high_u32x4(v.raw)};
 }

 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
 HWY_API VFromD<D> PromoteUpperTo(D /* tag */,
                                  VFromD<Repartition<uint8_t, D>> v) {
   return VFromD<D>{wasm_u16x8_extend_high_u8x16(v.raw)};
 }
 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
 HWY_API VFromD<D> PromoteUpperTo(D /* tag */,
                                  VFromD<Repartition<uint16_t, D>> v) {
   return VFromD<D>{wasm_u32x4_extend_high_u16x8(v.raw)};
 }
 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)>
 HWY_API VFromD<D> PromoteUpperTo(D /* tag */,
                                  VFromD<Repartition<uint32_t, D>> v) {
   return VFromD<D>{wasm_u64x2_extend_high_u32x4(v.raw)};
 }

 // Signed: replicate sign bit.
 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
 HWY_API VFromD<D> PromoteUpperTo(D /* tag */,
                                  VFromD<Repartition<int8_t, D>> v) {
   return VFromD<D>{wasm_i16x8_extend_high_i8x16(v.raw)};
 }
 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
 HWY_API VFromD<D> PromoteUpperTo(D /* tag */,
                                  VFromD<Repartition<int16_t, D>> v) {
   return VFromD<D>{wasm_i32x4_extend_high_i16x8(v.raw)};
 }
 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)>
 HWY_API VFromD<D> PromoteUpperTo(D /* tag */,
                                  VFromD<Repartition<int32_t, D>> v) {
   return VFromD<D>{wasm_i64x2_extend_high_i32x4(v.raw)};
 }

 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
 HWY_API VFromD<D> PromoteUpperTo(D df32, VFromD<Repartition<float16_t, D>> v) {
   const Rebind<float16_t, decltype(df32)> dh;
   return PromoteTo(df32, UpperHalf(dh, v));
 }

 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
 HWY_API VFromD<D> PromoteUpperTo(D df32, VFromD<Repartition<bfloat16_t, D>> v) {
   const Repartition<uint16_t, decltype(df32)> du16;
   const RebindToSigned<decltype(df32)> di32;
   return BitCast(df32, ShiftLeft<16>(PromoteUpperTo(di32, BitCast(du16, v))));
 }

 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
 HWY_API VFromD<D> PromoteUpperTo(D dd, VFromD<Repartition<int32_t, D>> v) {
   // There is no wasm_f64x2_convert_high_i32x4.
   return PromoteTo(dd, UpperHalf(Rebind<int32_t, D>(), v));
 }

 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
 HWY_API VFromD<D> PromoteUpperTo(D dd, VFromD<Repartition<uint32_t, D>> v) {
   // There is no wasm_f64x2_convert_high_u32x4.
   return PromoteTo(dd, UpperHalf(Rebind<uint32_t, D>(), v));
 }

 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
 HWY_API VFromD<D> PromoteUpperTo(D dd, VFromD<Repartition<float, D>> v) {
   // There is no wasm_f64x2_promote_high_f32x4.
   return PromoteTo(dd, UpperHalf(Rebind<float, D>(), v));
 }

 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI64_D(D)>
 HWY_API VFromD<D> PromoteUpperTo(D d64, VFromD<Repartition<float, D>> v) {
   return PromoteTo(d64, UpperHalf(Rebind<float, D>(), v));
 }

 // Generic version for <=64 bit input/output (_high is only for full vectors).
 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), class V>
 HWY_API VFromD<D> PromoteUpperTo(D d, V v) {
   const Rebind<TFromV<V>, decltype(d)> dh;
   return PromoteTo(d, UpperHalf(dh, v));
 }

 // ------------------------------ PromoteEvenTo/PromoteOddTo
 #include "third_party/highway/hwy/ops/inside-inl.h"

 // ------------------------------ Demotions (full -> part w/ narrow lanes)

 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
   return VFromD<D>{wasm_u16x8_narrow_i32x4(v.raw, v.raw)};
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
   return VFromD<D>{wasm_i16x8_narrow_i32x4(v.raw, v.raw)};
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)>
 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
   const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
   return VFromD<D>{wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)>
 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) {
   return VFromD<D>{wasm_u8x16_narrow_i16x8(v.raw, v.raw)};
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I8_D(D)>
 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
   const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
   return VFromD<D>{wasm_i8x16_narrow_i16x8(intermediate, intermediate)};
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I8_D(D)>
 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) {
   return VFromD<D>{wasm_i8x16_narrow_i16x8(v.raw, v.raw)};
 }

 template <class D, HWY_IF_UNSIGNED_D(D),
           HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
 HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<uint32_t, D>> v) {
   const DFromV<decltype(v)> du32;
   const RebindToSigned<decltype(du32)> di32;
   return DemoteTo(dn, BitCast(di32, Min(v, Set(du32, 0x7FFFFFFF))));
 }

 template <class D, HWY_IF_U8_D(D)>
 HWY_API VFromD<D> DemoteTo(D du8, VFromD<Rebind<uint16_t, D>> v) {
   const DFromV<decltype(v)> du16;
   const RebindToSigned<decltype(du16)> di16;
   return DemoteTo(du8, BitCast(di16, Min(v, Set(du16, 0x7FFF))));
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<double, D>> v) {
   return VFromD<D>{wasm_i32x4_trunc_sat_f64x2_zero(v.raw)};
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<double, D>> v) {
   return VFromD<D>{wasm_u32x4_trunc_sat_f64x2_zero(v.raw)};
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<double, D>> v) {
   return VFromD<D>{wasm_f32x4_demote_f64x2_zero(v.raw)};
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
 HWY_API VFromD<D> DemoteTo(D df32, VFromD<Rebind<int64_t, D>> v) {
   const Rebind<double, decltype(df32)> df64;
   const RebindToUnsigned<decltype(df64)> du64;
   const RebindToSigned<decltype(df32)> di32;
   const RebindToUnsigned<decltype(df32)> du32;

   const auto k2p64_63 = Set(df64, 27670116110564327424.0);
   const auto f64_hi52 =
       Xor(BitCast(df64, ShiftRight<12>(BitCast(du64, v))), k2p64_63) - k2p64_63;
   const auto f64_lo12 =
       PromoteTo(df64, BitCast(di32, And(TruncateTo(du32, BitCast(du64, v)),
                                         Set(du32, uint32_t{0x00000FFF}))));

   const auto f64_sum = f64_hi52 + f64_lo12;
   const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12;

   const auto f64_sum_is_inexact =
       ShiftRight<63>(BitCast(du64, VecFromMask(df64, f64_carry != Zero(df64))));
   const auto f64_bits_decrement =
       And(ShiftRight<63>(BitCast(du64, Xor(f64_sum, f64_carry))),
           f64_sum_is_inexact);

   const auto adj_f64_val = BitCast(
       df64,
       Or(BitCast(du64, f64_sum) - f64_bits_decrement, f64_sum_is_inexact));

   return DemoteTo(df32, adj_f64_val);
 }
 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
 HWY_API VFromD<D> DemoteTo(D df32, VFromD<Rebind<uint64_t, D>> v) {
   const Rebind<double, decltype(df32)> df64;
   const RebindToUnsigned<decltype(df64)> du64;
   const RebindToSigned<decltype(df32)> di32;
   const RebindToUnsigned<decltype(df32)> du32;

   const auto k2p64 = Set(df64, 18446744073709551616.0);
   const auto f64_hi52 = Or(BitCast(df64, ShiftRight<12>(v)), k2p64) - k2p64;
   const auto f64_lo12 =
       PromoteTo(df64, BitCast(di32, And(TruncateTo(du32, BitCast(du64, v)),
                                         Set(du32, uint32_t{0x00000FFF}))));

   const auto f64_sum = f64_hi52 + f64_lo12;
   const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12;
   const auto f64_sum_is_inexact =
       ShiftRight<63>(BitCast(du64, VecFromMask(df64, f64_carry != Zero(df64))));

   const auto adj_f64_val = BitCast(
       df64,
       Or(BitCast(du64, f64_sum) - ShiftRight<63>(BitCast(du64, f64_carry)),
          f64_sum_is_inexact));

   return DemoteTo(df32, adj_f64_val);
 }

 // Specializations for partial vectors because i16x8_narrow_i32x4 sets lanes
 // above 2*N.
 template <class D, HWY_IF_I16_D(D)>
 HWY_API Vec32<int16_t> ReorderDemote2To(D dn, Vec32<int32_t> a,
                                         Vec32<int32_t> b) {
   const DFromV<decltype(a)> d;
   const Twice<decltype(d)> dt;
   return DemoteTo(dn, Combine(dt, b, a));
 }
 template <class D, HWY_IF_I16_D(D)>
 HWY_API Vec64<int16_t> ReorderDemote2To(D dn, Vec64<int32_t> a,
                                         Vec64<int32_t> b) {
   const Twice<decltype(dn)> dn_full;
   const Repartition<uint32_t, decltype(dn_full)> du32_full;

   const Vec128<int16_t> v_full{wasm_i16x8_narrow_i32x4(a.raw, b.raw)};
   const auto vu32_full = BitCast(du32_full, v_full);
   return LowerHalf(
       BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full)));
 }
 template <class D, HWY_IF_I16_D(D)>
 HWY_API Vec128<int16_t> ReorderDemote2To(D /* tag */, Vec128<int32_t> a,
                                          Vec128<int32_t> b) {
   return Vec128<int16_t>{wasm_i16x8_narrow_i32x4(a.raw, b.raw)};
 }

 template <class D, HWY_IF_U16_D(D)>
 HWY_API Vec32<uint16_t> ReorderDemote2To(D dn, Vec32<int32_t> a,
                                          Vec32<int32_t> b) {
   const DFromV<decltype(a)> d;
   const Twice<decltype(d)> dt;
   return DemoteTo(dn, Combine(dt, b, a));
 }
 template <class D, HWY_IF_U16_D(D)>
 HWY_API Vec64<uint16_t> ReorderDemote2To(D dn, Vec64<int32_t> a,
                                          Vec64<int32_t> b) {
   const Twice<decltype(dn)> dn_full;
   const Repartition<uint32_t, decltype(dn_full)> du32_full;

   const Vec128<int16_t> v_full{wasm_u16x8_narrow_i32x4(a.raw, b.raw)};
   const auto vu32_full = BitCast(du32_full, v_full);
   return LowerHalf(
       BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full)));
 }
 template <class D, HWY_IF_U16_D(D)>
 HWY_API Vec128<uint16_t> ReorderDemote2To(D /* tag */, Vec128<int32_t> a,
                                           Vec128<int32_t> b) {
   return Vec128<uint16_t>{wasm_u16x8_narrow_i32x4(a.raw, b.raw)};
 }

 template <class D, HWY_IF_U16_D(D)>
 HWY_API VFromD<D> ReorderDemote2To(D dn, Vec128<uint32_t> a,
                                    Vec128<uint32_t> b) {
   const DFromV<decltype(a)> du32;
   const RebindToSigned<decltype(du32)> di32;
   const auto max_i32 = Set(du32, 0x7FFFFFFFu);

   const auto clamped_a = BitCast(di32, Min(a, max_i32));
   const auto clamped_b = BitCast(di32, Min(b, max_i32));
   return ReorderDemote2To(dn, clamped_a, clamped_b);
 }
 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
 HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint32_t, D>> a,
                                    VFromD<Repartition<uint32_t, D>> b) {
   const DFromV<decltype(a)> d;
   const Twice<decltype(d)> dt;
   return DemoteTo(dn, Combine(dt, b, a));
 }

 // Specializations for partial vectors because i8x16_narrow_i16x8 sets lanes
 // above 2*N.
 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I8_D(D)>
 HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<int16_t, D>> a,
                                    VFromD<Repartition<int16_t, D>> b) {
   const DFromV<decltype(a)> d;
   const Twice<decltype(d)> dt;
   return DemoteTo(dn, Combine(dt, b, a));
 }
 template <class D, HWY_IF_I8_D(D)>
 HWY_API Vec64<int8_t> ReorderDemote2To(D dn, Vec64<int16_t> a,
                                        Vec64<int16_t> b) {
   const Twice<decltype(dn)> dn_full;
   const Repartition<uint32_t, decltype(dn_full)> du32_full;

   const Vec128<int8_t> v_full{wasm_i8x16_narrow_i16x8(a.raw, b.raw)};
   const auto vu32_full = BitCast(du32_full, v_full);
   return LowerHalf(
       BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full)));
 }
 template <class D, HWY_IF_I8_D(D)>
 HWY_API Vec128<int8_t> ReorderDemote2To(D /* tag */, Vec128<int16_t> a,
                                         Vec128<int16_t> b) {
   return Vec128<int8_t>{wasm_i8x16_narrow_i16x8(a.raw, b.raw)};
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)>
 HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<int16_t, D>> a,
                                    VFromD<Repartition<int16_t, D>> b) {
   const DFromV<decltype(a)> d;
   const Twice<decltype(d)> dt;
   return DemoteTo(dn, Combine(dt, b, a));
 }
 template <class D, HWY_IF_U8_D(D)>
 HWY_API Vec64<uint8_t> ReorderDemote2To(D dn, Vec64<int16_t> a,
                                         Vec64<int16_t> b) {
   const Twice<decltype(dn)> dn_full;
   const Repartition<uint32_t, decltype(dn_full)> du32_full;

   const Vec128<uint8_t> v_full{wasm_u8x16_narrow_i16x8(a.raw, b.raw)};
   const auto vu32_full = BitCast(du32_full, v_full);
   return LowerHalf(
       BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full)));
 }
 template <class D, HWY_IF_U8_D(D)>
 HWY_API Vec128<uint8_t> ReorderDemote2To(D /* tag */, Vec128<int16_t> a,
                                          Vec128<int16_t> b) {
   return Vec128<uint8_t>{wasm_u8x16_narrow_i16x8(a.raw, b.raw)};
 }

 template <class D, HWY_IF_U8_D(D)>
 HWY_API VFromD<D> ReorderDemote2To(D dn, Vec128<uint16_t> a,
                                    Vec128<uint16_t> b) {
   const DFromV<decltype(a)> du16;
   const RebindToSigned<decltype(du16)> di16;
   const auto max_i16 = Set(du16, 0x7FFFu);

   const auto clamped_a = BitCast(di16, Min(a, max_i16));
   const auto clamped_b = BitCast(di16, Min(b, max_i16));
   return ReorderDemote2To(dn, clamped_a, clamped_b);
 }
 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)>
 HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint16_t, D>> a,
                                    VFromD<Repartition<uint16_t, D>> b) {
   const DFromV<decltype(a)> d;
   const Twice<decltype(d)> dt;
   return DemoteTo(dn, Combine(dt, b, a));
 }

 // For already range-limited input [0, 255].
 template <size_t N>
 HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) {
   const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
   return Vec128<uint8_t, N>{
       wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
 }

 // ------------------------------ Truncations

 template <typename From, class DTo, HWY_IF_LANES_D(DTo, 1)>
 HWY_API VFromD<DTo> TruncateTo(DTo /* tag */, Vec128<From, 1> v) {
   // BitCast requires the same size; DTo might be u8x1 and v u16x1.
   const Repartition<TFromD<DTo>, DFromV<decltype(v)>> dto;
   return VFromD<DTo>{BitCast(dto, v).raw};
 }

 template <class D, HWY_IF_U8_D(D)>
 HWY_API Vec16<uint8_t> TruncateTo(D /* tag */, Vec128<uint64_t> v) {
   const Full128<uint8_t> d;
   const auto v1 = BitCast(d, v);
   const auto v2 = ConcatEven(d, v1, v1);
   const auto v4 = ConcatEven(d, v2, v2);
   return LowerHalf(LowerHalf(LowerHalf(ConcatEven(d, v4, v4))));
 }

 template <class D, HWY_IF_U16_D(D)>
 HWY_API Vec32<uint16_t> TruncateTo(D /* tag */, Vec128<uint64_t> v) {
   const Full128<uint16_t> d;
   const auto v1 = BitCast(d, v);
   const auto v2 = ConcatEven(d, v1, v1);
   return LowerHalf(LowerHalf(ConcatEven(d, v2, v2)));
 }

 template <class D, HWY_IF_U32_D(D)>
 HWY_API Vec64<uint32_t> TruncateTo(D /* tag */, Vec128<uint64_t> v) {
   const Full128<uint32_t> d;
   const auto v1 = BitCast(d, v);
   return LowerHalf(ConcatEven(d, v1, v1));
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U8_D(D)>
 HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
   const Repartition<uint8_t, DFromV<decltype(v)>> d;
   const auto v1 = Vec128<uint8_t>{v.raw};
   const auto v2 = ConcatEven(d, v1, v1);
   const auto v3 = ConcatEven(d, v2, v2);
   return VFromD<D>{v3.raw};
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)>
 HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
   const Repartition<uint16_t, DFromV<decltype(v)>> d;
   const auto v1 = Vec128<uint16_t>{v.raw};
   const auto v2 = ConcatEven(d, v1, v1);
   return VFromD<D>{v2.raw};
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U8_D(D)>
 HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) {
   const Repartition<uint8_t, DFromV<decltype(v)>> d;
   const auto v1 = Vec128<uint8_t>{v.raw};
   const auto v2 = ConcatEven(d, v1, v1);
   return VFromD<D>{v2.raw};
 }

 // ------------------------------ Demotions to/from i64

 namespace detail {
 template <class D, HWY_IF_UNSIGNED_D(D)>
 HWY_INLINE VFromD<Rebind<uint64_t, D>> DemoteFromU64MaskOutResult(
     D /*dn*/, VFromD<Rebind<uint64_t, D>> v) {
   return v;
 }

 template <class D, HWY_IF_SIGNED_D(D)>
 HWY_INLINE VFromD<Rebind<uint64_t, D>> DemoteFromU64MaskOutResult(
     D /*dn*/, VFromD<Rebind<uint64_t, D>> v) {
   const DFromV<decltype(v)> du64;
   return And(v,
              Set(du64, static_cast<uint64_t>(hwy::HighestValue<TFromD<D>>())));
 }

 template <class D>
 HWY_INLINE VFromD<Rebind<uint64_t, D>> DemoteFromU64Saturate(
     D dn, VFromD<Rebind<uint64_t, D>> v) {
   const Rebind<uint64_t, D> du64;
   const RebindToSigned<decltype(du64)> di64;
   constexpr int kShiftAmt = static_cast<int>(sizeof(TFromD<D>) * 8) -
                             static_cast<int>(hwy::IsSigned<TFromD<D>>());

   const auto too_big = BitCast(
       du64, VecFromMask(
                 di64, Gt(BitCast(di64, ShiftRight<kShiftAmt>(v)), Zero(di64))));
   return DemoteFromU64MaskOutResult(dn, Or(v, too_big));
 }

 template <class D, class V>
 HWY_INLINE VFromD<D> ReorderDemote2From64To32Combine(D dn, V a, V b) {
   return ConcatEven(dn, BitCast(dn, b), BitCast(dn, a));
 }

 }  // namespace detail

 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
           HWY_IF_SIGNED_D(D)>
 HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<int64_t, D>> v) {
   const DFromV<decltype(v)> di64;
   const RebindToUnsigned<decltype(di64)> du64;
   const RebindToUnsigned<decltype(dn)> dn_u;

   // Negative values are saturated by first saturating their bitwise inverse
   // and then inverting the saturation result
   const auto invert_mask = BitCast(du64, BroadcastSignBit(v));
   const auto saturated_vals = Xor(
       invert_mask,
       detail::DemoteFromU64Saturate(dn, Xor(invert_mask, BitCast(du64, v))));
   return BitCast(dn, TruncateTo(dn_u, saturated_vals));
 }

 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
           HWY_IF_UNSIGNED_D(D)>
 HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<int64_t, D>> v) {
   const DFromV<decltype(v)> di64;
   const RebindToUnsigned<decltype(di64)> du64;

   const auto non_neg_vals = BitCast(du64, AndNot(BroadcastSignBit(v), v));
   return TruncateTo(dn, detail::DemoteFromU64Saturate(dn, non_neg_vals));
 }

 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
           HWY_IF_UNSIGNED_D(D)>
 HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<uint64_t, D>> v) {
   return TruncateTo(dn, detail::DemoteFromU64Saturate(dn, v));
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 4),
           HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>)>
 HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<int64_t, D>> a,
                                    VFromD<Repartition<int64_t, D>> b) {
   const DFromV<decltype(a)> d;
   const Twice<decltype(d)> dt;
   return DemoteTo(dn, Combine(dt, b, a));
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
 HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint64_t, D>> a,
                                    VFromD<Repartition<uint64_t, D>> b) {
   const DFromV<decltype(a)> d;
   const Twice<decltype(d)> dt;
   return DemoteTo(dn, Combine(dt, b, a));
 }

 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
 HWY_API Vec128<int32_t> ReorderDemote2To(D dn, Vec128<int64_t> a,
                                          Vec128<int64_t> b) {
   const DFromV<decltype(a)> di64;
   const RebindToUnsigned<decltype(di64)> du64;
   const Half<decltype(dn)> dnh;

   // Negative values are saturated by first saturating their bitwise inverse
   // and then inverting the saturation result
   const auto invert_mask_a = BitCast(du64, BroadcastSignBit(a));
   const auto invert_mask_b = BitCast(du64, BroadcastSignBit(b));
   const auto saturated_a = Xor(
       invert_mask_a,
       detail::DemoteFromU64Saturate(dnh, Xor(invert_mask_a, BitCast(du64, a))));
   const auto saturated_b = Xor(
       invert_mask_b,
       detail::DemoteFromU64Saturate(dnh, Xor(invert_mask_b, BitCast(du64, b))));

   return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a));
 }

 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
 HWY_API Vec128<uint32_t> ReorderDemote2To(D dn, Vec128<int64_t> a,
                                           Vec128<int64_t> b) {
   const DFromV<decltype(a)> di64;
   const RebindToUnsigned<decltype(di64)> du64;
   const Half<decltype(dn)> dnh;

   const auto saturated_a = detail::DemoteFromU64Saturate(
       dnh, BitCast(du64, AndNot(BroadcastSignBit(a), a)));
   const auto saturated_b = detail::DemoteFromU64Saturate(
       dnh, BitCast(du64, AndNot(BroadcastSignBit(b), b)));

   return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a));
 }

 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
 HWY_API Vec128<uint32_t> ReorderDemote2To(D dn, Vec128<uint64_t> a,
                                           Vec128<uint64_t> b) {
   const Half<decltype(dn)> dnh;

   const auto saturated_a = detail::DemoteFromU64Saturate(dnh, a);
   const auto saturated_b = detail::DemoteFromU64Saturate(dnh, b);

   return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a));
 }

 template <class D, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>), class V,
           HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
           HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
           HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)>
 HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
   return ReorderDemote2To(d, a, b);
 }

 // ------------------------------ ConvertTo

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
   return VFromD<D>{wasm_f32x4_convert_i32x4(v.raw)};
 }
 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
   return VFromD<D>{wasm_f32x4_convert_u32x4(v.raw)};
 }

 template <class D, HWY_IF_F64_D(D)>
 HWY_API VFromD<D> ConvertTo(D dd, VFromD<Rebind<int64_t, D>> v) {
   // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
   const Repartition<uint32_t, decltype(dd)> d32;
   const Repartition<uint64_t, decltype(dd)> d64;

   // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63
   const auto k84_63 = Set(d64, 0x4530000080000000ULL);
   const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63);

   // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven)
   const auto k52 = Set(d32, 0x43300000);
   const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v)));

   const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL));
   return (v_upper - k84_63_52) + v_lower;  // order matters!
 }

 namespace detail {
 template <class VW>
 HWY_INLINE VFromD<Rebind<double, DFromV<VW>>> U64ToF64VecFast(VW w) {
   const DFromV<decltype(w)> d64;
   const RebindToFloat<decltype(d64)> dd;
   const auto cnst2_52_dbl = Set(dd, 0x0010000000000000);  // 2^52
   return BitCast(dd, Or(w, BitCast(d64, cnst2_52_dbl))) - cnst2_52_dbl;
 }
 }  // namespace detail

 template <class D, HWY_IF_F64_D(D)>
 HWY_API VFromD<D> ConvertTo(D dd, VFromD<Rebind<uint64_t, D>> v) {
   // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
   const RebindToUnsigned<decltype(dd)> d64;
   using VU = VFromD<decltype(d64)>;

   const VU msk_lo = Set(d64, 0xFFFFFFFF);
   const auto cnst2_32_dbl = Set(dd, 4294967296.0);  // 2^32

   // Extract the 32 lowest/highest significant bits of v
   const VU v_lo = And(v, msk_lo);
   const VU v_hi = ShiftRight<32>(v);

   const auto v_lo_dbl = detail::U64ToF64VecFast(v_lo);
   return MulAdd(cnst2_32_dbl, detail::U64ToF64VecFast(v_hi), v_lo_dbl);
 }

 // Truncates (rounds toward zero).
 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<float, D>> v) {
   return VFromD<D>{wasm_i32x4_trunc_sat_f32x4(v.raw)};
 }
 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)>
 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<float, D>> v) {
   return VFromD<D>{wasm_u32x4_trunc_sat_f32x4(v.raw)};
 }

 template <class DI, HWY_IF_I64_D(DI)>
 HWY_API VFromD<DI> ConvertTo(DI di, VFromD<Rebind<double, DI>> v) {
   using VI = VFromD<decltype(di)>;
   using MI = MFromD<decltype(di)>;
   const RebindToUnsigned<decltype(di)> du;
   using VU = VFromD<decltype(du)>;
   const Repartition<uint16_t, decltype(di)> du16;
   const VI k1075 = Set(di, 1075);  // biased exponent of 2^52

   // Exponent indicates whether the number can be represented as int64_t.
   const VU biased_exp = ShiftRight<52>(BitCast(du, v)) & Set(du, 0x7FF);
   const MI in_range = BitCast(di, biased_exp) < Set(di, 1086);

   // If we were to cap the exponent at 51 and add 2^52, the number would be in
   // [2^52, 2^53) and mantissa bits could be read out directly. We need to
   // round-to-0 (truncate).
   // Use 16-bit saturated unsigned subtraction to compute shift_mnt and
   // shift_int since biased_exp[i] is a non-negative integer that is less than
   // or equal to 2047.
   // The upper 48 bits of both shift_mnt and shift_int are guaranteed to be
   // zero as the upper 48 bits of both k1075 and biased_exp are zero.

   const VU shift_mnt = BitCast(
       du, SaturatedSub(BitCast(du16, k1075), BitCast(du16, biased_exp)));
   const VU shift_int = BitCast(
       du, SaturatedSub(BitCast(du16, biased_exp), BitCast(du16, k1075)));
   const VU mantissa = BitCast(du, v) & Set(du, (1ULL << 52) - 1);
   // Include implicit 1-bit
   VU int53 = (mantissa | Set(du, 1ULL << 52)) >> shift_mnt;
   // WASM clamps shift count; zero if greater.
   const MI tiny = BitCast(di, shift_mnt) > Set(di, 63);
   int53 = IfThenZeroElse(RebindMask(du, tiny), int53);

   // For inputs larger than 2^53 - 1, insert zeros at the bottom.
   // For inputs less than 2^63, the implicit 1-bit is guaranteed not to be
   // shifted out of the left shift result below as shift_int[i] <= 10 is true
   // for any inputs that are less than 2^63.
   const VU shifted = int53 << shift_int;

   // Saturate to LimitsMin (unchanged when negating below) or LimitsMax.
   const VI sign_mask = BroadcastSignBit(BitCast(di, v));
   const VI limit = Set(di, LimitsMax<int64_t>()) - sign_mask;
   const VI magnitude = IfThenElse(in_range, BitCast(di, shifted), limit);

   // If the input was negative, negate the integer (two's complement).
   return (magnitude ^ sign_mask) - sign_mask;
 }

 template <class DU, HWY_IF_U64_D(DU)>
 HWY_API VFromD<DU> ConvertTo(DU du, VFromD<Rebind<double, DU>> v) {
   const RebindToSigned<decltype(du)> di;
   using MI = MFromD<decltype(di)>;
   using VU = VFromD<decltype(du)>;
   const Repartition<uint16_t, decltype(di)> du16;
   const VU k1075 = Set(du, 1075); /* biased exponent of 2^52 */

   const auto non_neg_v = ZeroIfNegative(v);

   // Exponent indicates whether the number can be represented as int64_t.
   const VU biased_exp = ShiftRight<52>(BitCast(du, non_neg_v));
   const VU out_of_range =
       BitCast(du, VecFromMask(di, BitCast(di, biased_exp) > Set(di, 1086)));

   // If we were to cap the exponent at 51 and add 2^52, the number would be in
   // [2^52, 2^53) and mantissa bits could be read out directly. We need to
   // round-to-0 (truncate), but changing rounding mode in MXCSR hits a
   // compiler reordering bug: https://gcc.godbolt.org/z/4hKj6c6qc . We instead
   // manually shift the mantissa into place (we already have many of the
   // inputs anyway).

   // Use 16-bit saturated unsigned subtraction to compute shift_mnt and
   // shift_int since biased_exp[i] is a non-negative integer that is less than
   // or equal to 2047.

   // 16-bit saturated unsigned subtraction is also more efficient than a
   // 64-bit subtraction followed by a 64-bit signed Max operation on
   // WASM.

   // The upper 48 bits of both shift_mnt and shift_int are guaranteed to be
   // zero as the upper 48 bits of both k1075 and biased_exp are zero.

   const VU shift_mnt = BitCast(
       du, SaturatedSub(BitCast(du16, k1075), BitCast(du16, biased_exp)));
   const VU shift_int = BitCast(
       du, SaturatedSub(BitCast(du16, biased_exp), BitCast(du16, k1075)));
   const VU mantissa = BitCast(du, non_neg_v) & Set(du, (1ULL << 52) - 1);
   // Include implicit 1-bit.
   VU int53 = (mantissa | Set(du, 1ULL << 52)) >> shift_mnt;
   // WASM clamps shift count; zero if greater.
   const MI tiny = BitCast(di, shift_mnt) > Set(di, 63);
   int53 = IfThenZeroElse(RebindMask(du, tiny), int53);

   // For inputs larger than 2^53 - 1, insert zeros at the bottom.

   // For inputs less than 2^64, the implicit 1-bit is guaranteed not to be
   // shifted out of the left shift result below as shift_int[i] <= 11 is true
   // for any inputs that are less than 2^64.

   const VU shifted = int53 << shift_int;
   return (shifted | out_of_range);
 }

 // ------------------------------ NearestInt (Round)
 template <typename T, size_t N, HWY_IF_FLOAT3264(T)>
 HWY_API Vec128<MakeSigned<T>, N> NearestInt(const Vec128<T, N> v) {
   return ConvertTo(RebindToSigned<DFromV<decltype(v)>>(), Round(v));
 }

 // ------------------------------ DemoteToNearestInt (Round)
 template <class DI32, HWY_IF_I32_D(DI32)>
 HWY_API VFromD<DI32> DemoteToNearestInt(DI32 di32,
                                         VFromD<Rebind<double, DI32>> v) {
   // No single instruction, round then demote.
   return DemoteTo(di32, Round(v));
 }

 // ================================================== MISC

 // ------------------------------ SumsOf8 (ShiftRight, Add)
 template <size_t N>
 HWY_API Vec128<uint64_t, N / 8> SumsOf8(const Vec128<uint8_t, N> v) {
   const DFromV<decltype(v)> du8;
   const RepartitionToWide<decltype(du8)> du16;
   const RepartitionToWide<decltype(du16)> du32;
   const RepartitionToWide<decltype(du32)> du64;
   using VU16 = VFromD<decltype(du16)>;

   const VU16 vFDB97531 = ShiftRight<8>(BitCast(du16, v));
   const VU16 vECA86420 = And(BitCast(du16, v), Set(du16, 0xFF));
   const VU16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420);

   const VU16 szz_FE_zz_BA_zz_76_zz_32 =
       BitCast(du16, ShiftRight<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10)));
   const VU16 sxx_FC_xx_B8_xx_74_xx_30 =
       Add(sFE_DC_BA_98_76_54_32_10, szz_FE_zz_BA_zz_76_zz_32);
   const VU16 szz_zz_xx_FC_zz_zz_xx_74 =
       BitCast(du16, ShiftRight<32>(BitCast(du64, sxx_FC_xx_B8_xx_74_xx_30)));
   const VU16 sxx_xx_xx_F8_xx_xx_xx_70 =
       Add(sxx_FC_xx_B8_xx_74_xx_30, szz_zz_xx_FC_zz_zz_xx_74);
   return And(BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70), Set(du64, 0xFFFF));
 }

 template <size_t N>
 HWY_API Vec128<int64_t, N / 8> SumsOf8(const Vec128<int8_t, N> v) {
   const DFromV<decltype(v)> di8;
   const RepartitionToWide<decltype(di8)> di16;
   const RepartitionToWide<decltype(di16)> di32;
   const RepartitionToWide<decltype(di32)> di64;
   const RebindToUnsigned<decltype(di32)> du32;
   const RebindToUnsigned<decltype(di64)> du64;
   using VI16 = VFromD<decltype(di16)>;

   const VI16 vFDB97531 = ShiftRight<8>(BitCast(di16, v));
   const VI16 vECA86420 = ShiftRight<8>(ShiftLeft<8>(BitCast(di16, v)));
   const VI16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420);

   const VI16 sDC_zz_98_zz_54_zz_10_zz =
       BitCast(di16, ShiftLeft<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10)));
   const VI16 sFC_xx_B8_xx_74_xx_30_xx =
       Add(sFE_DC_BA_98_76_54_32_10, sDC_zz_98_zz_54_zz_10_zz);
   const VI16 sB8_xx_zz_zz_30_xx_zz_zz =
       BitCast(di16, ShiftLeft<32>(BitCast(du64, sFC_xx_B8_xx_74_xx_30_xx)));
   const VI16 sF8_xx_xx_xx_70_xx_xx_xx =
       Add(sFC_xx_B8_xx_74_xx_30_xx, sB8_xx_zz_zz_30_xx_zz_zz);
   return ShiftRight<48>(BitCast(di64, sF8_xx_xx_xx_70_xx_xx_xx));
 }

 // ------------------------------ LoadMaskBits (TestBit)

 namespace detail {

 template <class D, HWY_IF_T_SIZE_D(D, 1)>
 HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t bits) {
   const RebindToUnsigned<decltype(d)> du;
   // Easier than Set(), which would require an >8-bit type, which would not
   // compile for T=uint8_t, N=1.
   const VFromD<D> vbits{wasm_i32x4_splat(static_cast<int32_t>(bits))};

   // Replicate bytes 8x such that each byte contains the bit that governs it.
   alignas(16) static constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
                                                     1, 1, 1, 1, 1, 1, 1, 1};
   const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8));

   alignas(16) static constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
                                                    1, 2, 4, 8, 16, 32, 64, 128};
   return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
 }

 template <class D, HWY_IF_T_SIZE_D(D, 2)>
 HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t bits) {
   const RebindToUnsigned<decltype(d)> du;
   alignas(16) static constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
   return RebindMask(
       d, TestBit(Set(du, static_cast<uint16_t>(bits)), Load(du, kBit)));
 }

 template <class D, HWY_IF_T_SIZE_D(D, 4)>
 HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t bits) {
   const RebindToUnsigned<decltype(d)> du;
   alignas(16) static constexpr uint32_t kBit[8] = {1, 2, 4, 8};
   return RebindMask(
       d, TestBit(Set(du, static_cast<uint32_t>(bits)), Load(du, kBit)));
 }

 template <class D, HWY_IF_T_SIZE_D(D, 8)>
 HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t bits) {
   const RebindToUnsigned<decltype(d)> du;
   alignas(16) static constexpr uint64_t kBit[8] = {1, 2};
   return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit)));
 }

 }  // namespace detail

 // `p` points to at least 8 readable bytes, not all of which need be valid.
 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
 HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
   uint64_t mask_bits = 0;
   CopyBytes<(MaxLanes(d) + 7) / 8>(bits, &mask_bits);
   return detail::LoadMaskBits(d, mask_bits);
 }

 // ------------------------------ Dup128MaskFromMaskBits

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
 HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
   constexpr size_t kN = MaxLanes(d);
   if (kN < 8) mask_bits &= (1u << kN) - 1;
   return detail::LoadMaskBits(d, mask_bits);
 }

 // ------------------------------ Mask

 namespace detail {

 // Returns the lowest N bits for the BitsFromMask result.
 template <class D>
 constexpr uint64_t OnlyActive(D d, uint64_t bits) {
   return (d.MaxBytes() == 16) ? bits : bits & ((1ull << d.MaxLanes()) - 1);
 }

 }  // namespace detail

 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_D(D, 16)>
 HWY_API uint64_t BitsFromMask(D /*d*/, const MFromD<D> mask) {
   alignas(16) uint64_t lanes[2];
   wasm_v128_store(lanes, mask.raw);

   constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
   const uint64_t lo = ((lanes[0] * kMagic) >> 56);
   const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00;
   return hi + lo;  // exactly 16 bits, no OnlyActive required
 }

 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_D(D, 8)>
 HWY_API uint64_t BitsFromMask(D /*d*/, const MFromD<D> mask) {
   constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
   const uint64_t bytes =
       static_cast<uint64_t>(wasm_i64x2_extract_lane(mask.raw, 0));
   return (bytes * kMagic) >> 56;  // exactly 8 bits, no OnlyActive required
 }

 // 32-bit or less: need masking
 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_LE_D(D, 4)>
 HWY_API uint64_t BitsFromMask(D d, const MFromD<D> mask) {
   uint64_t bytes = static_cast<uint64_t>(wasm_i64x2_extract_lane(mask.raw, 0));
   // Clear potentially undefined bytes.
   bytes &= (1ULL << (Lanes(d) * 8)) - 1;
   constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
   return detail::OnlyActive(d, (bytes * kMagic) >> 56);
 }

 template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_LE_D(D, 16)>
 HWY_API uint64_t BitsFromMask(D /*d*/, const MFromD<D> mask) {
   // Remove useless lower half of each u16 while preserving the sign bit.
   const Rebind<uint8_t, D> d8;
   using M8 = MFromD<decltype(d8)>;
   const __i16x8 zero = wasm_i16x8_splat(0);
   const M8 mask8{wasm_i8x16_narrow_i16x8(mask.raw, zero)};
   return detail::OnlyActive(d8, BitsFromMask(d8, mask8));
 }

 template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_LE_D(D, 16)>
 HWY_API uint64_t BitsFromMask(D d, const MFromD<D> mask) {
   const __i32x4 mask_i = static_cast<__i32x4>(mask.raw);
   const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8);
   const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice);
   alignas(16) uint32_t lanes[4];
   wasm_v128_store(lanes, sliced_mask);
   return detail::OnlyActive(d, lanes[0] | lanes[1] | lanes[2] | lanes[3]);
 }

 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_LE_D(D, 16)>
 HWY_API uint64_t BitsFromMask(D d, const MFromD<D> mask) {
   const __i64x2 mask_i = static_cast<__i64x2>(mask.raw);
   const __i64x2 slice = wasm_i64x2_make(1, 2);
   const __i64x2 sliced_mask = wasm_v128_and(mask_i, slice);
   alignas(16) uint64_t lanes[2];
   wasm_v128_store(lanes, sliced_mask);
   return detail::OnlyActive(d, lanes[0] | lanes[1]);
 }

 namespace detail {

 // Returns 0xFF for bytes with index >= N, otherwise 0.
 template <size_t N>
 constexpr __i8x16 BytesAbove() {
   return /**/
       (N == 0)    ? wasm_i32x4_make(-1, -1, -1, -1)
       : (N == 4)  ? wasm_i32x4_make(0, -1, -1, -1)
       : (N == 8)  ? wasm_i32x4_make(0, 0, -1, -1)
       : (N == 12) ? wasm_i32x4_make(0, 0, 0, -1)
       : (N == 16) ? wasm_i32x4_make(0, 0, 0, 0)
       : (N == 2)  ? wasm_i16x8_make(0, -1, -1, -1, -1, -1, -1, -1)
       : (N == 6)  ? wasm_i16x8_make(0, 0, 0, -1, -1, -1, -1, -1)
       : (N == 10) ? wasm_i16x8_make(0, 0, 0, 0, 0, -1, -1, -1)
       : (N == 14) ? wasm_i16x8_make(0, 0, 0, 0, 0, 0, 0, -1)
       : (N == 1)  ? wasm_i8x16_make(0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
                                     -1, -1, -1, -1, -1)
       : (N == 3)  ? wasm_i8x16_make(0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1,
                                     -1, -1, -1, -1)
       : (N == 5)  ? wasm_i8x16_make(0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1,
                                     -1, -1, -1, -1)
       : (N == 7)  ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1,
                                     -1, -1, -1)
       : (N == 9)  ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1,
                                     -1, -1, -1)
       : (N == 11)
           ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1)
       : (N == 13)
           ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1)
           : wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1);
 }

 }  // namespace detail

 // `p` points to at least 8 writable bytes.
 template <class D>
 HWY_API size_t StoreMaskBits(D d, const MFromD<D> mask, uint8_t* bits) {
   const uint64_t mask_bits = BitsFromMask(d, mask);
   const size_t kNumBytes = (d.MaxLanes() + 7) / 8;
   CopyBytes<kNumBytes>(&mask_bits, bits);
   return kNumBytes;
 }

 template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_D(D, 16)>
 HWY_API size_t CountTrue(D d, const MFromD<D> m) {
   return PopCount(BitsFromMask(d, m));
 }
 template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_D(D, 16)>
 HWY_API size_t CountTrue(D d, const MFromD<D> m) {
   return PopCount(BitsFromMask(d, m));
 }
 template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_D(D, 16)>
 HWY_API size_t CountTrue(D /*d*/, const MFromD<D> m) {
   const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8);
   const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift);
   alignas(16) uint64_t lanes[2];
   wasm_v128_store(lanes, shifted_bits);
   return PopCount(lanes[0] | lanes[1]);
 }
 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 16)>
 HWY_API size_t CountTrue(D /*d*/, const MFromD<D> m) {
   alignas(16) int64_t lanes[2];
   wasm_v128_store(lanes, m.raw);
   return static_cast<size_t>(-(lanes[0] + lanes[1]));
 }

 // Partial
 template <class D, typename T = TFromD<D>, HWY_IF_V_SIZE_LE_D(D, 8)>
 HWY_API size_t CountTrue(D d, MFromD<D> m) {
   // Ensure all undefined bytes are 0.
   const MFromD<D> mask{detail::BytesAbove<d.MaxBytes()>()};
   const Full128<T> dfull;
   return CountTrue(dfull, Mask128<T>{AndNot(mask, m).raw});
 }

 // Full vector
 template <class D, HWY_IF_V_SIZE_D(D, 16)>
 HWY_API bool AllFalse(D d, const MFromD<D> m) {
   const auto v8 = BitCast(Full128<int8_t>(), VecFromMask(d, m));
   return !wasm_v128_any_true(v8.raw);
 }

 // Full vector
 namespace detail {
 template <typename T>
 HWY_INLINE bool AllTrue(hwy::SizeTag<1> /*tag*/, const Mask128<T> m) {
   return wasm_i8x16_all_true(m.raw);
 }
 template <typename T>
 HWY_INLINE bool AllTrue(hwy::SizeTag<2> /*tag*/, const Mask128<T> m) {
   return wasm_i16x8_all_true(m.raw);
 }
 template <typename T>
 HWY_INLINE bool AllTrue(hwy::SizeTag<4> /*tag*/, const Mask128<T> m) {
   return wasm_i32x4_all_true(m.raw);
 }
 template <typename T>
 HWY_INLINE bool AllTrue(hwy::SizeTag<8> /*tag*/, const Mask128<T> m) {
   return wasm_i64x2_all_true(m.raw);
 }

 }  // namespace detail

 template <class D, typename T = TFromD<D>>
 HWY_API bool AllTrue(D /* tag */, const Mask128<T> m) {
   return detail::AllTrue(hwy::SizeTag<sizeof(T)>(), m);
 }

 // Partial vectors

 template <class D, typename T = TFromD<D>, HWY_IF_V_SIZE_LE_D(D, 8)>
 HWY_API bool AllFalse(D d, const MFromD<D> m) {
   // Ensure all undefined bytes are 0.
   const MFromD<D> mask{detail::BytesAbove<d.MaxBytes()>()};
   return AllFalse(Full128<T>(), Mask128<T>{AndNot(mask, m).raw});
 }

 template <class D, typename T = TFromD<D>, HWY_IF_V_SIZE_LE_D(D, 8)>
 HWY_API bool AllTrue(D d, const MFromD<D> m) {
   // Ensure all undefined bytes are FF.
   const MFromD<D> mask{detail::BytesAbove<d.MaxBytes()>()};
   return AllTrue(Full128<T>(), Mask128<T>{Or(mask, m).raw});
 }

 template <class D>
 HWY_API size_t FindKnownFirstTrue(D d, const MFromD<D> mask) {
   const uint32_t bits = static_cast<uint32_t>(BitsFromMask(d, mask));
   return Num0BitsBelowLS1Bit_Nonzero32(bits);
 }

 template <class D>
 HWY_API intptr_t FindFirstTrue(D d, const MFromD<D> mask) {
   const uint32_t bits = static_cast<uint32_t>(BitsFromMask(d, mask));
   return bits ? static_cast<intptr_t>(Num0BitsBelowLS1Bit_Nonzero32(bits)) : -1;
 }

 template <class D>
 HWY_API size_t FindKnownLastTrue(D d, const MFromD<D> mask) {
   const uint32_t bits = static_cast<uint32_t>(BitsFromMask(d, mask));
   return 31 - Num0BitsAboveMS1Bit_Nonzero32(bits);
 }

 template <class D>
 HWY_API intptr_t FindLastTrue(D d, const MFromD<D> mask) {
   const uint32_t bits = static_cast<uint32_t>(BitsFromMask(d, mask));
   return bits
              ? (31 - static_cast<intptr_t>(Num0BitsAboveMS1Bit_Nonzero32(bits)))
              : -1;
 }

 // ------------------------------ Compress

 namespace detail {

 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
 HWY_INLINE Vec128<T, N> IdxFromBits(const uint64_t mask_bits) {
   HWY_DASSERT(mask_bits < 256);
   const Simd<T, N, 0> d;
   const Rebind<uint8_t, decltype(d)> d8;
   const Simd<uint16_t, N, 0> du;

   // We need byte indices for TableLookupBytes (one vector's worth for each of
   // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We
   // can instead store lane indices and convert to byte indices (2*lane + 0..1),
   // with the doubling baked into the table. Unpacking nibbles is likely more
   // costly than the higher cache footprint from storing bytes.
   alignas(16) static constexpr uint8_t table[256 * 8] = {
       // PrintCompress16x8Tables
       0,  2,  4,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
       2,  0,  4,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
       4,  0,  2,  6,  8,  10, 12, 14, /**/ 0, 4,  2,  6,  8,  10, 12, 14,  //
       2,  4,  0,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
       6,  0,  2,  4,  8,  10, 12, 14, /**/ 0, 6,  2,  4,  8,  10, 12, 14,  //
       2,  6,  0,  4,  8,  10, 12, 14, /**/ 0, 2,  6,  4,  8,  10, 12, 14,  //
       4,  6,  0,  2,  8,  10, 12, 14, /**/ 0, 4,  6,  2,  8,  10, 12, 14,  //
       2,  4,  6,  0,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
       8,  0,  2,  4,  6,  10, 12, 14, /**/ 0, 8,  2,  4,  6,  10, 12, 14,  //
       2,  8,  0,  4,  6,  10, 12, 14, /**/ 0, 2,  8,  4,  6,  10, 12, 14,  //
       4,  8,  0,  2,  6,  10, 12, 14, /**/ 0, 4,  8,  2,  6,  10, 12, 14,  //
       2,  4,  8,  0,  6,  10, 12, 14, /**/ 0, 2,  4,  8,  6,  10, 12, 14,  //
       6,  8,  0,  2,  4,  10, 12, 14, /**/ 0, 6,  8,  2,  4,  10, 12, 14,  //
       2,  6,  8,  0,  4,  10, 12, 14, /**/ 0, 2,  6,  8,  4,  10, 12, 14,  //
       4,  6,  8,  0,  2,  10, 12, 14, /**/ 0, 4,  6,  8,  2,  10, 12, 14,  //
       2,  4,  6,  8,  0,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
       10, 0,  2,  4,  6,  8,  12, 14, /**/ 0, 10, 2,  4,  6,  8,  12, 14,  //
       2,  10, 0,  4,  6,  8,  12, 14, /**/ 0, 2,  10, 4,  6,  8,  12, 14,  //
       4,  10, 0,  2,  6,  8,  12, 14, /**/ 0, 4,  10, 2,  6,  8,  12, 14,  //
       2,  4,  10, 0,  6,  8,  12, 14, /**/ 0, 2,  4,  10, 6,  8,  12, 14,  //
       6,  10, 0,  2,  4,  8,  12, 14, /**/ 0, 6,  10, 2,  4,  8,  12, 14,  //
       2,  6,  10, 0,  4,  8,  12, 14, /**/ 0, 2,  6,  10, 4,  8,  12, 14,  //
       4,  6,  10, 0,  2,  8,  12, 14, /**/ 0, 4,  6,  10, 2,  8,  12, 14,  //
       2,  4,  6,  10, 0,  8,  12, 14, /**/ 0, 2,  4,  6,  10, 8,  12, 14,  //
       8,  10, 0,  2,  4,  6,  12, 14, /**/ 0, 8,  10, 2,  4,  6,  12, 14,  //
       2,  8,  10, 0,  4,  6,  12, 14, /**/ 0, 2,  8,  10, 4,  6,  12, 14,  //
       4,  8,  10, 0,  2,  6,  12, 14, /**/ 0, 4,  8,  10, 2,  6,  12, 14,  //
       2,  4,  8,  10, 0,  6,  12, 14, /**/ 0, 2,  4,  8,  10, 6,  12, 14,  //
       6,  8,  10, 0,  2,  4,  12, 14, /**/ 0, 6,  8,  10, 2,  4,  12, 14,  //
       2,  6,  8,  10, 0,  4,  12, 14, /**/ 0, 2,  6,  8,  10, 4,  12, 14,  //
       4,  6,  8,  10, 0,  2,  12, 14, /**/ 0, 4,  6,  8,  10, 2,  12, 14,  //
       2,  4,  6,  8,  10, 0,  12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
       12, 0,  2,  4,  6,  8,  10, 14, /**/ 0, 12, 2,  4,  6,  8,  10, 14,  //
       2,  12, 0,  4,  6,  8,  10, 14, /**/ 0, 2,  12, 4,  6,  8,  10, 14,  //
       4,  12, 0,  2,  6,  8,  10, 14, /**/ 0, 4,  12, 2,  6,  8,  10, 14,  //
       2,  4,  12, 0,  6,  8,  10, 14, /**/ 0, 2,  4,  12, 6,  8,  10, 14,  //
       6,  12, 0,  2,  4,  8,  10, 14, /**/ 0, 6,  12, 2,  4,  8,  10, 14,  //
       2,  6,  12, 0,  4,  8,  10, 14, /**/ 0, 2,  6,  12, 4,  8,  10, 14,  //
       4,  6,  12, 0,  2,  8,  10, 14, /**/ 0, 4,  6,  12, 2,  8,  10, 14,  //
       2,  4,  6,  12, 0,  8,  10, 14, /**/ 0, 2,  4,  6,  12, 8,  10, 14,  //
       8,  12, 0,  2,  4,  6,  10, 14, /**/ 0, 8,  12, 2,  4,  6,  10, 14,  //
       2,  8,  12, 0,  4,  6,  10, 14, /**/ 0, 2,  8,  12, 4,  6,  10, 14,  //
       4,  8,  12, 0,  2,  6,  10, 14, /**/ 0, 4,  8,  12, 2,  6,  10, 14,  //
       2,  4,  8,  12, 0,  6,  10, 14, /**/ 0, 2,  4,  8,  12, 6,  10, 14,  //
       6,  8,  12, 0,  2,  4,  10, 14, /**/ 0, 6,  8,  12, 2,  4,  10, 14,  //
       2,  6,  8,  12, 0,  4,  10, 14, /**/ 0, 2,  6,  8,  12, 4,  10, 14,  //
       4,  6,  8,  12, 0,  2,  10, 14, /**/ 0, 4,  6,  8,  12, 2,  10, 14,  //
       2,  4,  6,  8,  12, 0,  10, 14, /**/ 0, 2,  4,  6,  8,  12, 10, 14,  //
       10, 12, 0,  2,  4,  6,  8,  14, /**/ 0, 10, 12, 2,  4,  6,  8,  14,  //
       2,  10, 12, 0,  4,  6,  8,  14, /**/ 0, 2,  10, 12, 4,  6,  8,  14,  //
       4,  10, 12, 0,  2,  6,  8,  14, /**/ 0, 4,  10, 12, 2,  6,  8,  14,  //
       2,  4,  10, 12, 0,  6,  8,  14, /**/ 0, 2,  4,  10, 12, 6,  8,  14,  //
       6,  10, 12, 0,  2,  4,  8,  14, /**/ 0, 6,  10, 12, 2,  4,  8,  14,  //
       2,  6,  10, 12, 0,  4,  8,  14, /**/ 0, 2,  6,  10, 12, 4,  8,  14,  //
       4,  6,  10, 12, 0,  2,  8,  14, /**/ 0, 4,  6,  10, 12, 2,  8,  14,  //
       2,  4,  6,  10, 12, 0,  8,  14, /**/ 0, 2,  4,  6,  10, 12, 8,  14,  //
       8,  10, 12, 0,  2,  4,  6,  14, /**/ 0, 8,  10, 12, 2,  4,  6,  14,  //
       2,  8,  10, 12, 0,  4,  6,  14, /**/ 0, 2,  8,  10, 12, 4,  6,  14,  //
       4,  8,  10, 12, 0,  2,  6,  14, /**/ 0, 4,  8,  10, 12, 2,  6,  14,  //
       2,  4,  8,  10, 12, 0,  6,  14, /**/ 0, 2,  4,  8,  10, 12, 6,  14,  //
       6,  8,  10, 12, 0,  2,  4,  14, /**/ 0, 6,  8,  10, 12, 2,  4,  14,  //
       2,  6,  8,  10, 12, 0,  4,  14, /**/ 0, 2,  6,  8,  10, 12, 4,  14,  //
       4,  6,  8,  10, 12, 0,  2,  14, /**/ 0, 4,  6,  8,  10, 12, 2,  14,  //
       2,  4,  6,  8,  10, 12, 0,  14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
       14, 0,  2,  4,  6,  8,  10, 12, /**/ 0, 14, 2,  4,  6,  8,  10, 12,  //
       2,  14, 0,  4,  6,  8,  10, 12, /**/ 0, 2,  14, 4,  6,  8,  10, 12,  //
       4,  14, 0,  2,  6,  8,  10, 12, /**/ 0, 4,  14, 2,  6,  8,  10, 12,  //
       2,  4,  14, 0,  6,  8,  10, 12, /**/ 0, 2,  4,  14, 6,  8,  10, 12,  //
       6,  14, 0,  2,  4,  8,  10, 12, /**/ 0, 6,  14, 2,  4,  8,  10, 12,  //
       2,  6,  14, 0,  4,  8,  10, 12, /**/ 0, 2,  6,  14, 4,  8,  10, 12,  //
       4,  6,  14, 0,  2,  8,  10, 12, /**/ 0, 4,  6,  14, 2,  8,  10, 12,  //
       2,  4,  6,  14, 0,  8,  10, 12, /**/ 0, 2,  4,  6,  14, 8,  10, 12,  //
       8,  14, 0,  2,  4,  6,  10, 12, /**/ 0, 8,  14, 2,  4,  6,  10, 12,  //
       2,  8,  14, 0,  4,  6,  10, 12, /**/ 0, 2,  8,  14, 4,  6,  10, 12,  //
       4,  8,  14, 0,  2,  6,  10, 12, /**/ 0, 4,  8,  14, 2,  6,  10, 12,  //
       2,  4,  8,  14, 0,  6,  10, 12, /**/ 0, 2,  4,  8,  14, 6,  10, 12,  //
       6,  8,  14, 0,  2,  4,  10, 12, /**/ 0, 6,  8,  14, 2,  4,  10, 12,  //
       2,  6,  8,  14, 0,  4,  10, 12, /**/ 0, 2,  6,  8,  14, 4,  10, 12,  //
       4,  6,  8,  14, 0,  2,  10, 12, /**/ 0, 4,  6,  8,  14, 2,  10, 12,  //
       2,  4,  6,  8,  14, 0,  10, 12, /**/ 0, 2,  4,  6,  8,  14, 10, 12,  //
       10, 14, 0,  2,  4,  6,  8,  12, /**/ 0, 10, 14, 2,  4,  6,  8,  12,  //
       2,  10, 14, 0,  4,  6,  8,  12, /**/ 0, 2,  10, 14, 4,  6,  8,  12,  //
       4,  10, 14, 0,  2,  6,  8,  12, /**/ 0, 4,  10, 14, 2,  6,  8,  12,  //
       2,  4,  10, 14, 0,  6,  8,  12, /**/ 0, 2,  4,  10, 14, 6,  8,  12,  //
       6,  10, 14, 0,  2,  4,  8,  12, /**/ 0, 6,  10, 14, 2,  4,  8,  12,  //
       2,  6,  10, 14, 0,  4,  8,  12, /**/ 0, 2,  6,  10, 14, 4,  8,  12,  //
       4,  6,  10, 14, 0,  2,  8,  12, /**/ 0, 4,  6,  10, 14, 2,  8,  12,  //
       2,  4,  6,  10, 14, 0,  8,  12, /**/ 0, 2,  4,  6,  10, 14, 8,  12,  //
       8,  10, 14, 0,  2,  4,  6,  12, /**/ 0, 8,  10, 14, 2,  4,  6,  12,  //
       2,  8,  10, 14, 0,  4,  6,  12, /**/ 0, 2,  8,  10, 14, 4,  6,  12,  //
       4,  8,  10, 14, 0,  2,  6,  12, /**/ 0, 4,  8,  10, 14, 2,  6,  12,  //
       2,  4,  8,  10, 14, 0,  6,  12, /**/ 0, 2,  4,  8,  10, 14, 6,  12,  //
       6,  8,  10, 14, 0,  2,  4,  12, /**/ 0, 6,  8,  10, 14, 2,  4,  12,  //
       2,  6,  8,  10, 14, 0,  4,  12, /**/ 0, 2,  6,  8,  10, 14, 4,  12,  //
       4,  6,  8,  10, 14, 0,  2,  12, /**/ 0, 4,  6,  8,  10, 14, 2,  12,  //
       2,  4,  6,  8,  10, 14, 0,  12, /**/ 0, 2,  4,  6,  8,  10, 14, 12,  //
       12, 14, 0,  2,  4,  6,  8,  10, /**/ 0, 12, 14, 2,  4,  6,  8,  10,  //
       2,  12, 14, 0,  4,  6,  8,  10, /**/ 0, 2,  12, 14, 4,  6,  8,  10,  //
       4,  12, 14, 0,  2,  6,  8,  10, /**/ 0, 4,  12, 14, 2,  6,  8,  10,  //
       2,  4,  12, 14, 0,  6,  8,  10, /**/ 0, 2,  4,  12, 14, 6,  8,  10,  //
       6,  12, 14, 0,  2,  4,  8,  10, /**/ 0, 6,  12, 14, 2,  4,  8,  10,  //
       2,  6,  12, 14, 0,  4,  8,  10, /**/ 0, 2,  6,  12, 14, 4,  8,  10,  //
       4,  6,  12, 14, 0,  2,  8,  10, /**/ 0, 4,  6,  12, 14, 2,  8,  10,  //
       2,  4,  6,  12, 14, 0,  8,  10, /**/ 0, 2,  4,  6,  12, 14, 8,  10,  //
       8,  12, 14, 0,  2,  4,  6,  10, /**/ 0, 8,  12, 14, 2,  4,  6,  10,  //
       2,  8,  12, 14, 0,  4,  6,  10, /**/ 0, 2,  8,  12, 14, 4,  6,  10,  //
       4,  8,  12, 14, 0,  2,  6,  10, /**/ 0, 4,  8,  12, 14, 2,  6,  10,  //
       2,  4,  8,  12, 14, 0,  6,  10, /**/ 0, 2,  4,  8,  12, 14, 6,  10,  //
       6,  8,  12, 14, 0,  2,  4,  10, /**/ 0, 6,  8,  12, 14, 2,  4,  10,  //
       2,  6,  8,  12, 14, 0,  4,  10, /**/ 0, 2,  6,  8,  12, 14, 4,  10,  //
       4,  6,  8,  12, 14, 0,  2,  10, /**/ 0, 4,  6,  8,  12, 14, 2,  10,  //
       2,  4,  6,  8,  12, 14, 0,  10, /**/ 0, 2,  4,  6,  8,  12, 14, 10,  //
       10, 12, 14, 0,  2,  4,  6,  8,  /**/ 0, 10, 12, 14, 2,  4,  6,  8,   //
       2,  10, 12, 14, 0,  4,  6,  8,  /**/ 0, 2,  10, 12, 14, 4,  6,  8,   //
       4,  10, 12, 14, 0,  2,  6,  8,  /**/ 0, 4,  10, 12, 14, 2,  6,  8,   //
       2,  4,  10, 12, 14, 0,  6,  8,  /**/ 0, 2,  4,  10, 12, 14, 6,  8,   //
       6,  10, 12, 14, 0,  2,  4,  8,  /**/ 0, 6,  10, 12, 14, 2,  4,  8,   //
       2,  6,  10, 12, 14, 0,  4,  8,  /**/ 0, 2,  6,  10, 12, 14, 4,  8,   //
       4,  6,  10, 12, 14, 0,  2,  8,  /**/ 0, 4,  6,  10, 12, 14, 2,  8,   //
       2,  4,  6,  10, 12, 14, 0,  8,  /**/ 0, 2,  4,  6,  10, 12, 14, 8,   //
       8,  10, 12, 14, 0,  2,  4,  6,  /**/ 0, 8,  10, 12, 14, 2,  4,  6,   //
       2,  8,  10, 12, 14, 0,  4,  6,  /**/ 0, 2,  8,  10, 12, 14, 4,  6,   //
       4,  8,  10, 12, 14, 0,  2,  6,  /**/ 0, 4,  8,  10, 12, 14, 2,  6,   //
       2,  4,  8,  10, 12, 14, 0,  6,  /**/ 0, 2,  4,  8,  10, 12, 14, 6,   //
       6,  8,  10, 12, 14, 0,  2,  4,  /**/ 0, 6,  8,  10, 12, 14, 2,  4,   //
       2,  6,  8,  10, 12, 14, 0,  4,  /**/ 0, 2,  6,  8,  10, 12, 14, 4,   //
       4,  6,  8,  10, 12, 14, 0,  2,  /**/ 0, 4,  6,  8,  10, 12, 14, 2,   //
       2,  4,  6,  8,  10, 12, 14, 0,  /**/ 0, 2,  4,  6,  8,  10, 12, 14};

   const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
   const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
   return BitCast(d, pairs + Set(du, 0x0100));
 }

 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
 HWY_INLINE Vec128<T, N> IdxFromNotBits(const uint64_t mask_bits) {
   HWY_DASSERT(mask_bits < 256);
   const Simd<T, N, 0> d;
   const Rebind<uint8_t, decltype(d)> d8;
   const Simd<uint16_t, N, 0> du;

   // We need byte indices for TableLookupBytes (one vector's worth for each of
   // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We
   // can instead store lane indices and convert to byte indices (2*lane + 0..1),
   // with the doubling baked into the table. Unpacking nibbles is likely more
   // costly than the higher cache footprint from storing bytes.
   alignas(16) static constexpr uint8_t table[256 * 8] = {
       // PrintCompressNot16x8Tables
       0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 12, 14, 0,   //
       0, 4,  6,  8,  10, 12, 14, 2,  /**/ 4,  6,  8,  10, 12, 14, 0,  2,   //
       0, 2,  6,  8,  10, 12, 14, 4,  /**/ 2,  6,  8,  10, 12, 14, 0,  4,   //
       0, 6,  8,  10, 12, 14, 2,  4,  /**/ 6,  8,  10, 12, 14, 0,  2,  4,   //
       0, 2,  4,  8,  10, 12, 14, 6,  /**/ 2,  4,  8,  10, 12, 14, 0,  6,   //
       0, 4,  8,  10, 12, 14, 2,  6,  /**/ 4,  8,  10, 12, 14, 0,  2,  6,   //
       0, 2,  8,  10, 12, 14, 4,  6,  /**/ 2,  8,  10, 12, 14, 0,  4,  6,   //
       0, 8,  10, 12, 14, 2,  4,  6,  /**/ 8,  10, 12, 14, 0,  2,  4,  6,   //
       0, 2,  4,  6,  10, 12, 14, 8,  /**/ 2,  4,  6,  10, 12, 14, 0,  8,   //
       0, 4,  6,  10, 12, 14, 2,  8,  /**/ 4,  6,  10, 12, 14, 0,  2,  8,   //
       0, 2,  6,  10, 12, 14, 4,  8,  /**/ 2,  6,  10, 12, 14, 0,  4,  8,   //
       0, 6,  10, 12, 14, 2,  4,  8,  /**/ 6,  10, 12, 14, 0,  2,  4,  8,   //
       0, 2,  4,  10, 12, 14, 6,  8,  /**/ 2,  4,  10, 12, 14, 0,  6,  8,   //
       0, 4,  10, 12, 14, 2,  6,  8,  /**/ 4,  10, 12, 14, 0,  2,  6,  8,   //
       0, 2,  10, 12, 14, 4,  6,  8,  /**/ 2,  10, 12, 14, 0,  4,  6,  8,   //
       0, 10, 12, 14, 2,  4,  6,  8,  /**/ 10, 12, 14, 0,  2,  4,  6,  8,   //
       0, 2,  4,  6,  8,  12, 14, 10, /**/ 2,  4,  6,  8,  12, 14, 0,  10,  //
       0, 4,  6,  8,  12, 14, 2,  10, /**/ 4,  6,  8,  12, 14, 0,  2,  10,  //
       0, 2,  6,  8,  12, 14, 4,  10, /**/ 2,  6,  8,  12, 14, 0,  4,  10,  //
       0, 6,  8,  12, 14, 2,  4,  10, /**/ 6,  8,  12, 14, 0,  2,  4,  10,  //
       0, 2,  4,  8,  12, 14, 6,  10, /**/ 2,  4,  8,  12, 14, 0,  6,  10,  //
       0, 4,  8,  12, 14, 2,  6,  10, /**/ 4,  8,  12, 14, 0,  2,  6,  10,  //
       0, 2,  8,  12, 14, 4,  6,  10, /**/ 2,  8,  12, 14, 0,  4,  6,  10,  //
       0, 8,  12, 14, 2,  4,  6,  10, /**/ 8,  12, 14, 0,  2,  4,  6,  10,  //
       0, 2,  4,  6,  12, 14, 8,  10, /**/ 2,  4,  6,  12, 14, 0,  8,  10,  //
       0, 4,  6,  12, 14, 2,  8,  10, /**/ 4,  6,  12, 14, 0,  2,  8,  10,  //
       0, 2,  6,  12, 14, 4,  8,  10, /**/ 2,  6,  12, 14, 0,  4,  8,  10,  //
       0, 6,  12, 14, 2,  4,  8,  10, /**/ 6,  12, 14, 0,  2,  4,  8,  10,  //
       0, 2,  4,  12, 14, 6,  8,  10, /**/ 2,  4,  12, 14, 0,  6,  8,  10,  //
       0, 4,  12, 14, 2,  6,  8,  10, /**/ 4,  12, 14, 0,  2,  6,  8,  10,  //
       0, 2,  12, 14, 4,  6,  8,  10, /**/ 2,  12, 14, 0,  4,  6,  8,  10,  //
       0, 12, 14, 2,  4,  6,  8,  10, /**/ 12, 14, 0,  2,  4,  6,  8,  10,  //
       0, 2,  4,  6,  8,  10, 14, 12, /**/ 2,  4,  6,  8,  10, 14, 0,  12,  //
       0, 4,  6,  8,  10, 14, 2,  12, /**/ 4,  6,  8,  10, 14, 0,  2,  12,  //
       0, 2,  6,  8,  10, 14, 4,  12, /**/ 2,  6,  8,  10, 14, 0,  4,  12,  //
       0, 6,  8,  10, 14, 2,  4,  12, /**/ 6,  8,  10, 14, 0,  2,  4,  12,  //
       0, 2,  4,  8,  10, 14, 6,  12, /**/ 2,  4,  8,  10, 14, 0,  6,  12,  //
       0, 4,  8,  10, 14, 2,  6,  12, /**/ 4,  8,  10, 14, 0,  2,  6,  12,  //
       0, 2,  8,  10, 14, 4,  6,  12, /**/ 2,  8,  10, 14, 0,  4,  6,  12,  //
       0, 8,  10, 14, 2,  4,  6,  12, /**/ 8,  10, 14, 0,  2,  4,  6,  12,  //
       0, 2,  4,  6,  10, 14, 8,  12, /**/ 2,  4,  6,  10, 14, 0,  8,  12,  //
       0, 4,  6,  10, 14, 2,  8,  12, /**/ 4,  6,  10, 14, 0,  2,  8,  12,  //
       0, 2,  6,  10, 14, 4,  8,  12, /**/ 2,  6,  10, 14, 0,  4,  8,  12,  //
       0, 6,  10, 14, 2,  4,  8,  12, /**/ 6,  10, 14, 0,  2,  4,  8,  12,  //
       0, 2,  4,  10, 14, 6,  8,  12, /**/ 2,  4,  10, 14, 0,  6,  8,  12,  //
       0, 4,  10, 14, 2,  6,  8,  12, /**/ 4,  10, 14, 0,  2,  6,  8,  12,  //
       0, 2,  10, 14, 4,  6,  8,  12, /**/ 2,  10, 14, 0,  4,  6,  8,  12,  //
       0, 10, 14, 2,  4,  6,  8,  12, /**/ 10, 14, 0,  2,  4,  6,  8,  12,  //
       0, 2,  4,  6,  8,  14, 10, 12, /**/ 2,  4,  6,  8,  14, 0,  10, 12,  //
       0, 4,  6,  8,  14, 2,  10, 12, /**/ 4,  6,  8,  14, 0,  2,  10, 12,  //
       0, 2,  6,  8,  14, 4,  10, 12, /**/ 2,  6,  8,  14, 0,  4,  10, 12,  //
       0, 6,  8,  14, 2,  4,  10, 12, /**/ 6,  8,  14, 0,  2,  4,  10, 12,  //
       0, 2,  4,  8,  14, 6,  10, 12, /**/ 2,  4,  8,  14, 0,  6,  10, 12,  //
       0, 4,  8,  14, 2,  6,  10, 12, /**/ 4,  8,  14, 0,  2,  6,  10, 12,  //
       0, 2,  8,  14, 4,  6,  10, 12, /**/ 2,  8,  14, 0,  4,  6,  10, 12,  //
       0, 8,  14, 2,  4,  6,  10, 12, /**/ 8,  14, 0,  2,  4,  6,  10, 12,  //
       0, 2,  4,  6,  14, 8,  10, 12, /**/ 2,  4,  6,  14, 0,  8,  10, 12,  //
       0, 4,  6,  14, 2,  8,  10, 12, /**/ 4,  6,  14, 0,  2,  8,  10, 12,  //
       0, 2,  6,  14, 4,  8,  10, 12, /**/ 2,  6,  14, 0,  4,  8,  10, 12,  //
       0, 6,  14, 2,  4,  8,  10, 12, /**/ 6,  14, 0,  2,  4,  8,  10, 12,  //
       0, 2,  4,  14, 6,  8,  10, 12, /**/ 2,  4,  14, 0,  6,  8,  10, 12,  //
       0, 4,  14, 2,  6,  8,  10, 12, /**/ 4,  14, 0,  2,  6,  8,  10, 12,  //
       0, 2,  14, 4,  6,  8,  10, 12, /**/ 2,  14, 0,  4,  6,  8,  10, 12,  //
       0, 14, 2,  4,  6,  8,  10, 12, /**/ 14, 0,  2,  4,  6,  8,  10, 12,  //
       0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 12, 0,  14,  //
       0, 4,  6,  8,  10, 12, 2,  14, /**/ 4,  6,  8,  10, 12, 0,  2,  14,  //
       0, 2,  6,  8,  10, 12, 4,  14, /**/ 2,  6,  8,  10, 12, 0,  4,  14,  //
       0, 6,  8,  10, 12, 2,  4,  14, /**/ 6,  8,  10, 12, 0,  2,  4,  14,  //
       0, 2,  4,  8,  10, 12, 6,  14, /**/ 2,  4,  8,  10, 12, 0,  6,  14,  //
       0, 4,  8,  10, 12, 2,  6,  14, /**/ 4,  8,  10, 12, 0,  2,  6,  14,  //
       0, 2,  8,  10, 12, 4,  6,  14, /**/ 2,  8,  10, 12, 0,  4,  6,  14,  //
       0, 8,  10, 12, 2,  4,  6,  14, /**/ 8,  10, 12, 0,  2,  4,  6,  14,  //
       0, 2,  4,  6,  10, 12, 8,  14, /**/ 2,  4,  6,  10, 12, 0,  8,  14,  //
       0, 4,  6,  10, 12, 2,  8,  14, /**/ 4,  6,  10, 12, 0,  2,  8,  14,  //
       0, 2,  6,  10, 12, 4,  8,  14, /**/ 2,  6,  10, 12, 0,  4,  8,  14,  //
       0, 6,  10, 12, 2,  4,  8,  14, /**/ 6,  10, 12, 0,  2,  4,  8,  14,  //
       0, 2,  4,  10, 12, 6,  8,  14, /**/ 2,  4,  10, 12, 0,  6,  8,  14,  //
       0, 4,  10, 12, 2,  6,  8,  14, /**/ 4,  10, 12, 0,  2,  6,  8,  14,  //
       0, 2,  10, 12, 4,  6,  8,  14, /**/ 2,  10, 12, 0,  4,  6,  8,  14,  //
       0, 10, 12, 2,  4,  6,  8,  14, /**/ 10, 12, 0,  2,  4,  6,  8,  14,  //
       0, 2,  4,  6,  8,  12, 10, 14, /**/ 2,  4,  6,  8,  12, 0,  10, 14,  //
       0, 4,  6,  8,  12, 2,  10, 14, /**/ 4,  6,  8,  12, 0,  2,  10, 14,  //
       0, 2,  6,  8,  12, 4,  10, 14, /**/ 2,  6,  8,  12, 0,  4,  10, 14,  //
       0, 6,  8,  12, 2,  4,  10, 14, /**/ 6,  8,  12, 0,  2,  4,  10, 14,  //
       0, 2,  4,  8,  12, 6,  10, 14, /**/ 2,  4,  8,  12, 0,  6,  10, 14,  //
       0, 4,  8,  12, 2,  6,  10, 14, /**/ 4,  8,  12, 0,  2,  6,  10, 14,  //
       0, 2,  8,  12, 4,  6,  10, 14, /**/ 2,  8,  12, 0,  4,  6,  10, 14,  //
       0, 8,  12, 2,  4,  6,  10, 14, /**/ 8,  12, 0,  2,  4,  6,  10, 14,  //
       0, 2,  4,  6,  12, 8,  10, 14, /**/ 2,  4,  6,  12, 0,  8,  10, 14,  //
       0, 4,  6,  12, 2,  8,  10, 14, /**/ 4,  6,  12, 0,  2,  8,  10, 14,  //
       0, 2,  6,  12, 4,  8,  10, 14, /**/ 2,  6,  12, 0,  4,  8,  10, 14,  //
       0, 6,  12, 2,  4,  8,  10, 14, /**/ 6,  12, 0,  2,  4,  8,  10, 14,  //
       0, 2,  4,  12, 6,  8,  10, 14, /**/ 2,  4,  12, 0,  6,  8,  10, 14,  //
       0, 4,  12, 2,  6,  8,  10, 14, /**/ 4,  12, 0,  2,  6,  8,  10, 14,  //
       0, 2,  12, 4,  6,  8,  10, 14, /**/ 2,  12, 0,  4,  6,  8,  10, 14,  //
       0, 12, 2,  4,  6,  8,  10, 14, /**/ 12, 0,  2,  4,  6,  8,  10, 14,  //
       0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 0,  12, 14,  //
       0, 4,  6,  8,  10, 2,  12, 14, /**/ 4,  6,  8,  10, 0,  2,  12, 14,  //
       0, 2,  6,  8,  10, 4,  12, 14, /**/ 2,  6,  8,  10, 0,  4,  12, 14,  //
       0, 6,  8,  10, 2,  4,  12, 14, /**/ 6,  8,  10, 0,  2,  4,  12, 14,  //
       0, 2,  4,  8,  10, 6,  12, 14, /**/ 2,  4,  8,  10, 0,  6,  12, 14,  //
       0, 4,  8,  10, 2,  6,  12, 14, /**/ 4,  8,  10, 0,  2,  6,  12, 14,  //
       0, 2,  8,  10, 4,  6,  12, 14, /**/ 2,  8,  10, 0,  4,  6,  12, 14,  //
       0, 8,  10, 2,  4,  6,  12, 14, /**/ 8,  10, 0,  2,  4,  6,  12, 14,  //
       0, 2,  4,  6,  10, 8,  12, 14, /**/ 2,  4,  6,  10, 0,  8,  12, 14,  //
       0, 4,  6,  10, 2,  8,  12, 14, /**/ 4,  6,  10, 0,  2,  8,  12, 14,  //
       0, 2,  6,  10, 4,  8,  12, 14, /**/ 2,  6,  10, 0,  4,  8,  12, 14,  //
       0, 6,  10, 2,  4,  8,  12, 14, /**/ 6,  10, 0,  2,  4,  8,  12, 14,  //
       0, 2,  4,  10, 6,  8,  12, 14, /**/ 2,  4,  10, 0,  6,  8,  12, 14,  //
       0, 4,  10, 2,  6,  8,  12, 14, /**/ 4,  10, 0,  2,  6,  8,  12, 14,  //
       0, 2,  10, 4,  6,  8,  12, 14, /**/ 2,  10, 0,  4,  6,  8,  12, 14,  //
       0, 10, 2,  4,  6,  8,  12, 14, /**/ 10, 0,  2,  4,  6,  8,  12, 14,  //
       0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  0,  10, 12, 14,  //
       0, 4,  6,  8,  2,  10, 12, 14, /**/ 4,  6,  8,  0,  2,  10, 12, 14,  //
       0, 2,  6,  8,  4,  10, 12, 14, /**/ 2,  6,  8,  0,  4,  10, 12, 14,  //
       0, 6,  8,  2,  4,  10, 12, 14, /**/ 6,  8,  0,  2,  4,  10, 12, 14,  //
       0, 2,  4,  8,  6,  10, 12, 14, /**/ 2,  4,  8,  0,  6,  10, 12, 14,  //
       0, 4,  8,  2,  6,  10, 12, 14, /**/ 4,  8,  0,  2,  6,  10, 12, 14,  //
       0, 2,  8,  4,  6,  10, 12, 14, /**/ 2,  8,  0,  4,  6,  10, 12, 14,  //
       0, 8,  2,  4,  6,  10, 12, 14, /**/ 8,  0,  2,  4,  6,  10, 12, 14,  //
       0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  0,  8,  10, 12, 14,  //
       0, 4,  6,  2,  8,  10, 12, 14, /**/ 4,  6,  0,  2,  8,  10, 12, 14,  //
       0, 2,  6,  4,  8,  10, 12, 14, /**/ 2,  6,  0,  4,  8,  10, 12, 14,  //
       0, 6,  2,  4,  8,  10, 12, 14, /**/ 6,  0,  2,  4,  8,  10, 12, 14,  //
       0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  0,  6,  8,  10, 12, 14,  //
       0, 4,  2,  6,  8,  10, 12, 14, /**/ 4,  0,  2,  6,  8,  10, 12, 14,  //
       0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  0,  4,  6,  8,  10, 12, 14,  //
       0, 2,  4,  6,  8,  10, 12, 14, /**/ 0,  2,  4,  6,  8,  10, 12, 14};

   const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
   const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
   return BitCast(d, pairs + Set(du, 0x0100));
 }

 template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
 HWY_INLINE Vec128<T, N> IdxFromBits(const uint64_t mask_bits) {
   HWY_DASSERT(mask_bits < 16);

   // There are only 4 lanes, so we can afford to load the index vector directly.
   alignas(16) static constexpr uint8_t u8_indices[16 * 16] = {
       // PrintCompress32x4Tables
       0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
       0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
       4,  5,  6,  7,  0,  1,  2,  3,  8,  9,  10, 11, 12, 13, 14, 15,  //
       0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
       8,  9,  10, 11, 0,  1,  2,  3,  4,  5,  6,  7,  12, 13, 14, 15,  //
       0,  1,  2,  3,  8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15,  //
       4,  5,  6,  7,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15,  //
       0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
       12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,  //
       0,  1,  2,  3,  12, 13, 14, 15, 4,  5,  6,  7,  8,  9,  10, 11,  //
       4,  5,  6,  7,  12, 13, 14, 15, 0,  1,  2,  3,  8,  9,  10, 11,  //
       0,  1,  2,  3,  4,  5,  6,  7,  12, 13, 14, 15, 8,  9,  10, 11,  //
       8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,   //
       0,  1,  2,  3,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,   //
       4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,   //
       0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15};
   const Simd<T, N, 0> d;
   const Repartition<uint8_t, decltype(d)> d8;
   return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
 }

 template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
 HWY_INLINE Vec128<T, N> IdxFromNotBits(const uint64_t mask_bits) {
   HWY_DASSERT(mask_bits < 16);

   // There are only 4 lanes, so we can afford to load the index vector directly.
   alignas(16) static constexpr uint8_t u8_indices[16 * 16] = {
       // PrintCompressNot32x4Tables
       0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,
       6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  0,  1,  2,  3,
       8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
       14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
       12, 13, 14, 15, 8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15, 0,  1,
       2,  3,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15, 4,  5,  6,  7,
       8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
       10, 11, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
       4,  5,  6,  7,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15, 0,  1,
       2,  3,  8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15, 8,  9,  10, 11,
       0,  1,  2,  3,  4,  5,  6,  7,  12, 13, 14, 15, 0,  1,  2,  3,  4,  5,
       6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,  0,  1,  2,  3,
       8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
       10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
       12, 13, 14, 15};
   const Simd<T, N, 0> d;
   const Repartition<uint8_t, decltype(d)> d8;
   return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
 }

 template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
 HWY_INLINE Vec128<T, N> IdxFromBits(const uint64_t mask_bits) {
   HWY_DASSERT(mask_bits < 4);

   // There are only 2 lanes, so we can afford to load the index vector directly.
   alignas(16) static constexpr uint8_t u8_indices[4 * 16] = {
       // PrintCompress64x2Tables
       0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
       0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
       8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,
       0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15};

   const Simd<T, N, 0> d;
   const Repartition<uint8_t, decltype(d)> d8;
   return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
 }

 template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
 HWY_INLINE Vec128<T, N> IdxFromNotBits(const uint64_t mask_bits) {
   HWY_DASSERT(mask_bits < 4);

   // There are only 2 lanes, so we can afford to load the index vector directly.
   alignas(16) static constexpr uint8_t u8_indices[4 * 16] = {
       // PrintCompressNot64x2Tables
       0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
       8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,
       0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
       0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15};

   const Simd<T, N, 0> d;
   const Repartition<uint8_t, decltype(d)> d8;
   return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
 }

 // Helper functions called by both Compress and CompressStore - avoids a
 // redundant BitsFromMask in the latter.

 template <typename T, size_t N>
 HWY_INLINE Vec128<T, N> Compress(Vec128<T, N> v, const uint64_t mask_bits) {
   const auto idx = detail::IdxFromBits<T, N>(mask_bits);
   const DFromV<decltype(v)> d;
   const RebindToSigned<decltype(d)> di;
   return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
 }

 template <typename T, size_t N>
 HWY_INLINE Vec128<T, N> CompressNot(Vec128<T, N> v, const uint64_t mask_bits) {
   const auto idx = detail::IdxFromNotBits<T, N>(mask_bits);
   const DFromV<decltype(v)> d;
   const RebindToSigned<decltype(d)> di;
   return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
 }

 }  // namespace detail

 template <typename T>
 struct CompressIsPartition {
 #if HWY_TARGET == HWY_WASM_EMU256
   enum { value = 0 };
 #else
   enum { value = (sizeof(T) != 1) };
 #endif
 };

 // Single lane: no-op
 template <typename T>
 HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
   return v;
 }

 // Two lanes: conditional swap
 template <typename T, HWY_IF_T_SIZE(T, 8)>
 HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) {
   // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep.
   const Full128<T> d;
   const Vec128<T> m = VecFromMask(d, mask);
   const Vec128<T> maskL = DupEven(m);
   const Vec128<T> maskH = DupOdd(m);
   const Vec128<T> swap = AndNot(maskL, maskH);
   return IfVecThenElse(swap, Shuffle01(v), v);
 }

 // General case, 2 or 4 byte lanes
 template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 2))>
 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
   const DFromV<decltype(v)> d;
   return detail::Compress(v, BitsFromMask(d, mask));
 }

 // Single lane: no-op
 template <typename T>
 HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
   return v;
 }

 // Two lanes: conditional swap
 template <typename T, HWY_IF_T_SIZE(T, 8)>
 HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
   // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep.
   const Full128<T> d;
   const Vec128<T> m = VecFromMask(d, mask);
   const Vec128<T> maskL = DupEven(m);
   const Vec128<T> maskH = DupOdd(m);
   const Vec128<T> swap = AndNot(maskH, maskL);
   return IfVecThenElse(swap, Shuffle01(v), v);
 }

 // General case, 2 or 4 byte lanes
 template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
 HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
   const DFromV<decltype(v)> d;
   // For partial vectors, we cannot pull the Not() into the table because
   // BitsFromMask clears the upper bits.
   if (N < 16 / sizeof(T)) {
     return detail::Compress(v, BitsFromMask(d, Not(mask)));
   }
   return detail::CompressNot(v, BitsFromMask(d, mask));
 }

 // ------------------------------ CompressBlocksNot
 HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
                                            Mask128<uint64_t> /* m */) {
   return v;
 }

 // ------------------------------ CompressBits
 template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
 HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
                                   const uint8_t* HWY_RESTRICT bits) {
   uint64_t mask_bits = 0;
   constexpr size_t kNumBytes = (N + 7) / 8;
   CopyBytes<kNumBytes>(bits, &mask_bits);
   if (N < 8) {
     mask_bits &= (1ull << N) - 1;
   }

   return detail::Compress(v, mask_bits);
 }

 // ------------------------------ CompressStore
 template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
 HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d,
                              TFromD<D>* HWY_RESTRICT unaligned) {
   const uint64_t mask_bits = BitsFromMask(d, mask);
   const auto c = detail::Compress(v, mask_bits);
   StoreU(c, d, unaligned);
   return PopCount(mask_bits);
 }

 // ------------------------------ CompressBlendedStore
 template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
 HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
                                     TFromD<D>* HWY_RESTRICT unaligned) {
   const RebindToUnsigned<decltype(d)> du;  // so we can support fp16/bf16
   const uint64_t mask_bits = BitsFromMask(d, m);
   const size_t count = PopCount(mask_bits);
   const VFromD<decltype(du)> compressed =
       detail::Compress(BitCast(du, v), mask_bits);
   const MFromD<D> store_mask = RebindMask(d, FirstN(du, count));
   BlendedStore(BitCast(d, compressed), store_mask, d, unaligned);
   return count;
 }

 // ------------------------------ CompressBitsStore

 template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
 HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
                                  D d, TFromD<D>* HWY_RESTRICT unaligned) {
   uint64_t mask_bits = 0;
   constexpr size_t kN = MaxLanes(d);
   CopyBytes<(kN + 7) / 8>(bits, &mask_bits);
   if (kN < 8) {
     mask_bits &= (1ull << kN) - 1;
   }

   const auto c = detail::Compress(v, mask_bits);
   StoreU(c, d, unaligned);
   return PopCount(mask_bits);
 }

 // ------------------------------ StoreInterleaved2/3/4

 // HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in
 // generic_ops-inl.h.

 // ------------------------------ Additional mask logical operations
 template <class T>
 HWY_API Mask128<T, 1> SetAtOrAfterFirst(Mask128<T, 1> mask) {
   return mask;
 }
 template <class T>
 HWY_API Mask128<T, 2> SetAtOrAfterFirst(Mask128<T, 2> mask) {
   const FixedTag<T, 2> d;
   const auto vmask = VecFromMask(d, mask);
   return MaskFromVec(Or(vmask, InterleaveLower(vmask, vmask)));
 }
 template <class T, size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)>
 HWY_API Mask128<T, N> SetAtOrAfterFirst(Mask128<T, N> mask) {
   const Simd<T, N, 0> d;
   const auto vmask = VecFromMask(d, mask);
   const auto neg_vmask =
       ResizeBitCast(d, Neg(ResizeBitCast(Full64<int64_t>(), vmask)));
   return MaskFromVec(Or(vmask, neg_vmask));
 }
 template <class T, HWY_IF_NOT_T_SIZE(T, 8)>
 HWY_API Mask128<T> SetAtOrAfterFirst(Mask128<T> mask) {
   const Full128<T> d;
   const Repartition<int64_t, decltype(d)> di64;

   auto vmask = BitCast(di64, VecFromMask(d, mask));
   vmask = Or(vmask, Neg(vmask));

   // Copy the sign bit of the first int64_t lane to the second int64_t lane
   const auto vmask2 = BroadcastSignBit(InterleaveLower(Zero(di64), vmask));
   return MaskFromVec(BitCast(d, Or(vmask, vmask2)));
 }

 template <class T, size_t N>
 HWY_API Mask128<T, N> SetBeforeFirst(Mask128<T, N> mask) {
   return Not(SetAtOrAfterFirst(mask));
 }

 template <class T>
 HWY_API Mask128<T, 1> SetOnlyFirst(Mask128<T, 1> mask) {
   return mask;
 }
 template <class T>
 HWY_API Mask128<T, 2> SetOnlyFirst(Mask128<T, 2> mask) {
   const FixedTag<T, 2> d;
   const RebindToSigned<decltype(d)> di;

   const auto vmask = BitCast(di, VecFromMask(d, mask));
   const auto zero = Zero(di);
   const auto vmask2 = VecFromMask(di, InterleaveLower(zero, vmask) == zero);
   return MaskFromVec(BitCast(d, And(vmask, vmask2)));
 }
 template <class T, size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)>
 HWY_API Mask128<T, N> SetOnlyFirst(Mask128<T, N> mask) {
   const Simd<T, N, 0> d;
   const RebindToSigned<decltype(d)> di;

   const auto vmask = ResizeBitCast(Full64<int64_t>(), VecFromMask(d, mask));
   const auto only_first_vmask =
       BitCast(d, Neg(ResizeBitCast(di, And(vmask, Neg(vmask)))));
   return MaskFromVec(only_first_vmask);
 }
 template <class T, HWY_IF_NOT_T_SIZE(T, 8)>
 HWY_API Mask128<T> SetOnlyFirst(Mask128<T> mask) {
   const Full128<T> d;
   const RebindToSigned<decltype(d)> di;
   const Repartition<int64_t, decltype(d)> di64;

   const auto zero = Zero(di64);
   const auto vmask = BitCast(di64, VecFromMask(d, mask));
   const auto vmask2 = VecFromMask(di64, InterleaveLower(zero, vmask) == zero);
   const auto only_first_vmask = Neg(BitCast(di, And(vmask, Neg(vmask))));
   return MaskFromVec(BitCast(d, And(only_first_vmask, BitCast(di, vmask2))));
 }

 template <class T>
 HWY_API Mask128<T, 1> SetAtOrBeforeFirst(Mask128<T, 1> /*mask*/) {
   const FixedTag<T, 1> d;
   const RebindToSigned<decltype(d)> di;
   using TI = MakeSigned<T>;

   return RebindMask(d, MaskFromVec(Set(di, TI(-1))));
 }
 template <class T, size_t N, HWY_IF_LANES_GT(N, 1)>
 HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) {
   const Simd<T, N, 0> d;
   return SetBeforeFirst(MaskFromVec(ShiftLeftLanes<1>(VecFromMask(d, mask))));
 }

 // ------------------------------ MulEven/Odd (Load)

 template <class T, HWY_IF_UI64(T)>
 HWY_API Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) {
   alignas(16) T mul[2];
   mul[0] = Mul128(static_cast<T>(wasm_i64x2_extract_lane(a.raw, 0)),
                   static_cast<T>(wasm_i64x2_extract_lane(b.raw, 0)), &mul[1]);
   return Load(Full128<T>(), mul);
 }

 template <class T, HWY_IF_UI64(T)>
 HWY_API Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) {
   alignas(16) T mul[2];
   mul[0] = Mul128(static_cast<T>(wasm_i64x2_extract_lane(a.raw, 1)),
                   static_cast<T>(wasm_i64x2_extract_lane(b.raw, 1)), &mul[1]);
   return Load(Full128<T>(), mul);
 }

 // ------------------------------ I64/U64 MulHigh (GetLane)
 template <class T, HWY_IF_UI64(T)>
 HWY_API Vec64<T> MulHigh(Vec64<T> a, Vec64<T> b) {
   T hi;
   Mul128(GetLane(a), GetLane(b), &hi);
   return Set(Full64<T>(), hi);
 }

 template <class T, HWY_IF_UI64(T)>
 HWY_API Vec128<T> MulHigh(Vec128<T> a, Vec128<T> b) {
   T hi_0;
   T hi_1;
   Mul128(GetLane(a), GetLane(b), &hi_0);
   Mul128(detail::ExtractLane<1>(a), detail::ExtractLane<1>(b), &hi_1);
   return Dup128VecFromValues(Full128<T>(), hi_0, hi_1);
 }

 // ------------------------------ WidenMulPairwiseAdd (MulAdd, PromoteEvenTo)

 // Generic for all vector lengths.
 template <class DF, HWY_IF_F32_D(DF),
           class VBF = VFromD<Repartition<bfloat16_t, DF>>>
 HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) {
   return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b),
                 Mul(PromoteOddTo(df, a), PromoteOddTo(df, b)));
 }

 // Even if N=1, the input is always at least 2 lanes, hence i32x4_dot_i16x8 is
 // safe.
 template <class D32, HWY_IF_I32_D(D32), HWY_IF_V_SIZE_LE_D(D32, 16),
           class V16 = VFromD<RepartitionToNarrow<D32>>>
 HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 /* tag */, V16 a, V16 b) {
   return VFromD<D32>{wasm_i32x4_dot_i16x8(a.raw, b.raw)};
 }

 template <class DU32, HWY_IF_U32_D(DU32), HWY_IF_V_SIZE_LE_D(DU32, 16),
           class VU16 = VFromD<RepartitionToNarrow<DU32>>>
 HWY_API VFromD<DU32> WidenMulPairwiseAdd(DU32 du32, VU16 a, VU16 b) {
   return MulAdd(PromoteEvenTo(du32, a), PromoteEvenTo(du32, b),
                 Mul(PromoteOddTo(du32, a), PromoteOddTo(du32, b)));
 }

 // ------------------------------ ReorderWidenMulAccumulate

 template <class D32, HWY_IF_UI32_D(D32), HWY_IF_V_SIZE_LE_D(D32, 16),
           class V16 = VFromD<RepartitionToNarrow<D32>>>
 HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 d32, V16 a, V16 b,
                                               const VFromD<D32> sum0,
                                               VFromD<D32>& /*sum1*/) {
   return sum0 + WidenMulPairwiseAdd(d32, a, b);
 }

 // ------------------------------ RearrangeToOddPlusEven
 template <size_t N>
 HWY_API Vec128<int32_t, N> RearrangeToOddPlusEven(
     const Vec128<int32_t, N> sum0, const Vec128<int32_t, N> /*sum1*/) {
   return sum0;  // invariant already holds
 }

 template <size_t N>
 HWY_API Vec128<uint32_t, N> RearrangeToOddPlusEven(
     const Vec128<uint32_t, N> sum0, const Vec128<uint32_t, N> /*sum1*/) {
   return sum0;  // invariant already holds
 }

 template <size_t N>
 HWY_API Vec128<float, N> RearrangeToOddPlusEven(const Vec128<float, N> sum0,
                                                 const Vec128<float, N> sum1) {
   return Add(sum0, sum1);
 }

 // ------------------------------ Reductions

 // Nothing native, generic_ops-inl defines SumOfLanes and ReduceSum.

 // ------------------------------ Lt128

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
 HWY_INLINE MFromD<D> Lt128(D d, VFromD<D> a, VFromD<D> b) {
   // Truth table of Eq and Lt for Hi and Lo u64.
   // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
   // =H =L cH cL  | out = cH | (=H & cL)
   //  0  0  0  0  |  0
   //  0  0  0  1  |  0
   //  0  0  1  0  |  1
   //  0  0  1  1  |  1
   //  0  1  0  0  |  0
   //  0  1  0  1  |  0
   //  0  1  1  0  |  1
   //  1  0  0  0  |  0
   //  1  0  0  1  |  1
   //  1  1  0  0  |  0
   const MFromD<D> eqHL = Eq(a, b);
   const VFromD<D> ltHL = VecFromMask(d, Lt(a, b));
   // We need to bring cL to the upper lane/bit corresponding to cH. Comparing
   // the result of InterleaveUpper/Lower requires 9 ops, whereas shifting the
   // comparison result leftwards requires only 4. IfThenElse compiles to the
   // same code as OrAnd().
   const VFromD<D> ltLx = DupEven(ltHL);
   const VFromD<D> outHx = IfThenElse(eqHL, ltLx, ltHL);
   return MaskFromVec(DupOdd(outHx));
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
 HWY_INLINE MFromD<D> Lt128Upper(D d, VFromD<D> a, VFromD<D> b) {
   const VFromD<D> ltHL = VecFromMask(d, Lt(a, b));
   return MaskFromVec(InterleaveUpper(d, ltHL, ltHL));
 }

 // ------------------------------ Eq128

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
 HWY_INLINE MFromD<D> Eq128(D d, VFromD<D> a, VFromD<D> b) {
   const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
   return MaskFromVec(And(Reverse2(d, eqHL), eqHL));
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
 HWY_INLINE MFromD<D> Eq128Upper(D d, VFromD<D> a, VFromD<D> b) {
   const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
   return MaskFromVec(InterleaveUpper(d, eqHL, eqHL));
 }

 // ------------------------------ Ne128

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
 HWY_INLINE MFromD<D> Ne128(D d, VFromD<D> a, VFromD<D> b) {
   const VFromD<D> neHL = VecFromMask(d, Ne(a, b));
   return MaskFromVec(Or(Reverse2(d, neHL), neHL));
 }

 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
 HWY_INLINE MFromD<D> Ne128Upper(D d, VFromD<D> a, VFromD<D> b) {
   const VFromD<D> neHL = VecFromMask(d, Ne(a, b));
   return MaskFromVec(InterleaveUpper(d, neHL, neHL));
 }

 // ------------------------------ Min128, Max128 (Lt128)

 // Without a native OddEven, it seems infeasible to go faster than Lt128.
 template <class D>
 HWY_INLINE VFromD<D> Min128(D d, const VFromD<D> a, const VFromD<D> b) {
   return IfThenElse(Lt128(d, a, b), a, b);
 }

 template <class D>
 HWY_INLINE VFromD<D> Max128(D d, const VFromD<D> a, const VFromD<D> b) {
   return IfThenElse(Lt128(d, b, a), a, b);
 }

 template <class D>
 HWY_INLINE VFromD<D> Min128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
   return IfThenElse(Lt128Upper(d, a, b), a, b);
 }

 template <class D>
 HWY_INLINE VFromD<D> Max128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
   return IfThenElse(Lt128Upper(d, b, a), a, b);
 }

 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace hwy
 HWY_AFTER_NAMESPACE();