| // Copyright 2019 Google LLC |
| // SPDX-License-Identifier: Apache-2.0 |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // Single-element vectors and operations. |
| // External include guard in highway.h - see comment there. |
| |
| #include <stdint.h> |
| #ifndef HWY_NO_LIBCXX |
| #include <math.h> // sqrtf |
| #endif |
| |
| #include "third_party/highway/hwy/ops/shared-inl.h" |
| |
| HWY_BEFORE_NAMESPACE(); |
| namespace hwy { |
| namespace HWY_NAMESPACE { |
| |
| // Single instruction, single data. |
| template <typename T> |
| using Sisd = Simd<T, 1, 0>; |
| |
| // (Wrapper class required for overloading comparison operators.) |
| template <typename T> |
| struct Vec1 { |
| using PrivateT = T; // only for DFromV |
| static constexpr size_t kPrivateN = 1; // only for DFromV |
| |
| HWY_INLINE Vec1() = default; |
| Vec1(const Vec1&) = default; |
| Vec1& operator=(const Vec1&) = default; |
| HWY_INLINE explicit Vec1(const T t) : raw(t) {} |
| |
| HWY_INLINE Vec1& operator*=(const Vec1 other) { |
| return *this = (*this * other); |
| } |
| HWY_INLINE Vec1& operator/=(const Vec1 other) { |
| return *this = (*this / other); |
| } |
| HWY_INLINE Vec1& operator+=(const Vec1 other) { |
| return *this = (*this + other); |
| } |
| HWY_INLINE Vec1& operator-=(const Vec1 other) { |
| return *this = (*this - other); |
| } |
| HWY_INLINE Vec1& operator%=(const Vec1 other) { |
| return *this = (*this % other); |
| } |
| HWY_INLINE Vec1& operator&=(const Vec1 other) { |
| return *this = (*this & other); |
| } |
| HWY_INLINE Vec1& operator|=(const Vec1 other) { |
| return *this = (*this | other); |
| } |
| HWY_INLINE Vec1& operator^=(const Vec1 other) { |
| return *this = (*this ^ other); |
| } |
| |
| T raw; |
| }; |
| |
| // 0 or FF..FF, same size as Vec1. |
| template <typename T> |
| struct Mask1 { |
| using Raw = hwy::MakeUnsigned<T>; |
| |
| using PrivateT = T; // only for DFromM |
| static constexpr size_t kPrivateN = 1; // only for DFromM |
| |
| static HWY_INLINE Mask1<T> FromBool(bool b) { |
| Mask1<T> mask; |
| mask.bits = b ? static_cast<Raw>(~Raw{0}) : 0; |
| return mask; |
| } |
| |
| Raw bits; |
| }; |
| |
| template <class V> |
| using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>; |
| |
| template <class M> |
| using DFromM = Simd<typename M::PrivateT, M::kPrivateN, 0>; |
| |
| template <class V> |
| using TFromV = typename V::PrivateT; |
| |
| // ------------------------------ BitCast |
| |
| template <class DTo, typename TTo = TFromD<DTo>, typename TFrom> |
| HWY_API Vec1<TTo> BitCast(DTo /* tag */, Vec1<TFrom> v) { |
| static_assert(sizeof(TTo) <= sizeof(TFrom), "Promoting is undefined"); |
| TTo to; |
| CopyBytes<sizeof(TTo)>(&v.raw, &to); // not same size - ok to shrink |
| return Vec1<TTo>(to); |
| } |
| |
| // ------------------------------ Zero |
| |
| template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>> |
| HWY_API Vec1<T> Zero(D /* tag */) { |
| return Vec1<T>(ConvertScalarTo<T>(0)); |
| } |
| |
| template <class D> |
| using VFromD = decltype(Zero(D())); |
| |
| // ------------------------------ Set |
| template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>, typename T2> |
| HWY_API Vec1<T> Set(D /* tag */, const T2 t) { |
| return Vec1<T>(static_cast<T>(t)); |
| } |
| |
| // ------------------------------ Undefined |
| template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>> |
| HWY_API Vec1<T> Undefined(D d) { |
| return Zero(d); |
| } |
| |
| // ------------------------------ Iota |
| template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>, typename T2> |
| HWY_API Vec1<T> Iota(const D /* tag */, const T2 first) { |
| return Vec1<T>(static_cast<T>(first)); |
| } |
| |
| // ------------------------------ ResizeBitCast |
| |
| template <class D, typename FromV> |
| HWY_API VFromD<D> ResizeBitCast(D /* tag */, FromV v) { |
| using TFrom = TFromV<FromV>; |
| using TTo = TFromD<D>; |
| constexpr size_t kCopyLen = HWY_MIN(sizeof(TFrom), sizeof(TTo)); |
| TTo to{}; |
| CopyBytes<kCopyLen>(&v.raw, &to); |
| return VFromD<D>(to); |
| } |
| |
| namespace detail { |
| |
| // ResizeBitCast on the HWY_SCALAR target has zero-extending semantics if |
| // sizeof(TFromD<DTo>) is greater than sizeof(TFromV<FromV>) |
| template <class FromSizeTag, class ToSizeTag, class DTo, class DFrom> |
| HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(FromSizeTag /* from_size_tag */, |
| ToSizeTag /* to_size_tag */, |
| DTo d_to, DFrom /*d_from*/, |
| VFromD<DFrom> v) { |
| return ResizeBitCast(d_to, v); |
| } |
| |
| } // namespace detail |
| |
| // ------------------------------ Dup128VecFromValues |
| |
| template <class D, HWY_IF_T_SIZE_D(D, 1)> |
| HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> /*t1*/, |
| TFromD<D> /*t2*/, TFromD<D> /*t3*/, |
| TFromD<D> /*t4*/, TFromD<D> /*t5*/, |
| TFromD<D> /*t6*/, TFromD<D> /*t7*/, |
| TFromD<D> /*t8*/, TFromD<D> /*t9*/, |
| TFromD<D> /*t10*/, TFromD<D> /*t11*/, |
| TFromD<D> /*t12*/, TFromD<D> /*t13*/, |
| TFromD<D> /*t14*/, TFromD<D> /*t15*/) { |
| return VFromD<D>(t0); |
| } |
| |
| template <class D, HWY_IF_T_SIZE_D(D, 2)> |
| HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> /*t1*/, |
| TFromD<D> /*t2*/, TFromD<D> /*t3*/, |
| TFromD<D> /*t4*/, TFromD<D> /*t5*/, |
| TFromD<D> /*t6*/, TFromD<D> /*t7*/) { |
| return VFromD<D>(t0); |
| } |
| |
| template <class D, HWY_IF_T_SIZE_D(D, 4)> |
| HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> /*t1*/, |
| TFromD<D> /*t2*/, TFromD<D> /*t3*/) { |
| return VFromD<D>(t0); |
| } |
| |
| template <class D, HWY_IF_T_SIZE_D(D, 8)> |
| HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> /*t1*/) { |
| return VFromD<D>(t0); |
| } |
| |
| // ================================================== LOGICAL |
| |
| // ------------------------------ Not |
| |
| template <typename T> |
| HWY_API Vec1<T> Not(const Vec1<T> v) { |
| using TU = MakeUnsigned<T>; |
| const Sisd<TU> du; |
| return BitCast(Sisd<T>(), Vec1<TU>(static_cast<TU>(~BitCast(du, v).raw))); |
| } |
| |
| // ------------------------------ And |
| |
| template <typename T> |
| HWY_API Vec1<T> And(const Vec1<T> a, const Vec1<T> b) { |
| using TU = MakeUnsigned<T>; |
| const Sisd<TU> du; |
| return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw & BitCast(du, b).raw)); |
| } |
| template <typename T> |
| HWY_API Vec1<T> operator&(const Vec1<T> a, const Vec1<T> b) { |
| return And(a, b); |
| } |
| |
| // ------------------------------ AndNot |
| |
| template <typename T> |
| HWY_API Vec1<T> AndNot(const Vec1<T> a, const Vec1<T> b) { |
| using TU = MakeUnsigned<T>; |
| const Sisd<TU> du; |
| return BitCast(Sisd<T>(), Vec1<TU>(static_cast<TU>(~BitCast(du, a).raw & |
| BitCast(du, b).raw))); |
| } |
| |
| // ------------------------------ Or |
| |
| template <typename T> |
| HWY_API Vec1<T> Or(const Vec1<T> a, const Vec1<T> b) { |
| using TU = MakeUnsigned<T>; |
| const Sisd<TU> du; |
| return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw | BitCast(du, b).raw)); |
| } |
| template <typename T> |
| HWY_API Vec1<T> operator|(const Vec1<T> a, const Vec1<T> b) { |
| return Or(a, b); |
| } |
| |
| // ------------------------------ Xor |
| |
| template <typename T> |
| HWY_API Vec1<T> Xor(const Vec1<T> a, const Vec1<T> b) { |
| using TU = MakeUnsigned<T>; |
| const Sisd<TU> du; |
| return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw ^ BitCast(du, b).raw)); |
| } |
| template <typename T> |
| HWY_API Vec1<T> operator^(const Vec1<T> a, const Vec1<T> b) { |
| return Xor(a, b); |
| } |
| |
| // ------------------------------ Xor3 |
| |
| template <typename T> |
| HWY_API Vec1<T> Xor3(Vec1<T> x1, Vec1<T> x2, Vec1<T> x3) { |
| return Xor(x1, Xor(x2, x3)); |
| } |
| |
| // ------------------------------ Or3 |
| |
| template <typename T> |
| HWY_API Vec1<T> Or3(Vec1<T> o1, Vec1<T> o2, Vec1<T> o3) { |
| return Or(o1, Or(o2, o3)); |
| } |
| |
| // ------------------------------ OrAnd |
| |
| template <typename T> |
| HWY_API Vec1<T> OrAnd(const Vec1<T> o, const Vec1<T> a1, const Vec1<T> a2) { |
| return Or(o, And(a1, a2)); |
| } |
| |
| // ------------------------------ Mask |
| |
| template <class DTo, typename TTo = TFromD<DTo>, typename TFrom> |
| HWY_API Mask1<TTo> RebindMask(DTo /*tag*/, Mask1<TFrom> m) { |
| static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); |
| return Mask1<TTo>{m.bits}; |
| } |
| |
| // v must be 0 or FF..FF. |
| template <typename T> |
| HWY_API Mask1<T> MaskFromVec(const Vec1<T> v) { |
| Mask1<T> mask; |
| CopySameSize(&v, &mask); |
| return mask; |
| } |
| |
| template <class D> |
| using MFromD = decltype(MaskFromVec(VFromD<D>())); |
| |
| template <class D, typename T = TFromD<D>> |
| Vec1<T> VecFromMask(D /* tag */, const Mask1<T> mask) { |
| Vec1<T> v; |
| CopySameSize(&mask, &v); |
| return v; |
| } |
| |
| template <class D> |
| uint64_t BitsFromMask(D, MFromD<D> mask) { |
| return mask.bits ? 1 : 0; |
| } |
| |
| template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>> |
| HWY_API Mask1<T> FirstN(D /*tag*/, size_t n) { |
| return Mask1<T>::FromBool(n != 0); |
| } |
| |
| // ------------------------------ IfVecThenElse |
| template <typename T> |
| HWY_API Vec1<T> IfVecThenElse(Vec1<T> mask, Vec1<T> yes, Vec1<T> no) { |
| return IfThenElse(MaskFromVec(mask), yes, no); |
| } |
| |
| // ------------------------------ CopySign |
| template <typename T> |
| HWY_API Vec1<T> CopySign(const Vec1<T> magn, const Vec1<T> sign) { |
| static_assert(IsFloat<T>(), "Only makes sense for floating-point"); |
| const DFromV<decltype(magn)> d; |
| return BitwiseIfThenElse(SignBit(d), sign, magn); |
| } |
| |
| // ------------------------------ CopySignToAbs |
| template <typename T> |
| HWY_API Vec1<T> CopySignToAbs(const Vec1<T> abs, const Vec1<T> sign) { |
| static_assert(IsFloat<T>(), "Only makes sense for floating-point"); |
| const Sisd<T> d; |
| return OrAnd(abs, SignBit(d), sign); |
| } |
| |
| // ------------------------------ BroadcastSignBit |
| template <typename T> |
| HWY_API Vec1<T> BroadcastSignBit(const Vec1<T> v) { |
| return Vec1<T>(ScalarShr(v.raw, sizeof(T) * 8 - 1)); |
| } |
| |
| // ------------------------------ PopulationCount |
| |
| #ifdef HWY_NATIVE_POPCNT |
| #undef HWY_NATIVE_POPCNT |
| #else |
| #define HWY_NATIVE_POPCNT |
| #endif |
| |
| template <typename T> |
| HWY_API Vec1<T> PopulationCount(Vec1<T> v) { |
| return Vec1<T>(static_cast<T>(PopCount(v.raw))); |
| } |
| |
| // ------------------------------ IfThenElse |
| |
| // Returns mask ? yes : no. |
| template <typename T> |
| HWY_API Vec1<T> IfThenElse(const Mask1<T> mask, const Vec1<T> yes, |
| const Vec1<T> no) { |
| return mask.bits ? yes : no; |
| } |
| |
| template <typename T> |
| HWY_API Vec1<T> IfThenElseZero(const Mask1<T> mask, const Vec1<T> yes) { |
| return mask.bits ? yes : Vec1<T>(ConvertScalarTo<T>(0)); |
| } |
| |
| template <typename T> |
| HWY_API Vec1<T> IfThenZeroElse(const Mask1<T> mask, const Vec1<T> no) { |
| return mask.bits ? Vec1<T>(ConvertScalarTo<T>(0)) : no; |
| } |
| |
| template <typename T> |
| HWY_API Vec1<T> IfNegativeThenElse(Vec1<T> v, Vec1<T> yes, Vec1<T> no) { |
| const DFromV<decltype(v)> d; |
| const RebindToSigned<decltype(d)> di; |
| const auto vi = BitCast(di, v); |
| |
| return vi.raw < 0 ? yes : no; |
| } |
| |
| // ------------------------------ Mask logical |
| |
| template <typename T> |
| HWY_API Mask1<T> Not(const Mask1<T> m) { |
| return MaskFromVec(Not(VecFromMask(Sisd<T>(), m))); |
| } |
| |
| template <typename T> |
| HWY_API Mask1<T> And(const Mask1<T> a, Mask1<T> b) { |
| const Sisd<T> d; |
| return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); |
| } |
| |
| template <typename T> |
| HWY_API Mask1<T> AndNot(const Mask1<T> a, Mask1<T> b) { |
| const Sisd<T> d; |
| return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); |
| } |
| |
| template <typename T> |
| HWY_API Mask1<T> Or(const Mask1<T> a, Mask1<T> b) { |
| const Sisd<T> d; |
| return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); |
| } |
| |
| template <typename T> |
| HWY_API Mask1<T> Xor(const Mask1<T> a, Mask1<T> b) { |
| const Sisd<T> d; |
| return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); |
| } |
| |
| template <typename T> |
| HWY_API Mask1<T> ExclusiveNeither(const Mask1<T> a, Mask1<T> b) { |
| const Sisd<T> d; |
| return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); |
| } |
| |
| template <class T> |
| HWY_API Mask1<T> SetAtOrAfterFirst(Mask1<T> mask) { |
| return mask; |
| } |
| |
| template <class T> |
| HWY_API Mask1<T> SetBeforeFirst(Mask1<T> mask) { |
| return Not(mask); |
| } |
| |
| template <class T> |
| HWY_API Mask1<T> SetOnlyFirst(Mask1<T> mask) { |
| return mask; |
| } |
| |
| template <class T> |
| HWY_API Mask1<T> SetAtOrBeforeFirst(Mask1<T> /*mask*/) { |
| return Mask1<T>::FromBool(true); |
| } |
| |
| // ------------------------------ LowerHalfOfMask |
| |
| #ifdef HWY_NATIVE_LOWER_HALF_OF_MASK |
| #undef HWY_NATIVE_LOWER_HALF_OF_MASK |
| #else |
| #define HWY_NATIVE_LOWER_HALF_OF_MASK |
| #endif |
| |
| template <class D> |
| HWY_API MFromD<D> LowerHalfOfMask(D /*d*/, MFromD<D> m) { |
| return m; |
| } |
| |
| // ================================================== SHIFTS |
| |
| // ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit) |
| |
| template <int kBits, typename T> |
| HWY_API Vec1<T> ShiftLeft(const Vec1<T> v) { |
| static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); |
| return Vec1<T>( |
| static_cast<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << kBits)); |
| } |
| |
| template <int kBits, typename T> |
| HWY_API Vec1<T> ShiftRight(const Vec1<T> v) { |
| static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); |
| return Vec1<T>(ScalarShr(v.raw, kBits)); |
| } |
| |
| // ------------------------------ RotateRight (ShiftRight) |
| template <int kBits, typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> |
| HWY_API Vec1<T> RotateRight(const Vec1<T> v) { |
| const DFromV<decltype(v)> d; |
| const RebindToUnsigned<decltype(d)> du; |
| |
| constexpr size_t kSizeInBits = sizeof(T) * 8; |
| static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); |
| if (kBits == 0) return v; |
| |
| return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))), |
| ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v)); |
| } |
| |
| // ------------------------------ ShiftLeftSame (BroadcastSignBit) |
| |
| template <typename T> |
| HWY_API Vec1<T> ShiftLeftSame(const Vec1<T> v, int bits) { |
| return Vec1<T>( |
| static_cast<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << bits)); |
| } |
| |
| template <typename T> |
| HWY_API Vec1<T> ShiftRightSame(const Vec1<T> v, int bits) { |
| return Vec1<T>(ScalarShr(v.raw, bits)); |
| } |
| |
| // ------------------------------ Shl |
| |
| // Single-lane => same as ShiftLeftSame except for the argument type. |
| template <typename T> |
| HWY_API Vec1<T> operator<<(const Vec1<T> v, const Vec1<T> bits) { |
| return ShiftLeftSame(v, static_cast<int>(bits.raw)); |
| } |
| |
| template <typename T> |
| HWY_API Vec1<T> operator>>(const Vec1<T> v, const Vec1<T> bits) { |
| return ShiftRightSame(v, static_cast<int>(bits.raw)); |
| } |
| |
| // ================================================== ARITHMETIC |
| |
| template <typename T> |
| HWY_API Vec1<T> operator+(Vec1<T> a, Vec1<T> b) { |
| const uint64_t a64 = static_cast<uint64_t>(a.raw); |
| const uint64_t b64 = static_cast<uint64_t>(b.raw); |
| return Vec1<T>(static_cast<T>((a64 + b64) & static_cast<uint64_t>(~T(0)))); |
| } |
| HWY_API Vec1<float> operator+(const Vec1<float> a, const Vec1<float> b) { |
| return Vec1<float>(a.raw + b.raw); |
| } |
| HWY_API Vec1<double> operator+(const Vec1<double> a, const Vec1<double> b) { |
| return Vec1<double>(a.raw + b.raw); |
| } |
| |
| template <typename T> |
| HWY_API Vec1<T> operator-(Vec1<T> a, Vec1<T> b) { |
| const uint64_t a64 = static_cast<uint64_t>(a.raw); |
| const uint64_t b64 = static_cast<uint64_t>(b.raw); |
| return Vec1<T>(static_cast<T>((a64 - b64) & static_cast<uint64_t>(~T(0)))); |
| } |
| HWY_API Vec1<float> operator-(const Vec1<float> a, const Vec1<float> b) { |
| return Vec1<float>(a.raw - b.raw); |
| } |
| HWY_API Vec1<double> operator-(const Vec1<double> a, const Vec1<double> b) { |
| return Vec1<double>(a.raw - b.raw); |
| } |
| |
| // ------------------------------ SumsOf8 |
| |
| HWY_API Vec1<int64_t> SumsOf8(const Vec1<int8_t> v) { |
| return Vec1<int64_t>(v.raw); |
| } |
| HWY_API Vec1<uint64_t> SumsOf8(const Vec1<uint8_t> v) { |
| return Vec1<uint64_t>(v.raw); |
| } |
| |
| // ------------------------------ SumsOf2 |
| |
| template <class T> |
| HWY_API Vec1<MakeWide<T>> SumsOf2(const Vec1<T> v) { |
| const DFromV<decltype(v)> d; |
| const Rebind<MakeWide<T>, decltype(d)> dw; |
| return PromoteTo(dw, v); |
| } |
| |
| // ------------------------------ SaturatedAdd |
| |
| // Returns a + b clamped to the destination range. |
| |
| // Unsigned |
| HWY_API Vec1<uint8_t> SaturatedAdd(const Vec1<uint8_t> a, |
| const Vec1<uint8_t> b) { |
| return Vec1<uint8_t>( |
| static_cast<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 255))); |
| } |
| HWY_API Vec1<uint16_t> SaturatedAdd(const Vec1<uint16_t> a, |
| const Vec1<uint16_t> b) { |
| return Vec1<uint16_t>(static_cast<uint16_t>( |
| HWY_MIN(HWY_MAX(0, static_cast<int32_t>(a.raw) + b.raw), 65535))); |
| } |
| |
| // Signed |
| HWY_API Vec1<int8_t> SaturatedAdd(const Vec1<int8_t> a, const Vec1<int8_t> b) { |
| return Vec1<int8_t>( |
| static_cast<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw + b.raw), 127))); |
| } |
| HWY_API Vec1<int16_t> SaturatedAdd(const Vec1<int16_t> a, |
| const Vec1<int16_t> b) { |
| return Vec1<int16_t>(static_cast<int16_t>( |
| HWY_MIN(HWY_MAX(-32768, static_cast<int32_t>(a.raw) + b.raw), 32767))); |
| } |
| |
| // ------------------------------ Saturating subtraction |
| |
| // Returns a - b clamped to the destination range. |
| |
| // Unsigned |
| HWY_API Vec1<uint8_t> SaturatedSub(const Vec1<uint8_t> a, |
| const Vec1<uint8_t> b) { |
| return Vec1<uint8_t>( |
| static_cast<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 255))); |
| } |
| HWY_API Vec1<uint16_t> SaturatedSub(const Vec1<uint16_t> a, |
| const Vec1<uint16_t> b) { |
| return Vec1<uint16_t>(static_cast<uint16_t>( |
| HWY_MIN(HWY_MAX(0, static_cast<int32_t>(a.raw) - b.raw), 65535))); |
| } |
| |
| // Signed |
| HWY_API Vec1<int8_t> SaturatedSub(const Vec1<int8_t> a, const Vec1<int8_t> b) { |
| return Vec1<int8_t>( |
| static_cast<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw - b.raw), 127))); |
| } |
| HWY_API Vec1<int16_t> SaturatedSub(const Vec1<int16_t> a, |
| const Vec1<int16_t> b) { |
| return Vec1<int16_t>(static_cast<int16_t>( |
| HWY_MIN(HWY_MAX(-32768, static_cast<int32_t>(a.raw) - b.raw), 32767))); |
| } |
| |
| // ------------------------------ Average |
| |
| // Returns (a + b + 1) / 2 |
| |
| #ifdef HWY_NATIVE_AVERAGE_ROUND_UI32 |
| #undef HWY_NATIVE_AVERAGE_ROUND_UI32 |
| #else |
| #define HWY_NATIVE_AVERAGE_ROUND_UI32 |
| #endif |
| |
| #ifdef HWY_NATIVE_AVERAGE_ROUND_UI64 |
| #undef HWY_NATIVE_AVERAGE_ROUND_UI64 |
| #else |
| #define HWY_NATIVE_AVERAGE_ROUND_UI64 |
| #endif |
| |
| template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> |
| HWY_API Vec1<T> AverageRound(const Vec1<T> a, const Vec1<T> b) { |
| const T a_val = a.raw; |
| const T b_val = b.raw; |
| return Vec1<T>(static_cast<T>((a_val | b_val) - ScalarShr(a_val ^ b_val, 1))); |
| } |
| |
| // ------------------------------ Absolute value |
| |
| template <typename T> |
| HWY_API Vec1<T> Abs(const Vec1<T> a) { |
| return Vec1<T>(ScalarAbs(a.raw)); |
| } |
| |
| // ------------------------------ Min/Max |
| |
| // <cmath> may be unavailable, so implement our own. |
| |
| template <typename T, HWY_IF_NOT_FLOAT(T)> |
| HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) { |
| return Vec1<T>(HWY_MIN(a.raw, b.raw)); |
| } |
| |
| template <typename T, HWY_IF_FLOAT(T)> |
| HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) { |
| if (ScalarIsNaN(a.raw)) return b; |
| if (ScalarIsNaN(b.raw)) return a; |
| return Vec1<T>(HWY_MIN(a.raw, b.raw)); |
| } |
| |
| template <typename T, HWY_IF_NOT_FLOAT(T)> |
| HWY_API Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) { |
| return Vec1<T>(HWY_MAX(a.raw, b.raw)); |
| } |
| |
| template <typename T, HWY_IF_FLOAT(T)> |
| HWY_API Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) { |
| if (ScalarIsNaN(a.raw)) return b; |
| if (ScalarIsNaN(b.raw)) return a; |
| return Vec1<T>(HWY_MAX(a.raw, b.raw)); |
| } |
| |
| // ------------------------------ Floating-point negate |
| |
| template <typename T, HWY_IF_FLOAT_OR_SPECIAL(T)> |
| HWY_API Vec1<T> Neg(const Vec1<T> v) { |
| return Xor(v, SignBit(Sisd<T>())); |
| } |
| |
| template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> |
| HWY_API Vec1<T> Neg(const Vec1<T> v) { |
| return Zero(Sisd<T>()) - v; |
| } |
| |
| // ------------------------------ mul/div |
| |
| // Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*. |
| #ifdef HWY_NATIVE_MUL_8 |
| #undef HWY_NATIVE_MUL_8 |
| #else |
| #define HWY_NATIVE_MUL_8 |
| #endif |
| #ifdef HWY_NATIVE_MUL_64 |
| #undef HWY_NATIVE_MUL_64 |
| #else |
| #define HWY_NATIVE_MUL_64 |
| #endif |
| |
| template <typename T, HWY_IF_FLOAT(T)> |
| HWY_API Vec1<T> operator*(const Vec1<T> a, const Vec1<T> b) { |
| return Vec1<T>(static_cast<T>(double{a.raw} * b.raw)); |
| } |
| |
| template <typename T, HWY_IF_NOT_FLOAT(T)> |
| HWY_API Vec1<T> operator*(const Vec1<T> a, const Vec1<T> b) { |
| return Vec1<T>(static_cast<T>(static_cast<uint64_t>(a.raw) * |
| static_cast<uint64_t>(b.raw))); |
| } |
| |
| template <typename T, HWY_IF_FLOAT(T)> |
| HWY_API Vec1<T> operator/(const Vec1<T> a, const Vec1<T> b) { |
| return Vec1<T>(a.raw / b.raw); |
| } |
| |
| // Returns the upper sizeof(T)*8 bits of a * b in each lane. |
| template <class T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)), |
| HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> |
| HWY_API Vec1<T> MulHigh(const Vec1<T> a, const Vec1<T> b) { |
| using TW = MakeWide<T>; |
| return Vec1<T>(static_cast<T>( |
| (static_cast<TW>(a.raw) * static_cast<TW>(b.raw)) >> (sizeof(T) * 8))); |
| } |
| template <class T, HWY_IF_UI64(T)> |
| HWY_API Vec1<T> MulHigh(const Vec1<T> a, const Vec1<T> b) { |
| T hi; |
| Mul128(a.raw, b.raw, &hi); |
| return Vec1<T>(hi); |
| } |
| |
| HWY_API Vec1<int16_t> MulFixedPoint15(Vec1<int16_t> a, Vec1<int16_t> b) { |
| return Vec1<int16_t>(static_cast<int16_t>((a.raw * b.raw + 16384) >> 15)); |
| } |
| |
| // Multiplies even lanes (0, 2 ..) and returns the double-wide result. |
| template <class T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)), |
| HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> |
| HWY_API Vec1<MakeWide<T>> MulEven(const Vec1<T> a, const Vec1<T> b) { |
| using TW = MakeWide<T>; |
| const TW a_wide = a.raw; |
| return Vec1<TW>(static_cast<TW>(a_wide * b.raw)); |
| } |
| |
| template <class T> |
| HWY_API Vec1<MakeWide<T>> MulOdd(const Vec1<T>, const Vec1<T>) { |
| static_assert(sizeof(T) == 0, "There are no odd lanes"); |
| } |
| |
| // Approximate reciprocal |
| HWY_API Vec1<float> ApproximateReciprocal(const Vec1<float> v) { |
| // Zero inputs are allowed, but callers are responsible for replacing the |
| // return value with something else (typically using IfThenElse). This check |
| // avoids a ubsan error. The return value is arbitrary. |
| if (v.raw == 0.0f) return Vec1<float>(0.0f); |
| return Vec1<float>(1.0f / v.raw); |
| } |
| |
| // generic_ops takes care of integer T. |
| template <typename T, HWY_IF_FLOAT(T)> |
| HWY_API Vec1<T> AbsDiff(const Vec1<T> a, const Vec1<T> b) { |
| return Abs(a - b); |
| } |
| |
| // ------------------------------ Floating-point multiply-add variants |
| |
| template <typename T, HWY_IF_FLOAT(T)> |
| HWY_API Vec1<T> MulAdd(const Vec1<T> mul, const Vec1<T> x, const Vec1<T> add) { |
| return mul * x + add; |
| } |
| |
| template <typename T, HWY_IF_FLOAT(T)> |
| HWY_API Vec1<T> NegMulAdd(const Vec1<T> mul, const Vec1<T> x, |
| const Vec1<T> add) { |
| return add - mul * x; |
| } |
| |
| template <typename T, HWY_IF_FLOAT(T)> |
| HWY_API Vec1<T> MulSub(const Vec1<T> mul, const Vec1<T> x, const Vec1<T> sub) { |
| return mul * x - sub; |
| } |
| |
| template <typename T, HWY_IF_FLOAT(T)> |
| HWY_API Vec1<T> NegMulSub(const Vec1<T> mul, const Vec1<T> x, |
| const Vec1<T> sub) { |
| return Neg(mul) * x - sub; |
| } |
| |
| // ------------------------------ Floating-point square root |
| |
| // Approximate reciprocal square root |
| HWY_API Vec1<float> ApproximateReciprocalSqrt(const Vec1<float> v) { |
| float f = v.raw; |
| const float half = f * 0.5f; |
| uint32_t bits; |
| CopySameSize(&f, &bits); |
| // Initial guess based on log2(f) |
| bits = 0x5F3759DF - (bits >> 1); |
| CopySameSize(&bits, &f); |
| // One Newton-Raphson iteration |
| return Vec1<float>(f * (1.5f - (half * f * f))); |
| } |
| |
| // Square root |
| HWY_API Vec1<float> Sqrt(Vec1<float> v) { |
| #if defined(HWY_NO_LIBCXX) |
| #if HWY_COMPILER_GCC_ACTUAL |
| return Vec1<float>(__builtin_sqrt(v.raw)); |
| #else |
| uint32_t bits; |
| CopyBytes<sizeof(bits)>(&v, &bits); |
| // Coarse approximation, letting the exponent LSB leak into the mantissa |
| bits = (1 << 29) + (bits >> 1) - (1 << 22); |
| CopyBytes<sizeof(bits)>(&bits, &v); |
| return v; |
| #endif // !HWY_COMPILER_GCC_ACTUAL |
| #else |
| return Vec1<float>(sqrtf(v.raw)); |
| #endif // !HWY_NO_LIBCXX |
| } |
| HWY_API Vec1<double> Sqrt(Vec1<double> v) { |
| #if defined(HWY_NO_LIBCXX) |
| #if HWY_COMPILER_GCC_ACTUAL |
| return Vec1<double>(__builtin_sqrt(v.raw)); |
| #else |
| uint64_t bits; |
| CopyBytes<sizeof(bits)>(&v, &bits); |
| // Coarse approximation, letting the exponent LSB leak into the mantissa |
| bits = (1ULL << 61) + (bits >> 1) - (1ULL << 51); |
| CopyBytes<sizeof(bits)>(&bits, &v); |
| return v; |
| #endif // !HWY_COMPILER_GCC_ACTUAL |
| #else |
| return Vec1<double>(sqrt(v.raw)); |
| #endif // HWY_NO_LIBCXX |
| } |
| |
| // ------------------------------ Floating-point rounding |
| |
| template <typename T> |
| HWY_API Vec1<T> Round(const Vec1<T> v) { |
| using TI = MakeSigned<T>; |
| if (!(Abs(v).raw < MantissaEnd<T>())) { // Huge or NaN |
| return v; |
| } |
| const T k0 = ConvertScalarTo<T>(0); |
| const T bias = ConvertScalarTo<T>(v.raw < k0 ? -0.5 : 0.5); |
| const TI rounded = ConvertScalarTo<TI>(v.raw + bias); |
| if (rounded == 0) return CopySignToAbs(Vec1<T>(k0), v); |
| TI offset = 0; |
| // Round to even |
| if ((rounded & 1) && ScalarAbs(ConvertScalarTo<T>(rounded) - v.raw) == |
| ConvertScalarTo<T>(0.5)) { |
| offset = v.raw < k0 ? -1 : 1; |
| } |
| return Vec1<T>(ConvertScalarTo<T>(rounded - offset)); |
| } |
| |
| // Round-to-nearest even. |
| template <class T, HWY_IF_FLOAT3264(T)> |
| HWY_API Vec1<MakeSigned<T>> NearestInt(const Vec1<T> v) { |
| using TI = MakeSigned<T>; |
| |
| const T abs = Abs(v).raw; |
| const bool is_sign = ScalarSignBit(v.raw); |
| |
| if (!(abs < MantissaEnd<T>())) { // Huge or NaN |
| // Check if too large to cast or NaN |
| if (!(abs <= ConvertScalarTo<T>(LimitsMax<TI>()))) { |
| return Vec1<TI>(is_sign ? LimitsMin<TI>() : LimitsMax<TI>()); |
| } |
| return Vec1<TI>(ConvertScalarTo<TI>(v.raw)); |
| } |
| const T bias = |
| ConvertScalarTo<T>(v.raw < ConvertScalarTo<T>(0.0) ? -0.5 : 0.5); |
| const TI rounded = ConvertScalarTo<TI>(v.raw + bias); |
| if (rounded == 0) return Vec1<TI>(0); |
| TI offset = 0; |
| // Round to even |
| if ((rounded & 1) && ScalarAbs(ConvertScalarTo<T>(rounded) - v.raw) == |
| ConvertScalarTo<T>(0.5)) { |
| offset = is_sign ? -1 : 1; |
| } |
| return Vec1<TI>(rounded - offset); |
| } |
| |
| // Round-to-nearest even. |
| template <class DI32, HWY_IF_I32_D(DI32)> |
| HWY_API VFromD<DI32> DemoteToNearestInt(DI32 /*di32*/, const Vec1<double> v) { |
| using T = double; |
| using TI = int32_t; |
| |
| const T abs = Abs(v).raw; |
| const bool is_sign = ScalarSignBit(v.raw); |
| |
| // Check if too large to cast or NaN |
| if (!(abs <= ConvertScalarTo<T>(LimitsMax<TI>()))) { |
| return Vec1<TI>(is_sign ? LimitsMin<TI>() : LimitsMax<TI>()); |
| } |
| |
| const T bias = |
| ConvertScalarTo<T>(v.raw < ConvertScalarTo<T>(0.0) ? -0.5 : 0.5); |
| const TI rounded = ConvertScalarTo<TI>(v.raw + bias); |
| if (rounded == 0) return Vec1<TI>(0); |
| TI offset = 0; |
| // Round to even |
| if ((rounded & 1) && ScalarAbs(ConvertScalarTo<T>(rounded) - v.raw) == |
| ConvertScalarTo<T>(0.5)) { |
| offset = is_sign ? -1 : 1; |
| } |
| return Vec1<TI>(rounded - offset); |
| } |
| |
| template <typename T> |
| HWY_API Vec1<T> Trunc(const Vec1<T> v) { |
| using TI = MakeSigned<T>; |
| if (!(Abs(v).raw <= MantissaEnd<T>())) { // Huge or NaN |
| return v; |
| } |
| const TI truncated = ConvertScalarTo<TI>(v.raw); |
| if (truncated == 0) return CopySignToAbs(Vec1<T>(0), v); |
| return Vec1<T>(ConvertScalarTo<T>(truncated)); |
| } |
| |
| template <typename Float, typename Bits, int kMantissaBits, int kExponentBits, |
| class V> |
| V Ceiling(const V v) { |
| const Bits kExponentMask = (1ull << kExponentBits) - 1; |
| const Bits kMantissaMask = (1ull << kMantissaBits) - 1; |
| const Bits kBias = kExponentMask / 2; |
| |
| Float f = v.raw; |
| const bool positive = f > Float(0.0); |
| |
| Bits bits; |
| CopySameSize(&v, &bits); |
| |
| const int exponent = |
| static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias); |
| // Already an integer. |
| if (exponent >= kMantissaBits) return v; |
| // |v| <= 1 => 0 or 1. |
| if (exponent < 0) return positive ? V(1) : V(-0.0); |
| |
| const Bits mantissa_mask = kMantissaMask >> exponent; |
| // Already an integer |
| if ((bits & mantissa_mask) == 0) return v; |
| |
| // Clear fractional bits and round up |
| if (positive) bits += (kMantissaMask + 1) >> exponent; |
| bits &= ~mantissa_mask; |
| |
| CopySameSize(&bits, &f); |
| return V(f); |
| } |
| |
| template <typename Float, typename Bits, int kMantissaBits, int kExponentBits, |
| class V> |
| V Floor(const V v) { |
| const Bits kExponentMask = (1ull << kExponentBits) - 1; |
| const Bits kMantissaMask = (1ull << kMantissaBits) - 1; |
| const Bits kBias = kExponentMask / 2; |
| |
| Float f = v.raw; |
| const bool negative = f < Float(0.0); |
| |
| Bits bits; |
| CopySameSize(&v, &bits); |
| |
| const int exponent = |
| static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias); |
| // Already an integer. |
| if (exponent >= kMantissaBits) return v; |
| // |v| <= 1 => -1 or 0. |
| if (exponent < 0) return V(negative ? Float(-1.0) : Float(0.0)); |
| |
| const Bits mantissa_mask = kMantissaMask >> exponent; |
| // Already an integer |
| if ((bits & mantissa_mask) == 0) return v; |
| |
| // Clear fractional bits and round down |
| if (negative) bits += (kMantissaMask + 1) >> exponent; |
| bits &= ~mantissa_mask; |
| |
| CopySameSize(&bits, &f); |
| return V(f); |
| } |
| |
| // Toward +infinity, aka ceiling |
| HWY_API Vec1<float> Ceil(const Vec1<float> v) { |
| return Ceiling<float, uint32_t, 23, 8>(v); |
| } |
| HWY_API Vec1<double> Ceil(const Vec1<double> v) { |
| return Ceiling<double, uint64_t, 52, 11>(v); |
| } |
| |
| // Toward -infinity, aka floor |
| HWY_API Vec1<float> Floor(const Vec1<float> v) { |
| return Floor<float, uint32_t, 23, 8>(v); |
| } |
| HWY_API Vec1<double> Floor(const Vec1<double> v) { |
| return Floor<double, uint64_t, 52, 11>(v); |
| } |
| |
| // ================================================== COMPARE |
| |
| template <typename T> |
| HWY_API Mask1<T> operator==(const Vec1<T> a, const Vec1<T> b) { |
| return Mask1<T>::FromBool(a.raw == b.raw); |
| } |
| |
| template <typename T> |
| HWY_API Mask1<T> operator!=(const Vec1<T> a, const Vec1<T> b) { |
| return Mask1<T>::FromBool(a.raw != b.raw); |
| } |
| |
| template <typename T> |
| HWY_API Mask1<T> TestBit(const Vec1<T> v, const Vec1<T> bit) { |
| static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported"); |
| return (v & bit) == bit; |
| } |
| |
| template <typename T> |
| HWY_API Mask1<T> operator<(const Vec1<T> a, const Vec1<T> b) { |
| return Mask1<T>::FromBool(a.raw < b.raw); |
| } |
| template <typename T> |
| HWY_API Mask1<T> operator>(const Vec1<T> a, const Vec1<T> b) { |
| return Mask1<T>::FromBool(a.raw > b.raw); |
| } |
| |
| template <typename T> |
| HWY_API Mask1<T> operator<=(const Vec1<T> a, const Vec1<T> b) { |
| return Mask1<T>::FromBool(a.raw <= b.raw); |
| } |
| template <typename T> |
| HWY_API Mask1<T> operator>=(const Vec1<T> a, const Vec1<T> b) { |
| return Mask1<T>::FromBool(a.raw >= b.raw); |
| } |
| |
| // ------------------------------ Floating-point classification (==) |
| |
| template <typename T> |
| HWY_API Mask1<T> IsNaN(const Vec1<T> v) { |
| // std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY. |
| return Mask1<T>::FromBool(ScalarIsNaN(v.raw)); |
| } |
| |
| // Per-target flag to prevent generic_ops-inl.h from defining IsInf / IsFinite. |
| #ifdef HWY_NATIVE_ISINF |
| #undef HWY_NATIVE_ISINF |
| #else |
| #define HWY_NATIVE_ISINF |
| #endif |
| |
| HWY_API Mask1<float> IsInf(const Vec1<float> v) { |
| const Sisd<float> d; |
| const RebindToUnsigned<decltype(d)> du; |
| const Vec1<uint32_t> vu = BitCast(du, v); |
| // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. |
| return RebindMask(d, (vu + vu) == Set(du, 0xFF000000u)); |
| } |
| HWY_API Mask1<double> IsInf(const Vec1<double> v) { |
| const Sisd<double> d; |
| const RebindToUnsigned<decltype(d)> du; |
| const Vec1<uint64_t> vu = BitCast(du, v); |
| // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. |
| return RebindMask(d, (vu + vu) == Set(du, 0xFFE0000000000000ull)); |
| } |
| |
| HWY_API Mask1<float> IsFinite(const Vec1<float> v) { |
| const Vec1<uint32_t> vu = BitCast(Sisd<uint32_t>(), v); |
| // Shift left to clear the sign bit, check whether exponent != max value. |
| return Mask1<float>::FromBool((vu.raw << 1) < 0xFF000000u); |
| } |
| HWY_API Mask1<double> IsFinite(const Vec1<double> v) { |
| const Vec1<uint64_t> vu = BitCast(Sisd<uint64_t>(), v); |
| // Shift left to clear the sign bit, check whether exponent != max value. |
| return Mask1<double>::FromBool((vu.raw << 1) < 0xFFE0000000000000ull); |
| } |
| |
| // ================================================== MEMORY |
| |
| // ------------------------------ Load |
| |
| template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>> |
| HWY_API Vec1<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) { |
| T t; |
| CopySameSize(aligned, &t); |
| return Vec1<T>(t); |
| } |
| |
| template <class D, typename T = TFromD<D>> |
| HWY_API Vec1<T> MaskedLoad(Mask1<T> m, D d, const T* HWY_RESTRICT aligned) { |
| return IfThenElseZero(m, Load(d, aligned)); |
| } |
| |
| template <class D, typename T = TFromD<D>> |
| HWY_API Vec1<T> MaskedLoadOr(Vec1<T> v, Mask1<T> m, D d, |
| const T* HWY_RESTRICT aligned) { |
| return IfThenElse(m, Load(d, aligned), v); |
| } |
| |
| template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>> |
| HWY_API Vec1<T> LoadU(D d, const T* HWY_RESTRICT p) { |
| return Load(d, p); |
| } |
| |
| // In some use cases, "load single lane" is sufficient; otherwise avoid this. |
| template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>> |
| HWY_API Vec1<T> LoadDup128(D d, const T* HWY_RESTRICT aligned) { |
| return Load(d, aligned); |
| } |
| |
| #ifdef HWY_NATIVE_LOAD_N |
| #undef HWY_NATIVE_LOAD_N |
| #else |
| #define HWY_NATIVE_LOAD_N |
| #endif |
| |
| template <class D, typename T = TFromD<D>> |
| HWY_API VFromD<D> LoadN(D d, const T* HWY_RESTRICT p, |
| size_t max_lanes_to_load) { |
| return (max_lanes_to_load > 0) ? Load(d, p) : Zero(d); |
| } |
| |
| template <class D, typename T = TFromD<D>> |
| HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const T* HWY_RESTRICT p, |
| size_t max_lanes_to_load) { |
| return (max_lanes_to_load > 0) ? Load(d, p) : no; |
| } |
| |
| // ------------------------------ Store |
| |
| template <class D, typename T = TFromD<D>> |
| HWY_API void Store(const Vec1<T> v, D /* tag */, T* HWY_RESTRICT aligned) { |
| CopySameSize(&v.raw, aligned); |
| } |
| |
| template <class D, typename T = TFromD<D>> |
| HWY_API void StoreU(const Vec1<T> v, D d, T* HWY_RESTRICT p) { |
| return Store(v, d, p); |
| } |
| |
| template <class D, typename T = TFromD<D>> |
| HWY_API void BlendedStore(const Vec1<T> v, Mask1<T> m, D d, T* HWY_RESTRICT p) { |
| if (!m.bits) return; |
| StoreU(v, d, p); |
| } |
| |
| #ifdef HWY_NATIVE_STORE_N |
| #undef HWY_NATIVE_STORE_N |
| #else |
| #define HWY_NATIVE_STORE_N |
| #endif |
| |
| template <class D, typename T = TFromD<D>> |
| HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p, |
| size_t max_lanes_to_store) { |
| if (max_lanes_to_store > 0) { |
| Store(v, d, p); |
| } |
| } |
| |
| // ------------------------------ Tuples |
| #include "third_party/highway/hwy/ops/inside-inl.h" |
| |
| // ------------------------------ LoadInterleaved2/3/4 |
| |
| // Per-target flag to prevent generic_ops-inl.h from defining StoreInterleaved2. |
| #ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED |
| #undef HWY_NATIVE_LOAD_STORE_INTERLEAVED |
| #else |
| #define HWY_NATIVE_LOAD_STORE_INTERLEAVED |
| #endif |
| |
| template <class D, typename T = TFromD<D>> |
| HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned, Vec1<T>& v0, |
| Vec1<T>& v1) { |
| v0 = LoadU(d, unaligned + 0); |
| v1 = LoadU(d, unaligned + 1); |
| } |
| |
| template <class D, typename T = TFromD<D>> |
| HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned, Vec1<T>& v0, |
| Vec1<T>& v1, Vec1<T>& v2) { |
| v0 = LoadU(d, unaligned + 0); |
| v1 = LoadU(d, unaligned + 1); |
| v2 = LoadU(d, unaligned + 2); |
| } |
| |
| template <class D, typename T = TFromD<D>> |
| HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned, Vec1<T>& v0, |
| Vec1<T>& v1, Vec1<T>& v2, Vec1<T>& v3) { |
| v0 = LoadU(d, unaligned + 0); |
| v1 = LoadU(d, unaligned + 1); |
| v2 = LoadU(d, unaligned + 2); |
| v3 = LoadU(d, unaligned + 3); |
| } |
| |
| // ------------------------------ StoreInterleaved2/3/4 |
| |
| template <class D, typename T = TFromD<D>> |
| HWY_API void StoreInterleaved2(const Vec1<T> v0, const Vec1<T> v1, D d, |
| T* HWY_RESTRICT unaligned) { |
| StoreU(v0, d, unaligned + 0); |
| StoreU(v1, d, unaligned + 1); |
| } |
| |
| template <class D, typename T = TFromD<D>> |
| HWY_API void StoreInterleaved3(const Vec1<T> v0, const Vec1<T> v1, |
| const Vec1<T> v2, D d, |
| T* HWY_RESTRICT unaligned) { |
| StoreU(v0, d, unaligned + 0); |
| StoreU(v1, d, unaligned + 1); |
| StoreU(v2, d, unaligned + 2); |
| } |
| |
| template <class D, typename T = TFromD<D>> |
| HWY_API void StoreInterleaved4(const Vec1<T> v0, const Vec1<T> v1, |
| const Vec1<T> v2, const Vec1<T> v3, D d, |
| T* HWY_RESTRICT unaligned) { |
| StoreU(v0, d, unaligned + 0); |
| StoreU(v1, d, unaligned + 1); |
| StoreU(v2, d, unaligned + 2); |
| StoreU(v3, d, unaligned + 3); |
| } |
| |
| // ------------------------------ Stream |
| |
| template <class D, typename T = TFromD<D>> |
| HWY_API void Stream(const Vec1<T> v, D d, T* HWY_RESTRICT aligned) { |
| return Store(v, d, aligned); |
| } |
| |
| // ------------------------------ Scatter |
| |
| #ifdef HWY_NATIVE_SCATTER |
| #undef HWY_NATIVE_SCATTER |
| #else |
| #define HWY_NATIVE_SCATTER |
| #endif |
| |
| template <class D, typename T = TFromD<D>, typename TI> |
| HWY_API void ScatterOffset(Vec1<T> v, D d, T* base, Vec1<TI> offset) { |
| static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); |
| const intptr_t addr = |
| reinterpret_cast<intptr_t>(base) + static_cast<intptr_t>(offset.raw); |
| Store(v, d, reinterpret_cast<T*>(addr)); |
| } |
| |
| template <class D, typename T = TFromD<D>, typename TI> |
| HWY_API void ScatterIndex(Vec1<T> v, D d, T* HWY_RESTRICT base, |
| Vec1<TI> index) { |
| static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); |
| Store(v, d, base + index.raw); |
| } |
| |
| template <class D, typename T = TFromD<D>, typename TI> |
| HWY_API void MaskedScatterIndex(Vec1<T> v, Mask1<T> m, D d, |
| T* HWY_RESTRICT base, Vec1<TI> index) { |
| static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); |
| if (m.bits) Store(v, d, base + index.raw); |
| } |
| |
| // ------------------------------ Gather |
| |
| #ifdef HWY_NATIVE_GATHER |
| #undef HWY_NATIVE_GATHER |
| #else |
| #define HWY_NATIVE_GATHER |
| #endif |
| |
| template <class D, typename T = TFromD<D>> |
| HWY_API Vec1<T> GatherOffset(D d, const T* base, Vec1<MakeSigned<T>> offset) { |
| HWY_DASSERT(offset.raw >= 0); |
| const intptr_t addr = |
| reinterpret_cast<intptr_t>(base) + static_cast<intptr_t>(offset.raw); |
| return Load(d, reinterpret_cast<const T*>(addr)); |
| } |
| |
| template <class D, typename T = TFromD<D>> |
| HWY_API Vec1<T> GatherIndex(D d, const T* HWY_RESTRICT base, |
| Vec1<MakeSigned<T>> index) { |
| HWY_DASSERT(index.raw >= 0); |
| return Load(d, base + index.raw); |
| } |
| |
| template <class D, typename T = TFromD<D>> |
| HWY_API Vec1<T> MaskedGatherIndex(Mask1<T> m, D d, const T* HWY_RESTRICT base, |
| Vec1<MakeSigned<T>> index) { |
| HWY_DASSERT(index.raw >= 0); |
| return MaskedLoad(m, d, base + index.raw); |
| } |
| |
| template <class D, typename T = TFromD<D>> |
| HWY_API Vec1<T> MaskedGatherIndexOr(Vec1<T> no, Mask1<T> m, D d, |
| const T* HWY_RESTRICT base, |
| Vec1<MakeSigned<T>> index) { |
| HWY_DASSERT(index.raw >= 0); |
| return MaskedLoadOr(no, m, d, base + index.raw); |
| } |
| |
| // ================================================== CONVERT |
| |
| // ConvertTo and DemoteTo with floating-point input and integer output truncate |
| // (rounding toward zero). |
| |
| namespace detail { |
| |
| template <class ToT, class FromT> |
| HWY_INLINE ToT CastValueForF2IConv(FromT val) { |
| // Prevent ubsan errors when converting float to narrower integer |
| |
| using FromTU = MakeUnsigned<FromT>; |
| using ToTU = MakeUnsigned<ToT>; |
| |
| constexpr unsigned kMaxExpField = |
| static_cast<unsigned>(MaxExponentField<FromT>()); |
| constexpr unsigned kExpBias = kMaxExpField >> 1; |
| constexpr unsigned kMinOutOfRangeExpField = static_cast<unsigned>(HWY_MIN( |
| kExpBias + sizeof(ToT) * 8 - static_cast<unsigned>(IsSigned<ToT>()), |
| kMaxExpField)); |
| |
| // If ToT is signed, compare only the exponent bits of val against |
| // kMinOutOfRangeExpField. |
| // |
| // Otherwise, if ToT is unsigned, compare the sign bit plus exponent bits of |
| // val against kMinOutOfRangeExpField as a negative value is outside of the |
| // range of an unsigned integer type. |
| const FromT val_to_compare = |
| static_cast<FromT>(IsSigned<ToT>() ? ScalarAbs(val) : val); |
| |
| // val is within the range of ToT if |
| // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is less |
| // than kMinOutOfRangeExpField |
| // |
| // Otherwise, val is either outside of the range of ToT or equal to |
| // LimitsMin<ToT>() if |
| // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is greater |
| // than or equal to kMinOutOfRangeExpField. |
| |
| return (static_cast<unsigned>(BitCastScalar<FromTU>(val_to_compare) >> |
| MantissaBits<FromT>()) < kMinOutOfRangeExpField) |
| ? static_cast<ToT>(val) |
| : static_cast<ToT>(static_cast<ToTU>(LimitsMax<ToT>()) + |
| static_cast<ToTU>(ScalarSignBit(val))); |
| } |
| |
| template <class ToT, class ToTypeTag, class FromT> |
| HWY_INLINE ToT CastValueForPromoteTo(ToTypeTag /* to_type_tag */, FromT val) { |
| return ConvertScalarTo<ToT>(val); |
| } |
| |
| template <class ToT> |
| HWY_INLINE ToT CastValueForPromoteTo(hwy::SignedTag /*to_type_tag*/, |
| float val) { |
| return CastValueForF2IConv<ToT>(val); |
| } |
| |
| template <class ToT> |
| HWY_INLINE ToT CastValueForPromoteTo(hwy::UnsignedTag /*to_type_tag*/, |
| float val) { |
| return CastValueForF2IConv<ToT>(val); |
| } |
| |
| // If val is within the range of ToT, CastValueForInRangeF2IConv<ToT>(val) |
| // returns static_cast<ToT>(val) |
| // |
| // Otherwise, CastValueForInRangeF2IConv<ToT>(val) returns an |
| // implementation-defined result if val is not within the range of ToT. |
| template <class ToT, class FromT> |
| HWY_INLINE ToT CastValueForInRangeF2IConv(FromT val) { |
| // Prevent ubsan errors when converting float to narrower integer |
| |
| using FromTU = MakeUnsigned<FromT>; |
| |
| constexpr unsigned kMaxExpField = |
| static_cast<unsigned>(MaxExponentField<FromT>()); |
| constexpr unsigned kExpBias = kMaxExpField >> 1; |
| constexpr unsigned kMinOutOfRangeExpField = static_cast<unsigned>(HWY_MIN( |
| kExpBias + sizeof(ToT) * 8 - static_cast<unsigned>(IsSigned<ToT>()), |
| kMaxExpField)); |
| |
| // If ToT is signed, compare only the exponent bits of val against |
| // kMinOutOfRangeExpField. |
| // |
| // Otherwise, if ToT is unsigned, compare the sign bit plus exponent bits of |
| // val against kMinOutOfRangeExpField as a negative value is outside of the |
| // range of an unsigned integer type. |
| const FromT val_to_compare = |
| static_cast<FromT>(IsSigned<ToT>() ? ScalarAbs(val) : val); |
| |
| // val is within the range of ToT if |
| // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is less |
| // than kMinOutOfRangeExpField |
| // |
| // Otherwise, val is either outside of the range of ToT or equal to |
| // LimitsMin<ToT>() if |
| // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is greater |
| // than or equal to kMinOutOfRangeExpField. |
| |
| return (static_cast<unsigned>(BitCastScalar<FromTU>(val_to_compare) >> |
| MantissaBits<FromT>()) < kMinOutOfRangeExpField) |
| ? static_cast<ToT>(val) |
| : static_cast<ToT>(LimitsMin<ToT>()); |
| } |
| |
| } // namespace detail |
| |
| #ifdef HWY_NATIVE_PROMOTE_F16_TO_F64 |
| #undef HWY_NATIVE_PROMOTE_F16_TO_F64 |
| #else |
| #define HWY_NATIVE_PROMOTE_F16_TO_F64 |
| #endif |
| |
| template <class DTo, typename TTo = TFromD<DTo>, typename TFrom> |
| HWY_API Vec1<TTo> PromoteTo(DTo /* tag */, Vec1<TFrom> from) { |
| static_assert(sizeof(TTo) > sizeof(TFrom), "Not promoting"); |
| // For bits Y > X, floatX->floatY and intX->intY are always representable. |
| return Vec1<TTo>( |
| detail::CastValueForPromoteTo<TTo>(hwy::TypeTag<TTo>(), from.raw)); |
| } |
| |
| #ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO |
| #undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO |
| #else |
| #define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO |
| #endif |
| |
| template <class DTo, HWY_IF_UI64_D(DTo)> |
| HWY_API VFromD<DTo> PromoteInRangeTo(DTo /* tag */, Vec1<float> from) { |
| using TTo = TFromD<DTo>; |
| return Vec1<TTo>(detail::CastValueForInRangeF2IConv<TTo>(from.raw)); |
| } |
| |
| // MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(TFrom) is here, |
| // so we overload for TFrom=double and TTo={float,int32_t}. |
| template <class D, HWY_IF_F32_D(D)> |
| HWY_API Vec1<float> DemoteTo(D /* tag */, Vec1<double> from) { |
| // Prevent ubsan errors when converting float to narrower integer/float |
| if (IsInf(from).bits || |
| Abs(from).raw > static_cast<double>(HighestValue<float>())) { |
| return Vec1<float>(ScalarSignBit(from.raw) ? LowestValue<float>() |
| : HighestValue<float>()); |
| } |
| return Vec1<float>(static_cast<float>(from.raw)); |
| } |
| template <class D, HWY_IF_UI32_D(D)> |
| HWY_API VFromD<D> DemoteTo(D /* tag */, Vec1<double> from) { |
| // Prevent ubsan errors when converting int32_t to narrower integer/int32_t |
| return Vec1<TFromD<D>>(detail::CastValueForF2IConv<TFromD<D>>(from.raw)); |
| } |
| |
| template <class DTo, typename TTo = TFromD<DTo>, typename TFrom, |
| HWY_IF_SIGNED(TFrom), HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DTo>)> |
| HWY_API Vec1<TTo> DemoteTo(DTo /* tag */, Vec1<TFrom> from) { |
| static_assert(!IsFloat<TFrom>(), "TFrom=double are handled above"); |
| static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting"); |
| |
| // Int to int: choose closest value in TTo to `from` (avoids UB) |
| from.raw = HWY_MIN(HWY_MAX(LimitsMin<TTo>(), from.raw), LimitsMax<TTo>()); |
| return Vec1<TTo>(static_cast<TTo>(from.raw)); |
| } |
| |
| // Disable the default unsigned to signed DemoteTo implementation in |
| // generic_ops-inl.h on SCALAR as the SCALAR target has a target-specific |
| // implementation of the unsigned to signed DemoteTo op and as ReorderDemote2To |
| // is not supported on the SCALAR target |
| |
| // NOTE: hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr is used instead of |
| // hwy::EnableIf<false>* = nullptr to avoid compiler errors since |
| // !hwy::IsSame<V, V>() is always false and as !hwy::IsSame<V, V>() will cause |
| // SFINAE to occur instead of a hard error due to a dependency on the V template |
| // argument |
| #undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V |
| #define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) \ |
| hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr |
| |
| template <class DTo, typename TTo = TFromD<DTo>, typename TFrom, |
| HWY_IF_UNSIGNED(TFrom), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DTo)> |
| HWY_API Vec1<TTo> DemoteTo(DTo /* tag */, Vec1<TFrom> from) { |
| static_assert(!IsFloat<TFrom>(), "TFrom=double are handled above"); |
| static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting"); |
| |
| const auto max = static_cast<MakeUnsigned<TTo>>(LimitsMax<TTo>()); |
| |
| // Int to int: choose closest value in TTo to `from` (avoids UB) |
| return Vec1<TTo>(static_cast<TTo>(HWY_MIN(from.raw, max))); |
| } |
| |
| template <class DTo, typename TTo = TFromD<DTo>, typename TFrom, |
| HWY_IF_UI64(TFrom), HWY_IF_F32_D(DTo)> |
| HWY_API Vec1<TTo> DemoteTo(DTo /* tag */, Vec1<TFrom> from) { |
| // int64_t/uint64_t to float: simply cast to TTo |
| return Vec1<TTo>(static_cast<TTo>(from.raw)); |
| } |
| |
| #ifdef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO |
| #undef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO |
| #else |
| #define HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO |
| #endif |
| |
| template <class D32, HWY_IF_UI32_D(D32)> |
| HWY_API VFromD<D32> DemoteInRangeTo(D32 /*d32*/, |
| VFromD<Rebind<double, D32>> v) { |
| using TTo = TFromD<D32>; |
| return Vec1<TTo>(detail::CastValueForInRangeF2IConv<TTo>(v.raw)); |
| } |
| |
| // Per-target flag to prevent generic_ops-inl.h from defining f16 conversions; |
| // use this scalar version to verify the vector implementation. |
| #ifdef HWY_NATIVE_F16C |
| #undef HWY_NATIVE_F16C |
| #else |
| #define HWY_NATIVE_F16C |
| #endif |
| |
| template <class D, HWY_IF_F32_D(D)> |
| HWY_API Vec1<float> PromoteTo(D /* tag */, const Vec1<float16_t> v) { |
| return Vec1<float>(F32FromF16(v.raw)); |
| } |
| |
| template <class D, HWY_IF_F32_D(D)> |
| HWY_API Vec1<float> PromoteTo(D d, const Vec1<bfloat16_t> v) { |
| return Set(d, F32FromBF16(v.raw)); |
| } |
| |
| template <class DTo, typename TFrom> |
| HWY_API VFromD<DTo> PromoteEvenTo(DTo d_to, Vec1<TFrom> v) { |
| return PromoteTo(d_to, v); |
| } |
| |
| template <class D, HWY_IF_F16_D(D)> |
| HWY_API Vec1<float16_t> DemoteTo(D /* tag */, const Vec1<float> v) { |
| return Vec1<float16_t>(F16FromF32(v.raw)); |
| } |
| |
| #ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16 |
| #undef HWY_NATIVE_DEMOTE_F32_TO_BF16 |
| #else |
| #define HWY_NATIVE_DEMOTE_F32_TO_BF16 |
| #endif |
| |
| template <class D, HWY_IF_BF16_D(D)> |
| HWY_API Vec1<bfloat16_t> DemoteTo(D d, const Vec1<float> v) { |
| return Set(d, BF16FromF32(v.raw)); |
| } |
| |
| template <class DTo, typename TTo = TFromD<DTo>, typename TFrom, |
| HWY_IF_FLOAT(TFrom)> |
| HWY_API Vec1<TTo> ConvertTo(DTo /* tag */, Vec1<TFrom> from) { |
| static_assert(sizeof(TTo) == sizeof(TFrom), "Should have same size"); |
| // float## -> int##: return closest representable value. |
| return Vec1<TTo>(detail::CastValueForF2IConv<TTo>(from.raw)); |
| } |
| |
| template <class DTo, typename TTo = TFromD<DTo>, typename TFrom, |
| HWY_IF_NOT_FLOAT(TFrom)> |
| HWY_API Vec1<TTo> ConvertTo(DTo /* tag */, Vec1<TFrom> from) { |
| static_assert(sizeof(TTo) == sizeof(TFrom), "Should have same size"); |
| // int## -> float##: no check needed |
| return Vec1<TTo>(static_cast<TTo>(from.raw)); |
| } |
| |
| #ifdef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO |
| #undef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO |
| #else |
| #define HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO |
| #endif |
| |
| template <class DI, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DI), |
| HWY_IF_T_SIZE_ONE_OF_D(DI, (1 << 4) | (1 << 8))> |
| HWY_API VFromD<DI> ConvertInRangeTo(DI /*di*/, VFromD<RebindToFloat<DI>> v) { |
| using TTo = TFromD<DI>; |
| return VFromD<DI>(detail::CastValueForInRangeF2IConv<TTo>(v.raw)); |
| } |
| |
| HWY_API Vec1<uint8_t> U8FromU32(const Vec1<uint32_t> v) { |
| return DemoteTo(Sisd<uint8_t>(), v); |
| } |
| |
| // ------------------------------ TruncateTo |
| |
| template <class D, HWY_IF_U8_D(D)> |
| HWY_API Vec1<uint8_t> TruncateTo(D /* tag */, Vec1<uint64_t> v) { |
| return Vec1<uint8_t>{static_cast<uint8_t>(v.raw & 0xFF)}; |
| } |
| |
| template <class D, HWY_IF_U16_D(D)> |
| HWY_API Vec1<uint16_t> TruncateTo(D /* tag */, Vec1<uint64_t> v) { |
| return Vec1<uint16_t>{static_cast<uint16_t>(v.raw & 0xFFFF)}; |
| } |
| |
| template <class D, HWY_IF_U32_D(D)> |
| HWY_API Vec1<uint32_t> TruncateTo(D /* tag */, Vec1<uint64_t> v) { |
| return Vec1<uint32_t>{static_cast<uint32_t>(v.raw & 0xFFFFFFFFu)}; |
| } |
| |
| template <class D, HWY_IF_U8_D(D)> |
| HWY_API Vec1<uint8_t> TruncateTo(D /* tag */, Vec1<uint32_t> v) { |
| return Vec1<uint8_t>{static_cast<uint8_t>(v.raw & 0xFF)}; |
| } |
| |
| template <class D, HWY_IF_U16_D(D)> |
| HWY_API Vec1<uint16_t> TruncateTo(D /* tag */, Vec1<uint32_t> v) { |
| return Vec1<uint16_t>{static_cast<uint16_t>(v.raw & 0xFFFF)}; |
| } |
| |
| template <class D, HWY_IF_U8_D(D)> |
| HWY_API Vec1<uint8_t> TruncateTo(D /* tag */, Vec1<uint16_t> v) { |
| return Vec1<uint8_t>{static_cast<uint8_t>(v.raw & 0xFF)}; |
| } |
| |
| // ================================================== COMBINE |
| // UpperHalf, ZeroExtendVector, Combine, Concat* are unsupported. |
| |
| template <typename T> |
| HWY_API Vec1<T> LowerHalf(Vec1<T> v) { |
| return v; |
| } |
| |
| template <class D, typename T = TFromD<D>> |
| HWY_API Vec1<T> LowerHalf(D /* tag */, Vec1<T> v) { |
| return v; |
| } |
| |
| // ================================================== SWIZZLE |
| |
| template <typename T> |
| HWY_API T GetLane(const Vec1<T> v) { |
| return v.raw; |
| } |
| |
| template <typename T> |
| HWY_API T ExtractLane(const Vec1<T> v, size_t i) { |
| HWY_DASSERT(i == 0); |
| (void)i; |
| return v.raw; |
| } |
| |
| template <typename T> |
| HWY_API Vec1<T> InsertLane(Vec1<T> v, size_t i, T t) { |
| HWY_DASSERT(i == 0); |
| (void)i; |
| v.raw = t; |
| return v; |
| } |
| |
| template <typename T> |
| HWY_API Vec1<T> DupEven(Vec1<T> v) { |
| return v; |
| } |
| // DupOdd is unsupported. |
| |
| template <typename T> |
| HWY_API Vec1<T> OddEven(Vec1<T> /* odd */, Vec1<T> even) { |
| return even; |
| } |
| |
| template <typename T> |
| HWY_API Vec1<T> OddEvenBlocks(Vec1<T> /* odd */, Vec1<T> even) { |
| return even; |
| } |
| |
| // ------------------------------ SwapAdjacentBlocks |
| template <typename T> |
| HWY_API Vec1<T> SwapAdjacentBlocks(Vec1<T> v) { |
| return v; |
| } |
| |
| // ------------------------------ InterleaveEvenBlocks |
| template <class D, class V = VFromD<D>> |
| HWY_API V InterleaveEvenBlocks(D, V a, V /*b*/) { |
| return a; |
| } |
| // ------------------------------ InterleaveOddBlocks |
| template <class D, class V = VFromD<D>> |
| HWY_API V InterleaveOddBlocks(D, V a, V /*b*/) { |
| return a; |
| } |
| |
| // ------------------------------ TableLookupLanes |
| |
| // Returned by SetTableIndices for use by TableLookupLanes. |
| template <typename T> |
| struct Indices1 { |
| MakeSigned<T> raw; |
| }; |
| |
| template <class D, typename T = TFromD<D>, typename TI> |
| HWY_API Indices1<T> IndicesFromVec(D, Vec1<TI> vec) { |
| static_assert(sizeof(T) == sizeof(TI), "Index size must match lane size"); |
| HWY_DASSERT(vec.raw <= 1); |
| return Indices1<T>{static_cast<MakeSigned<T>>(vec.raw)}; |
| } |
| |
| template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>, typename TI> |
| HWY_API Indices1<T> SetTableIndices(D d, const TI* idx) { |
| return IndicesFromVec(d, LoadU(Sisd<TI>(), idx)); |
| } |
| |
| template <typename T> |
| HWY_API Vec1<T> TableLookupLanes(const Vec1<T> v, const Indices1<T> /* idx */) { |
| return v; |
| } |
| |
| template <typename T> |
| HWY_API Vec1<T> TwoTablesLookupLanes(const Vec1<T> a, const Vec1<T> b, |
| const Indices1<T> idx) { |
| return (idx.raw == 0) ? a : b; |
| } |
| |
| // ------------------------------ ReverseBlocks |
| |
| // Single block: no change |
| template <class D, typename T = TFromD<D>> |
| HWY_API Vec1<T> ReverseBlocks(D /* tag */, const Vec1<T> v) { |
| return v; |
| } |
| |
| // ------------------------------ Reverse |
| |
| template <class D, typename T = TFromD<D>> |
| HWY_API Vec1<T> Reverse(D /* tag */, const Vec1<T> v) { |
| return v; |
| } |
| |
| // Per-target flag to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8. |
| #ifdef HWY_NATIVE_REVERSE2_8 |
| #undef HWY_NATIVE_REVERSE2_8 |
| #else |
| #define HWY_NATIVE_REVERSE2_8 |
| #endif |
| |
| // Must not be called: |
| template <class D, typename T = TFromD<D>> |
| HWY_API Vec1<T> Reverse2(D /* tag */, const Vec1<T> v) { |
| return v; |
| } |
| |
| template <class D, typename T = TFromD<D>> |
| HWY_API Vec1<T> Reverse4(D /* tag */, const Vec1<T> v) { |
| return v; |
| } |
| |
| template <class D, typename T = TFromD<D>> |
| HWY_API Vec1<T> Reverse8(D /* tag */, const Vec1<T> v) { |
| return v; |
| } |
| |
| // ------------------------------ ReverseLaneBytes |
| |
| #ifdef HWY_NATIVE_REVERSE_LANE_BYTES |
| #undef HWY_NATIVE_REVERSE_LANE_BYTES |
| #else |
| #define HWY_NATIVE_REVERSE_LANE_BYTES |
| #endif |
| |
| HWY_API Vec1<uint16_t> ReverseLaneBytes(Vec1<uint16_t> v) { |
| const uint32_t val{v.raw}; |
| return Vec1<uint16_t>( |
| static_cast<uint16_t>(((val << 8) & 0xFF00u) | ((val >> 8) & 0x00FFu))); |
| } |
| |
| HWY_API Vec1<uint32_t> ReverseLaneBytes(Vec1<uint32_t> v) { |
| const uint32_t val = v.raw; |
| return Vec1<uint32_t>(static_cast<uint32_t>( |
| ((val << 24) & 0xFF000000u) | ((val << 8) & 0x00FF0000u) | |
| ((val >> 8) & 0x0000FF00u) | ((val >> 24) & 0x000000FFu))); |
| } |
| |
| HWY_API Vec1<uint64_t> ReverseLaneBytes(Vec1<uint64_t> v) { |
| const uint64_t val = v.raw; |
| return Vec1<uint64_t>(static_cast<uint64_t>( |
| ((val << 56) & 0xFF00000000000000u) | |
| ((val << 40) & 0x00FF000000000000u) | |
| ((val << 24) & 0x0000FF0000000000u) | ((val << 8) & 0x000000FF00000000u) | |
| ((val >> 8) & 0x00000000FF000000u) | ((val >> 24) & 0x0000000000FF0000u) | |
| ((val >> 40) & 0x000000000000FF00u) | |
| ((val >> 56) & 0x00000000000000FFu))); |
| } |
| |
| template <class V, HWY_IF_SIGNED_V(V), |
| HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 2) | (1 << 4) | (1 << 8))> |
| HWY_API V ReverseLaneBytes(V v) { |
| const DFromV<decltype(v)> d; |
| const RebindToUnsigned<decltype(d)> du; |
| return BitCast(d, ReverseLaneBytes(BitCast(du, v))); |
| } |
| |
| // ------------------------------ ReverseBits |
| #ifdef HWY_NATIVE_REVERSE_BITS_UI8 |
| #undef HWY_NATIVE_REVERSE_BITS_UI8 |
| #else |
| #define HWY_NATIVE_REVERSE_BITS_UI8 |
| #endif |
| |
| #ifdef HWY_NATIVE_REVERSE_BITS_UI16_32_64 |
| #undef HWY_NATIVE_REVERSE_BITS_UI16_32_64 |
| #else |
| #define HWY_NATIVE_REVERSE_BITS_UI16_32_64 |
| #endif |
| |
| namespace detail { |
| |
| template <class T> |
| HWY_INLINE T ReverseBitsOfEachByte(T val) { |
| using TU = MakeUnsigned<T>; |
| constexpr TU kMaxUnsignedVal{LimitsMax<TU>()}; |
| constexpr TU kShrMask1 = |
| static_cast<TU>(0x5555555555555555u & kMaxUnsignedVal); |
| constexpr TU kShrMask2 = |
| static_cast<TU>(0x3333333333333333u & kMaxUnsignedVal); |
| constexpr TU kShrMask3 = |
| static_cast<TU>(0x0F0F0F0F0F0F0F0Fu & kMaxUnsignedVal); |
| |
| constexpr TU kShlMask1 = static_cast<TU>(~kShrMask1); |
| constexpr TU kShlMask2 = static_cast<TU>(~kShrMask2); |
| constexpr TU kShlMask3 = static_cast<TU>(~kShrMask3); |
| |
| TU result = static_cast<TU>(val); |
| result = static_cast<TU>(((result << 1) & kShlMask1) | |
| ((result >> 1) & kShrMask1)); |
| result = static_cast<TU>(((result << 2) & kShlMask2) | |
| ((result >> 2) & kShrMask2)); |
| result = static_cast<TU>(((result << 4) & kShlMask3) | |
| ((result >> 4) & kShrMask3)); |
| return static_cast<T>(result); |
| } |
| |
| } // namespace detail |
| |
| template <class V, HWY_IF_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, 1)> |
| HWY_API V ReverseBits(V v) { |
| return V(detail::ReverseBitsOfEachByte(v.raw)); |
| } |
| |
| template <class V, HWY_IF_UNSIGNED_V(V), |
| HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 2) | (1 << 4) | (1 << 8))> |
| HWY_API V ReverseBits(V v) { |
| return ReverseLaneBytes(V(detail::ReverseBitsOfEachByte(v.raw))); |
| } |
| |
| template <class V, HWY_IF_SIGNED_V(V)> |
| HWY_API V ReverseBits(V v) { |
| const DFromV<decltype(v)> d; |
| const RebindToUnsigned<decltype(d)> du; |
| return BitCast(d, ReverseBits(BitCast(du, v))); |
| } |
| |
| // ------------------------------ SlideUpLanes |
| |
| template <typename D> |
| HWY_API VFromD<D> SlideUpLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) { |
| return v; |
| } |
| |
| // ------------------------------ SlideDownLanes |
| |
| template <typename D> |
| HWY_API VFromD<D> SlideDownLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) { |
| return v; |
| } |
| |
| // ================================================== BLOCKWISE |
| // Shift*Bytes, CombineShiftRightBytes, Interleave*, Shuffle* are unsupported. |
| |
| // ------------------------------ Broadcast/splat any lane |
| |
| template <int kLane, typename T> |
| HWY_API Vec1<T> Broadcast(const Vec1<T> v) { |
| static_assert(kLane == 0, "Scalar only has one lane"); |
| return v; |
| } |
| |
| // ------------------------------ TableLookupBytes, TableLookupBytesOr0 |
| |
| template <typename T, typename TI> |
| HWY_API Vec1<TI> TableLookupBytes(const Vec1<T> in, const Vec1<TI> indices) { |
| uint8_t in_bytes[sizeof(T)]; |
| uint8_t idx_bytes[sizeof(T)]; |
| uint8_t out_bytes[sizeof(T)]; |
| CopyBytes<sizeof(T)>(&in, &in_bytes); // copy to bytes |
| CopyBytes<sizeof(T)>(&indices, &idx_bytes); |
| for (size_t i = 0; i < sizeof(T); ++i) { |
| out_bytes[i] = in_bytes[idx_bytes[i]]; |
| } |
| TI out; |
| CopyBytes<sizeof(TI)>(&out_bytes, &out); |
| return Vec1<TI>{out}; |
| } |
| |
| template <typename T, typename TI> |
| HWY_API Vec1<TI> TableLookupBytesOr0(const Vec1<T> in, const Vec1<TI> indices) { |
| uint8_t in_bytes[sizeof(T)]; |
| uint8_t idx_bytes[sizeof(T)]; |
| uint8_t out_bytes[sizeof(T)]; |
| CopyBytes<sizeof(T)>(&in, &in_bytes); // copy to bytes |
| CopyBytes<sizeof(T)>(&indices, &idx_bytes); |
| for (size_t i = 0; i < sizeof(T); ++i) { |
| out_bytes[i] = idx_bytes[i] & 0x80 ? 0 : in_bytes[idx_bytes[i]]; |
| } |
| TI out; |
| CopyBytes<sizeof(TI)>(&out_bytes, &out); |
| return Vec1<TI>{out}; |
| } |
| |
| // ------------------------------ ZipLower |
| |
| HWY_API Vec1<uint16_t> ZipLower(Vec1<uint8_t> a, Vec1<uint8_t> b) { |
| return Vec1<uint16_t>(static_cast<uint16_t>((uint32_t{b.raw} << 8) + a.raw)); |
| } |
| HWY_API Vec1<uint32_t> ZipLower(Vec1<uint16_t> a, Vec1<uint16_t> b) { |
| return Vec1<uint32_t>((uint32_t{b.raw} << 16) + a.raw); |
| } |
| HWY_API Vec1<uint64_t> ZipLower(Vec1<uint32_t> a, Vec1<uint32_t> b) { |
| return Vec1<uint64_t>((uint64_t{b.raw} << 32) + a.raw); |
| } |
| HWY_API Vec1<int16_t> ZipLower(Vec1<int8_t> a, Vec1<int8_t> b) { |
| return Vec1<int16_t>(static_cast<int16_t>((int32_t{b.raw} << 8) + a.raw)); |
| } |
| HWY_API Vec1<int32_t> ZipLower(Vec1<int16_t> a, Vec1<int16_t> b) { |
| return Vec1<int32_t>((int32_t{b.raw} << 16) + a.raw); |
| } |
| HWY_API Vec1<int64_t> ZipLower(Vec1<int32_t> a, Vec1<int32_t> b) { |
| return Vec1<int64_t>((int64_t{b.raw} << 32) + a.raw); |
| } |
| |
| template <class DW, typename TW = TFromD<DW>, typename TN = MakeNarrow<TW>> |
| HWY_API Vec1<TW> ZipLower(DW /* tag */, Vec1<TN> a, Vec1<TN> b) { |
| return Vec1<TW>(static_cast<TW>((TW{b.raw} << (sizeof(TN) * 8)) + a.raw)); |
| } |
| |
| // ================================================== MASK |
| |
| template <class D, typename T = TFromD<D>> |
| HWY_API bool AllFalse(D /* tag */, const Mask1<T> mask) { |
| return mask.bits == 0; |
| } |
| |
| template <class D, typename T = TFromD<D>> |
| HWY_API bool AllTrue(D /* tag */, const Mask1<T> mask) { |
| return mask.bits != 0; |
| } |
| |
| // `p` points to at least 8 readable bytes, not all of which need be valid. |
| template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>> |
| HWY_API Mask1<T> LoadMaskBits(D /* tag */, const uint8_t* HWY_RESTRICT bits) { |
| return Mask1<T>::FromBool((bits[0] & 1) != 0); |
| } |
| |
| template <class D, HWY_IF_LANES_D(D, 1)> |
| HWY_API MFromD<D> Dup128MaskFromMaskBits(D /*d*/, unsigned mask_bits) { |
| return MFromD<D>::FromBool((mask_bits & 1) != 0); |
| } |
| |
| // `p` points to at least 8 writable bytes. |
| template <class D, typename T = TFromD<D>> |
| HWY_API size_t StoreMaskBits(D d, const Mask1<T> mask, uint8_t* bits) { |
| *bits = AllTrue(d, mask); |
| return 1; |
| } |
| |
| template <class D, typename T = TFromD<D>> |
| HWY_API size_t CountTrue(D /* tag */, const Mask1<T> mask) { |
| return mask.bits == 0 ? 0 : 1; |
| } |
| |
| template <class D, typename T = TFromD<D>> |
| HWY_API intptr_t FindFirstTrue(D /* tag */, const Mask1<T> mask) { |
| return mask.bits == 0 ? -1 : 0; |
| } |
| |
| template <class D, typename T = TFromD<D>> |
| HWY_API size_t FindKnownFirstTrue(D /* tag */, const Mask1<T> /* m */) { |
| return 0; // There is only one lane and we know it is true. |
| } |
| |
| template <class D, typename T = TFromD<D>> |
| HWY_API intptr_t FindLastTrue(D /* tag */, const Mask1<T> mask) { |
| return mask.bits == 0 ? -1 : 0; |
| } |
| |
| template <class D, typename T = TFromD<D>> |
| HWY_API size_t FindKnownLastTrue(D /* tag */, const Mask1<T> /* m */) { |
| return 0; // There is only one lane and we know it is true. |
| } |
| |
| // ------------------------------ Compress, CompressBits |
| |
| template <typename T> |
| struct CompressIsPartition { |
| enum { value = 1 }; |
| }; |
| |
| template <typename T> |
| HWY_API Vec1<T> Compress(Vec1<T> v, const Mask1<T> /* mask */) { |
| // A single lane is already partitioned by definition. |
| return v; |
| } |
| |
| template <typename T> |
| HWY_API Vec1<T> CompressNot(Vec1<T> v, const Mask1<T> /* mask */) { |
| // A single lane is already partitioned by definition. |
| return v; |
| } |
| |
| // ------------------------------ CompressStore |
| template <class D, typename T = TFromD<D>> |
| HWY_API size_t CompressStore(Vec1<T> v, const Mask1<T> mask, D d, |
| T* HWY_RESTRICT unaligned) { |
| StoreU(Compress(v, mask), d, unaligned); |
| return CountTrue(d, mask); |
| } |
| |
| // ------------------------------ CompressBlendedStore |
| template <class D, typename T = TFromD<D>> |
| HWY_API size_t CompressBlendedStore(Vec1<T> v, const Mask1<T> mask, D d, |
| T* HWY_RESTRICT unaligned) { |
| if (!mask.bits) return 0; |
| StoreU(v, d, unaligned); |
| return 1; |
| } |
| |
| // ------------------------------ CompressBits |
| template <typename T> |
| HWY_API Vec1<T> CompressBits(Vec1<T> v, const uint8_t* HWY_RESTRICT /*bits*/) { |
| return v; |
| } |
| |
| // ------------------------------ CompressBitsStore |
| template <class D, typename T = TFromD<D>> |
| HWY_API size_t CompressBitsStore(Vec1<T> v, const uint8_t* HWY_RESTRICT bits, |
| D d, T* HWY_RESTRICT unaligned) { |
| const Mask1<T> mask = LoadMaskBits(d, bits); |
| StoreU(Compress(v, mask), d, unaligned); |
| return CountTrue(d, mask); |
| } |
| |
| // ------------------------------ Expand |
| |
| // generic_ops-inl.h requires Vec64/128, so implement [Load]Expand here. |
| #ifdef HWY_NATIVE_EXPAND |
| #undef HWY_NATIVE_EXPAND |
| #else |
| #define HWY_NATIVE_EXPAND |
| #endif |
| |
| template <typename T> |
| HWY_API Vec1<T> Expand(Vec1<T> v, const Mask1<T> mask) { |
| return IfThenElseZero(mask, v); |
| } |
| |
| // ------------------------------ LoadExpand |
| template <class D> |
| HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d, |
| const TFromD<D>* HWY_RESTRICT unaligned) { |
| return MaskedLoad(mask, d, unaligned); |
| } |
| |
| // ------------------------------ WidenMulPairwiseAdd |
| |
| template <class D32, HWY_IF_F32_D(D32)> |
| HWY_API Vec1<float> WidenMulPairwiseAdd(D32 /* tag */, Vec1<bfloat16_t> a, |
| Vec1<bfloat16_t> b) { |
| return Vec1<float>(F32FromBF16(a.raw)) * Vec1<float>(F32FromBF16(b.raw)); |
| } |
| |
| template <class D32, HWY_IF_I32_D(D32)> |
| HWY_API Vec1<int32_t> WidenMulPairwiseAdd(D32 /* tag */, Vec1<int16_t> a, |
| Vec1<int16_t> b) { |
| return Vec1<int32_t>(a.raw * b.raw); |
| } |
| |
| // ------------------------------ SatWidenMulAccumFixedPoint |
| #ifdef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT |
| #undef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT |
| #else |
| #define HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT |
| #endif |
| |
| template <class DI32, HWY_IF_I32_D(DI32)> |
| HWY_API VFromD<DI32> SatWidenMulAccumFixedPoint(DI32 di32, |
| VFromD<Rebind<int16_t, DI32>> a, |
| VFromD<Rebind<int16_t, DI32>> b, |
| VFromD<DI32> sum) { |
| // Multiplying static_cast<int32_t>(a.raw) by static_cast<int32_t>(b.raw) |
| // followed by an addition of the product is okay as |
| // (a.raw * b.raw * 2) is between -2147418112 and 2147483648 and as |
| // a.raw * b.raw * 2 can only overflow an int32_t if both a.raw and b.raw are |
| // equal to -32768. |
| |
| const VFromD<DI32> product(static_cast<int32_t>(a.raw) * |
| static_cast<int32_t>(b.raw)); |
| const VFromD<DI32> product2 = Add(product, product); |
| |
| const auto mul_overflow = |
| VecFromMask(di32, Eq(product2, Set(di32, LimitsMin<int32_t>()))); |
| |
| return SaturatedAdd(Sub(sum, And(BroadcastSignBit(sum), mul_overflow)), |
| Add(product2, mul_overflow)); |
| } |
| |
| // ------------------------------ SatWidenMulPairwiseAdd |
| |
| #ifdef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD |
| #undef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD |
| #else |
| #define HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD |
| #endif |
| |
| template <class DI16, HWY_IF_I16_D(DI16)> |
| HWY_API Vec1<int16_t> SatWidenMulPairwiseAdd(DI16 /* tag */, Vec1<uint8_t> a, |
| Vec1<int8_t> b) { |
| // Saturation of a.raw * b.raw is not needed on the HWY_SCALAR target as the |
| // input vectors only have 1 lane on the HWY_SCALAR target and as |
| // a.raw * b.raw is between -32640 and 32385, which is already within the |
| // range of an int16_t. |
| |
| // On other targets, a saturated addition of a[0]*b[0] + a[1]*b[1] is needed |
| // as it is possible for the addition of a[0]*b[0] + a[1]*b[1] to overflow if |
| // a[0], a[1], b[0], and b[1] are all non-zero and b[0] and b[1] both have the |
| // same sign. |
| |
| return Vec1<int16_t>(static_cast<int16_t>(a.raw) * |
| static_cast<int16_t>(b.raw)); |
| } |
| |
| // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) |
| |
| #ifdef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16 |
| #undef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16 |
| #else |
| #define HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16 |
| #endif |
| |
| template <class D32, HWY_IF_F32_D(D32)> |
| HWY_API Vec1<float> ReorderWidenMulAccumulate(D32 /* tag */, Vec1<bfloat16_t> a, |
| Vec1<bfloat16_t> b, |
| const Vec1<float> sum0, |
| Vec1<float>& /* sum1 */) { |
| return MulAdd(Vec1<float>(F32FromBF16(a.raw)), |
| Vec1<float>(F32FromBF16(b.raw)), sum0); |
| } |
| |
| template <class D32, HWY_IF_I32_D(D32)> |
| HWY_API Vec1<int32_t> ReorderWidenMulAccumulate(D32 /* tag */, Vec1<int16_t> a, |
| Vec1<int16_t> b, |
| const Vec1<int32_t> sum0, |
| Vec1<int32_t>& /* sum1 */) { |
| return Vec1<int32_t>(a.raw * b.raw + sum0.raw); |
| } |
| |
| template <class DU32, HWY_IF_U32_D(DU32)> |
| HWY_API Vec1<uint32_t> ReorderWidenMulAccumulate(DU32 /* tag */, |
| Vec1<uint16_t> a, |
| Vec1<uint16_t> b, |
| const Vec1<uint32_t> sum0, |
| Vec1<uint32_t>& /* sum1 */) { |
| return Vec1<uint32_t>(static_cast<uint32_t>(a.raw) * b.raw + sum0.raw); |
| } |
| |
| // ------------------------------ RearrangeToOddPlusEven |
| template <typename TW> |
| HWY_API Vec1<TW> RearrangeToOddPlusEven(Vec1<TW> sum0, Vec1<TW> /* sum1 */) { |
| return sum0; // invariant already holds |
| } |
| |
| // ================================================== REDUCTIONS |
| |
| // Nothing native, generic_ops-inl defines SumOfLanes and ReduceSum. |
| |
| // NOLINTNEXTLINE(google-readability-namespace-comments) |
| } // namespace HWY_NAMESPACE |
| } // namespace hwy |
| HWY_AFTER_NAMESPACE(); |