| // Copyright 2023 Google LLC | 
 | // SPDX-License-Identifier: Apache-2.0 | 
 | // | 
 | // Licensed under the Apache License, Version 2.0 (the "License"); | 
 | // you may not use this file except in compliance with the License. | 
 | // You may obtain a copy of the License at | 
 | // | 
 | //      http://www.apache.org/licenses/LICENSE-2.0 | 
 | // | 
 | // Unless required by applicable law or agreed to in writing, software | 
 | // distributed under the License is distributed on an "AS IS" BASIS, | 
 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
 | // See the License for the specific language governing permissions and | 
 | // limitations under the License. | 
 |  | 
 | // Must be included inside an existing include guard, with the following ops | 
 | // already defined: BitCast, And, Set, ShiftLeft, ShiftRight, PromoteLowerTo, | 
 | // ConcatEven, ConcatOdd, plus the optional detail::PromoteEvenTo and | 
 | // detail::PromoteOddTo (if implemented in the target-specific header). | 
 |  | 
 | // This is normally set by set_macros-inl.h before this header is included; | 
 | // if not, we are viewing this header standalone. Reduce IDE errors by: | 
 | #if !defined(HWY_NAMESPACE) | 
 | // 1) Defining HWY_IDE so we get syntax highlighting rather than all-gray text. | 
 | #include "third_party/highway/hwy/ops/shared-inl.h" | 
 | // 2) Entering the HWY_NAMESPACE to make definitions from shared-inl.h visible. | 
 | HWY_BEFORE_NAMESPACE(); | 
 | namespace hwy { | 
 | namespace HWY_NAMESPACE { | 
 | #define HWY_INSIDE_END_NAMESPACE | 
 | // 3) Providing a dummy VFromD (usually done by the target-specific header). | 
 | template <class D> | 
 | using VFromD = int; | 
 | template <class D> | 
 | using TFromV = int; | 
 | template <class D> | 
 | struct DFromV {}; | 
 | #endif | 
 |  | 
 | // ------------------------------ Vec/Create/Get/Set2..4 | 
 |  | 
 | // On SVE and RVV, Vec2..4 are aliases to built-in types. Also exclude the | 
 | // fixed-size SVE targets. | 
 | #if HWY_IDE || (!HWY_HAVE_SCALABLE && !HWY_TARGET_IS_SVE) | 
 |  | 
 | // NOTE: these are used inside arm_neon-inl.h, hence they cannot be defined in | 
 | // generic_ops-inl.h, which is included after that. | 
 | template <class D> | 
 | struct Vec2 { | 
 |   VFromD<D> v0; | 
 |   VFromD<D> v1; | 
 | }; | 
 |  | 
 | template <class D> | 
 | struct Vec3 { | 
 |   VFromD<D> v0; | 
 |   VFromD<D> v1; | 
 |   VFromD<D> v2; | 
 | }; | 
 |  | 
 | template <class D> | 
 | struct Vec4 { | 
 |   VFromD<D> v0; | 
 |   VFromD<D> v1; | 
 |   VFromD<D> v2; | 
 |   VFromD<D> v3; | 
 | }; | 
 |  | 
 | // D arg is unused but allows deducing D. | 
 | template <class D> | 
 | HWY_API Vec2<D> Create2(D /* tag */, VFromD<D> v0, VFromD<D> v1) { | 
 |   return Vec2<D>{v0, v1}; | 
 | } | 
 |  | 
 | template <class D> | 
 | HWY_API Vec3<D> Create3(D /* tag */, VFromD<D> v0, VFromD<D> v1, VFromD<D> v2) { | 
 |   return Vec3<D>{v0, v1, v2}; | 
 | } | 
 |  | 
 | template <class D> | 
 | HWY_API Vec4<D> Create4(D /* tag */, VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, | 
 |                         VFromD<D> v3) { | 
 |   return Vec4<D>{v0, v1, v2, v3}; | 
 | } | 
 |  | 
 | template <size_t kIndex, class D> | 
 | HWY_API VFromD<D> Get2(Vec2<D> tuple) { | 
 |   static_assert(kIndex < 2, "Tuple index out of bounds"); | 
 |   return kIndex == 0 ? tuple.v0 : tuple.v1; | 
 | } | 
 |  | 
 | template <size_t kIndex, class D> | 
 | HWY_API VFromD<D> Get3(Vec3<D> tuple) { | 
 |   static_assert(kIndex < 3, "Tuple index out of bounds"); | 
 |   return kIndex == 0 ? tuple.v0 : kIndex == 1 ? tuple.v1 : tuple.v2; | 
 | } | 
 |  | 
 | template <size_t kIndex, class D> | 
 | HWY_API VFromD<D> Get4(Vec4<D> tuple) { | 
 |   static_assert(kIndex < 4, "Tuple index out of bounds"); | 
 |   return kIndex == 0   ? tuple.v0 | 
 |          : kIndex == 1 ? tuple.v1 | 
 |          : kIndex == 2 ? tuple.v2 | 
 |                        : tuple.v3; | 
 | } | 
 |  | 
 | template <size_t kIndex, class D> | 
 | HWY_API Vec2<D> Set2(Vec2<D> tuple, VFromD<D> val) { | 
 |   static_assert(kIndex < 2, "Tuple index out of bounds"); | 
 |   if (kIndex == 0) { | 
 |     tuple.v0 = val; | 
 |   } else { | 
 |     tuple.v1 = val; | 
 |   } | 
 |   return tuple; | 
 | } | 
 |  | 
 | template <size_t kIndex, class D> | 
 | HWY_API Vec3<D> Set3(Vec3<D> tuple, VFromD<D> val) { | 
 |   static_assert(kIndex < 3, "Tuple index out of bounds"); | 
 |   if (kIndex == 0) { | 
 |     tuple.v0 = val; | 
 |   } else if (kIndex == 1) { | 
 |     tuple.v1 = val; | 
 |   } else { | 
 |     tuple.v2 = val; | 
 |   } | 
 |   return tuple; | 
 | } | 
 |  | 
 | template <size_t kIndex, class D> | 
 | HWY_API Vec4<D> Set4(Vec4<D> tuple, VFromD<D> val) { | 
 |   static_assert(kIndex < 4, "Tuple index out of bounds"); | 
 |   if (kIndex == 0) { | 
 |     tuple.v0 = val; | 
 |   } else if (kIndex == 1) { | 
 |     tuple.v1 = val; | 
 |   } else if (kIndex == 2) { | 
 |     tuple.v2 = val; | 
 |   } else { | 
 |     tuple.v3 = val; | 
 |   } | 
 |   return tuple; | 
 | } | 
 |  | 
 | #endif  // !HWY_HAVE_SCALABLE || HWY_IDE | 
 |  | 
 | // ------------------------------ Rol/Ror (And, Or, Neg, Shl, Shr) | 
 | #if (defined(HWY_NATIVE_ROL_ROR_8) == defined(HWY_TARGET_TOGGLE)) | 
 | #ifdef HWY_NATIVE_ROL_ROR_8 | 
 | #undef HWY_NATIVE_ROL_ROR_8 | 
 | #else | 
 | #define HWY_NATIVE_ROL_ROR_8 | 
 | #endif | 
 |  | 
 | template <class V, HWY_IF_UI8(TFromV<V>)> | 
 | HWY_API V Rol(V a, V b) { | 
 |   const DFromV<decltype(a)> d; | 
 |   const RebindToSigned<decltype(d)> di; | 
 |   const RebindToUnsigned<decltype(d)> du; | 
 |  | 
 |   const auto shift_amt_mask = Set(du, uint8_t{7}); | 
 |   const auto shl_amt = And(BitCast(du, b), shift_amt_mask); | 
 |   const auto shr_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask); | 
 |  | 
 |   const auto vu = BitCast(du, a); | 
 |   return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt))); | 
 | } | 
 |  | 
 | template <class V, HWY_IF_UI8(TFromV<V>)> | 
 | HWY_API V Ror(V a, V b) { | 
 |   const DFromV<decltype(a)> d; | 
 |   const RebindToSigned<decltype(d)> di; | 
 |   const RebindToUnsigned<decltype(d)> du; | 
 |  | 
 |   const auto shift_amt_mask = Set(du, uint8_t{7}); | 
 |   const auto shr_amt = And(BitCast(du, b), shift_amt_mask); | 
 |   const auto shl_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask); | 
 |  | 
 |   const auto vu = BitCast(du, a); | 
 |   return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt))); | 
 | } | 
 |  | 
 | #endif  // HWY_NATIVE_ROL_ROR_8 | 
 |  | 
 | #if (defined(HWY_NATIVE_ROL_ROR_16) == defined(HWY_TARGET_TOGGLE)) | 
 | #ifdef HWY_NATIVE_ROL_ROR_16 | 
 | #undef HWY_NATIVE_ROL_ROR_16 | 
 | #else | 
 | #define HWY_NATIVE_ROL_ROR_16 | 
 | #endif | 
 |  | 
 | template <class V, HWY_IF_UI16(TFromV<V>)> | 
 | HWY_API V Rol(V a, V b) { | 
 |   const DFromV<decltype(a)> d; | 
 |   const RebindToSigned<decltype(d)> di; | 
 |   const RebindToUnsigned<decltype(d)> du; | 
 |  | 
 |   const auto shift_amt_mask = Set(du, uint16_t{15}); | 
 |   const auto shl_amt = And(BitCast(du, b), shift_amt_mask); | 
 |   const auto shr_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask); | 
 |  | 
 |   const auto vu = BitCast(du, a); | 
 |   return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt))); | 
 | } | 
 |  | 
 | template <class V, HWY_IF_UI16(TFromV<V>)> | 
 | HWY_API V Ror(V a, V b) { | 
 |   const DFromV<decltype(a)> d; | 
 |   const RebindToSigned<decltype(d)> di; | 
 |   const RebindToUnsigned<decltype(d)> du; | 
 |  | 
 |   const auto shift_amt_mask = Set(du, uint16_t{15}); | 
 |   const auto shr_amt = And(BitCast(du, b), shift_amt_mask); | 
 |   const auto shl_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask); | 
 |  | 
 |   const auto vu = BitCast(du, a); | 
 |   return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt))); | 
 | } | 
 |  | 
 | #endif  // HWY_NATIVE_ROL_ROR_16 | 
 |  | 
 | #if (defined(HWY_NATIVE_ROL_ROR_32_64) == defined(HWY_TARGET_TOGGLE)) | 
 | #ifdef HWY_NATIVE_ROL_ROR_32_64 | 
 | #undef HWY_NATIVE_ROL_ROR_32_64 | 
 | #else | 
 | #define HWY_NATIVE_ROL_ROR_32_64 | 
 | #endif | 
 |  | 
 | template <class V, HWY_IF_UI32(TFromV<V>)> | 
 | HWY_API V Rol(V a, V b) { | 
 |   const DFromV<decltype(a)> d; | 
 |   const RebindToSigned<decltype(d)> di; | 
 |   const RebindToUnsigned<decltype(d)> du; | 
 |  | 
 |   const auto shift_amt_mask = Set(du, uint32_t{31}); | 
 |   const auto shl_amt = And(BitCast(du, b), shift_amt_mask); | 
 |   const auto shr_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask); | 
 |  | 
 |   const auto vu = BitCast(du, a); | 
 |   return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt))); | 
 | } | 
 |  | 
 | template <class V, HWY_IF_UI32(TFromV<V>)> | 
 | HWY_API V Ror(V a, V b) { | 
 |   const DFromV<decltype(a)> d; | 
 |   const RebindToSigned<decltype(d)> di; | 
 |   const RebindToUnsigned<decltype(d)> du; | 
 |  | 
 |   const auto shift_amt_mask = Set(du, uint32_t{31}); | 
 |   const auto shr_amt = And(BitCast(du, b), shift_amt_mask); | 
 |   const auto shl_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask); | 
 |  | 
 |   const auto vu = BitCast(du, a); | 
 |   return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt))); | 
 | } | 
 |  | 
 | #if HWY_HAVE_INTEGER64 | 
 | template <class V, HWY_IF_UI64(TFromV<V>)> | 
 | HWY_API V Rol(V a, V b) { | 
 |   const DFromV<decltype(a)> d; | 
 |   const RebindToSigned<decltype(d)> di; | 
 |   const RebindToUnsigned<decltype(d)> du; | 
 |  | 
 |   const auto shift_amt_mask = Set(du, uint64_t{63}); | 
 |   const auto shl_amt = And(BitCast(du, b), shift_amt_mask); | 
 |   const auto shr_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask); | 
 |  | 
 |   const auto vu = BitCast(du, a); | 
 |   return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt))); | 
 | } | 
 |  | 
 | template <class V, HWY_IF_UI64(TFromV<V>)> | 
 | HWY_API V Ror(V a, V b) { | 
 |   const DFromV<decltype(a)> d; | 
 |   const RebindToSigned<decltype(d)> di; | 
 |   const RebindToUnsigned<decltype(d)> du; | 
 |  | 
 |   const auto shift_amt_mask = Set(du, uint64_t{63}); | 
 |   const auto shr_amt = And(BitCast(du, b), shift_amt_mask); | 
 |   const auto shl_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask); | 
 |  | 
 |   const auto vu = BitCast(du, a); | 
 |   return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt))); | 
 | } | 
 | #endif  // HWY_HAVE_INTEGER64 | 
 |  | 
 | #endif  // HWY_NATIVE_ROL_ROR_32_64 | 
 |  | 
 | // ------------------------------ RotateLeftSame/RotateRightSame | 
 |  | 
 | #if (defined(HWY_NATIVE_ROL_ROR_SAME_8) == defined(HWY_TARGET_TOGGLE)) | 
 | #ifdef HWY_NATIVE_ROL_ROR_SAME_8 | 
 | #undef HWY_NATIVE_ROL_ROR_SAME_8 | 
 | #else | 
 | #define HWY_NATIVE_ROL_ROR_SAME_8 | 
 | #endif | 
 |  | 
 | template <class V, HWY_IF_UI8(TFromV<V>)> | 
 | HWY_API V RotateLeftSame(V v, int bits) { | 
 |   const DFromV<decltype(v)> d; | 
 |   const RebindToUnsigned<decltype(d)> du; | 
 |  | 
 |   const int shl_amt = bits & 7; | 
 |   const int shr_amt = static_cast<int>((0u - static_cast<unsigned>(bits)) & 7u); | 
 |  | 
 |   const auto vu = BitCast(du, v); | 
 |   return BitCast(d, | 
 |                  Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt))); | 
 | } | 
 |  | 
 | template <class V, HWY_IF_UI8(TFromV<V>)> | 
 | HWY_API V RotateRightSame(V v, int bits) { | 
 |   const DFromV<decltype(v)> d; | 
 |   const RebindToUnsigned<decltype(d)> du; | 
 |  | 
 |   const int shr_amt = bits & 7; | 
 |   const int shl_amt = static_cast<int>((0u - static_cast<unsigned>(bits)) & 7u); | 
 |  | 
 |   const auto vu = BitCast(du, v); | 
 |   return BitCast(d, | 
 |                  Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt))); | 
 | } | 
 |  | 
 | #endif  // HWY_NATIVE_ROL_ROR_SAME_8 | 
 |  | 
 | #if (defined(HWY_NATIVE_ROL_ROR_SAME_16) == defined(HWY_TARGET_TOGGLE)) | 
 | #ifdef HWY_NATIVE_ROL_ROR_SAME_16 | 
 | #undef HWY_NATIVE_ROL_ROR_SAME_16 | 
 | #else | 
 | #define HWY_NATIVE_ROL_ROR_SAME_16 | 
 | #endif | 
 |  | 
 | template <class V, HWY_IF_UI16(TFromV<V>)> | 
 | HWY_API V RotateLeftSame(V v, int bits) { | 
 |   const DFromV<decltype(v)> d; | 
 |   const RebindToUnsigned<decltype(d)> du; | 
 |  | 
 |   const int shl_amt = bits & 15; | 
 |   const int shr_amt = | 
 |       static_cast<int>((0u - static_cast<unsigned>(bits)) & 15u); | 
 |  | 
 |   const auto vu = BitCast(du, v); | 
 |   return BitCast(d, | 
 |                  Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt))); | 
 | } | 
 |  | 
 | template <class V, HWY_IF_UI16(TFromV<V>)> | 
 | HWY_API V RotateRightSame(V v, int bits) { | 
 |   const DFromV<decltype(v)> d; | 
 |   const RebindToUnsigned<decltype(d)> du; | 
 |  | 
 |   const int shr_amt = bits & 15; | 
 |   const int shl_amt = | 
 |       static_cast<int>((0u - static_cast<unsigned>(bits)) & 15u); | 
 |  | 
 |   const auto vu = BitCast(du, v); | 
 |   return BitCast(d, | 
 |                  Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt))); | 
 | } | 
 | #endif  // HWY_NATIVE_ROL_ROR_SAME_16 | 
 |  | 
 | #if (defined(HWY_NATIVE_ROL_ROR_SAME_32_64) == defined(HWY_TARGET_TOGGLE)) | 
 | #ifdef HWY_NATIVE_ROL_ROR_SAME_32_64 | 
 | #undef HWY_NATIVE_ROL_ROR_SAME_32_64 | 
 | #else | 
 | #define HWY_NATIVE_ROL_ROR_SAME_32_64 | 
 | #endif | 
 |  | 
 | template <class V, HWY_IF_UI32(TFromV<V>)> | 
 | HWY_API V RotateLeftSame(V v, int bits) { | 
 |   const DFromV<decltype(v)> d; | 
 |   const RebindToUnsigned<decltype(d)> du; | 
 |  | 
 |   const int shl_amt = bits & 31; | 
 |   const int shr_amt = | 
 |       static_cast<int>((0u - static_cast<unsigned>(bits)) & 31u); | 
 |  | 
 |   const auto vu = BitCast(du, v); | 
 |   return BitCast(d, | 
 |                  Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt))); | 
 | } | 
 |  | 
 | template <class V, HWY_IF_UI32(TFromV<V>)> | 
 | HWY_API V RotateRightSame(V v, int bits) { | 
 |   const DFromV<decltype(v)> d; | 
 |   const RebindToUnsigned<decltype(d)> du; | 
 |  | 
 |   const int shr_amt = bits & 31; | 
 |   const int shl_amt = | 
 |       static_cast<int>((0u - static_cast<unsigned>(bits)) & 31u); | 
 |  | 
 |   const auto vu = BitCast(du, v); | 
 |   return BitCast(d, | 
 |                  Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt))); | 
 | } | 
 |  | 
 | #if HWY_HAVE_INTEGER64 | 
 | template <class V, HWY_IF_UI64(TFromV<V>)> | 
 | HWY_API V RotateLeftSame(V v, int bits) { | 
 |   const DFromV<decltype(v)> d; | 
 |   const RebindToUnsigned<decltype(d)> du; | 
 |  | 
 |   const int shl_amt = bits & 63; | 
 |   const int shr_amt = | 
 |       static_cast<int>((0u - static_cast<unsigned>(bits)) & 63u); | 
 |  | 
 |   const auto vu = BitCast(du, v); | 
 |   return BitCast(d, | 
 |                  Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt))); | 
 | } | 
 |  | 
 | template <class V, HWY_IF_UI64(TFromV<V>)> | 
 | HWY_API V RotateRightSame(V v, int bits) { | 
 |   const DFromV<decltype(v)> d; | 
 |   const RebindToUnsigned<decltype(d)> du; | 
 |  | 
 |   const int shr_amt = bits & 63; | 
 |   const int shl_amt = | 
 |       static_cast<int>((0u - static_cast<unsigned>(bits)) & 63u); | 
 |  | 
 |   const auto vu = BitCast(du, v); | 
 |   return BitCast(d, | 
 |                  Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt))); | 
 | } | 
 | #endif  // HWY_HAVE_INTEGER64 | 
 |  | 
 | #endif  // HWY_NATIVE_ROL_ROR_SAME_32_64 | 
 |  | 
 | // ------------------------------ PromoteEvenTo/PromoteOddTo | 
 |  | 
 | // These are used by target-specific headers for ReorderWidenMulAccumulate etc. | 
 |  | 
 | #if HWY_TARGET != HWY_SCALAR || HWY_IDE | 
 | namespace detail { | 
 |  | 
 | // Tag dispatch is used in detail::PromoteEvenTo and detail::PromoteOddTo as | 
 | // there are target-specific specializations for some of the | 
 | // detail::PromoteEvenTo and detail::PromoteOddTo cases on | 
 | // SVE/PPC/SSE2/SSSE3/SSE4/AVX2. | 
 |  | 
 | // All targets except HWY_SCALAR use the implementations of | 
 | // detail::PromoteEvenTo and detail::PromoteOddTo in generic_ops-inl.h for at | 
 | // least some of the PromoteEvenTo and PromoteOddTo cases. | 
 |  | 
 | // Signed to signed PromoteEvenTo/PromoteOddTo | 
 | template <size_t kToLaneSize, class D, class V> | 
 | HWY_INLINE VFromD<D> PromoteEvenTo( | 
 |     hwy::SignedTag /*to_type_tag*/, | 
 |     hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/, | 
 |     hwy::SignedTag /*from_type_tag*/, D d_to, V v) { | 
 | #if HWY_TARGET_IS_SVE | 
 |   // The intrinsic expects the wide lane type. | 
 |   return NativePromoteEvenTo(BitCast(d_to, v)); | 
 | #else | 
 | #if HWY_IS_LITTLE_ENDIAN | 
 |   // On little-endian targets, need to shift each lane of the bitcasted | 
 |   // vector left by kToLaneSize * 4 bits to get the bits of the even | 
 |   // source lanes into the upper kToLaneSize * 4 bits of even_in_hi. | 
 |   const auto even_in_hi = ShiftLeft<kToLaneSize * 4>(BitCast(d_to, v)); | 
 | #else | 
 |   // On big-endian targets, the bits of the even source lanes are already | 
 |   // in the upper kToLaneSize * 4 bits of the lanes of the bitcasted | 
 |   // vector. | 
 |   const auto even_in_hi = BitCast(d_to, v); | 
 | #endif | 
 |  | 
 |   // Right-shift even_in_hi by kToLaneSize * 4 bits | 
 |   return ShiftRight<kToLaneSize * 4>(even_in_hi); | 
 | #endif  // HWY_TARGET_IS_SVE | 
 | } | 
 |  | 
 | // Unsigned to unsigned PromoteEvenTo/PromoteOddTo | 
 | template <size_t kToLaneSize, class D, class V> | 
 | HWY_INLINE VFromD<D> PromoteEvenTo( | 
 |     hwy::UnsignedTag /*to_type_tag*/, | 
 |     hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/, | 
 |     hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) { | 
 | #if HWY_TARGET_IS_SVE | 
 |   // The intrinsic expects the wide lane type. | 
 |   return NativePromoteEvenTo(BitCast(d_to, v)); | 
 | #else | 
 | #if HWY_IS_LITTLE_ENDIAN | 
 |   // On little-endian targets, the bits of the even source lanes are already | 
 |   // in the lower kToLaneSize * 4 bits of the lanes of the bitcasted vector. | 
 |  | 
 |   // Simply need to zero out the upper bits of each lane of the bitcasted | 
 |   // vector. | 
 |   return And(BitCast(d_to, v), | 
 |              Set(d_to, static_cast<TFromD<D>>(LimitsMax<TFromV<V>>()))); | 
 | #else | 
 |   // On big-endian targets, need to shift each lane of the bitcasted vector | 
 |   // right by kToLaneSize * 4 bits to get the bits of the even source lanes into | 
 |   // the lower kToLaneSize * 4 bits of the result. | 
 |  | 
 |   // The right shift below will zero out the upper kToLaneSize * 4 bits of the | 
 |   // result. | 
 |   return ShiftRight<kToLaneSize * 4>(BitCast(d_to, v)); | 
 | #endif | 
 | #endif  // HWY_TARGET_IS_SVE | 
 | } | 
 |  | 
 | template <size_t kToLaneSize, class D, class V> | 
 | HWY_INLINE VFromD<D> PromoteOddTo( | 
 |     hwy::SignedTag /*to_type_tag*/, | 
 |     hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/, | 
 |     hwy::SignedTag /*from_type_tag*/, D d_to, V v) { | 
 | #if HWY_IS_LITTLE_ENDIAN | 
 |   // On little-endian targets, the bits of the odd source lanes are already in | 
 |   // the upper kToLaneSize * 4 bits of the lanes of the bitcasted vector. | 
 |   const auto odd_in_hi = BitCast(d_to, v); | 
 | #else | 
 |   // On big-endian targets, need to shift each lane of the bitcasted vector | 
 |   // left by kToLaneSize * 4 bits to get the bits of the odd source lanes into | 
 |   // the upper kToLaneSize * 4 bits of odd_in_hi. | 
 |   const auto odd_in_hi = ShiftLeft<kToLaneSize * 4>(BitCast(d_to, v)); | 
 | #endif | 
 |  | 
 |   // Right-shift odd_in_hi by kToLaneSize * 4 bits | 
 |   return ShiftRight<kToLaneSize * 4>(odd_in_hi); | 
 | } | 
 |  | 
 | template <size_t kToLaneSize, class D, class V> | 
 | HWY_INLINE VFromD<D> PromoteOddTo( | 
 |     hwy::UnsignedTag /*to_type_tag*/, | 
 |     hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/, | 
 |     hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) { | 
 | #if HWY_IS_LITTLE_ENDIAN | 
 |   // On little-endian targets, need to shift each lane of the bitcasted vector | 
 |   // right by kToLaneSize * 4 bits to get the bits of the odd source lanes into | 
 |   // the lower kToLaneSize * 4 bits of the result. | 
 |  | 
 |   // The right shift below will zero out the upper kToLaneSize * 4 bits of the | 
 |   // result. | 
 |   return ShiftRight<kToLaneSize * 4>(BitCast(d_to, v)); | 
 | #else | 
 |   // On big-endian targets, the bits of the even source lanes are already | 
 |   // in the lower kToLaneSize * 4 bits of the lanes of the bitcasted vector. | 
 |  | 
 |   // Simply need to zero out the upper bits of each lane of the bitcasted | 
 |   // vector. | 
 |   return And(BitCast(d_to, v), | 
 |              Set(d_to, static_cast<TFromD<D>>(LimitsMax<TFromV<V>>()))); | 
 | #endif | 
 | } | 
 |  | 
 | // Unsigned to signed: Same as unsigned->unsigned PromoteEvenTo/PromoteOddTo | 
 | // followed by BitCast to signed | 
 | template <size_t kToLaneSize, class D, class V> | 
 | HWY_INLINE VFromD<D> PromoteEvenTo( | 
 |     hwy::SignedTag /*to_type_tag*/, | 
 |     hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/, | 
 |     hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) { | 
 |   const RebindToUnsigned<decltype(d_to)> du_to; | 
 |   return BitCast(d_to, | 
 |                  PromoteEvenTo(hwy::UnsignedTag(), hwy::SizeTag<kToLaneSize>(), | 
 |                                hwy::UnsignedTag(), du_to, v)); | 
 | } | 
 |  | 
 | template <size_t kToLaneSize, class D, class V> | 
 | HWY_INLINE VFromD<D> PromoteOddTo( | 
 |     hwy::SignedTag /*to_type_tag*/, | 
 |     hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/, | 
 |     hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) { | 
 |   const RebindToUnsigned<decltype(d_to)> du_to; | 
 |   return BitCast(d_to, | 
 |                  PromoteOddTo(hwy::UnsignedTag(), hwy::SizeTag<kToLaneSize>(), | 
 |                               hwy::UnsignedTag(), du_to, v)); | 
 | } | 
 |  | 
 | // BF16->F32 PromoteEvenTo | 
 |  | 
 | // NOTE: It is possible for FromTypeTag to be hwy::SignedTag or hwy::UnsignedTag | 
 | // instead of hwy::FloatTag on targets that use scalable vectors. | 
 |  | 
 | // VBF16 is considered to be a bfloat16_t vector if TFromV<VBF16> is the same | 
 | // type as TFromV<VFromD<Repartition<bfloat16_t, DF32>>> | 
 |  | 
 | // The BF16->F32 PromoteEvenTo overload is only enabled if VBF16 is considered | 
 | // to be a bfloat16_t vector. | 
 | template <class FromTypeTag, class DF32, class VBF16, | 
 |           class VBF16_2 = VFromD<Repartition<bfloat16_t, DF32>>, | 
 |           hwy::EnableIf<IsSame<TFromV<VBF16>, TFromV<VBF16_2>>()>* = nullptr> | 
 | HWY_INLINE VFromD<DF32> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/, | 
 |                                       hwy::SizeTag<4> /*to_lane_size_tag*/, | 
 |                                       FromTypeTag /*from_type_tag*/, DF32 d_to, | 
 |                                       VBF16 v) { | 
 |   const RebindToUnsigned<decltype(d_to)> du_to; | 
 | #if HWY_IS_LITTLE_ENDIAN | 
 |   // On little-endian platforms, need to shift left each lane of the bitcasted | 
 |   // vector by 16 bits. | 
 |   return BitCast(d_to, ShiftLeft<16>(BitCast(du_to, v))); | 
 | #else | 
 |   // On big-endian platforms, the even lanes of the source vector are already | 
 |   // in the upper 16 bits of the lanes of the bitcasted vector. | 
 |  | 
 |   // Need to simply zero out the lower 16 bits of each lane of the bitcasted | 
 |   // vector. | 
 |   return BitCast(d_to, | 
 |                  And(BitCast(du_to, v), Set(du_to, uint32_t{0xFFFF0000u}))); | 
 | #endif | 
 | } | 
 |  | 
 | // BF16->F32 PromoteOddTo | 
 |  | 
 | // NOTE: It is possible for FromTypeTag to be hwy::SignedTag or hwy::UnsignedTag | 
 | // instead of hwy::FloatTag on targets that use scalable vectors. | 
 |  | 
 | // VBF16 is considered to be a bfloat16_t vector if TFromV<VBF16> is the same | 
 | // type as TFromV<VFromD<Repartition<bfloat16_t, DF32>>> | 
 |  | 
 | // The BF16->F32 PromoteEvenTo overload is only enabled if VBF16 is considered | 
 | // to be a bfloat16_t vector. | 
 | template <class FromTypeTag, class DF32, class VBF16, | 
 |           class VBF16_2 = VFromD<Repartition<bfloat16_t, DF32>>, | 
 |           hwy::EnableIf<IsSame<TFromV<VBF16>, TFromV<VBF16_2>>()>* = nullptr> | 
 | HWY_INLINE VFromD<DF32> PromoteOddTo(hwy::FloatTag /*to_type_tag*/, | 
 |                                      hwy::SizeTag<4> /*to_lane_size_tag*/, | 
 |                                      FromTypeTag /*from_type_tag*/, DF32 d_to, | 
 |                                      VBF16 v) { | 
 |   const RebindToUnsigned<decltype(d_to)> du_to; | 
 | #if HWY_IS_LITTLE_ENDIAN | 
 |   // On little-endian platforms, the odd lanes of the source vector are already | 
 |   // in the upper 16 bits of the lanes of the bitcasted vector. | 
 |  | 
 |   // Need to simply zero out the lower 16 bits of each lane of the bitcasted | 
 |   // vector. | 
 |   return BitCast(d_to, | 
 |                  And(BitCast(du_to, v), Set(du_to, uint32_t{0xFFFF0000u}))); | 
 | #else | 
 |   // On big-endian platforms, need to shift left each lane of the bitcasted | 
 |   // vector by 16 bits. | 
 |   return BitCast(d_to, ShiftLeft<16>(BitCast(du_to, v))); | 
 | #endif | 
 | } | 
 |  | 
 | // Default PromoteEvenTo/PromoteOddTo implementations | 
 | template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D, | 
 |           class V, HWY_IF_LANES_D(D, 1)> | 
 | HWY_INLINE VFromD<D> PromoteEvenTo( | 
 |     ToTypeTag /*to_type_tag*/, hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/, | 
 |     FromTypeTag /*from_type_tag*/, D d_to, V v) { | 
 |   return PromoteLowerTo(d_to, v); | 
 | } | 
 |  | 
 | template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D, | 
 |           class V, HWY_IF_LANES_GT_D(D, 1)> | 
 | HWY_INLINE VFromD<D> PromoteEvenTo( | 
 |     ToTypeTag /*to_type_tag*/, hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/, | 
 |     FromTypeTag /*from_type_tag*/, D d_to, V v) { | 
 |   const DFromV<decltype(v)> d; | 
 |   return PromoteLowerTo(d_to, ConcatEven(d, v, v)); | 
 | } | 
 |  | 
 | template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D, | 
 |           class V> | 
 | HWY_INLINE VFromD<D> PromoteOddTo( | 
 |     ToTypeTag /*to_type_tag*/, hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/, | 
 |     FromTypeTag /*from_type_tag*/, D d_to, V v) { | 
 |   const DFromV<decltype(v)> d; | 
 |   return PromoteLowerTo(d_to, ConcatOdd(d, v, v)); | 
 | } | 
 |  | 
 | }  // namespace detail | 
 |  | 
 | template <class D, class V, HWY_IF_T_SIZE_D(D, 2 * sizeof(TFromV<V>)), | 
 |           class V2 = VFromD<Repartition<TFromV<V>, D>>, | 
 |           HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_V(V2))> | 
 | HWY_API VFromD<D> PromoteEvenTo(D d, V v) { | 
 |   return detail::PromoteEvenTo(hwy::TypeTag<TFromD<D>>(), | 
 |                                hwy::SizeTag<sizeof(TFromD<D>)>(), | 
 |                                hwy::TypeTag<TFromV<V>>(), d, v); | 
 | } | 
 |  | 
 | template <class D, class V, HWY_IF_T_SIZE_D(D, 2 * sizeof(TFromV<V>)), | 
 |           class V2 = VFromD<Repartition<TFromV<V>, D>>, | 
 |           HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_V(V2))> | 
 | HWY_API VFromD<D> PromoteOddTo(D d, V v) { | 
 |   return detail::PromoteOddTo(hwy::TypeTag<TFromD<D>>(), | 
 |                               hwy::SizeTag<sizeof(TFromD<D>)>(), | 
 |                               hwy::TypeTag<TFromV<V>>(), d, v); | 
 | } | 
 | #endif  // HWY_TARGET != HWY_SCALAR | 
 |  | 
 | #ifdef HWY_INSIDE_END_NAMESPACE | 
 | #undef HWY_INSIDE_END_NAMESPACE | 
 | // NOLINTNEXTLINE(google-readability-namespace-comments) | 
 | }  // namespace HWY_NAMESPACE | 
 | }  // namespace hwy | 
 | HWY_AFTER_NAMESPACE(); | 
 | #endif |