third_party/highway/hwy/contrib/math/math-inl.h - aom - Git at Google

 // Copyright 2020 Google LLC
 // SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // Include guard (still compiled once per target)
 #if defined(HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_) == \
     defined(HWY_TARGET_TOGGLE)  // NOLINT
 #ifdef HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_
 #undef HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_
 #else
 #define HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_
 #endif

 #include <stddef.h>
 #include <stdint.h>

 #include "third_party/highway/hwy/highway.h"

 HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {

 /**
  * Highway SIMD version of std::acos(x).
  *
  * Valid Lane Types: float32, float64
  *        Max Error: ULP = 2
  *      Valid Range: [-1, +1]
  * @return arc cosine of 'x'
  */
 template <class D, class V>
 HWY_INLINE V Acos(D d, V x);
 template <class D, class V>
 HWY_NOINLINE V CallAcos(const D d, VecArg<V> x) {
   return Acos(d, x);
 }

 /**
  * Highway SIMD version of std::acosh(x).
  *
  * Valid Lane Types: float32, float64
  *        Max Error: ULP = 3
  *      Valid Range: float32[1, +FLT_MAX], float64[1, +DBL_MAX]
  * @return hyperbolic arc cosine of 'x'
  */
 template <class D, class V>
 HWY_INLINE V Acosh(D d, V x);
 template <class D, class V>
 HWY_NOINLINE V CallAcosh(const D d, VecArg<V> x) {
   return Acosh(d, x);
 }

 /**
  * Highway SIMD version of std::asin(x).
  *
  * Valid Lane Types: float32, float64
  *        Max Error: ULP = 2
  *      Valid Range: [-1, +1]
  * @return arc sine of 'x'
  */
 template <class D, class V>
 HWY_INLINE V Asin(D d, V x);
 template <class D, class V>
 HWY_NOINLINE V CallAsin(const D d, VecArg<V> x) {
   return Asin(d, x);
 }

 /**
  * Highway SIMD version of std::asinh(x).
  *
  * Valid Lane Types: float32, float64
  *        Max Error: ULP = 3
  *      Valid Range: float32[-FLT_MAX, +FLT_MAX], float64[-DBL_MAX, +DBL_MAX]
  * @return hyperbolic arc sine of 'x'
  */
 template <class D, class V>
 HWY_INLINE V Asinh(D d, V x);
 template <class D, class V>
 HWY_NOINLINE V CallAsinh(const D d, VecArg<V> x) {
   return Asinh(d, x);
 }

 /**
  * Highway SIMD version of std::atan(x).
  *
  * Valid Lane Types: float32, float64
  *        Max Error: ULP = 3
  *      Valid Range: float32[-FLT_MAX, +FLT_MAX], float64[-DBL_MAX, +DBL_MAX]
  * @return arc tangent of 'x'
  */
 template <class D, class V>
 HWY_INLINE V Atan(D d, V x);
 template <class D, class V>
 HWY_NOINLINE V CallAtan(const D d, VecArg<V> x) {
   return Atan(d, x);
 }

 /**
  * Highway SIMD version of std::atanh(x).
  *
  * Valid Lane Types: float32, float64
  *        Max Error: ULP = 3
  *      Valid Range: (-1, +1)
  * @return hyperbolic arc tangent of 'x'
  */
 template <class D, class V>
 HWY_INLINE V Atanh(D d, V x);
 template <class D, class V>
 HWY_NOINLINE V CallAtanh(const D d, VecArg<V> x) {
   return Atanh(d, x);
 }

 // Atan2 was added later and some users may be implementing it themselves, so
 // notify them that this version of Highway defines it already.
 #ifndef HWY_HAVE_ATAN2
 #define HWY_HAVE_ATAN2 1
 #endif

 /**
  * Highway SIMD version of std::atan2(x).
  *
  * Valid Lane Types: float32, float64
  * Correctly handles negative zero, infinities, and NaN.
  * @return atan2 of 'y', 'x'
  */
 template <class D, class V = VFromD<D>, class M = MFromD<D>,
           typename T = TFromD<D>>
 HWY_INLINE V Atan2(const D d, V y, V x) {
   const V kHalf = Set(d, static_cast<T>(+0.5));
   const V kPi = Set(d, static_cast<T>(+3.14159265358979323846264));
   const V kPi2 = Mul(kPi, kHalf);

   const V k0 = Zero(d);
   const M y_0 = Eq(y, k0);
   const M x_0 = Eq(x, k0);
   const M x_neg = Lt(x, k0);
   const M y_inf = IsInf(y);
   const M x_inf = IsInf(x);
   const M nan = Or(IsNaN(y), IsNaN(x));

   const V if_xneg_pi = IfThenElseZero(x_neg, kPi);
   // x= +inf: pi/4; -inf: 3*pi/4; else: pi/2
   const V if_yinf = Mul(kHalf, IfThenElse(x_inf, Add(kPi2, if_xneg_pi), kPi));

   V t = Atan(d, Div(y, x));
   // Disambiguate between quadrants 1/3 and 2/4 by adding (Q2: Pi; Q3: -Pi).
   t = Add(t, CopySignToAbs(if_xneg_pi, y));
   // Special cases for 0 and infinity:
   t = IfThenElse(x_inf, if_xneg_pi, t);
   t = IfThenElse(x_0, kPi2, t);
   t = IfThenElse(y_inf, if_yinf, t);
   t = IfThenElse(y_0, if_xneg_pi, t);
   // Any input NaN => NaN, otherwise fix sign.
   return IfThenElse(nan, NaN(d), CopySign(t, y));
 }
 template <class D, class V>
 HWY_NOINLINE V CallAtan2(const D d, VecArg<V> y, VecArg<V> x) {
   return Atan2(d, y, x);
 }

 /**
  * Highway SIMD version of std::cos(x).
  *
  * Valid Lane Types: float32, float64
  *        Max Error: ULP = 3
  *      Valid Range: [-39000, +39000]
  * @return cosine of 'x'
  */
 template <class D, class V>
 HWY_INLINE V Cos(D d, V x);
 template <class D, class V>
 HWY_NOINLINE V CallCos(const D d, VecArg<V> x) {
   return Cos(d, x);
 }

 /**
  * Highway SIMD version of std::exp(x).
  *
  * Valid Lane Types: float32, float64
  *        Max Error: ULP = 1
  *      Valid Range: float32[-FLT_MAX, +104], float64[-DBL_MAX, +706]
  * @return e^x
  */
 template <class D, class V>
 HWY_INLINE V Exp(D d, V x);
 template <class D, class V>
 HWY_NOINLINE V CallExp(const D d, VecArg<V> x) {
   return Exp(d, x);
 }

 /**
  * Highway SIMD version of std::exp2(x).
  *
  * Valid Lane Types: float32, float64
  *        Max Error: ULP = 2
  *      Valid Range: float32[-FLT_MAX, +128], float64[-DBL_MAX, +1024]
  * @return 2^x
  */
 template <class D, class V>
 HWY_INLINE V Exp2(D d, V x);
 template <class D, class V>
 HWY_NOINLINE V CallExp2(const D d, VecArg<V> x) {
   return Exp2(d, x);
 }

 /**
  * Highway SIMD version of std::expm1(x).
  *
  * Valid Lane Types: float32, float64
  *        Max Error: ULP = 4
  *      Valid Range: float32[-FLT_MAX, +104], float64[-DBL_MAX, +706]
  * @return e^x - 1
  */
 template <class D, class V>
 HWY_INLINE V Expm1(D d, V x);
 template <class D, class V>
 HWY_NOINLINE V CallExpm1(const D d, VecArg<V> x) {
   return Expm1(d, x);
 }

 /**
  * Highway SIMD version of std::log(x).
  *
  * Valid Lane Types: float32, float64
  *        Max Error: ULP = 4
  *      Valid Range: float32(0, +FLT_MAX], float64(0, +DBL_MAX]
  * @return natural logarithm of 'x'
  */
 template <class D, class V>
 HWY_INLINE V Log(D d, V x);
 template <class D, class V>
 HWY_NOINLINE V CallLog(const D d, VecArg<V> x) {
   return Log(d, x);
 }

 /**
  * Highway SIMD version of std::log10(x).
  *
  * Valid Lane Types: float32, float64
  *        Max Error: ULP = 2
  *      Valid Range: float32(0, +FLT_MAX], float64(0, +DBL_MAX]
  * @return base 10 logarithm of 'x'
  */
 template <class D, class V>
 HWY_INLINE V Log10(D d, V x);
 template <class D, class V>
 HWY_NOINLINE V CallLog10(const D d, VecArg<V> x) {
   return Log10(d, x);
 }

 /**
  * Highway SIMD version of std::log1p(x).
  *
  * Valid Lane Types: float32, float64
  *        Max Error: ULP = 2
  *      Valid Range: float32[0, +FLT_MAX], float64[0, +DBL_MAX]
  * @return log(1 + x)
  */
 template <class D, class V>
 HWY_INLINE V Log1p(D d, V x);
 template <class D, class V>
 HWY_NOINLINE V CallLog1p(const D d, VecArg<V> x) {
   return Log1p(d, x);
 }

 /**
  * Highway SIMD version of std::log2(x).
  *
  * Valid Lane Types: float32, float64
  *        Max Error: ULP = 2
  *      Valid Range: float32(0, +FLT_MAX], float64(0, +DBL_MAX]
  * @return base 2 logarithm of 'x'
  */
 template <class D, class V>
 HWY_INLINE V Log2(D d, V x);
 template <class D, class V>
 HWY_NOINLINE V CallLog2(const D d, VecArg<V> x) {
   return Log2(d, x);
 }

 /**
  * Highway SIMD version of std::sin(x).
  *
  * Valid Lane Types: float32, float64
  *        Max Error: ULP = 3
  *      Valid Range: [-39000, +39000]
  * @return sine of 'x'
  */
 template <class D, class V>
 HWY_INLINE V Sin(D d, V x);
 template <class D, class V>
 HWY_NOINLINE V CallSin(const D d, VecArg<V> x) {
   return Sin(d, x);
 }

 /**
  * Highway SIMD version of std::sinh(x).
  *
  * Valid Lane Types: float32, float64
  *        Max Error: ULP = 4
  *      Valid Range: float32[-88.7228, +88.7228], float64[-709, +709]
  * @return hyperbolic sine of 'x'
  */
 template <class D, class V>
 HWY_INLINE V Sinh(D d, V x);
 template <class D, class V>
 HWY_NOINLINE V CallSinh(const D d, VecArg<V> x) {
   return Sinh(d, x);
 }

 /**
  * Highway SIMD version of std::tanh(x).
  *
  * Valid Lane Types: float32, float64
  *        Max Error: ULP = 4
  *      Valid Range: float32[-FLT_MAX, +FLT_MAX], float64[-DBL_MAX, +DBL_MAX]
  * @return hyperbolic tangent of 'x'
  */
 template <class D, class V>
 HWY_INLINE V Tanh(D d, V x);
 template <class D, class V>
 HWY_NOINLINE V CallTanh(const D d, VecArg<V> x) {
   return Tanh(d, x);
 }

 /**
  * Highway SIMD version of SinCos.
  * Compute the sine and cosine at the same time
  * The performance should be around the same as calling Sin.
  *
  * Valid Lane Types: float32, float64
  *        Max Error: ULP = 1
  *      Valid Range: [-39000, +39000]
  * @return sine and cosine of 'x'
  */
 template <class D, class V>
 HWY_INLINE void SinCos(D d, V x, V& s, V& c);
 template <class D, class V>
 HWY_NOINLINE void CallSinCos(const D d, VecArg<V> x, V& s, V& c) {
   SinCos(d, x, s, c);
 }

 /**
  * Highway SIMD version of Hypot
  *
  * Valid Lane Types: float32, float64
  *        Max Error: ULP = 4
  *      Valid Range: float32[-FLT_MAX, +FLT_MAX], float64[-DBL_MAX, +DBL_MAX]
  * @return hypotenuse of a and b
  */
 template <class D, class V>
 HWY_INLINE V Hypot(D d, V a, V b);
 template <class D, class V>
 HWY_NOINLINE V CallHypot(const D d, VecArg<V> a, VecArg<V> b) {
   return Hypot(d, a, b);
 }

 ////////////////////////////////////////////////////////////////////////////////
 // Implementation
 ////////////////////////////////////////////////////////////////////////////////
 namespace impl {

 // Estrin's Scheme is a faster method for evaluating large polynomials on
 // super scalar architectures. It works by factoring the Horner's Method
 // polynomial into power of two sub-trees that can be evaluated in parallel.
 // Wikipedia Link: https://en.wikipedia.org/wiki/Estrin%27s_scheme
 template <class T>
 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1) {
   return MulAdd(c1, x, c0);
 }
 template <class T>
 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2) {
   T x2 = Mul(x, x);
   return MulAdd(x2, c2, MulAdd(c1, x, c0));
 }
 template <class T>
 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3) {
   T x2 = Mul(x, x);
   return MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0));
 }
 template <class T>
 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4) {
   T x2 = Mul(x, x);
   T x4 = Mul(x2, x2);
   return MulAdd(x4, c4, MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)));
 }
 template <class T>
 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5) {
   T x2 = Mul(x, x);
   T x4 = Mul(x2, x2);
   return MulAdd(x4, MulAdd(c5, x, c4),
                 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)));
 }
 template <class T>
 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
                                      T c6) {
   T x2 = Mul(x, x);
   T x4 = Mul(x2, x2);
   return MulAdd(x4, MulAdd(x2, c6, MulAdd(c5, x, c4)),
                 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)));
 }
 template <class T>
 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
                                      T c6, T c7) {
   T x2 = Mul(x, x);
   T x4 = Mul(x2, x2);
   return MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
                 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)));
 }
 template <class T>
 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
                                      T c6, T c7, T c8) {
   T x2 = Mul(x, x);
   T x4 = Mul(x2, x2);
   T x8 = Mul(x4, x4);
   return MulAdd(x8, c8,
                 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
                        MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
 }
 template <class T>
 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
                                      T c6, T c7, T c8, T c9) {
   T x2 = Mul(x, x);
   T x4 = Mul(x2, x2);
   T x8 = Mul(x4, x4);
   return MulAdd(x8, MulAdd(c9, x, c8),
                 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
                        MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
 }
 template <class T>
 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
                                      T c6, T c7, T c8, T c9, T c10) {
   T x2 = Mul(x, x);
   T x4 = Mul(x2, x2);
   T x8 = Mul(x4, x4);
   return MulAdd(x8, MulAdd(x2, c10, MulAdd(c9, x, c8)),
                 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
                        MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
 }
 template <class T>
 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
                                      T c6, T c7, T c8, T c9, T c10, T c11) {
   T x2 = Mul(x, x);
   T x4 = Mul(x2, x2);
   T x8 = Mul(x4, x4);
   return MulAdd(x8, MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8)),
                 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
                        MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
 }
 template <class T>
 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
                                      T c6, T c7, T c8, T c9, T c10, T c11,
                                      T c12) {
   T x2 = Mul(x, x);
   T x4 = Mul(x2, x2);
   T x8 = Mul(x4, x4);
   return MulAdd(
       x8, MulAdd(x4, c12, MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
       MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
              MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
 }
 template <class T>
 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
                                      T c6, T c7, T c8, T c9, T c10, T c11,
                                      T c12, T c13) {
   T x2 = Mul(x, x);
   T x4 = Mul(x2, x2);
   T x8 = Mul(x4, x4);
   return MulAdd(x8,
                 MulAdd(x4, MulAdd(c13, x, c12),
                        MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
                 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
                        MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
 }
 template <class T>
 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
                                      T c6, T c7, T c8, T c9, T c10, T c11,
                                      T c12, T c13, T c14) {
   T x2 = Mul(x, x);
   T x4 = Mul(x2, x2);
   T x8 = Mul(x4, x4);
   return MulAdd(x8,
                 MulAdd(x4, MulAdd(x2, c14, MulAdd(c13, x, c12)),
                        MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
                 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
                        MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
 }
 template <class T>
 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
                                      T c6, T c7, T c8, T c9, T c10, T c11,
                                      T c12, T c13, T c14, T c15) {
   T x2 = Mul(x, x);
   T x4 = Mul(x2, x2);
   T x8 = Mul(x4, x4);
   return MulAdd(x8,
                 MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)),
                        MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
                 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
                        MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
 }
 template <class T>
 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
                                      T c6, T c7, T c8, T c9, T c10, T c11,
                                      T c12, T c13, T c14, T c15, T c16) {
   T x2 = Mul(x, x);
   T x4 = Mul(x2, x2);
   T x8 = Mul(x4, x4);
   T x16 = Mul(x8, x8);
   return MulAdd(
       x16, c16,
       MulAdd(x8,
              MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)),
                     MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
              MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
                     MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))));
 }
 template <class T>
 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
                                      T c6, T c7, T c8, T c9, T c10, T c11,
                                      T c12, T c13, T c14, T c15, T c16, T c17) {
   T x2 = Mul(x, x);
   T x4 = Mul(x2, x2);
   T x8 = Mul(x4, x4);
   T x16 = Mul(x8, x8);
   return MulAdd(
       x16, MulAdd(c17, x, c16),
       MulAdd(x8,
              MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)),
                     MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
              MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
                     MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))));
 }
 template <class T>
 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
                                      T c6, T c7, T c8, T c9, T c10, T c11,
                                      T c12, T c13, T c14, T c15, T c16, T c17,
                                      T c18) {
   T x2 = Mul(x, x);
   T x4 = Mul(x2, x2);
   T x8 = Mul(x4, x4);
   T x16 = Mul(x8, x8);
   return MulAdd(
       x16, MulAdd(x2, c18, MulAdd(c17, x, c16)),
       MulAdd(x8,
              MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)),
                     MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
              MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
                     MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))));
 }

 template <class FloatOrDouble>
 struct AsinImpl {};
 template <class FloatOrDouble>
 struct AtanImpl {};
 template <class FloatOrDouble>
 struct CosSinImpl {};
 template <class FloatOrDouble>
 struct ExpImpl {};
 template <class FloatOrDouble>
 struct LogImpl {};
 template <class FloatOrDouble>
 struct SinCosImpl {};

 template <>
 struct AsinImpl<float> {
   // Polynomial approximation for asin(x) over the range [0, 0.5).
   template <class D, class V>
   HWY_INLINE V AsinPoly(D d, V x2, V /*x*/) {
     const auto k0 = Set(d, +0.1666677296f);
     const auto k1 = Set(d, +0.07495029271f);
     const auto k2 = Set(d, +0.04547423869f);
     const auto k3 = Set(d, +0.02424046025f);
     const auto k4 = Set(d, +0.04197454825f);

     return Estrin(x2, k0, k1, k2, k3, k4);
   }
 };

 #if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64

 template <>
 struct AsinImpl<double> {
   // Polynomial approximation for asin(x) over the range [0, 0.5).
   template <class D, class V>
   HWY_INLINE V AsinPoly(D d, V x2, V /*x*/) {
     const auto k0 = Set(d, +0.1666666666666497543);
     const auto k1 = Set(d, +0.07500000000378581611);
     const auto k2 = Set(d, +0.04464285681377102438);
     const auto k3 = Set(d, +0.03038195928038132237);
     const auto k4 = Set(d, +0.02237176181932048341);
     const auto k5 = Set(d, +0.01735956991223614604);
     const auto k6 = Set(d, +0.01388715184501609218);
     const auto k7 = Set(d, +0.01215360525577377331);
     const auto k8 = Set(d, +0.006606077476277170610);
     const auto k9 = Set(d, +0.01929045477267910674);
     const auto k10 = Set(d, -0.01581918243329996643);
     const auto k11 = Set(d, +0.03161587650653934628);

     return Estrin(x2, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11);
   }
 };

 #endif

 template <>
 struct AtanImpl<float> {
   // Polynomial approximation for atan(x) over the range [0, 1.0).
   template <class D, class V>
   HWY_INLINE V AtanPoly(D d, V x) {
     const auto k0 = Set(d, -0.333331018686294555664062f);
     const auto k1 = Set(d, +0.199926957488059997558594f);
     const auto k2 = Set(d, -0.142027363181114196777344f);
     const auto k3 = Set(d, +0.106347933411598205566406f);
     const auto k4 = Set(d, -0.0748900920152664184570312f);
     const auto k5 = Set(d, +0.0425049886107444763183594f);
     const auto k6 = Set(d, -0.0159569028764963150024414f);
     const auto k7 = Set(d, +0.00282363896258175373077393f);

     const auto y = Mul(x, x);
     return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7), Mul(y, x), x);
   }
 };

 #if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64

 template <>
 struct AtanImpl<double> {
   // Polynomial approximation for atan(x) over the range [0, 1.0).
   template <class D, class V>
   HWY_INLINE V AtanPoly(D d, V x) {
     const auto k0 = Set(d, -0.333333333333311110369124);
     const auto k1 = Set(d, +0.199999999996591265594148);
     const auto k2 = Set(d, -0.14285714266771329383765);
     const auto k3 = Set(d, +0.111111105648261418443745);
     const auto k4 = Set(d, -0.090908995008245008229153);
     const auto k5 = Set(d, +0.0769219538311769618355029);
     const auto k6 = Set(d, -0.0666573579361080525984562);
     const auto k7 = Set(d, +0.0587666392926673580854313);
     const auto k8 = Set(d, -0.0523674852303482457616113);
     const auto k9 = Set(d, +0.0466667150077840625632675);
     const auto k10 = Set(d, -0.0407629191276836500001934);
     const auto k11 = Set(d, +0.0337852580001353069993897);
     const auto k12 = Set(d, -0.0254517624932312641616861);
     const auto k13 = Set(d, +0.016599329773529201970117);
     const auto k14 = Set(d, -0.00889896195887655491740809);
     const auto k15 = Set(d, +0.00370026744188713119232403);
     const auto k16 = Set(d, -0.00110611831486672482563471);
     const auto k17 = Set(d, +0.000209850076645816976906797);
     const auto k18 = Set(d, -1.88796008463073496563746e-5);

     const auto y = Mul(x, x);
     return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11,
                          k12, k13, k14, k15, k16, k17, k18),
                   Mul(y, x), x);
   }
 };

 #endif

 template <>
 struct CosSinImpl<float> {
   // Rounds float toward zero and returns as int32_t.
   template <class D, class V>
   HWY_INLINE Vec<Rebind<int32_t, D>> ToInt32(D /*unused*/, V x) {
     return ConvertTo(Rebind<int32_t, D>(), x);
   }

   template <class D, class V>
   HWY_INLINE V Poly(D d, V x) {
     const auto k0 = Set(d, -1.66666597127914428710938e-1f);
     const auto k1 = Set(d, +8.33307858556509017944336e-3f);
     const auto k2 = Set(d, -1.981069071916863322258e-4f);
     const auto k3 = Set(d, +2.6083159809786593541503e-6f);

     const auto y = Mul(x, x);
     return MulAdd(Estrin(y, k0, k1, k2, k3), Mul(y, x), x);
   }

   template <class D, class V, class VI32>
   HWY_INLINE V CosReduce(D d, V x, VI32 q) {
     // kHalfPiPart0f + kHalfPiPart1f + kHalfPiPart2f + kHalfPiPart3f ~= -pi/2
     const V kHalfPiPart0f = Set(d, -0.5f * 3.140625f);
     const V kHalfPiPart1f = Set(d, -0.5f * 0.0009670257568359375f);
     const V kHalfPiPart2f = Set(d, -0.5f * 6.2771141529083251953e-7f);
     const V kHalfPiPart3f = Set(d, -0.5f * 1.2154201256553420762e-10f);

     // Extended precision modular arithmetic.
     const V qf = ConvertTo(d, q);
     x = MulAdd(qf, kHalfPiPart0f, x);
     x = MulAdd(qf, kHalfPiPart1f, x);
     x = MulAdd(qf, kHalfPiPart2f, x);
     x = MulAdd(qf, kHalfPiPart3f, x);
     return x;
   }

   template <class D, class V, class VI32>
   HWY_INLINE V SinReduce(D d, V x, VI32 q) {
     // kPiPart0f + kPiPart1f + kPiPart2f + kPiPart3f ~= -pi
     const V kPiPart0f = Set(d, -3.140625f);
     const V kPiPart1f = Set(d, -0.0009670257568359375f);
     const V kPiPart2f = Set(d, -6.2771141529083251953e-7f);
     const V kPiPart3f = Set(d, -1.2154201256553420762e-10f);

     // Extended precision modular arithmetic.
     const V qf = ConvertTo(d, q);
     x = MulAdd(qf, kPiPart0f, x);
     x = MulAdd(qf, kPiPart1f, x);
     x = MulAdd(qf, kPiPart2f, x);
     x = MulAdd(qf, kPiPart3f, x);
     return x;
   }

   // (q & 2) == 0 ? -0.0 : +0.0
   template <class D, class VI32>
   HWY_INLINE Vec<Rebind<float, D>> CosSignFromQuadrant(D d, VI32 q) {
     const VI32 kTwo = Set(Rebind<int32_t, D>(), 2);
     return BitCast(d, ShiftLeft<30>(AndNot(q, kTwo)));
   }

   // ((q & 1) ? -0.0 : +0.0)
   template <class D, class VI32>
   HWY_INLINE Vec<Rebind<float, D>> SinSignFromQuadrant(D d, VI32 q) {
     const VI32 kOne = Set(Rebind<int32_t, D>(), 1);
     return BitCast(d, ShiftLeft<31>(And(q, kOne)));
   }
 };

 #if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64

 template <>
 struct CosSinImpl<double> {
   // Rounds double toward zero and returns as int32_t.
   template <class D, class V>
   HWY_INLINE Vec<Rebind<int32_t, D>> ToInt32(D /*unused*/, V x) {
     return DemoteTo(Rebind<int32_t, D>(), x);
   }

   template <class D, class V>
   HWY_INLINE V Poly(D d, V x) {
     const auto k0 = Set(d, -0.166666666666666657414808);
     const auto k1 = Set(d, +0.00833333333333332974823815);
     const auto k2 = Set(d, -0.000198412698412696162806809);
     const auto k3 = Set(d, +2.75573192239198747630416e-6);
     const auto k4 = Set(d, -2.50521083763502045810755e-8);
     const auto k5 = Set(d, +1.60590430605664501629054e-10);
     const auto k6 = Set(d, -7.64712219118158833288484e-13);
     const auto k7 = Set(d, +2.81009972710863200091251e-15);
     const auto k8 = Set(d, -7.97255955009037868891952e-18);

     const auto y = Mul(x, x);
     return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7, k8), Mul(y, x), x);
   }

   template <class D, class V, class VI32>
   HWY_INLINE V CosReduce(D d, V x, VI32 q) {
     // kHalfPiPart0d + kHalfPiPart1d + kHalfPiPart2d + kHalfPiPart3d ~= -pi/2
     const V kHalfPiPart0d = Set(d, -0.5 * 3.1415926218032836914);
     const V kHalfPiPart1d = Set(d, -0.5 * 3.1786509424591713469e-8);
     const V kHalfPiPart2d = Set(d, -0.5 * 1.2246467864107188502e-16);
     const V kHalfPiPart3d = Set(d, -0.5 * 1.2736634327021899816e-24);

     // Extended precision modular arithmetic.
     const V qf = PromoteTo(d, q);
     x = MulAdd(qf, kHalfPiPart0d, x);
     x = MulAdd(qf, kHalfPiPart1d, x);
     x = MulAdd(qf, kHalfPiPart2d, x);
     x = MulAdd(qf, kHalfPiPart3d, x);
     return x;
   }

   template <class D, class V, class VI32>
   HWY_INLINE V SinReduce(D d, V x, VI32 q) {
     // kPiPart0d + kPiPart1d + kPiPart2d + kPiPart3d ~= -pi
     const V kPiPart0d = Set(d, -3.1415926218032836914);
     const V kPiPart1d = Set(d, -3.1786509424591713469e-8);
     const V kPiPart2d = Set(d, -1.2246467864107188502e-16);
     const V kPiPart3d = Set(d, -1.2736634327021899816e-24);

     // Extended precision modular arithmetic.
     const V qf = PromoteTo(d, q);
     x = MulAdd(qf, kPiPart0d, x);
     x = MulAdd(qf, kPiPart1d, x);
     x = MulAdd(qf, kPiPart2d, x);
     x = MulAdd(qf, kPiPart3d, x);
     return x;
   }

   // (q & 2) == 0 ? -0.0 : +0.0
   template <class D, class VI32>
   HWY_INLINE Vec<Rebind<double, D>> CosSignFromQuadrant(D d, VI32 q) {
     const VI32 kTwo = Set(Rebind<int32_t, D>(), 2);
     return BitCast(
         d, ShiftLeft<62>(PromoteTo(Rebind<int64_t, D>(), AndNot(q, kTwo))));
   }

   // ((q & 1) ? -0.0 : +0.0)
   template <class D, class VI32>
   HWY_INLINE Vec<Rebind<double, D>> SinSignFromQuadrant(D d, VI32 q) {
     const VI32 kOne = Set(Rebind<int32_t, D>(), 1);
     return BitCast(
         d, ShiftLeft<63>(PromoteTo(Rebind<int64_t, D>(), And(q, kOne))));
   }
 };

 #endif

 template <>
 struct ExpImpl<float> {
   // Rounds float toward zero and returns as int32_t.
   template <class D, class V>
   HWY_INLINE Vec<Rebind<int32_t, D>> ToInt32(D /*unused*/, V x) {
     return ConvertTo(Rebind<int32_t, D>(), x);
   }

   // Rounds float to nearest int32_t
   template <class D, class V>
   HWY_INLINE Vec<Rebind<int32_t, D>> ToNearestInt32(D /*unused*/, V x) {
     return NearestInt(x);
   }

   template <class D, class V>
   HWY_INLINE V ExpPoly(D d, V x) {
     const auto k0 = Set(d, +0.5f);
     const auto k1 = Set(d, +0.166666671633720397949219f);
     const auto k2 = Set(d, +0.0416664853692054748535156f);
     const auto k3 = Set(d, +0.00833336077630519866943359f);
     const auto k4 = Set(d, +0.00139304355252534151077271f);
     const auto k5 = Set(d, +0.000198527617612853646278381f);

     return MulAdd(Estrin(x, k0, k1, k2, k3, k4, k5), Mul(x, x), x);
   }

   // Computes 2^x, where x is an integer.
   template <class D, class VI32>
   HWY_INLINE Vec<D> Pow2I(D d, VI32 x) {
     const Rebind<int32_t, D> di32;
     const VI32 kOffset = Set(di32, 0x7F);
     return BitCast(d, ShiftLeft<23>(Add(x, kOffset)));
   }

   // Sets the exponent of 'x' to 2^e.
   template <class D, class V, class VI32>
   HWY_INLINE V LoadExpShortRange(D d, V x, VI32 e) {
     const VI32 y = ShiftRight<1>(e);
     return Mul(Mul(x, Pow2I(d, y)), Pow2I(d, Sub(e, y)));
   }

   template <class D, class V, class VI32>
   HWY_INLINE V ExpReduce(D d, V x, VI32 q) {
     // kLn2Part0f + kLn2Part1f ~= -ln(2)
     const V kLn2Part0f = Set(d, -0.693145751953125f);
     const V kLn2Part1f = Set(d, -1.428606765330187045e-6f);

     // Extended precision modular arithmetic.
     const V qf = ConvertTo(d, q);
     x = MulAdd(qf, kLn2Part0f, x);
     x = MulAdd(qf, kLn2Part1f, x);
     return x;
   }

   template <class D, class V, class VI32>
   HWY_INLINE V Exp2Reduce(D d, V x, VI32 q) {
     const V x_frac = Sub(x, ConvertTo(d, q));
     return MulAdd(x_frac, Set(d, 0.193147182464599609375f),
                   Mul(x_frac, Set(d, 0.5f)));
   }
 };

 template <>
 struct LogImpl<float> {
   template <class D, class V>
   HWY_INLINE Vec<Rebind<int32_t, D>> Log2p1NoSubnormal(D /*d*/, V x) {
     const Rebind<int32_t, D> di32;
     const Rebind<uint32_t, D> du32;
     const auto kBias = Set(di32, 0x7F);
     return Sub(BitCast(di32, ShiftRight<23>(BitCast(du32, x))), kBias);
   }

   // Approximates Log(x) over the range [sqrt(2) / 2, sqrt(2)].
   template <class D, class V>
   HWY_INLINE V LogPoly(D d, V x) {
     const V k0 = Set(d, 0.66666662693f);
     const V k1 = Set(d, 0.40000972152f);
     const V k2 = Set(d, 0.28498786688f);
     const V k3 = Set(d, 0.24279078841f);

     const V x2 = Mul(x, x);
     const V x4 = Mul(x2, x2);
     return MulAdd(MulAdd(k2, x4, k0), x2, Mul(MulAdd(k3, x4, k1), x4));
   }
 };

 #if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
 template <>
 struct ExpImpl<double> {
   // Rounds double toward zero and returns as int32_t.
   template <class D, class V>
   HWY_INLINE Vec<Rebind<int32_t, D>> ToInt32(D /*unused*/, V x) {
     return DemoteTo(Rebind<int32_t, D>(), x);
   }

   // Rounds double to nearest int32_t
   template <class D, class V>
   HWY_INLINE Vec<Rebind<int32_t, D>> ToNearestInt32(D /*unused*/, V x) {
     return DemoteToNearestInt(Rebind<int32_t, D>(), x);
   }

   template <class D, class V>
   HWY_INLINE V ExpPoly(D d, V x) {
     const auto k0 = Set(d, +0.5);
     const auto k1 = Set(d, +0.166666666666666851703837);
     const auto k2 = Set(d, +0.0416666666666665047591422);
     const auto k3 = Set(d, +0.00833333333331652721664984);
     const auto k4 = Set(d, +0.00138888888889774492207962);
     const auto k5 = Set(d, +0.000198412698960509205564975);
     const auto k6 = Set(d, +2.4801587159235472998791e-5);
     const auto k7 = Set(d, +2.75572362911928827629423e-6);
     const auto k8 = Set(d, +2.75573911234900471893338e-7);
     const auto k9 = Set(d, +2.51112930892876518610661e-8);
     const auto k10 = Set(d, +2.08860621107283687536341e-9);

     return MulAdd(Estrin(x, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10),
                   Mul(x, x), x);
   }

   // Computes 2^x, where x is an integer.
   template <class D, class VI32>
   HWY_INLINE Vec<D> Pow2I(D d, VI32 x) {
     const Rebind<int32_t, D> di32;
     const Rebind<int64_t, D> di64;
     const VI32 kOffset = Set(di32, 0x3FF);
     return BitCast(d, ShiftLeft<52>(PromoteTo(di64, Add(x, kOffset))));
   }

   // Sets the exponent of 'x' to 2^e.
   template <class D, class V, class VI32>
   HWY_INLINE V LoadExpShortRange(D d, V x, VI32 e) {
     const VI32 y = ShiftRight<1>(e);
     return Mul(Mul(x, Pow2I(d, y)), Pow2I(d, Sub(e, y)));
   }

   template <class D, class V, class VI32>
   HWY_INLINE V ExpReduce(D d, V x, VI32 q) {
     // kLn2Part0d + kLn2Part1d ~= -ln(2)
     const V kLn2Part0d = Set(d, -0.6931471805596629565116018);
     const V kLn2Part1d = Set(d, -0.28235290563031577122588448175e-12);

     // Extended precision modular arithmetic.
     const V qf = PromoteTo(d, q);
     x = MulAdd(qf, kLn2Part0d, x);
     x = MulAdd(qf, kLn2Part1d, x);
     return x;
   }

   template <class D, class V, class VI32>
   HWY_INLINE V Exp2Reduce(D d, V x, VI32 q) {
     const V x_frac = Sub(x, PromoteTo(d, q));
     return MulAdd(x_frac, Set(d, 0.1931471805599453139823396),
                   Mul(x_frac, Set(d, 0.5)));
   }
 };

 template <>
 struct LogImpl<double> {
   template <class D, class V>
   HWY_INLINE Vec<Rebind<int64_t, D>> Log2p1NoSubnormal(D /*d*/, V x) {
     const Rebind<int64_t, D> di64;
     const Rebind<uint64_t, D> du64;
     return Sub(BitCast(di64, ShiftRight<52>(BitCast(du64, x))),
                Set(di64, 0x3FF));
   }

   // Approximates Log(x) over the range [sqrt(2) / 2, sqrt(2)].
   template <class D, class V>
   HWY_INLINE V LogPoly(D d, V x) {
     const V k0 = Set(d, 0.6666666666666735130);
     const V k1 = Set(d, 0.3999999999940941908);
     const V k2 = Set(d, 0.2857142874366239149);
     const V k3 = Set(d, 0.2222219843214978396);
     const V k4 = Set(d, 0.1818357216161805012);
     const V k5 = Set(d, 0.1531383769920937332);
     const V k6 = Set(d, 0.1479819860511658591);

     const V x2 = Mul(x, x);
     const V x4 = Mul(x2, x2);
     return MulAdd(MulAdd(MulAdd(MulAdd(k6, x4, k4), x4, k2), x4, k0), x2,
                   (Mul(MulAdd(MulAdd(k5, x4, k3), x4, k1), x4)));
   }
 };

 #endif

 template <class D, class V, bool kAllowSubnormals = true>
 HWY_INLINE V Log(const D d, V x) {
   // http://git.musl-libc.org/cgit/musl/tree/src/math/log.c for more info.
   using T = TFromD<D>;
   impl::LogImpl<T> impl;

   constexpr bool kIsF32 = (sizeof(T) == 4);

   // Float Constants
   const V kLn2Hi = Set(d, kIsF32 ? static_cast<T>(0.69313812256f)
                                  : static_cast<T>(0.693147180369123816490));
   const V kLn2Lo = Set(d, kIsF32 ? static_cast<T>(9.0580006145e-6f)
                                  : static_cast<T>(1.90821492927058770002e-10));
   const V kOne = Set(d, static_cast<T>(+1.0));
   const V kMinNormal = Set(d, kIsF32 ? static_cast<T>(1.175494351e-38f)
                                      : static_cast<T>(2.2250738585072014e-308));
   const V kScale = Set(d, kIsF32 ? static_cast<T>(3.355443200e+7f)
                                  : static_cast<T>(1.8014398509481984e+16));

   // Integer Constants
   using TI = MakeSigned<T>;
   const Rebind<TI, D> di;
   using VI = decltype(Zero(di));
   const VI kLowerBits = Set(di, kIsF32 ? static_cast<TI>(0x00000000L)
                                        : static_cast<TI>(0xFFFFFFFFLL));
   const VI kMagic = Set(di, kIsF32 ? static_cast<TI>(0x3F3504F3L)
                                    : static_cast<TI>(0x3FE6A09E00000000LL));
   const VI kExpMask = Set(di, kIsF32 ? static_cast<TI>(0x3F800000L)
                                      : static_cast<TI>(0x3FF0000000000000LL));
   const VI kExpScale =
       Set(di, kIsF32 ? static_cast<TI>(-25) : static_cast<TI>(-54));
   const VI kManMask = Set(di, kIsF32 ? static_cast<TI>(0x7FFFFFL)
                                      : static_cast<TI>(0xFFFFF00000000LL));

   // Scale up 'x' so that it is no longer denormalized.
   VI exp_bits;
   V exp;
   if (kAllowSubnormals == true) {
     const auto is_denormal = Lt(x, kMinNormal);
     x = IfThenElse(is_denormal, Mul(x, kScale), x);

     // Compute the new exponent.
     exp_bits = Add(BitCast(di, x), Sub(kExpMask, kMagic));
     const VI exp_scale =
         BitCast(di, IfThenElseZero(is_denormal, BitCast(d, kExpScale)));
     exp = ConvertTo(
         d, Add(exp_scale, impl.Log2p1NoSubnormal(d, BitCast(d, exp_bits))));
   } else {
     // Compute the new exponent.
     exp_bits = Add(BitCast(di, x), Sub(kExpMask, kMagic));
     exp = ConvertTo(d, impl.Log2p1NoSubnormal(d, BitCast(d, exp_bits)));
   }

   // Renormalize.
   const V y = Or(And(x, BitCast(d, kLowerBits)),
                  BitCast(d, Add(And(exp_bits, kManMask), kMagic)));

   // Approximate and reconstruct.
   const V ym1 = Sub(y, kOne);
   const V z = Div(ym1, Add(y, kOne));

   return MulSub(
       exp, kLn2Hi,
       Sub(MulSub(z, Sub(ym1, impl.LogPoly(d, z)), Mul(exp, kLn2Lo)), ym1));
 }

 // SinCos
 // Based on "sse_mathfun.h", by Julien Pommier
 // http://gruntthepeon.free.fr/ssemath/

 // Third degree poly
 template <class D, class V>
 HWY_INLINE void SinCos3(D d, TFromD<D> dp1, TFromD<D> dp2, TFromD<D> dp3, V x,
                         V& s, V& c) {
   using T = TFromD<D>;
   using TI = MakeSigned<T>;
   using DI = Rebind<TI, D>;
   const DI di;
   using VI = decltype(Zero(di));
   using M = Mask<D>;

   static constexpr size_t bits = sizeof(TI) * 8;
   const VI sign_mask = SignBit(di);
   const VI ci_0 = Zero(di);
   const VI ci_1 = Set(di, 1);
   const VI ci_2 = Set(di, 2);
   const VI ci_4 = Set(di, 4);
   const V cos_p0 = Set(d, ConvertScalarTo<T>(2.443315711809948E-005));
   const V cos_p1 = Set(d, ConvertScalarTo<T>(-1.388731625493765E-003));
   const V cos_p2 = Set(d, ConvertScalarTo<T>(4.166664568298827E-002));
   const V sin_p0 = Set(d, ConvertScalarTo<T>(-1.9515295891E-4));
   const V sin_p1 = Set(d, ConvertScalarTo<T>(8.3321608736E-3));
   const V sin_p2 = Set(d, ConvertScalarTo<T>(-1.6666654611E-1));
   const V FOPI = Set(d, ConvertScalarTo<T>(1.27323954473516));  // 4 / M_PI
   const V DP1 = Set(d, dp1);
   const V DP2 = Set(d, dp2);
   const V DP3 = Set(d, dp3);

   V xmm1, xmm2, sign_bit_sin, y;
   VI imm0, imm2, imm4;

   sign_bit_sin = x;
   x = Abs(x);

   /* extract the sign bit (upper one) */
   sign_bit_sin = And(sign_bit_sin, BitCast(d, sign_mask));

   /* scale by 4/Pi */
   y = Mul(x, FOPI);

   /* store the integer part of y in imm2 */
   imm2 = ConvertTo(di, y);

   /* j=(j+1) & (~1) (see the cephes sources) */
   imm2 = Add(imm2, ci_1);
   imm2 = AndNot(ci_1, imm2);

   y = ConvertTo(d, imm2);
   imm4 = imm2;

   /* get the swap sign flag for the sine */
   imm0 = And(imm2, ci_4);
   imm0 = ShiftLeft<bits - 3>(imm0);

   V swap_sign_bit_sin = BitCast(d, imm0);

   /* get the polynomial selection mask for the sine*/
   imm2 = And(imm2, ci_2);
   M poly_mask = RebindMask(d, Eq(imm2, ci_0));

   /* The magic pass: "Extended precision modular arithmetic"
   x = ((x - y * DP1) - y * DP2) - y * DP3; */
   x = MulAdd(y, DP1, x);
   x = MulAdd(y, DP2, x);
   x = MulAdd(y, DP3, x);

   imm4 = Sub(imm4, ci_2);
   imm4 = AndNot(imm4, ci_4);
   imm4 = ShiftLeft<bits - 3>(imm4);

   V sign_bit_cos = BitCast(d, imm4);

   sign_bit_sin = Xor(sign_bit_sin, swap_sign_bit_sin);

   /* Evaluate the first polynomial  (0 <= x <= Pi/4) */
   V z = Mul(x, x);

   y = MulAdd(cos_p0, z, cos_p1);
   y = MulAdd(y, z, cos_p2);
   y = Mul(y, z);
   y = Mul(y, z);
   y = NegMulAdd(z, Set(d, 0.5f), y);
   y = Add(y, Set(d, 1));

   /* Evaluate the second polynomial  (Pi/4 <= x <= 0) */
   V y2 = MulAdd(sin_p0, z, sin_p1);
   y2 = MulAdd(y2, z, sin_p2);
   y2 = Mul(y2, z);
   y2 = MulAdd(y2, x, x);

   /* select the correct result from the two polynomials */
   xmm1 = IfThenElse(poly_mask, y2, y);
   xmm2 = IfThenElse(poly_mask, y, y2);

   /* update the sign */
   s = Xor(xmm1, sign_bit_sin);
   c = Xor(xmm2, sign_bit_cos);
 }

 // Sixth degree poly
 template <class D, class V>
 HWY_INLINE void SinCos6(D d, TFromD<D> dp1, TFromD<D> dp2, TFromD<D> dp3, V x,
                         V& s, V& c) {
   using T = TFromD<D>;
   using TI = MakeSigned<T>;
   using DI = Rebind<TI, D>;
   const DI di;
   using VI = decltype(Zero(di));
   using M = Mask<D>;

   static constexpr size_t bits = sizeof(TI) * 8;
   const VI sign_mask = SignBit(di);
   const VI ci_0 = Zero(di);
   const VI ci_1 = Set(di, 1);
   const VI ci_2 = Set(di, 2);
   const VI ci_4 = Set(di, 4);
   const V cos_p0 = Set(d, ConvertScalarTo<T>(-1.13585365213876817300E-11));
   const V cos_p1 = Set(d, ConvertScalarTo<T>(2.08757008419747316778E-9));
   const V cos_p2 = Set(d, ConvertScalarTo<T>(-2.75573141792967388112E-7));
   const V cos_p3 = Set(d, ConvertScalarTo<T>(2.48015872888517045348E-5));
   const V cos_p4 = Set(d, ConvertScalarTo<T>(-1.38888888888730564116E-3));
   const V cos_p5 = Set(d, ConvertScalarTo<T>(4.16666666666665929218E-2));
   const V sin_p0 = Set(d, ConvertScalarTo<T>(1.58962301576546568060E-10));
   const V sin_p1 = Set(d, ConvertScalarTo<T>(-2.50507477628578072866E-8));
   const V sin_p2 = Set(d, ConvertScalarTo<T>(2.75573136213857245213E-6));
   const V sin_p3 = Set(d, ConvertScalarTo<T>(-1.98412698295895385996E-4));
   const V sin_p4 = Set(d, ConvertScalarTo<T>(8.33333333332211858878E-3));
   const V sin_p5 = Set(d, ConvertScalarTo<T>(-1.66666666666666307295E-1));
   const V FOPI =  // 4 / M_PI
       Set(d, ConvertScalarTo<T>(1.2732395447351626861510701069801148));
   const V DP1 = Set(d, dp1);
   const V DP2 = Set(d, dp2);
   const V DP3 = Set(d, dp3);

   V xmm1, xmm2, sign_bit_sin, y;
   VI imm0, imm2, imm4;

   sign_bit_sin = x;
   x = Abs(x);

   /* extract the sign bit (upper one) */
   sign_bit_sin = And(sign_bit_sin, BitCast(d, sign_mask));

   /* scale by 4/Pi */
   y = Mul(x, FOPI);

   /* store the integer part of y in imm2 */
   imm2 = ConvertTo(di, y);

   /* j=(j+1) & (~1) (see the cephes sources) */
   imm2 = Add(imm2, ci_1);
   imm2 = AndNot(ci_1, imm2);

   y = ConvertTo(d, imm2);
   imm4 = imm2;

   /* get the swap sign flag for the sine */
   imm0 = And(imm2, ci_4);
   imm0 = ShiftLeft<bits - 3>(imm0);

   V swap_sign_bit_sin = BitCast(d, imm0);

   /* get the polynomial selection mask for the sine*/
   imm2 = And(imm2, ci_2);
   M poly_mask = RebindMask(d, Eq(imm2, ci_0));

   /* The magic pass: "Extended precision modular arithmetic"
     x = ((x - y * DP1) - y * DP2) - y * DP3; */
   x = MulAdd(y, DP1, x);
   x = MulAdd(y, DP2, x);
   x = MulAdd(y, DP3, x);

   imm4 = Sub(imm4, ci_2);
   imm4 = AndNot(imm4, ci_4);
   imm4 = ShiftLeft<bits - 3>(imm4);

   V sign_bit_cos = BitCast(d, imm4);
   sign_bit_sin = Xor(sign_bit_sin, swap_sign_bit_sin);

   /* Evaluate the first polynomial  (0 <= x <= Pi/4) */
   V z = Mul(x, x);

   y = MulAdd(cos_p0, z, cos_p1);
   y = MulAdd(y, z, cos_p2);
   y = MulAdd(y, z, cos_p3);
   y = MulAdd(y, z, cos_p4);
   y = MulAdd(y, z, cos_p5);
   y = Mul(y, z);
   y = Mul(y, z);
   y = NegMulAdd(z, Set(d, 0.5f), y);
   y = Add(y, Set(d, 1.0f));

   /* Evaluate the second polynomial  (Pi/4 <= x <= 0) */
   V y2 = MulAdd(sin_p0, z, sin_p1);
   y2 = MulAdd(y2, z, sin_p2);
   y2 = MulAdd(y2, z, sin_p3);
   y2 = MulAdd(y2, z, sin_p4);
   y2 = MulAdd(y2, z, sin_p5);
   y2 = Mul(y2, z);
   y2 = MulAdd(y2, x, x);

   /* select the correct result from the two polynomials */
   xmm1 = IfThenElse(poly_mask, y2, y);
   xmm2 = IfThenElse(poly_mask, y, y2);

   /* update the sign */
   s = Xor(xmm1, sign_bit_sin);
   c = Xor(xmm2, sign_bit_cos);
 }

 template <>
 struct SinCosImpl<float> {
   template <class D, class V>
   HWY_INLINE void SinCos(D d, V x, V& s, V& c) {
     SinCos3(d, -0.78515625f, -2.4187564849853515625e-4f,
             -3.77489497744594108e-8f, x, s, c);
   }
 };

 #if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
 template <>
 struct SinCosImpl<double> {
   template <class D, class V>
   HWY_INLINE void SinCos(D d, V x, V& s, V& c) {
     SinCos6(d, -7.85398125648498535156E-1, -3.77489470793079817668E-8,
             -2.69515142907905952645E-15, x, s, c);
   }
 };
 #endif

 }  // namespace impl

 template <class D, class V>
 HWY_INLINE V Acos(const D d, V x) {
   using T = TFromD<D>;

   const V kZero = Zero(d);
   const V kHalf = Set(d, static_cast<T>(+0.5));
   const V kPi = Set(d, static_cast<T>(+3.14159265358979323846264));
   const V kPiOverTwo = Set(d, static_cast<T>(+1.57079632679489661923132169));

   const V sign_x = And(SignBit(d), x);
   const V abs_x = Xor(x, sign_x);
   const auto mask = Lt(abs_x, kHalf);
   const V yy =
       IfThenElse(mask, Mul(abs_x, abs_x), NegMulAdd(abs_x, kHalf, kHalf));
   const V y = IfThenElse(mask, abs_x, Sqrt(yy));

   impl::AsinImpl<T> impl;
   const V t = Mul(impl.AsinPoly(d, yy, y), Mul(y, yy));

   const V t_plus_y = Add(t, y);
   const V z =
       IfThenElse(mask, Sub(kPiOverTwo, Add(Xor(y, sign_x), Xor(t, sign_x))),
                  Add(t_plus_y, t_plus_y));
   return IfThenElse(Or(mask, Ge(x, kZero)), z, Sub(kPi, z));
 }

 template <class D, class V>
 HWY_INLINE V Acosh(const D d, V x) {
   using T = TFromD<D>;

   const V kLarge = Set(d, static_cast<T>(268435456.0));
   const V kLog2 = Set(d, static_cast<T>(0.693147180559945286227));
   const V kOne = Set(d, static_cast<T>(+1.0));
   const V kTwo = Set(d, static_cast<T>(+2.0));

   const auto is_x_large = Gt(x, kLarge);
   const auto is_x_gt_2 = Gt(x, kTwo);

   const V x_minus_1 = Sub(x, kOne);
   const V y0 = MulSub(kTwo, x, Div(kOne, Add(Sqrt(MulSub(x, x, kOne)), x)));
   const V y1 =
       Add(Sqrt(MulAdd(x_minus_1, kTwo, Mul(x_minus_1, x_minus_1))), x_minus_1);
   const V y2 =
       IfThenElse(is_x_gt_2, IfThenElse(is_x_large, x, y0), Add(y1, kOne));
   const V z = impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y2);

   const auto is_pole = Eq(y2, kOne);
   const auto divisor = Sub(IfThenZeroElse(is_pole, y2), kOne);
   return Add(IfThenElse(is_x_gt_2, z,
                         IfThenElse(is_pole, y1, Div(Mul(z, y1), divisor))),
              IfThenElseZero(is_x_large, kLog2));
 }

 template <class D, class V>
 HWY_INLINE V Asin(const D d, V x) {
   using T = TFromD<D>;

   const V kHalf = Set(d, static_cast<T>(+0.5));
   const V kTwo = Set(d, static_cast<T>(+2.0));
   const V kPiOverTwo = Set(d, static_cast<T>(+1.57079632679489661923132169));

   const V sign_x = And(SignBit(d), x);
   const V abs_x = Xor(x, sign_x);
   const auto mask = Lt(abs_x, kHalf);
   const V yy =
       IfThenElse(mask, Mul(abs_x, abs_x), NegMulAdd(abs_x, kHalf, kHalf));
   const V y = IfThenElse(mask, abs_x, Sqrt(yy));

   impl::AsinImpl<T> impl;
   const V z0 = MulAdd(impl.AsinPoly(d, yy, y), Mul(yy, y), y);
   const V z1 = NegMulAdd(z0, kTwo, kPiOverTwo);
   return Or(IfThenElse(mask, z0, z1), sign_x);
 }

 template <class D, class V>
 HWY_INLINE V Asinh(const D d, V x) {
   using T = TFromD<D>;

   const V kSmall = Set(d, static_cast<T>(1.0 / 268435456.0));
   const V kLarge = Set(d, static_cast<T>(268435456.0));
   const V kLog2 = Set(d, static_cast<T>(0.693147180559945286227));
   const V kOne = Set(d, static_cast<T>(+1.0));
   const V kTwo = Set(d, static_cast<T>(+2.0));

   const V sign_x = And(SignBit(d), x);  // Extract the sign bit
   const V abs_x = Xor(x, sign_x);

   const auto is_x_large = Gt(abs_x, kLarge);
   const auto is_x_lt_2 = Lt(abs_x, kTwo);

   const V x2 = Mul(x, x);
   const V sqrt_x2_plus_1 = Sqrt(Add(x2, kOne));

   const V y0 = MulAdd(abs_x, kTwo, Div(kOne, Add(sqrt_x2_plus_1, abs_x)));
   const V y1 = Add(Div(x2, Add(sqrt_x2_plus_1, kOne)), abs_x);
   const V y2 =
       IfThenElse(is_x_lt_2, Add(y1, kOne), IfThenElse(is_x_large, abs_x, y0));
   const V z = impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y2);

   const auto is_pole = Eq(y2, kOne);
   const auto divisor = Sub(IfThenZeroElse(is_pole, y2), kOne);
   const auto large = IfThenElse(is_pole, y1, Div(Mul(z, y1), divisor));
   const V y = IfThenElse(Lt(abs_x, kSmall), x, large);
   return Or(Add(IfThenElse(is_x_lt_2, y, z), IfThenElseZero(is_x_large, kLog2)),
             sign_x);
 }

 template <class D, class V>
 HWY_INLINE V Atan(const D d, V x) {
   using T = TFromD<D>;

   const V kOne = Set(d, static_cast<T>(+1.0));
   const V kPiOverTwo = Set(d, static_cast<T>(+1.57079632679489661923132169));

   const V sign = And(SignBit(d), x);
   const V abs_x = Xor(x, sign);
   const auto mask = Gt(abs_x, kOne);

   impl::AtanImpl<T> impl;
   const auto divisor = IfThenElse(mask, abs_x, kOne);
   const V y = impl.AtanPoly(d, IfThenElse(mask, Div(kOne, divisor), abs_x));
   return Or(IfThenElse(mask, Sub(kPiOverTwo, y), y), sign);
 }

 template <class D, class V>
 HWY_INLINE V Atanh(const D d, V x) {
   using T = TFromD<D>;

   const V kHalf = Set(d, static_cast<T>(+0.5));
   const V kOne = Set(d, static_cast<T>(+1.0));

   const V sign = And(SignBit(d), x);  // Extract the sign bit
   const V abs_x = Xor(x, sign);
   return Mul(Log1p(d, Div(Add(abs_x, abs_x), Sub(kOne, abs_x))),
              Xor(kHalf, sign));
 }

 template <class D, class V>
 HWY_INLINE V Cos(const D d, V x) {
   using T = TFromD<D>;
   impl::CosSinImpl<T> impl;

   // Float Constants
   const V kOneOverPi = Set(d, static_cast<T>(0.31830988618379067153));

   // Integer Constants
   const Rebind<int32_t, D> di32;
   using VI32 = decltype(Zero(di32));
   const VI32 kOne = Set(di32, 1);

   const V y = Abs(x);  // cos(x) == cos(|x|)

   // Compute the quadrant, q = int(|x| / pi) * 2 + 1
   const VI32 q = Add(ShiftLeft<1>(impl.ToInt32(d, Mul(y, kOneOverPi))), kOne);

   // Reduce range, apply sign, and approximate.
   return impl.Poly(
       d, Xor(impl.CosReduce(d, y, q), impl.CosSignFromQuadrant(d, q)));
 }

 template <class D, class V>
 HWY_INLINE V Exp(const D d, V x) {
   using T = TFromD<D>;

   const V kHalf = Set(d, static_cast<T>(+0.5));
   const V kLowerBound =
       Set(d, static_cast<T>((sizeof(T) == 4 ? -104.0 : -1000.0)));
   const V kNegZero = Set(d, static_cast<T>(-0.0));
   const V kOne = Set(d, static_cast<T>(+1.0));
   const V kOneOverLog2 = Set(d, static_cast<T>(+1.442695040888963407359924681));

   impl::ExpImpl<T> impl;

   // q = static_cast<int32>((x / log(2)) + ((x < 0) ? -0.5 : +0.5))
   const auto q =
       impl.ToInt32(d, MulAdd(x, kOneOverLog2, Or(kHalf, And(x, kNegZero))));

   // Reduce, approximate, and then reconstruct.
   const V y = impl.LoadExpShortRange(
       d, Add(impl.ExpPoly(d, impl.ExpReduce(d, x, q)), kOne), q);
   return IfThenElseZero(Ge(x, kLowerBound), y);
 }

 template <class D, class V>
 HWY_INLINE V Exp2(const D d, V x) {
   using T = TFromD<D>;

   const V kLowerBound =
       Set(d, static_cast<T>((sizeof(T) == 4 ? -150.0 : -1075.0)));
   const V kOne = Set(d, static_cast<T>(+1.0));

   impl::ExpImpl<T> impl;

   // q = static_cast<int32_t>(std::lrint(x))
   const auto q = impl.ToNearestInt32(d, x);

   // Reduce, approximate, and then reconstruct.
   const V y = impl.LoadExpShortRange(
       d, Add(impl.ExpPoly(d, impl.Exp2Reduce(d, x, q)), kOne), q);
   return IfThenElseZero(Ge(x, kLowerBound), y);
 }

 template <class D, class V>
 HWY_INLINE V Expm1(const D d, V x) {
   using T = TFromD<D>;

   const V kHalf = Set(d, static_cast<T>(+0.5));
   const V kLowerBound =
       Set(d, static_cast<T>((sizeof(T) == 4 ? -104.0 : -1000.0)));
   const V kLn2Over2 = Set(d, static_cast<T>(+0.346573590279972654708616));
   const V kNegOne = Set(d, static_cast<T>(-1.0));
   const V kNegZero = Set(d, static_cast<T>(-0.0));
   const V kOne = Set(d, static_cast<T>(+1.0));
   const V kOneOverLog2 = Set(d, static_cast<T>(+1.442695040888963407359924681));

   impl::ExpImpl<T> impl;

   // q = static_cast<int32>((x / log(2)) + ((x < 0) ? -0.5 : +0.5))
   const auto q =
       impl.ToInt32(d, MulAdd(x, kOneOverLog2, Or(kHalf, And(x, kNegZero))));

   // Reduce, approximate, and then reconstruct.
   const V y = impl.ExpPoly(d, impl.ExpReduce(d, x, q));
   const V z = IfThenElse(Lt(Abs(x), kLn2Over2), y,
                          Sub(impl.LoadExpShortRange(d, Add(y, kOne), q), kOne));
   return IfThenElse(Lt(x, kLowerBound), kNegOne, z);
 }

 template <class D, class V>
 HWY_INLINE V Log(const D d, V x) {
   return impl::Log<D, V, /*kAllowSubnormals=*/true>(d, x);
 }

 template <class D, class V>
 HWY_INLINE V Log10(const D d, V x) {
   using T = TFromD<D>;
   return Mul(Log(d, x), Set(d, static_cast<T>(0.4342944819032518276511)));
 }

 template <class D, class V>
 HWY_INLINE V Log1p(const D d, V x) {
   using T = TFromD<D>;
   const V kOne = Set(d, static_cast<T>(+1.0));

   const V y = Add(x, kOne);
   const auto is_pole = Eq(y, kOne);
   const auto divisor = Sub(IfThenZeroElse(is_pole, y), kOne);
   const auto non_pole =
       Mul(impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y), Div(x, divisor));
   return IfThenElse(is_pole, x, non_pole);
 }

 template <class D, class V>
 HWY_INLINE V Log2(const D d, V x) {
   using T = TFromD<D>;
   return Mul(Log(d, x), Set(d, static_cast<T>(1.44269504088896340735992)));
 }

 template <class D, class V>
 HWY_INLINE V Sin(const D d, V x) {
   using T = TFromD<D>;
   impl::CosSinImpl<T> impl;

   // Float Constants
   const V kOneOverPi = Set(d, static_cast<T>(0.31830988618379067153));
   const V kHalf = Set(d, static_cast<T>(0.5));

   // Integer Constants
   const Rebind<int32_t, D> di32;
   using VI32 = decltype(Zero(di32));

   const V abs_x = Abs(x);
   const V sign_x = Xor(abs_x, x);

   // Compute the quadrant, q = int((|x| / pi) + 0.5)
   const VI32 q = impl.ToInt32(d, MulAdd(abs_x, kOneOverPi, kHalf));

   // Reduce range, apply sign, and approximate.
   return impl.Poly(d, Xor(impl.SinReduce(d, abs_x, q),
                           Xor(impl.SinSignFromQuadrant(d, q), sign_x)));
 }

 template <class D, class V>
 HWY_INLINE V Sinh(const D d, V x) {
   using T = TFromD<D>;
   const V kHalf = Set(d, static_cast<T>(+0.5));
   const V kOne = Set(d, static_cast<T>(+1.0));
   const V kTwo = Set(d, static_cast<T>(+2.0));

   const V sign = And(SignBit(d), x);  // Extract the sign bit
   const V abs_x = Xor(x, sign);
   const V y = Expm1(d, abs_x);
   const V z = Mul(Div(Add(y, kTwo), Add(y, kOne)), Mul(y, kHalf));
   return Xor(z, sign);  // Reapply the sign bit
 }

 template <class D, class V>
 HWY_INLINE V Tanh(const D d, V x) {
   using T = TFromD<D>;
   const V kLimit = Set(d, static_cast<T>(18.714973875));
   const V kOne = Set(d, static_cast<T>(+1.0));
   const V kTwo = Set(d, static_cast<T>(+2.0));

   const V sign = And(SignBit(d), x);  // Extract the sign bit
   const V abs_x = Xor(x, sign);
   const V y = Expm1(d, Mul(abs_x, kTwo));
   const V z = IfThenElse(Gt(abs_x, kLimit), kOne, Div(y, Add(y, kTwo)));
   return Xor(z, sign);  // Reapply the sign bit
 }

 template <class D, class V>
 HWY_INLINE void SinCos(const D d, V x, V& s, V& c) {
   using T = TFromD<D>;
   impl::SinCosImpl<T> impl;
   impl.SinCos(d, x, s, c);
 }

 template <class D, class V>
 HWY_INLINE V Hypot(const D d, V a, V b) {
   using T = TFromD<D>;
   using TI = MakeSigned<T>;
   const RebindToUnsigned<decltype(d)> du;
   const RebindToSigned<decltype(d)> di;
   using VI = VFromD<decltype(di)>;

   constexpr int kMaxBiasedExp = static_cast<int>(MaxExponentField<T>());
   static_assert(kMaxBiasedExp > 0, "kMaxBiasedExp > 0 must be true");

   constexpr int kNumOfMantBits = MantissaBits<T>();
   static_assert(kNumOfMantBits > 0, "kNumOfMantBits > 0 must be true");

   constexpr int kExpBias = kMaxBiasedExp / 2;

   static_assert(
       static_cast<unsigned>(kExpBias) + static_cast<unsigned>(kNumOfMantBits) <
           static_cast<unsigned>(kMaxBiasedExp),
       "kExpBias + kNumOfMantBits < kMaxBiasedExp must be true");

   // kMinValToSquareBiasedExp is the smallest biased exponent such that
   // pow(pow(2, kMinValToSquareBiasedExp - kExpBias) * x, 2) is either a normal
   // floating-point value or infinity if x is a non-zero, non-NaN value
   constexpr int kMinValToSquareBiasedExp = (kExpBias / 2) + kNumOfMantBits;
   static_assert(kMinValToSquareBiasedExp < kExpBias,
                 "kMinValToSquareBiasedExp < kExpBias must be true");

   // kMaxValToSquareBiasedExp is the largest biased exponent such that
   // pow(pow(2, kMaxValToSquareBiasedExp - kExpBias) * x, 2) * 2 is guaranteed
   // to be a finite value if x is a finite value
   constexpr int kMaxValToSquareBiasedExp = kExpBias + ((kExpBias / 2) - 1);
   static_assert(kMaxValToSquareBiasedExp > kExpBias,
                 "kMaxValToSquareBiasedExp > kExpBias must be true");
   static_assert(kMaxValToSquareBiasedExp < kMaxBiasedExp,
                 "kMaxValToSquareBiasedExp < kMaxBiasedExp must be true");

 #if HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128 || \
     HWY_TARGET == HWY_Z14 || HWY_TARGET == HWY_Z15
   using TExpSatSub = MakeUnsigned<T>;
   using TExpMinMax = TI;
 #else
   using TExpSatSub = uint16_t;
   using TExpMinMax = int16_t;
 #endif

   const Repartition<TExpSatSub, decltype(d)> d_exp_sat_sub;
   const Repartition<TExpMinMax, decltype(d)> d_exp_min_max;

   const V abs_a = Abs(a);
   const V abs_b = Abs(b);

   const MFromD<D> either_inf = Or(IsInf(a), IsInf(b));

   const VI zero = Zero(di);

   // exp_a[i] is the biased exponent of abs_a[i]
   const VI exp_a = BitCast(di, ShiftRight<kNumOfMantBits>(BitCast(du, abs_a)));

   // exp_b[i] is the biased exponent of abs_b[i]
   const VI exp_b = BitCast(di, ShiftRight<kNumOfMantBits>(BitCast(du, abs_b)));

   // max_exp[i] is equal to HWY_MAX(exp_a[i], exp_b[i])

   // If abs_a[i] and abs_b[i] are both NaN values, max_exp[i] will be equal to
   // the biased exponent of the larger value. Otherwise, if either abs_a[i] or
   // abs_b[i] is NaN, max_exp[i] will be equal to kMaxBiasedExp.
   const VI max_exp = BitCast(
       di, Max(BitCast(d_exp_min_max, exp_a), BitCast(d_exp_min_max, exp_b)));

   // If either abs_a[i] or abs_b[i] is zero, min_exp[i] is equal to max_exp[i].
   // Otherwise, if abs_a[i] and abs_b[i] are both nonzero, min_exp[i] is equal
   // to HWY_MIN(exp_a[i], exp_b[i]).
   const VI min_exp = IfThenElse(
       Or(Eq(BitCast(di, abs_a), zero), Eq(BitCast(di, abs_b), zero)), max_exp,
       BitCast(di, Min(BitCast(d_exp_min_max, exp_a),
                       BitCast(d_exp_min_max, exp_b))));

   // scl_pow2[i] is the power of 2 to scale abs_a[i] and abs_b[i] by

   // abs_a[i] and abs_b[i] should be scaled by a factor that is greater than
   // zero but less than or equal to
   // pow(2, kMaxValToSquareBiasedExp - max_exp[i]) to ensure that that the
   // multiplications or addition operations do not overflow if
   // std::hypot(abs_a[i], abs_b[i]) is finite

   // If either abs_a[i] or abs_b[i] is a a positive value that is less than
   // pow(2, kMinValToSquareBiasedExp - kExpBias), then scaling up abs_a[i] and
   // abs_b[i] by pow(2, kMinValToSquareBiasedExp - min_exp[i]) will ensure that
   // the multiplications and additions result in normal floating point values,
   // infinities, or NaNs.

   // If HWY_MAX(kMinValToSquareBiasedExp - min_exp[i], 0) is greater than
   // kMaxValToSquareBiasedExp - max_exp[i], scale abs_a[i] and abs_b[i] up by
   // pow(2, kMaxValToSquareBiasedExp - max_exp[i]) to ensure that the
   // multiplication and addition operations result in a finite result if
   // std::hypot(abs_a[i], abs_b[i]) is finite.

   const VI scl_pow2 = BitCast(
       di,
       Min(BitCast(d_exp_min_max,
                   SaturatedSub(BitCast(d_exp_sat_sub,
                                        Set(di, static_cast<TI>(
                                                    kMinValToSquareBiasedExp))),
                                BitCast(d_exp_sat_sub, min_exp))),
           BitCast(d_exp_min_max,
                   Sub(Set(di, static_cast<TI>(kMaxValToSquareBiasedExp)),
                       max_exp))));

   const VI exp_bias = Set(di, static_cast<TI>(kExpBias));

   const V ab_scl_factor =
       BitCast(d, ShiftLeft<kNumOfMantBits>(Add(exp_bias, scl_pow2)));
   const V hypot_scl_factor =
       BitCast(d, ShiftLeft<kNumOfMantBits>(Sub(exp_bias, scl_pow2)));

   const V scl_a = Mul(abs_a, ab_scl_factor);
   const V scl_b = Mul(abs_b, ab_scl_factor);

   const V scl_hypot = Sqrt(MulAdd(scl_a, scl_a, Mul(scl_b, scl_b)));
   // std::hypot returns inf if one input is +/- inf, even if the other is NaN.
   return IfThenElse(either_inf, Inf(d), Mul(scl_hypot, hypot_scl_factor));
 }

 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace hwy
 HWY_AFTER_NAMESPACE();

 #endif  // HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_