|  | // Copyright 2020 Google LLC | 
|  | // SPDX-License-Identifier: Apache-2.0 | 
|  | // | 
|  | // Licensed under the Apache License, Version 2.0 (the "License"); | 
|  | // you may not use this file except in compliance with the License. | 
|  | // You may obtain a copy of the License at | 
|  | // | 
|  | //      http://www.apache.org/licenses/LICENSE-2.0 | 
|  | // | 
|  | // Unless required by applicable law or agreed to in writing, software | 
|  | // distributed under the License is distributed on an "AS IS" BASIS, | 
|  | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
|  | // See the License for the specific language governing permissions and | 
|  | // limitations under the License. | 
|  |  | 
|  | // Include guard (still compiled once per target) | 
|  | #if defined(HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_) == \ | 
|  | defined(HWY_TARGET_TOGGLE)  // NOLINT | 
|  | #ifdef HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_ | 
|  | #undef HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_ | 
|  | #else | 
|  | #define HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_ | 
|  | #endif | 
|  |  | 
|  | #include <stddef.h> | 
|  | #include <stdint.h> | 
|  |  | 
|  | #include "third_party/highway/hwy/highway.h" | 
|  |  | 
|  | HWY_BEFORE_NAMESPACE(); | 
|  | namespace hwy { | 
|  | namespace HWY_NAMESPACE { | 
|  |  | 
|  | /** | 
|  | * Highway SIMD version of std::acos(x). | 
|  | * | 
|  | * Valid Lane Types: float32, float64 | 
|  | *        Max Error: ULP = 2 | 
|  | *      Valid Range: [-1, +1] | 
|  | * @return arc cosine of 'x' | 
|  | */ | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Acos(D d, V x); | 
|  | template <class D, class V> | 
|  | HWY_NOINLINE V CallAcos(const D d, VecArg<V> x) { | 
|  | return Acos(d, x); | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Highway SIMD version of std::acosh(x). | 
|  | * | 
|  | * Valid Lane Types: float32, float64 | 
|  | *        Max Error: ULP = 3 | 
|  | *      Valid Range: float32[1, +FLT_MAX], float64[1, +DBL_MAX] | 
|  | * @return hyperbolic arc cosine of 'x' | 
|  | */ | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Acosh(D d, V x); | 
|  | template <class D, class V> | 
|  | HWY_NOINLINE V CallAcosh(const D d, VecArg<V> x) { | 
|  | return Acosh(d, x); | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Highway SIMD version of std::asin(x). | 
|  | * | 
|  | * Valid Lane Types: float32, float64 | 
|  | *        Max Error: ULP = 2 | 
|  | *      Valid Range: [-1, +1] | 
|  | * @return arc sine of 'x' | 
|  | */ | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Asin(D d, V x); | 
|  | template <class D, class V> | 
|  | HWY_NOINLINE V CallAsin(const D d, VecArg<V> x) { | 
|  | return Asin(d, x); | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Highway SIMD version of std::asinh(x). | 
|  | * | 
|  | * Valid Lane Types: float32, float64 | 
|  | *        Max Error: ULP = 3 | 
|  | *      Valid Range: float32[-FLT_MAX, +FLT_MAX], float64[-DBL_MAX, +DBL_MAX] | 
|  | * @return hyperbolic arc sine of 'x' | 
|  | */ | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Asinh(D d, V x); | 
|  | template <class D, class V> | 
|  | HWY_NOINLINE V CallAsinh(const D d, VecArg<V> x) { | 
|  | return Asinh(d, x); | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Highway SIMD version of std::atan(x). | 
|  | * | 
|  | * Valid Lane Types: float32, float64 | 
|  | *        Max Error: ULP = 3 | 
|  | *      Valid Range: float32[-FLT_MAX, +FLT_MAX], float64[-DBL_MAX, +DBL_MAX] | 
|  | * @return arc tangent of 'x' | 
|  | */ | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Atan(D d, V x); | 
|  | template <class D, class V> | 
|  | HWY_NOINLINE V CallAtan(const D d, VecArg<V> x) { | 
|  | return Atan(d, x); | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Highway SIMD version of std::atanh(x). | 
|  | * | 
|  | * Valid Lane Types: float32, float64 | 
|  | *        Max Error: ULP = 3 | 
|  | *      Valid Range: (-1, +1) | 
|  | * @return hyperbolic arc tangent of 'x' | 
|  | */ | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Atanh(D d, V x); | 
|  | template <class D, class V> | 
|  | HWY_NOINLINE V CallAtanh(const D d, VecArg<V> x) { | 
|  | return Atanh(d, x); | 
|  | } | 
|  |  | 
|  | // Atan2 was added later and some users may be implementing it themselves, so | 
|  | // notify them that this version of Highway defines it already. | 
|  | #ifndef HWY_HAVE_ATAN2 | 
|  | #define HWY_HAVE_ATAN2 1 | 
|  | #endif | 
|  |  | 
|  | /** | 
|  | * Highway SIMD version of std::atan2(x). | 
|  | * | 
|  | * Valid Lane Types: float32, float64 | 
|  | * Correctly handles negative zero, infinities, and NaN. | 
|  | * @return atan2 of 'y', 'x' | 
|  | */ | 
|  | template <class D, class V = VFromD<D>, class M = MFromD<D>, | 
|  | typename T = TFromD<D>> | 
|  | HWY_INLINE V Atan2(const D d, V y, V x) { | 
|  | const V kHalf = Set(d, static_cast<T>(+0.5)); | 
|  | const V kPi = Set(d, static_cast<T>(+3.14159265358979323846264)); | 
|  | const V kPi2 = Mul(kPi, kHalf); | 
|  |  | 
|  | const V k0 = Zero(d); | 
|  | const M y_0 = Eq(y, k0); | 
|  | const M x_0 = Eq(x, k0); | 
|  | const M x_neg = Lt(x, k0); | 
|  | const M y_inf = IsInf(y); | 
|  | const M x_inf = IsInf(x); | 
|  | const M nan = Or(IsNaN(y), IsNaN(x)); | 
|  |  | 
|  | const V if_xneg_pi = IfThenElseZero(x_neg, kPi); | 
|  | // x= +inf: pi/4; -inf: 3*pi/4; else: pi/2 | 
|  | const V if_yinf = Mul(kHalf, IfThenElse(x_inf, Add(kPi2, if_xneg_pi), kPi)); | 
|  |  | 
|  | V t = Atan(d, Div(y, x)); | 
|  | // Disambiguate between quadrants 1/3 and 2/4 by adding (Q2: Pi; Q3: -Pi). | 
|  | t = Add(t, CopySignToAbs(if_xneg_pi, y)); | 
|  | // Special cases for 0 and infinity: | 
|  | t = IfThenElse(x_inf, if_xneg_pi, t); | 
|  | t = IfThenElse(x_0, kPi2, t); | 
|  | t = IfThenElse(y_inf, if_yinf, t); | 
|  | t = IfThenElse(y_0, if_xneg_pi, t); | 
|  | // Any input NaN => NaN, otherwise fix sign. | 
|  | return IfThenElse(nan, NaN(d), CopySign(t, y)); | 
|  | } | 
|  | template <class D, class V> | 
|  | HWY_NOINLINE V CallAtan2(const D d, VecArg<V> y, VecArg<V> x) { | 
|  | return Atan2(d, y, x); | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Highway SIMD version of std::cos(x). | 
|  | * | 
|  | * Valid Lane Types: float32, float64 | 
|  | *        Max Error: ULP = 3 | 
|  | *      Valid Range: [-39000, +39000] | 
|  | * @return cosine of 'x' | 
|  | */ | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Cos(D d, V x); | 
|  | template <class D, class V> | 
|  | HWY_NOINLINE V CallCos(const D d, VecArg<V> x) { | 
|  | return Cos(d, x); | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Highway SIMD version of std::exp(x). | 
|  | * | 
|  | * Valid Lane Types: float32, float64 | 
|  | *        Max Error: ULP = 1 | 
|  | *      Valid Range: float32[-FLT_MAX, +104], float64[-DBL_MAX, +706] | 
|  | * @return e^x | 
|  | */ | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Exp(D d, V x); | 
|  | template <class D, class V> | 
|  | HWY_NOINLINE V CallExp(const D d, VecArg<V> x) { | 
|  | return Exp(d, x); | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Highway SIMD version of std::exp2(x). | 
|  | * | 
|  | * Valid Lane Types: float32, float64 | 
|  | *        Max Error: ULP = 2 | 
|  | *      Valid Range: float32[-FLT_MAX, +128], float64[-DBL_MAX, +1024] | 
|  | * @return 2^x | 
|  | */ | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Exp2(D d, V x); | 
|  | template <class D, class V> | 
|  | HWY_NOINLINE V CallExp2(const D d, VecArg<V> x) { | 
|  | return Exp2(d, x); | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Highway SIMD version of std::expm1(x). | 
|  | * | 
|  | * Valid Lane Types: float32, float64 | 
|  | *        Max Error: ULP = 4 | 
|  | *      Valid Range: float32[-FLT_MAX, +104], float64[-DBL_MAX, +706] | 
|  | * @return e^x - 1 | 
|  | */ | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Expm1(D d, V x); | 
|  | template <class D, class V> | 
|  | HWY_NOINLINE V CallExpm1(const D d, VecArg<V> x) { | 
|  | return Expm1(d, x); | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Highway SIMD version of std::log(x). | 
|  | * | 
|  | * Valid Lane Types: float32, float64 | 
|  | *        Max Error: ULP = 4 | 
|  | *      Valid Range: float32(0, +FLT_MAX], float64(0, +DBL_MAX] | 
|  | * @return natural logarithm of 'x' | 
|  | */ | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Log(D d, V x); | 
|  | template <class D, class V> | 
|  | HWY_NOINLINE V CallLog(const D d, VecArg<V> x) { | 
|  | return Log(d, x); | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Highway SIMD version of std::log10(x). | 
|  | * | 
|  | * Valid Lane Types: float32, float64 | 
|  | *        Max Error: ULP = 2 | 
|  | *      Valid Range: float32(0, +FLT_MAX], float64(0, +DBL_MAX] | 
|  | * @return base 10 logarithm of 'x' | 
|  | */ | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Log10(D d, V x); | 
|  | template <class D, class V> | 
|  | HWY_NOINLINE V CallLog10(const D d, VecArg<V> x) { | 
|  | return Log10(d, x); | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Highway SIMD version of std::log1p(x). | 
|  | * | 
|  | * Valid Lane Types: float32, float64 | 
|  | *        Max Error: ULP = 2 | 
|  | *      Valid Range: float32[0, +FLT_MAX], float64[0, +DBL_MAX] | 
|  | * @return log(1 + x) | 
|  | */ | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Log1p(D d, V x); | 
|  | template <class D, class V> | 
|  | HWY_NOINLINE V CallLog1p(const D d, VecArg<V> x) { | 
|  | return Log1p(d, x); | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Highway SIMD version of std::log2(x). | 
|  | * | 
|  | * Valid Lane Types: float32, float64 | 
|  | *        Max Error: ULP = 2 | 
|  | *      Valid Range: float32(0, +FLT_MAX], float64(0, +DBL_MAX] | 
|  | * @return base 2 logarithm of 'x' | 
|  | */ | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Log2(D d, V x); | 
|  | template <class D, class V> | 
|  | HWY_NOINLINE V CallLog2(const D d, VecArg<V> x) { | 
|  | return Log2(d, x); | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Highway SIMD version of std::sin(x). | 
|  | * | 
|  | * Valid Lane Types: float32, float64 | 
|  | *        Max Error: ULP = 3 | 
|  | *      Valid Range: [-39000, +39000] | 
|  | * @return sine of 'x' | 
|  | */ | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Sin(D d, V x); | 
|  | template <class D, class V> | 
|  | HWY_NOINLINE V CallSin(const D d, VecArg<V> x) { | 
|  | return Sin(d, x); | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Highway SIMD version of std::sinh(x). | 
|  | * | 
|  | * Valid Lane Types: float32, float64 | 
|  | *        Max Error: ULP = 4 | 
|  | *      Valid Range: float32[-88.7228, +88.7228], float64[-709, +709] | 
|  | * @return hyperbolic sine of 'x' | 
|  | */ | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Sinh(D d, V x); | 
|  | template <class D, class V> | 
|  | HWY_NOINLINE V CallSinh(const D d, VecArg<V> x) { | 
|  | return Sinh(d, x); | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Highway SIMD version of std::tanh(x). | 
|  | * | 
|  | * Valid Lane Types: float32, float64 | 
|  | *        Max Error: ULP = 4 | 
|  | *      Valid Range: float32[-FLT_MAX, +FLT_MAX], float64[-DBL_MAX, +DBL_MAX] | 
|  | * @return hyperbolic tangent of 'x' | 
|  | */ | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Tanh(D d, V x); | 
|  | template <class D, class V> | 
|  | HWY_NOINLINE V CallTanh(const D d, VecArg<V> x) { | 
|  | return Tanh(d, x); | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Highway SIMD version of SinCos. | 
|  | * Compute the sine and cosine at the same time | 
|  | * The performance should be around the same as calling Sin. | 
|  | * | 
|  | * Valid Lane Types: float32, float64 | 
|  | *        Max Error: ULP = 1 | 
|  | *      Valid Range: [-39000, +39000] | 
|  | * @return sine and cosine of 'x' | 
|  | */ | 
|  | template <class D, class V> | 
|  | HWY_INLINE void SinCos(D d, V x, V& s, V& c); | 
|  | template <class D, class V> | 
|  | HWY_NOINLINE void CallSinCos(const D d, VecArg<V> x, V& s, V& c) { | 
|  | SinCos(d, x, s, c); | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Highway SIMD version of Hypot | 
|  | * | 
|  | * Valid Lane Types: float32, float64 | 
|  | *        Max Error: ULP = 4 | 
|  | *      Valid Range: float32[-FLT_MAX, +FLT_MAX], float64[-DBL_MAX, +DBL_MAX] | 
|  | * @return hypotenuse of a and b | 
|  | */ | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Hypot(D d, V a, V b); | 
|  | template <class D, class V> | 
|  | HWY_NOINLINE V CallHypot(const D d, VecArg<V> a, VecArg<V> b) { | 
|  | return Hypot(d, a, b); | 
|  | } | 
|  |  | 
|  | //////////////////////////////////////////////////////////////////////////////// | 
|  | // Implementation | 
|  | //////////////////////////////////////////////////////////////////////////////// | 
|  | namespace impl { | 
|  |  | 
|  | // Estrin's Scheme is a faster method for evaluating large polynomials on | 
|  | // super scalar architectures. It works by factoring the Horner's Method | 
|  | // polynomial into power of two sub-trees that can be evaluated in parallel. | 
|  | // Wikipedia Link: https://en.wikipedia.org/wiki/Estrin%27s_scheme | 
|  | template <class T> | 
|  | HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1) { | 
|  | return MulAdd(c1, x, c0); | 
|  | } | 
|  | template <class T> | 
|  | HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2) { | 
|  | T x2 = Mul(x, x); | 
|  | return MulAdd(x2, c2, MulAdd(c1, x, c0)); | 
|  | } | 
|  | template <class T> | 
|  | HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3) { | 
|  | T x2 = Mul(x, x); | 
|  | return MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)); | 
|  | } | 
|  | template <class T> | 
|  | HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4) { | 
|  | T x2 = Mul(x, x); | 
|  | T x4 = Mul(x2, x2); | 
|  | return MulAdd(x4, c4, MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))); | 
|  | } | 
|  | template <class T> | 
|  | HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5) { | 
|  | T x2 = Mul(x, x); | 
|  | T x4 = Mul(x2, x2); | 
|  | return MulAdd(x4, MulAdd(c5, x, c4), | 
|  | MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))); | 
|  | } | 
|  | template <class T> | 
|  | HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, | 
|  | T c6) { | 
|  | T x2 = Mul(x, x); | 
|  | T x4 = Mul(x2, x2); | 
|  | return MulAdd(x4, MulAdd(x2, c6, MulAdd(c5, x, c4)), | 
|  | MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))); | 
|  | } | 
|  | template <class T> | 
|  | HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, | 
|  | T c6, T c7) { | 
|  | T x2 = Mul(x, x); | 
|  | T x4 = Mul(x2, x2); | 
|  | return MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), | 
|  | MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))); | 
|  | } | 
|  | template <class T> | 
|  | HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, | 
|  | T c6, T c7, T c8) { | 
|  | T x2 = Mul(x, x); | 
|  | T x4 = Mul(x2, x2); | 
|  | T x8 = Mul(x4, x4); | 
|  | return MulAdd(x8, c8, | 
|  | MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), | 
|  | MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))); | 
|  | } | 
|  | template <class T> | 
|  | HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, | 
|  | T c6, T c7, T c8, T c9) { | 
|  | T x2 = Mul(x, x); | 
|  | T x4 = Mul(x2, x2); | 
|  | T x8 = Mul(x4, x4); | 
|  | return MulAdd(x8, MulAdd(c9, x, c8), | 
|  | MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), | 
|  | MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))); | 
|  | } | 
|  | template <class T> | 
|  | HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, | 
|  | T c6, T c7, T c8, T c9, T c10) { | 
|  | T x2 = Mul(x, x); | 
|  | T x4 = Mul(x2, x2); | 
|  | T x8 = Mul(x4, x4); | 
|  | return MulAdd(x8, MulAdd(x2, c10, MulAdd(c9, x, c8)), | 
|  | MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), | 
|  | MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))); | 
|  | } | 
|  | template <class T> | 
|  | HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, | 
|  | T c6, T c7, T c8, T c9, T c10, T c11) { | 
|  | T x2 = Mul(x, x); | 
|  | T x4 = Mul(x2, x2); | 
|  | T x8 = Mul(x4, x4); | 
|  | return MulAdd(x8, MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8)), | 
|  | MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), | 
|  | MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))); | 
|  | } | 
|  | template <class T> | 
|  | HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, | 
|  | T c6, T c7, T c8, T c9, T c10, T c11, | 
|  | T c12) { | 
|  | T x2 = Mul(x, x); | 
|  | T x4 = Mul(x2, x2); | 
|  | T x8 = Mul(x4, x4); | 
|  | return MulAdd( | 
|  | x8, MulAdd(x4, c12, MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))), | 
|  | MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), | 
|  | MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))); | 
|  | } | 
|  | template <class T> | 
|  | HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, | 
|  | T c6, T c7, T c8, T c9, T c10, T c11, | 
|  | T c12, T c13) { | 
|  | T x2 = Mul(x, x); | 
|  | T x4 = Mul(x2, x2); | 
|  | T x8 = Mul(x4, x4); | 
|  | return MulAdd(x8, | 
|  | MulAdd(x4, MulAdd(c13, x, c12), | 
|  | MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))), | 
|  | MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), | 
|  | MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))); | 
|  | } | 
|  | template <class T> | 
|  | HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, | 
|  | T c6, T c7, T c8, T c9, T c10, T c11, | 
|  | T c12, T c13, T c14) { | 
|  | T x2 = Mul(x, x); | 
|  | T x4 = Mul(x2, x2); | 
|  | T x8 = Mul(x4, x4); | 
|  | return MulAdd(x8, | 
|  | MulAdd(x4, MulAdd(x2, c14, MulAdd(c13, x, c12)), | 
|  | MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))), | 
|  | MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), | 
|  | MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))); | 
|  | } | 
|  | template <class T> | 
|  | HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, | 
|  | T c6, T c7, T c8, T c9, T c10, T c11, | 
|  | T c12, T c13, T c14, T c15) { | 
|  | T x2 = Mul(x, x); | 
|  | T x4 = Mul(x2, x2); | 
|  | T x8 = Mul(x4, x4); | 
|  | return MulAdd(x8, | 
|  | MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)), | 
|  | MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))), | 
|  | MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), | 
|  | MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))); | 
|  | } | 
|  | template <class T> | 
|  | HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, | 
|  | T c6, T c7, T c8, T c9, T c10, T c11, | 
|  | T c12, T c13, T c14, T c15, T c16) { | 
|  | T x2 = Mul(x, x); | 
|  | T x4 = Mul(x2, x2); | 
|  | T x8 = Mul(x4, x4); | 
|  | T x16 = Mul(x8, x8); | 
|  | return MulAdd( | 
|  | x16, c16, | 
|  | MulAdd(x8, | 
|  | MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)), | 
|  | MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))), | 
|  | MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), | 
|  | MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))))); | 
|  | } | 
|  | template <class T> | 
|  | HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, | 
|  | T c6, T c7, T c8, T c9, T c10, T c11, | 
|  | T c12, T c13, T c14, T c15, T c16, T c17) { | 
|  | T x2 = Mul(x, x); | 
|  | T x4 = Mul(x2, x2); | 
|  | T x8 = Mul(x4, x4); | 
|  | T x16 = Mul(x8, x8); | 
|  | return MulAdd( | 
|  | x16, MulAdd(c17, x, c16), | 
|  | MulAdd(x8, | 
|  | MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)), | 
|  | MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))), | 
|  | MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), | 
|  | MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))))); | 
|  | } | 
|  | template <class T> | 
|  | HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, | 
|  | T c6, T c7, T c8, T c9, T c10, T c11, | 
|  | T c12, T c13, T c14, T c15, T c16, T c17, | 
|  | T c18) { | 
|  | T x2 = Mul(x, x); | 
|  | T x4 = Mul(x2, x2); | 
|  | T x8 = Mul(x4, x4); | 
|  | T x16 = Mul(x8, x8); | 
|  | return MulAdd( | 
|  | x16, MulAdd(x2, c18, MulAdd(c17, x, c16)), | 
|  | MulAdd(x8, | 
|  | MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)), | 
|  | MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))), | 
|  | MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), | 
|  | MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))))); | 
|  | } | 
|  |  | 
|  | template <class FloatOrDouble> | 
|  | struct AsinImpl {}; | 
|  | template <class FloatOrDouble> | 
|  | struct AtanImpl {}; | 
|  | template <class FloatOrDouble> | 
|  | struct CosSinImpl {}; | 
|  | template <class FloatOrDouble> | 
|  | struct ExpImpl {}; | 
|  | template <class FloatOrDouble> | 
|  | struct LogImpl {}; | 
|  | template <class FloatOrDouble> | 
|  | struct SinCosImpl {}; | 
|  |  | 
|  | template <> | 
|  | struct AsinImpl<float> { | 
|  | // Polynomial approximation for asin(x) over the range [0, 0.5). | 
|  | template <class D, class V> | 
|  | HWY_INLINE V AsinPoly(D d, V x2, V /*x*/) { | 
|  | const auto k0 = Set(d, +0.1666677296f); | 
|  | const auto k1 = Set(d, +0.07495029271f); | 
|  | const auto k2 = Set(d, +0.04547423869f); | 
|  | const auto k3 = Set(d, +0.02424046025f); | 
|  | const auto k4 = Set(d, +0.04197454825f); | 
|  |  | 
|  | return Estrin(x2, k0, k1, k2, k3, k4); | 
|  | } | 
|  | }; | 
|  |  | 
|  | #if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 | 
|  |  | 
|  | template <> | 
|  | struct AsinImpl<double> { | 
|  | // Polynomial approximation for asin(x) over the range [0, 0.5). | 
|  | template <class D, class V> | 
|  | HWY_INLINE V AsinPoly(D d, V x2, V /*x*/) { | 
|  | const auto k0 = Set(d, +0.1666666666666497543); | 
|  | const auto k1 = Set(d, +0.07500000000378581611); | 
|  | const auto k2 = Set(d, +0.04464285681377102438); | 
|  | const auto k3 = Set(d, +0.03038195928038132237); | 
|  | const auto k4 = Set(d, +0.02237176181932048341); | 
|  | const auto k5 = Set(d, +0.01735956991223614604); | 
|  | const auto k6 = Set(d, +0.01388715184501609218); | 
|  | const auto k7 = Set(d, +0.01215360525577377331); | 
|  | const auto k8 = Set(d, +0.006606077476277170610); | 
|  | const auto k9 = Set(d, +0.01929045477267910674); | 
|  | const auto k10 = Set(d, -0.01581918243329996643); | 
|  | const auto k11 = Set(d, +0.03161587650653934628); | 
|  |  | 
|  | return Estrin(x2, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11); | 
|  | } | 
|  | }; | 
|  |  | 
|  | #endif | 
|  |  | 
|  | template <> | 
|  | struct AtanImpl<float> { | 
|  | // Polynomial approximation for atan(x) over the range [0, 1.0). | 
|  | template <class D, class V> | 
|  | HWY_INLINE V AtanPoly(D d, V x) { | 
|  | const auto k0 = Set(d, -0.333331018686294555664062f); | 
|  | const auto k1 = Set(d, +0.199926957488059997558594f); | 
|  | const auto k2 = Set(d, -0.142027363181114196777344f); | 
|  | const auto k3 = Set(d, +0.106347933411598205566406f); | 
|  | const auto k4 = Set(d, -0.0748900920152664184570312f); | 
|  | const auto k5 = Set(d, +0.0425049886107444763183594f); | 
|  | const auto k6 = Set(d, -0.0159569028764963150024414f); | 
|  | const auto k7 = Set(d, +0.00282363896258175373077393f); | 
|  |  | 
|  | const auto y = Mul(x, x); | 
|  | return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7), Mul(y, x), x); | 
|  | } | 
|  | }; | 
|  |  | 
|  | #if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 | 
|  |  | 
|  | template <> | 
|  | struct AtanImpl<double> { | 
|  | // Polynomial approximation for atan(x) over the range [0, 1.0). | 
|  | template <class D, class V> | 
|  | HWY_INLINE V AtanPoly(D d, V x) { | 
|  | const auto k0 = Set(d, -0.333333333333311110369124); | 
|  | const auto k1 = Set(d, +0.199999999996591265594148); | 
|  | const auto k2 = Set(d, -0.14285714266771329383765); | 
|  | const auto k3 = Set(d, +0.111111105648261418443745); | 
|  | const auto k4 = Set(d, -0.090908995008245008229153); | 
|  | const auto k5 = Set(d, +0.0769219538311769618355029); | 
|  | const auto k6 = Set(d, -0.0666573579361080525984562); | 
|  | const auto k7 = Set(d, +0.0587666392926673580854313); | 
|  | const auto k8 = Set(d, -0.0523674852303482457616113); | 
|  | const auto k9 = Set(d, +0.0466667150077840625632675); | 
|  | const auto k10 = Set(d, -0.0407629191276836500001934); | 
|  | const auto k11 = Set(d, +0.0337852580001353069993897); | 
|  | const auto k12 = Set(d, -0.0254517624932312641616861); | 
|  | const auto k13 = Set(d, +0.016599329773529201970117); | 
|  | const auto k14 = Set(d, -0.00889896195887655491740809); | 
|  | const auto k15 = Set(d, +0.00370026744188713119232403); | 
|  | const auto k16 = Set(d, -0.00110611831486672482563471); | 
|  | const auto k17 = Set(d, +0.000209850076645816976906797); | 
|  | const auto k18 = Set(d, -1.88796008463073496563746e-5); | 
|  |  | 
|  | const auto y = Mul(x, x); | 
|  | return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, | 
|  | k12, k13, k14, k15, k16, k17, k18), | 
|  | Mul(y, x), x); | 
|  | } | 
|  | }; | 
|  |  | 
|  | #endif | 
|  |  | 
|  | template <> | 
|  | struct CosSinImpl<float> { | 
|  | // Rounds float toward zero and returns as int32_t. | 
|  | template <class D, class V> | 
|  | HWY_INLINE Vec<Rebind<int32_t, D>> ToInt32(D /*unused*/, V x) { | 
|  | return ConvertTo(Rebind<int32_t, D>(), x); | 
|  | } | 
|  |  | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Poly(D d, V x) { | 
|  | const auto k0 = Set(d, -1.66666597127914428710938e-1f); | 
|  | const auto k1 = Set(d, +8.33307858556509017944336e-3f); | 
|  | const auto k2 = Set(d, -1.981069071916863322258e-4f); | 
|  | const auto k3 = Set(d, +2.6083159809786593541503e-6f); | 
|  |  | 
|  | const auto y = Mul(x, x); | 
|  | return MulAdd(Estrin(y, k0, k1, k2, k3), Mul(y, x), x); | 
|  | } | 
|  |  | 
|  | template <class D, class V, class VI32> | 
|  | HWY_INLINE V CosReduce(D d, V x, VI32 q) { | 
|  | // kHalfPiPart0f + kHalfPiPart1f + kHalfPiPart2f + kHalfPiPart3f ~= -pi/2 | 
|  | const V kHalfPiPart0f = Set(d, -0.5f * 3.140625f); | 
|  | const V kHalfPiPart1f = Set(d, -0.5f * 0.0009670257568359375f); | 
|  | const V kHalfPiPart2f = Set(d, -0.5f * 6.2771141529083251953e-7f); | 
|  | const V kHalfPiPart3f = Set(d, -0.5f * 1.2154201256553420762e-10f); | 
|  |  | 
|  | // Extended precision modular arithmetic. | 
|  | const V qf = ConvertTo(d, q); | 
|  | x = MulAdd(qf, kHalfPiPart0f, x); | 
|  | x = MulAdd(qf, kHalfPiPart1f, x); | 
|  | x = MulAdd(qf, kHalfPiPart2f, x); | 
|  | x = MulAdd(qf, kHalfPiPart3f, x); | 
|  | return x; | 
|  | } | 
|  |  | 
|  | template <class D, class V, class VI32> | 
|  | HWY_INLINE V SinReduce(D d, V x, VI32 q) { | 
|  | // kPiPart0f + kPiPart1f + kPiPart2f + kPiPart3f ~= -pi | 
|  | const V kPiPart0f = Set(d, -3.140625f); | 
|  | const V kPiPart1f = Set(d, -0.0009670257568359375f); | 
|  | const V kPiPart2f = Set(d, -6.2771141529083251953e-7f); | 
|  | const V kPiPart3f = Set(d, -1.2154201256553420762e-10f); | 
|  |  | 
|  | // Extended precision modular arithmetic. | 
|  | const V qf = ConvertTo(d, q); | 
|  | x = MulAdd(qf, kPiPart0f, x); | 
|  | x = MulAdd(qf, kPiPart1f, x); | 
|  | x = MulAdd(qf, kPiPart2f, x); | 
|  | x = MulAdd(qf, kPiPart3f, x); | 
|  | return x; | 
|  | } | 
|  |  | 
|  | // (q & 2) == 0 ? -0.0 : +0.0 | 
|  | template <class D, class VI32> | 
|  | HWY_INLINE Vec<Rebind<float, D>> CosSignFromQuadrant(D d, VI32 q) { | 
|  | const VI32 kTwo = Set(Rebind<int32_t, D>(), 2); | 
|  | return BitCast(d, ShiftLeft<30>(AndNot(q, kTwo))); | 
|  | } | 
|  |  | 
|  | // ((q & 1) ? -0.0 : +0.0) | 
|  | template <class D, class VI32> | 
|  | HWY_INLINE Vec<Rebind<float, D>> SinSignFromQuadrant(D d, VI32 q) { | 
|  | const VI32 kOne = Set(Rebind<int32_t, D>(), 1); | 
|  | return BitCast(d, ShiftLeft<31>(And(q, kOne))); | 
|  | } | 
|  | }; | 
|  |  | 
|  | #if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 | 
|  |  | 
|  | template <> | 
|  | struct CosSinImpl<double> { | 
|  | // Rounds double toward zero and returns as int32_t. | 
|  | template <class D, class V> | 
|  | HWY_INLINE Vec<Rebind<int32_t, D>> ToInt32(D /*unused*/, V x) { | 
|  | return DemoteTo(Rebind<int32_t, D>(), x); | 
|  | } | 
|  |  | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Poly(D d, V x) { | 
|  | const auto k0 = Set(d, -0.166666666666666657414808); | 
|  | const auto k1 = Set(d, +0.00833333333333332974823815); | 
|  | const auto k2 = Set(d, -0.000198412698412696162806809); | 
|  | const auto k3 = Set(d, +2.75573192239198747630416e-6); | 
|  | const auto k4 = Set(d, -2.50521083763502045810755e-8); | 
|  | const auto k5 = Set(d, +1.60590430605664501629054e-10); | 
|  | const auto k6 = Set(d, -7.64712219118158833288484e-13); | 
|  | const auto k7 = Set(d, +2.81009972710863200091251e-15); | 
|  | const auto k8 = Set(d, -7.97255955009037868891952e-18); | 
|  |  | 
|  | const auto y = Mul(x, x); | 
|  | return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7, k8), Mul(y, x), x); | 
|  | } | 
|  |  | 
|  | template <class D, class V, class VI32> | 
|  | HWY_INLINE V CosReduce(D d, V x, VI32 q) { | 
|  | // kHalfPiPart0d + kHalfPiPart1d + kHalfPiPart2d + kHalfPiPart3d ~= -pi/2 | 
|  | const V kHalfPiPart0d = Set(d, -0.5 * 3.1415926218032836914); | 
|  | const V kHalfPiPart1d = Set(d, -0.5 * 3.1786509424591713469e-8); | 
|  | const V kHalfPiPart2d = Set(d, -0.5 * 1.2246467864107188502e-16); | 
|  | const V kHalfPiPart3d = Set(d, -0.5 * 1.2736634327021899816e-24); | 
|  |  | 
|  | // Extended precision modular arithmetic. | 
|  | const V qf = PromoteTo(d, q); | 
|  | x = MulAdd(qf, kHalfPiPart0d, x); | 
|  | x = MulAdd(qf, kHalfPiPart1d, x); | 
|  | x = MulAdd(qf, kHalfPiPart2d, x); | 
|  | x = MulAdd(qf, kHalfPiPart3d, x); | 
|  | return x; | 
|  | } | 
|  |  | 
|  | template <class D, class V, class VI32> | 
|  | HWY_INLINE V SinReduce(D d, V x, VI32 q) { | 
|  | // kPiPart0d + kPiPart1d + kPiPart2d + kPiPart3d ~= -pi | 
|  | const V kPiPart0d = Set(d, -3.1415926218032836914); | 
|  | const V kPiPart1d = Set(d, -3.1786509424591713469e-8); | 
|  | const V kPiPart2d = Set(d, -1.2246467864107188502e-16); | 
|  | const V kPiPart3d = Set(d, -1.2736634327021899816e-24); | 
|  |  | 
|  | // Extended precision modular arithmetic. | 
|  | const V qf = PromoteTo(d, q); | 
|  | x = MulAdd(qf, kPiPart0d, x); | 
|  | x = MulAdd(qf, kPiPart1d, x); | 
|  | x = MulAdd(qf, kPiPart2d, x); | 
|  | x = MulAdd(qf, kPiPart3d, x); | 
|  | return x; | 
|  | } | 
|  |  | 
|  | // (q & 2) == 0 ? -0.0 : +0.0 | 
|  | template <class D, class VI32> | 
|  | HWY_INLINE Vec<Rebind<double, D>> CosSignFromQuadrant(D d, VI32 q) { | 
|  | const VI32 kTwo = Set(Rebind<int32_t, D>(), 2); | 
|  | return BitCast( | 
|  | d, ShiftLeft<62>(PromoteTo(Rebind<int64_t, D>(), AndNot(q, kTwo)))); | 
|  | } | 
|  |  | 
|  | // ((q & 1) ? -0.0 : +0.0) | 
|  | template <class D, class VI32> | 
|  | HWY_INLINE Vec<Rebind<double, D>> SinSignFromQuadrant(D d, VI32 q) { | 
|  | const VI32 kOne = Set(Rebind<int32_t, D>(), 1); | 
|  | return BitCast( | 
|  | d, ShiftLeft<63>(PromoteTo(Rebind<int64_t, D>(), And(q, kOne)))); | 
|  | } | 
|  | }; | 
|  |  | 
|  | #endif | 
|  |  | 
|  | template <> | 
|  | struct ExpImpl<float> { | 
|  | // Rounds float toward zero and returns as int32_t. | 
|  | template <class D, class V> | 
|  | HWY_INLINE Vec<Rebind<int32_t, D>> ToInt32(D /*unused*/, V x) { | 
|  | return ConvertTo(Rebind<int32_t, D>(), x); | 
|  | } | 
|  |  | 
|  | // Rounds float to nearest int32_t | 
|  | template <class D, class V> | 
|  | HWY_INLINE Vec<Rebind<int32_t, D>> ToNearestInt32(D /*unused*/, V x) { | 
|  | return NearestInt(x); | 
|  | } | 
|  |  | 
|  | template <class D, class V> | 
|  | HWY_INLINE V ExpPoly(D d, V x) { | 
|  | const auto k0 = Set(d, +0.5f); | 
|  | const auto k1 = Set(d, +0.166666671633720397949219f); | 
|  | const auto k2 = Set(d, +0.0416664853692054748535156f); | 
|  | const auto k3 = Set(d, +0.00833336077630519866943359f); | 
|  | const auto k4 = Set(d, +0.00139304355252534151077271f); | 
|  | const auto k5 = Set(d, +0.000198527617612853646278381f); | 
|  |  | 
|  | return MulAdd(Estrin(x, k0, k1, k2, k3, k4, k5), Mul(x, x), x); | 
|  | } | 
|  |  | 
|  | // Computes 2^x, where x is an integer. | 
|  | template <class D, class VI32> | 
|  | HWY_INLINE Vec<D> Pow2I(D d, VI32 x) { | 
|  | const Rebind<int32_t, D> di32; | 
|  | const VI32 kOffset = Set(di32, 0x7F); | 
|  | return BitCast(d, ShiftLeft<23>(Add(x, kOffset))); | 
|  | } | 
|  |  | 
|  | // Sets the exponent of 'x' to 2^e. | 
|  | template <class D, class V, class VI32> | 
|  | HWY_INLINE V LoadExpShortRange(D d, V x, VI32 e) { | 
|  | const VI32 y = ShiftRight<1>(e); | 
|  | return Mul(Mul(x, Pow2I(d, y)), Pow2I(d, Sub(e, y))); | 
|  | } | 
|  |  | 
|  | template <class D, class V, class VI32> | 
|  | HWY_INLINE V ExpReduce(D d, V x, VI32 q) { | 
|  | // kLn2Part0f + kLn2Part1f ~= -ln(2) | 
|  | const V kLn2Part0f = Set(d, -0.693145751953125f); | 
|  | const V kLn2Part1f = Set(d, -1.428606765330187045e-6f); | 
|  |  | 
|  | // Extended precision modular arithmetic. | 
|  | const V qf = ConvertTo(d, q); | 
|  | x = MulAdd(qf, kLn2Part0f, x); | 
|  | x = MulAdd(qf, kLn2Part1f, x); | 
|  | return x; | 
|  | } | 
|  |  | 
|  | template <class D, class V, class VI32> | 
|  | HWY_INLINE V Exp2Reduce(D d, V x, VI32 q) { | 
|  | const V x_frac = Sub(x, ConvertTo(d, q)); | 
|  | return MulAdd(x_frac, Set(d, 0.193147182464599609375f), | 
|  | Mul(x_frac, Set(d, 0.5f))); | 
|  | } | 
|  | }; | 
|  |  | 
|  | template <> | 
|  | struct LogImpl<float> { | 
|  | template <class D, class V> | 
|  | HWY_INLINE Vec<Rebind<int32_t, D>> Log2p1NoSubnormal(D /*d*/, V x) { | 
|  | const Rebind<int32_t, D> di32; | 
|  | const Rebind<uint32_t, D> du32; | 
|  | const auto kBias = Set(di32, 0x7F); | 
|  | return Sub(BitCast(di32, ShiftRight<23>(BitCast(du32, x))), kBias); | 
|  | } | 
|  |  | 
|  | // Approximates Log(x) over the range [sqrt(2) / 2, sqrt(2)]. | 
|  | template <class D, class V> | 
|  | HWY_INLINE V LogPoly(D d, V x) { | 
|  | const V k0 = Set(d, 0.66666662693f); | 
|  | const V k1 = Set(d, 0.40000972152f); | 
|  | const V k2 = Set(d, 0.28498786688f); | 
|  | const V k3 = Set(d, 0.24279078841f); | 
|  |  | 
|  | const V x2 = Mul(x, x); | 
|  | const V x4 = Mul(x2, x2); | 
|  | return MulAdd(MulAdd(k2, x4, k0), x2, Mul(MulAdd(k3, x4, k1), x4)); | 
|  | } | 
|  | }; | 
|  |  | 
|  | #if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 | 
|  | template <> | 
|  | struct ExpImpl<double> { | 
|  | // Rounds double toward zero and returns as int32_t. | 
|  | template <class D, class V> | 
|  | HWY_INLINE Vec<Rebind<int32_t, D>> ToInt32(D /*unused*/, V x) { | 
|  | return DemoteTo(Rebind<int32_t, D>(), x); | 
|  | } | 
|  |  | 
|  | // Rounds double to nearest int32_t | 
|  | template <class D, class V> | 
|  | HWY_INLINE Vec<Rebind<int32_t, D>> ToNearestInt32(D /*unused*/, V x) { | 
|  | return DemoteToNearestInt(Rebind<int32_t, D>(), x); | 
|  | } | 
|  |  | 
|  | template <class D, class V> | 
|  | HWY_INLINE V ExpPoly(D d, V x) { | 
|  | const auto k0 = Set(d, +0.5); | 
|  | const auto k1 = Set(d, +0.166666666666666851703837); | 
|  | const auto k2 = Set(d, +0.0416666666666665047591422); | 
|  | const auto k3 = Set(d, +0.00833333333331652721664984); | 
|  | const auto k4 = Set(d, +0.00138888888889774492207962); | 
|  | const auto k5 = Set(d, +0.000198412698960509205564975); | 
|  | const auto k6 = Set(d, +2.4801587159235472998791e-5); | 
|  | const auto k7 = Set(d, +2.75572362911928827629423e-6); | 
|  | const auto k8 = Set(d, +2.75573911234900471893338e-7); | 
|  | const auto k9 = Set(d, +2.51112930892876518610661e-8); | 
|  | const auto k10 = Set(d, +2.08860621107283687536341e-9); | 
|  |  | 
|  | return MulAdd(Estrin(x, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10), | 
|  | Mul(x, x), x); | 
|  | } | 
|  |  | 
|  | // Computes 2^x, where x is an integer. | 
|  | template <class D, class VI32> | 
|  | HWY_INLINE Vec<D> Pow2I(D d, VI32 x) { | 
|  | const Rebind<int32_t, D> di32; | 
|  | const Rebind<int64_t, D> di64; | 
|  | const VI32 kOffset = Set(di32, 0x3FF); | 
|  | return BitCast(d, ShiftLeft<52>(PromoteTo(di64, Add(x, kOffset)))); | 
|  | } | 
|  |  | 
|  | // Sets the exponent of 'x' to 2^e. | 
|  | template <class D, class V, class VI32> | 
|  | HWY_INLINE V LoadExpShortRange(D d, V x, VI32 e) { | 
|  | const VI32 y = ShiftRight<1>(e); | 
|  | return Mul(Mul(x, Pow2I(d, y)), Pow2I(d, Sub(e, y))); | 
|  | } | 
|  |  | 
|  | template <class D, class V, class VI32> | 
|  | HWY_INLINE V ExpReduce(D d, V x, VI32 q) { | 
|  | // kLn2Part0d + kLn2Part1d ~= -ln(2) | 
|  | const V kLn2Part0d = Set(d, -0.6931471805596629565116018); | 
|  | const V kLn2Part1d = Set(d, -0.28235290563031577122588448175e-12); | 
|  |  | 
|  | // Extended precision modular arithmetic. | 
|  | const V qf = PromoteTo(d, q); | 
|  | x = MulAdd(qf, kLn2Part0d, x); | 
|  | x = MulAdd(qf, kLn2Part1d, x); | 
|  | return x; | 
|  | } | 
|  |  | 
|  | template <class D, class V, class VI32> | 
|  | HWY_INLINE V Exp2Reduce(D d, V x, VI32 q) { | 
|  | const V x_frac = Sub(x, PromoteTo(d, q)); | 
|  | return MulAdd(x_frac, Set(d, 0.1931471805599453139823396), | 
|  | Mul(x_frac, Set(d, 0.5))); | 
|  | } | 
|  | }; | 
|  |  | 
|  | template <> | 
|  | struct LogImpl<double> { | 
|  | template <class D, class V> | 
|  | HWY_INLINE Vec<Rebind<int64_t, D>> Log2p1NoSubnormal(D /*d*/, V x) { | 
|  | const Rebind<int64_t, D> di64; | 
|  | const Rebind<uint64_t, D> du64; | 
|  | return Sub(BitCast(di64, ShiftRight<52>(BitCast(du64, x))), | 
|  | Set(di64, 0x3FF)); | 
|  | } | 
|  |  | 
|  | // Approximates Log(x) over the range [sqrt(2) / 2, sqrt(2)]. | 
|  | template <class D, class V> | 
|  | HWY_INLINE V LogPoly(D d, V x) { | 
|  | const V k0 = Set(d, 0.6666666666666735130); | 
|  | const V k1 = Set(d, 0.3999999999940941908); | 
|  | const V k2 = Set(d, 0.2857142874366239149); | 
|  | const V k3 = Set(d, 0.2222219843214978396); | 
|  | const V k4 = Set(d, 0.1818357216161805012); | 
|  | const V k5 = Set(d, 0.1531383769920937332); | 
|  | const V k6 = Set(d, 0.1479819860511658591); | 
|  |  | 
|  | const V x2 = Mul(x, x); | 
|  | const V x4 = Mul(x2, x2); | 
|  | return MulAdd(MulAdd(MulAdd(MulAdd(k6, x4, k4), x4, k2), x4, k0), x2, | 
|  | (Mul(MulAdd(MulAdd(k5, x4, k3), x4, k1), x4))); | 
|  | } | 
|  | }; | 
|  |  | 
|  | #endif | 
|  |  | 
|  | template <class D, class V, bool kAllowSubnormals = true> | 
|  | HWY_INLINE V Log(const D d, V x) { | 
|  | // http://git.musl-libc.org/cgit/musl/tree/src/math/log.c for more info. | 
|  | using T = TFromD<D>; | 
|  | impl::LogImpl<T> impl; | 
|  |  | 
|  | constexpr bool kIsF32 = (sizeof(T) == 4); | 
|  |  | 
|  | // Float Constants | 
|  | const V kLn2Hi = Set(d, kIsF32 ? static_cast<T>(0.69313812256f) | 
|  | : static_cast<T>(0.693147180369123816490)); | 
|  | const V kLn2Lo = Set(d, kIsF32 ? static_cast<T>(9.0580006145e-6f) | 
|  | : static_cast<T>(1.90821492927058770002e-10)); | 
|  | const V kOne = Set(d, static_cast<T>(+1.0)); | 
|  | const V kMinNormal = Set(d, kIsF32 ? static_cast<T>(1.175494351e-38f) | 
|  | : static_cast<T>(2.2250738585072014e-308)); | 
|  | const V kScale = Set(d, kIsF32 ? static_cast<T>(3.355443200e+7f) | 
|  | : static_cast<T>(1.8014398509481984e+16)); | 
|  |  | 
|  | // Integer Constants | 
|  | using TI = MakeSigned<T>; | 
|  | const Rebind<TI, D> di; | 
|  | using VI = decltype(Zero(di)); | 
|  | const VI kLowerBits = Set(di, kIsF32 ? static_cast<TI>(0x00000000L) | 
|  | : static_cast<TI>(0xFFFFFFFFLL)); | 
|  | const VI kMagic = Set(di, kIsF32 ? static_cast<TI>(0x3F3504F3L) | 
|  | : static_cast<TI>(0x3FE6A09E00000000LL)); | 
|  | const VI kExpMask = Set(di, kIsF32 ? static_cast<TI>(0x3F800000L) | 
|  | : static_cast<TI>(0x3FF0000000000000LL)); | 
|  | const VI kExpScale = | 
|  | Set(di, kIsF32 ? static_cast<TI>(-25) : static_cast<TI>(-54)); | 
|  | const VI kManMask = Set(di, kIsF32 ? static_cast<TI>(0x7FFFFFL) | 
|  | : static_cast<TI>(0xFFFFF00000000LL)); | 
|  |  | 
|  | // Scale up 'x' so that it is no longer denormalized. | 
|  | VI exp_bits; | 
|  | V exp; | 
|  | if (kAllowSubnormals == true) { | 
|  | const auto is_denormal = Lt(x, kMinNormal); | 
|  | x = IfThenElse(is_denormal, Mul(x, kScale), x); | 
|  |  | 
|  | // Compute the new exponent. | 
|  | exp_bits = Add(BitCast(di, x), Sub(kExpMask, kMagic)); | 
|  | const VI exp_scale = | 
|  | BitCast(di, IfThenElseZero(is_denormal, BitCast(d, kExpScale))); | 
|  | exp = ConvertTo( | 
|  | d, Add(exp_scale, impl.Log2p1NoSubnormal(d, BitCast(d, exp_bits)))); | 
|  | } else { | 
|  | // Compute the new exponent. | 
|  | exp_bits = Add(BitCast(di, x), Sub(kExpMask, kMagic)); | 
|  | exp = ConvertTo(d, impl.Log2p1NoSubnormal(d, BitCast(d, exp_bits))); | 
|  | } | 
|  |  | 
|  | // Renormalize. | 
|  | const V y = Or(And(x, BitCast(d, kLowerBits)), | 
|  | BitCast(d, Add(And(exp_bits, kManMask), kMagic))); | 
|  |  | 
|  | // Approximate and reconstruct. | 
|  | const V ym1 = Sub(y, kOne); | 
|  | const V z = Div(ym1, Add(y, kOne)); | 
|  |  | 
|  | return MulSub( | 
|  | exp, kLn2Hi, | 
|  | Sub(MulSub(z, Sub(ym1, impl.LogPoly(d, z)), Mul(exp, kLn2Lo)), ym1)); | 
|  | } | 
|  |  | 
|  | // SinCos | 
|  | // Based on "sse_mathfun.h", by Julien Pommier | 
|  | // http://gruntthepeon.free.fr/ssemath/ | 
|  |  | 
|  | // Third degree poly | 
|  | template <class D, class V> | 
|  | HWY_INLINE void SinCos3(D d, TFromD<D> dp1, TFromD<D> dp2, TFromD<D> dp3, V x, | 
|  | V& s, V& c) { | 
|  | using T = TFromD<D>; | 
|  | using TI = MakeSigned<T>; | 
|  | using DI = Rebind<TI, D>; | 
|  | const DI di; | 
|  | using VI = decltype(Zero(di)); | 
|  | using M = Mask<D>; | 
|  |  | 
|  | static constexpr size_t bits = sizeof(TI) * 8; | 
|  | const VI sign_mask = SignBit(di); | 
|  | const VI ci_0 = Zero(di); | 
|  | const VI ci_1 = Set(di, 1); | 
|  | const VI ci_2 = Set(di, 2); | 
|  | const VI ci_4 = Set(di, 4); | 
|  | const V cos_p0 = Set(d, ConvertScalarTo<T>(2.443315711809948E-005)); | 
|  | const V cos_p1 = Set(d, ConvertScalarTo<T>(-1.388731625493765E-003)); | 
|  | const V cos_p2 = Set(d, ConvertScalarTo<T>(4.166664568298827E-002)); | 
|  | const V sin_p0 = Set(d, ConvertScalarTo<T>(-1.9515295891E-4)); | 
|  | const V sin_p1 = Set(d, ConvertScalarTo<T>(8.3321608736E-3)); | 
|  | const V sin_p2 = Set(d, ConvertScalarTo<T>(-1.6666654611E-1)); | 
|  | const V FOPI = Set(d, ConvertScalarTo<T>(1.27323954473516));  // 4 / M_PI | 
|  | const V DP1 = Set(d, dp1); | 
|  | const V DP2 = Set(d, dp2); | 
|  | const V DP3 = Set(d, dp3); | 
|  |  | 
|  | V xmm1, xmm2, sign_bit_sin, y; | 
|  | VI imm0, imm2, imm4; | 
|  |  | 
|  | sign_bit_sin = x; | 
|  | x = Abs(x); | 
|  |  | 
|  | /* extract the sign bit (upper one) */ | 
|  | sign_bit_sin = And(sign_bit_sin, BitCast(d, sign_mask)); | 
|  |  | 
|  | /* scale by 4/Pi */ | 
|  | y = Mul(x, FOPI); | 
|  |  | 
|  | /* store the integer part of y in imm2 */ | 
|  | imm2 = ConvertTo(di, y); | 
|  |  | 
|  | /* j=(j+1) & (~1) (see the cephes sources) */ | 
|  | imm2 = Add(imm2, ci_1); | 
|  | imm2 = AndNot(ci_1, imm2); | 
|  |  | 
|  | y = ConvertTo(d, imm2); | 
|  | imm4 = imm2; | 
|  |  | 
|  | /* get the swap sign flag for the sine */ | 
|  | imm0 = And(imm2, ci_4); | 
|  | imm0 = ShiftLeft<bits - 3>(imm0); | 
|  |  | 
|  | V swap_sign_bit_sin = BitCast(d, imm0); | 
|  |  | 
|  | /* get the polynomial selection mask for the sine*/ | 
|  | imm2 = And(imm2, ci_2); | 
|  | M poly_mask = RebindMask(d, Eq(imm2, ci_0)); | 
|  |  | 
|  | /* The magic pass: "Extended precision modular arithmetic" | 
|  | x = ((x - y * DP1) - y * DP2) - y * DP3; */ | 
|  | x = MulAdd(y, DP1, x); | 
|  | x = MulAdd(y, DP2, x); | 
|  | x = MulAdd(y, DP3, x); | 
|  |  | 
|  | imm4 = Sub(imm4, ci_2); | 
|  | imm4 = AndNot(imm4, ci_4); | 
|  | imm4 = ShiftLeft<bits - 3>(imm4); | 
|  |  | 
|  | V sign_bit_cos = BitCast(d, imm4); | 
|  |  | 
|  | sign_bit_sin = Xor(sign_bit_sin, swap_sign_bit_sin); | 
|  |  | 
|  | /* Evaluate the first polynomial  (0 <= x <= Pi/4) */ | 
|  | V z = Mul(x, x); | 
|  |  | 
|  | y = MulAdd(cos_p0, z, cos_p1); | 
|  | y = MulAdd(y, z, cos_p2); | 
|  | y = Mul(y, z); | 
|  | y = Mul(y, z); | 
|  | y = NegMulAdd(z, Set(d, 0.5f), y); | 
|  | y = Add(y, Set(d, 1)); | 
|  |  | 
|  | /* Evaluate the second polynomial  (Pi/4 <= x <= 0) */ | 
|  | V y2 = MulAdd(sin_p0, z, sin_p1); | 
|  | y2 = MulAdd(y2, z, sin_p2); | 
|  | y2 = Mul(y2, z); | 
|  | y2 = MulAdd(y2, x, x); | 
|  |  | 
|  | /* select the correct result from the two polynomials */ | 
|  | xmm1 = IfThenElse(poly_mask, y2, y); | 
|  | xmm2 = IfThenElse(poly_mask, y, y2); | 
|  |  | 
|  | /* update the sign */ | 
|  | s = Xor(xmm1, sign_bit_sin); | 
|  | c = Xor(xmm2, sign_bit_cos); | 
|  | } | 
|  |  | 
|  | // Sixth degree poly | 
|  | template <class D, class V> | 
|  | HWY_INLINE void SinCos6(D d, TFromD<D> dp1, TFromD<D> dp2, TFromD<D> dp3, V x, | 
|  | V& s, V& c) { | 
|  | using T = TFromD<D>; | 
|  | using TI = MakeSigned<T>; | 
|  | using DI = Rebind<TI, D>; | 
|  | const DI di; | 
|  | using VI = decltype(Zero(di)); | 
|  | using M = Mask<D>; | 
|  |  | 
|  | static constexpr size_t bits = sizeof(TI) * 8; | 
|  | const VI sign_mask = SignBit(di); | 
|  | const VI ci_0 = Zero(di); | 
|  | const VI ci_1 = Set(di, 1); | 
|  | const VI ci_2 = Set(di, 2); | 
|  | const VI ci_4 = Set(di, 4); | 
|  | const V cos_p0 = Set(d, ConvertScalarTo<T>(-1.13585365213876817300E-11)); | 
|  | const V cos_p1 = Set(d, ConvertScalarTo<T>(2.08757008419747316778E-9)); | 
|  | const V cos_p2 = Set(d, ConvertScalarTo<T>(-2.75573141792967388112E-7)); | 
|  | const V cos_p3 = Set(d, ConvertScalarTo<T>(2.48015872888517045348E-5)); | 
|  | const V cos_p4 = Set(d, ConvertScalarTo<T>(-1.38888888888730564116E-3)); | 
|  | const V cos_p5 = Set(d, ConvertScalarTo<T>(4.16666666666665929218E-2)); | 
|  | const V sin_p0 = Set(d, ConvertScalarTo<T>(1.58962301576546568060E-10)); | 
|  | const V sin_p1 = Set(d, ConvertScalarTo<T>(-2.50507477628578072866E-8)); | 
|  | const V sin_p2 = Set(d, ConvertScalarTo<T>(2.75573136213857245213E-6)); | 
|  | const V sin_p3 = Set(d, ConvertScalarTo<T>(-1.98412698295895385996E-4)); | 
|  | const V sin_p4 = Set(d, ConvertScalarTo<T>(8.33333333332211858878E-3)); | 
|  | const V sin_p5 = Set(d, ConvertScalarTo<T>(-1.66666666666666307295E-1)); | 
|  | const V FOPI =  // 4 / M_PI | 
|  | Set(d, ConvertScalarTo<T>(1.2732395447351626861510701069801148)); | 
|  | const V DP1 = Set(d, dp1); | 
|  | const V DP2 = Set(d, dp2); | 
|  | const V DP3 = Set(d, dp3); | 
|  |  | 
|  | V xmm1, xmm2, sign_bit_sin, y; | 
|  | VI imm0, imm2, imm4; | 
|  |  | 
|  | sign_bit_sin = x; | 
|  | x = Abs(x); | 
|  |  | 
|  | /* extract the sign bit (upper one) */ | 
|  | sign_bit_sin = And(sign_bit_sin, BitCast(d, sign_mask)); | 
|  |  | 
|  | /* scale by 4/Pi */ | 
|  | y = Mul(x, FOPI); | 
|  |  | 
|  | /* store the integer part of y in imm2 */ | 
|  | imm2 = ConvertTo(di, y); | 
|  |  | 
|  | /* j=(j+1) & (~1) (see the cephes sources) */ | 
|  | imm2 = Add(imm2, ci_1); | 
|  | imm2 = AndNot(ci_1, imm2); | 
|  |  | 
|  | y = ConvertTo(d, imm2); | 
|  | imm4 = imm2; | 
|  |  | 
|  | /* get the swap sign flag for the sine */ | 
|  | imm0 = And(imm2, ci_4); | 
|  | imm0 = ShiftLeft<bits - 3>(imm0); | 
|  |  | 
|  | V swap_sign_bit_sin = BitCast(d, imm0); | 
|  |  | 
|  | /* get the polynomial selection mask for the sine*/ | 
|  | imm2 = And(imm2, ci_2); | 
|  | M poly_mask = RebindMask(d, Eq(imm2, ci_0)); | 
|  |  | 
|  | /* The magic pass: "Extended precision modular arithmetic" | 
|  | x = ((x - y * DP1) - y * DP2) - y * DP3; */ | 
|  | x = MulAdd(y, DP1, x); | 
|  | x = MulAdd(y, DP2, x); | 
|  | x = MulAdd(y, DP3, x); | 
|  |  | 
|  | imm4 = Sub(imm4, ci_2); | 
|  | imm4 = AndNot(imm4, ci_4); | 
|  | imm4 = ShiftLeft<bits - 3>(imm4); | 
|  |  | 
|  | V sign_bit_cos = BitCast(d, imm4); | 
|  | sign_bit_sin = Xor(sign_bit_sin, swap_sign_bit_sin); | 
|  |  | 
|  | /* Evaluate the first polynomial  (0 <= x <= Pi/4) */ | 
|  | V z = Mul(x, x); | 
|  |  | 
|  | y = MulAdd(cos_p0, z, cos_p1); | 
|  | y = MulAdd(y, z, cos_p2); | 
|  | y = MulAdd(y, z, cos_p3); | 
|  | y = MulAdd(y, z, cos_p4); | 
|  | y = MulAdd(y, z, cos_p5); | 
|  | y = Mul(y, z); | 
|  | y = Mul(y, z); | 
|  | y = NegMulAdd(z, Set(d, 0.5f), y); | 
|  | y = Add(y, Set(d, 1.0f)); | 
|  |  | 
|  | /* Evaluate the second polynomial  (Pi/4 <= x <= 0) */ | 
|  | V y2 = MulAdd(sin_p0, z, sin_p1); | 
|  | y2 = MulAdd(y2, z, sin_p2); | 
|  | y2 = MulAdd(y2, z, sin_p3); | 
|  | y2 = MulAdd(y2, z, sin_p4); | 
|  | y2 = MulAdd(y2, z, sin_p5); | 
|  | y2 = Mul(y2, z); | 
|  | y2 = MulAdd(y2, x, x); | 
|  |  | 
|  | /* select the correct result from the two polynomials */ | 
|  | xmm1 = IfThenElse(poly_mask, y2, y); | 
|  | xmm2 = IfThenElse(poly_mask, y, y2); | 
|  |  | 
|  | /* update the sign */ | 
|  | s = Xor(xmm1, sign_bit_sin); | 
|  | c = Xor(xmm2, sign_bit_cos); | 
|  | } | 
|  |  | 
|  | template <> | 
|  | struct SinCosImpl<float> { | 
|  | template <class D, class V> | 
|  | HWY_INLINE void SinCos(D d, V x, V& s, V& c) { | 
|  | SinCos3(d, -0.78515625f, -2.4187564849853515625e-4f, | 
|  | -3.77489497744594108e-8f, x, s, c); | 
|  | } | 
|  | }; | 
|  |  | 
|  | #if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 | 
|  | template <> | 
|  | struct SinCosImpl<double> { | 
|  | template <class D, class V> | 
|  | HWY_INLINE void SinCos(D d, V x, V& s, V& c) { | 
|  | SinCos6(d, -7.85398125648498535156E-1, -3.77489470793079817668E-8, | 
|  | -2.69515142907905952645E-15, x, s, c); | 
|  | } | 
|  | }; | 
|  | #endif | 
|  |  | 
|  | }  // namespace impl | 
|  |  | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Acos(const D d, V x) { | 
|  | using T = TFromD<D>; | 
|  |  | 
|  | const V kZero = Zero(d); | 
|  | const V kHalf = Set(d, static_cast<T>(+0.5)); | 
|  | const V kPi = Set(d, static_cast<T>(+3.14159265358979323846264)); | 
|  | const V kPiOverTwo = Set(d, static_cast<T>(+1.57079632679489661923132169)); | 
|  |  | 
|  | const V sign_x = And(SignBit(d), x); | 
|  | const V abs_x = Xor(x, sign_x); | 
|  | const auto mask = Lt(abs_x, kHalf); | 
|  | const V yy = | 
|  | IfThenElse(mask, Mul(abs_x, abs_x), NegMulAdd(abs_x, kHalf, kHalf)); | 
|  | const V y = IfThenElse(mask, abs_x, Sqrt(yy)); | 
|  |  | 
|  | impl::AsinImpl<T> impl; | 
|  | const V t = Mul(impl.AsinPoly(d, yy, y), Mul(y, yy)); | 
|  |  | 
|  | const V t_plus_y = Add(t, y); | 
|  | const V z = | 
|  | IfThenElse(mask, Sub(kPiOverTwo, Add(Xor(y, sign_x), Xor(t, sign_x))), | 
|  | Add(t_plus_y, t_plus_y)); | 
|  | return IfThenElse(Or(mask, Ge(x, kZero)), z, Sub(kPi, z)); | 
|  | } | 
|  |  | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Acosh(const D d, V x) { | 
|  | using T = TFromD<D>; | 
|  |  | 
|  | const V kLarge = Set(d, static_cast<T>(268435456.0)); | 
|  | const V kLog2 = Set(d, static_cast<T>(0.693147180559945286227)); | 
|  | const V kOne = Set(d, static_cast<T>(+1.0)); | 
|  | const V kTwo = Set(d, static_cast<T>(+2.0)); | 
|  |  | 
|  | const auto is_x_large = Gt(x, kLarge); | 
|  | const auto is_x_gt_2 = Gt(x, kTwo); | 
|  |  | 
|  | const V x_minus_1 = Sub(x, kOne); | 
|  | const V y0 = MulSub(kTwo, x, Div(kOne, Add(Sqrt(MulSub(x, x, kOne)), x))); | 
|  | const V y1 = | 
|  | Add(Sqrt(MulAdd(x_minus_1, kTwo, Mul(x_minus_1, x_minus_1))), x_minus_1); | 
|  | const V y2 = | 
|  | IfThenElse(is_x_gt_2, IfThenElse(is_x_large, x, y0), Add(y1, kOne)); | 
|  | const V z = impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y2); | 
|  |  | 
|  | const auto is_pole = Eq(y2, kOne); | 
|  | const auto divisor = Sub(IfThenZeroElse(is_pole, y2), kOne); | 
|  | return Add(IfThenElse(is_x_gt_2, z, | 
|  | IfThenElse(is_pole, y1, Div(Mul(z, y1), divisor))), | 
|  | IfThenElseZero(is_x_large, kLog2)); | 
|  | } | 
|  |  | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Asin(const D d, V x) { | 
|  | using T = TFromD<D>; | 
|  |  | 
|  | const V kHalf = Set(d, static_cast<T>(+0.5)); | 
|  | const V kTwo = Set(d, static_cast<T>(+2.0)); | 
|  | const V kPiOverTwo = Set(d, static_cast<T>(+1.57079632679489661923132169)); | 
|  |  | 
|  | const V sign_x = And(SignBit(d), x); | 
|  | const V abs_x = Xor(x, sign_x); | 
|  | const auto mask = Lt(abs_x, kHalf); | 
|  | const V yy = | 
|  | IfThenElse(mask, Mul(abs_x, abs_x), NegMulAdd(abs_x, kHalf, kHalf)); | 
|  | const V y = IfThenElse(mask, abs_x, Sqrt(yy)); | 
|  |  | 
|  | impl::AsinImpl<T> impl; | 
|  | const V z0 = MulAdd(impl.AsinPoly(d, yy, y), Mul(yy, y), y); | 
|  | const V z1 = NegMulAdd(z0, kTwo, kPiOverTwo); | 
|  | return Or(IfThenElse(mask, z0, z1), sign_x); | 
|  | } | 
|  |  | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Asinh(const D d, V x) { | 
|  | using T = TFromD<D>; | 
|  |  | 
|  | const V kSmall = Set(d, static_cast<T>(1.0 / 268435456.0)); | 
|  | const V kLarge = Set(d, static_cast<T>(268435456.0)); | 
|  | const V kLog2 = Set(d, static_cast<T>(0.693147180559945286227)); | 
|  | const V kOne = Set(d, static_cast<T>(+1.0)); | 
|  | const V kTwo = Set(d, static_cast<T>(+2.0)); | 
|  |  | 
|  | const V sign_x = And(SignBit(d), x);  // Extract the sign bit | 
|  | const V abs_x = Xor(x, sign_x); | 
|  |  | 
|  | const auto is_x_large = Gt(abs_x, kLarge); | 
|  | const auto is_x_lt_2 = Lt(abs_x, kTwo); | 
|  |  | 
|  | const V x2 = Mul(x, x); | 
|  | const V sqrt_x2_plus_1 = Sqrt(Add(x2, kOne)); | 
|  |  | 
|  | const V y0 = MulAdd(abs_x, kTwo, Div(kOne, Add(sqrt_x2_plus_1, abs_x))); | 
|  | const V y1 = Add(Div(x2, Add(sqrt_x2_plus_1, kOne)), abs_x); | 
|  | const V y2 = | 
|  | IfThenElse(is_x_lt_2, Add(y1, kOne), IfThenElse(is_x_large, abs_x, y0)); | 
|  | const V z = impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y2); | 
|  |  | 
|  | const auto is_pole = Eq(y2, kOne); | 
|  | const auto divisor = Sub(IfThenZeroElse(is_pole, y2), kOne); | 
|  | const auto large = IfThenElse(is_pole, y1, Div(Mul(z, y1), divisor)); | 
|  | const V y = IfThenElse(Lt(abs_x, kSmall), x, large); | 
|  | return Or(Add(IfThenElse(is_x_lt_2, y, z), IfThenElseZero(is_x_large, kLog2)), | 
|  | sign_x); | 
|  | } | 
|  |  | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Atan(const D d, V x) { | 
|  | using T = TFromD<D>; | 
|  |  | 
|  | const V kOne = Set(d, static_cast<T>(+1.0)); | 
|  | const V kPiOverTwo = Set(d, static_cast<T>(+1.57079632679489661923132169)); | 
|  |  | 
|  | const V sign = And(SignBit(d), x); | 
|  | const V abs_x = Xor(x, sign); | 
|  | const auto mask = Gt(abs_x, kOne); | 
|  |  | 
|  | impl::AtanImpl<T> impl; | 
|  | const auto divisor = IfThenElse(mask, abs_x, kOne); | 
|  | const V y = impl.AtanPoly(d, IfThenElse(mask, Div(kOne, divisor), abs_x)); | 
|  | return Or(IfThenElse(mask, Sub(kPiOverTwo, y), y), sign); | 
|  | } | 
|  |  | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Atanh(const D d, V x) { | 
|  | using T = TFromD<D>; | 
|  |  | 
|  | const V kHalf = Set(d, static_cast<T>(+0.5)); | 
|  | const V kOne = Set(d, static_cast<T>(+1.0)); | 
|  |  | 
|  | const V sign = And(SignBit(d), x);  // Extract the sign bit | 
|  | const V abs_x = Xor(x, sign); | 
|  | return Mul(Log1p(d, Div(Add(abs_x, abs_x), Sub(kOne, abs_x))), | 
|  | Xor(kHalf, sign)); | 
|  | } | 
|  |  | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Cos(const D d, V x) { | 
|  | using T = TFromD<D>; | 
|  | impl::CosSinImpl<T> impl; | 
|  |  | 
|  | // Float Constants | 
|  | const V kOneOverPi = Set(d, static_cast<T>(0.31830988618379067153)); | 
|  |  | 
|  | // Integer Constants | 
|  | const Rebind<int32_t, D> di32; | 
|  | using VI32 = decltype(Zero(di32)); | 
|  | const VI32 kOne = Set(di32, 1); | 
|  |  | 
|  | const V y = Abs(x);  // cos(x) == cos(|x|) | 
|  |  | 
|  | // Compute the quadrant, q = int(|x| / pi) * 2 + 1 | 
|  | const VI32 q = Add(ShiftLeft<1>(impl.ToInt32(d, Mul(y, kOneOverPi))), kOne); | 
|  |  | 
|  | // Reduce range, apply sign, and approximate. | 
|  | return impl.Poly( | 
|  | d, Xor(impl.CosReduce(d, y, q), impl.CosSignFromQuadrant(d, q))); | 
|  | } | 
|  |  | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Exp(const D d, V x) { | 
|  | using T = TFromD<D>; | 
|  |  | 
|  | const V kHalf = Set(d, static_cast<T>(+0.5)); | 
|  | const V kLowerBound = | 
|  | Set(d, static_cast<T>((sizeof(T) == 4 ? -104.0 : -1000.0))); | 
|  | const V kNegZero = Set(d, static_cast<T>(-0.0)); | 
|  | const V kOne = Set(d, static_cast<T>(+1.0)); | 
|  | const V kOneOverLog2 = Set(d, static_cast<T>(+1.442695040888963407359924681)); | 
|  |  | 
|  | impl::ExpImpl<T> impl; | 
|  |  | 
|  | // q = static_cast<int32>((x / log(2)) + ((x < 0) ? -0.5 : +0.5)) | 
|  | const auto q = | 
|  | impl.ToInt32(d, MulAdd(x, kOneOverLog2, Or(kHalf, And(x, kNegZero)))); | 
|  |  | 
|  | // Reduce, approximate, and then reconstruct. | 
|  | const V y = impl.LoadExpShortRange( | 
|  | d, Add(impl.ExpPoly(d, impl.ExpReduce(d, x, q)), kOne), q); | 
|  | return IfThenElseZero(Ge(x, kLowerBound), y); | 
|  | } | 
|  |  | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Exp2(const D d, V x) { | 
|  | using T = TFromD<D>; | 
|  |  | 
|  | const V kLowerBound = | 
|  | Set(d, static_cast<T>((sizeof(T) == 4 ? -150.0 : -1075.0))); | 
|  | const V kOne = Set(d, static_cast<T>(+1.0)); | 
|  |  | 
|  | impl::ExpImpl<T> impl; | 
|  |  | 
|  | // q = static_cast<int32_t>(std::lrint(x)) | 
|  | const auto q = impl.ToNearestInt32(d, x); | 
|  |  | 
|  | // Reduce, approximate, and then reconstruct. | 
|  | const V y = impl.LoadExpShortRange( | 
|  | d, Add(impl.ExpPoly(d, impl.Exp2Reduce(d, x, q)), kOne), q); | 
|  | return IfThenElseZero(Ge(x, kLowerBound), y); | 
|  | } | 
|  |  | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Expm1(const D d, V x) { | 
|  | using T = TFromD<D>; | 
|  |  | 
|  | const V kHalf = Set(d, static_cast<T>(+0.5)); | 
|  | const V kLowerBound = | 
|  | Set(d, static_cast<T>((sizeof(T) == 4 ? -104.0 : -1000.0))); | 
|  | const V kLn2Over2 = Set(d, static_cast<T>(+0.346573590279972654708616)); | 
|  | const V kNegOne = Set(d, static_cast<T>(-1.0)); | 
|  | const V kNegZero = Set(d, static_cast<T>(-0.0)); | 
|  | const V kOne = Set(d, static_cast<T>(+1.0)); | 
|  | const V kOneOverLog2 = Set(d, static_cast<T>(+1.442695040888963407359924681)); | 
|  |  | 
|  | impl::ExpImpl<T> impl; | 
|  |  | 
|  | // q = static_cast<int32>((x / log(2)) + ((x < 0) ? -0.5 : +0.5)) | 
|  | const auto q = | 
|  | impl.ToInt32(d, MulAdd(x, kOneOverLog2, Or(kHalf, And(x, kNegZero)))); | 
|  |  | 
|  | // Reduce, approximate, and then reconstruct. | 
|  | const V y = impl.ExpPoly(d, impl.ExpReduce(d, x, q)); | 
|  | const V z = IfThenElse(Lt(Abs(x), kLn2Over2), y, | 
|  | Sub(impl.LoadExpShortRange(d, Add(y, kOne), q), kOne)); | 
|  | return IfThenElse(Lt(x, kLowerBound), kNegOne, z); | 
|  | } | 
|  |  | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Log(const D d, V x) { | 
|  | return impl::Log<D, V, /*kAllowSubnormals=*/true>(d, x); | 
|  | } | 
|  |  | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Log10(const D d, V x) { | 
|  | using T = TFromD<D>; | 
|  | return Mul(Log(d, x), Set(d, static_cast<T>(0.4342944819032518276511))); | 
|  | } | 
|  |  | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Log1p(const D d, V x) { | 
|  | using T = TFromD<D>; | 
|  | const V kOne = Set(d, static_cast<T>(+1.0)); | 
|  |  | 
|  | const V y = Add(x, kOne); | 
|  | const auto is_pole = Eq(y, kOne); | 
|  | const auto divisor = Sub(IfThenZeroElse(is_pole, y), kOne); | 
|  | const auto non_pole = | 
|  | Mul(impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y), Div(x, divisor)); | 
|  | return IfThenElse(is_pole, x, non_pole); | 
|  | } | 
|  |  | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Log2(const D d, V x) { | 
|  | using T = TFromD<D>; | 
|  | return Mul(Log(d, x), Set(d, static_cast<T>(1.44269504088896340735992))); | 
|  | } | 
|  |  | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Sin(const D d, V x) { | 
|  | using T = TFromD<D>; | 
|  | impl::CosSinImpl<T> impl; | 
|  |  | 
|  | // Float Constants | 
|  | const V kOneOverPi = Set(d, static_cast<T>(0.31830988618379067153)); | 
|  | const V kHalf = Set(d, static_cast<T>(0.5)); | 
|  |  | 
|  | // Integer Constants | 
|  | const Rebind<int32_t, D> di32; | 
|  | using VI32 = decltype(Zero(di32)); | 
|  |  | 
|  | const V abs_x = Abs(x); | 
|  | const V sign_x = Xor(abs_x, x); | 
|  |  | 
|  | // Compute the quadrant, q = int((|x| / pi) + 0.5) | 
|  | const VI32 q = impl.ToInt32(d, MulAdd(abs_x, kOneOverPi, kHalf)); | 
|  |  | 
|  | // Reduce range, apply sign, and approximate. | 
|  | return impl.Poly(d, Xor(impl.SinReduce(d, abs_x, q), | 
|  | Xor(impl.SinSignFromQuadrant(d, q), sign_x))); | 
|  | } | 
|  |  | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Sinh(const D d, V x) { | 
|  | using T = TFromD<D>; | 
|  | const V kHalf = Set(d, static_cast<T>(+0.5)); | 
|  | const V kOne = Set(d, static_cast<T>(+1.0)); | 
|  | const V kTwo = Set(d, static_cast<T>(+2.0)); | 
|  |  | 
|  | const V sign = And(SignBit(d), x);  // Extract the sign bit | 
|  | const V abs_x = Xor(x, sign); | 
|  | const V y = Expm1(d, abs_x); | 
|  | const V z = Mul(Div(Add(y, kTwo), Add(y, kOne)), Mul(y, kHalf)); | 
|  | return Xor(z, sign);  // Reapply the sign bit | 
|  | } | 
|  |  | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Tanh(const D d, V x) { | 
|  | using T = TFromD<D>; | 
|  | const V kLimit = Set(d, static_cast<T>(18.714973875)); | 
|  | const V kOne = Set(d, static_cast<T>(+1.0)); | 
|  | const V kTwo = Set(d, static_cast<T>(+2.0)); | 
|  |  | 
|  | const V sign = And(SignBit(d), x);  // Extract the sign bit | 
|  | const V abs_x = Xor(x, sign); | 
|  | const V y = Expm1(d, Mul(abs_x, kTwo)); | 
|  | const V z = IfThenElse(Gt(abs_x, kLimit), kOne, Div(y, Add(y, kTwo))); | 
|  | return Xor(z, sign);  // Reapply the sign bit | 
|  | } | 
|  |  | 
|  | template <class D, class V> | 
|  | HWY_INLINE void SinCos(const D d, V x, V& s, V& c) { | 
|  | using T = TFromD<D>; | 
|  | impl::SinCosImpl<T> impl; | 
|  | impl.SinCos(d, x, s, c); | 
|  | } | 
|  |  | 
|  | template <class D, class V> | 
|  | HWY_INLINE V Hypot(const D d, V a, V b) { | 
|  | using T = TFromD<D>; | 
|  | using TI = MakeSigned<T>; | 
|  | const RebindToUnsigned<decltype(d)> du; | 
|  | const RebindToSigned<decltype(d)> di; | 
|  | using VI = VFromD<decltype(di)>; | 
|  |  | 
|  | constexpr int kMaxBiasedExp = static_cast<int>(MaxExponentField<T>()); | 
|  | static_assert(kMaxBiasedExp > 0, "kMaxBiasedExp > 0 must be true"); | 
|  |  | 
|  | constexpr int kNumOfMantBits = MantissaBits<T>(); | 
|  | static_assert(kNumOfMantBits > 0, "kNumOfMantBits > 0 must be true"); | 
|  |  | 
|  | constexpr int kExpBias = kMaxBiasedExp / 2; | 
|  |  | 
|  | static_assert( | 
|  | static_cast<unsigned>(kExpBias) + static_cast<unsigned>(kNumOfMantBits) < | 
|  | static_cast<unsigned>(kMaxBiasedExp), | 
|  | "kExpBias + kNumOfMantBits < kMaxBiasedExp must be true"); | 
|  |  | 
|  | // kMinValToSquareBiasedExp is the smallest biased exponent such that | 
|  | // pow(pow(2, kMinValToSquareBiasedExp - kExpBias) * x, 2) is either a normal | 
|  | // floating-point value or infinity if x is a non-zero, non-NaN value | 
|  | constexpr int kMinValToSquareBiasedExp = (kExpBias / 2) + kNumOfMantBits; | 
|  | static_assert(kMinValToSquareBiasedExp < kExpBias, | 
|  | "kMinValToSquareBiasedExp < kExpBias must be true"); | 
|  |  | 
|  | // kMaxValToSquareBiasedExp is the largest biased exponent such that | 
|  | // pow(pow(2, kMaxValToSquareBiasedExp - kExpBias) * x, 2) * 2 is guaranteed | 
|  | // to be a finite value if x is a finite value | 
|  | constexpr int kMaxValToSquareBiasedExp = kExpBias + ((kExpBias / 2) - 1); | 
|  | static_assert(kMaxValToSquareBiasedExp > kExpBias, | 
|  | "kMaxValToSquareBiasedExp > kExpBias must be true"); | 
|  | static_assert(kMaxValToSquareBiasedExp < kMaxBiasedExp, | 
|  | "kMaxValToSquareBiasedExp < kMaxBiasedExp must be true"); | 
|  |  | 
|  | #if HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128 || \ | 
|  | HWY_TARGET == HWY_Z14 || HWY_TARGET == HWY_Z15 | 
|  | using TExpSatSub = MakeUnsigned<T>; | 
|  | using TExpMinMax = TI; | 
|  | #else | 
|  | using TExpSatSub = uint16_t; | 
|  | using TExpMinMax = int16_t; | 
|  | #endif | 
|  |  | 
|  | const Repartition<TExpSatSub, decltype(d)> d_exp_sat_sub; | 
|  | const Repartition<TExpMinMax, decltype(d)> d_exp_min_max; | 
|  |  | 
|  | const V abs_a = Abs(a); | 
|  | const V abs_b = Abs(b); | 
|  |  | 
|  | const MFromD<D> either_inf = Or(IsInf(a), IsInf(b)); | 
|  |  | 
|  | const VI zero = Zero(di); | 
|  |  | 
|  | // exp_a[i] is the biased exponent of abs_a[i] | 
|  | const VI exp_a = BitCast(di, ShiftRight<kNumOfMantBits>(BitCast(du, abs_a))); | 
|  |  | 
|  | // exp_b[i] is the biased exponent of abs_b[i] | 
|  | const VI exp_b = BitCast(di, ShiftRight<kNumOfMantBits>(BitCast(du, abs_b))); | 
|  |  | 
|  | // max_exp[i] is equal to HWY_MAX(exp_a[i], exp_b[i]) | 
|  |  | 
|  | // If abs_a[i] and abs_b[i] are both NaN values, max_exp[i] will be equal to | 
|  | // the biased exponent of the larger value. Otherwise, if either abs_a[i] or | 
|  | // abs_b[i] is NaN, max_exp[i] will be equal to kMaxBiasedExp. | 
|  | const VI max_exp = BitCast( | 
|  | di, Max(BitCast(d_exp_min_max, exp_a), BitCast(d_exp_min_max, exp_b))); | 
|  |  | 
|  | // If either abs_a[i] or abs_b[i] is zero, min_exp[i] is equal to max_exp[i]. | 
|  | // Otherwise, if abs_a[i] and abs_b[i] are both nonzero, min_exp[i] is equal | 
|  | // to HWY_MIN(exp_a[i], exp_b[i]). | 
|  | const VI min_exp = IfThenElse( | 
|  | Or(Eq(BitCast(di, abs_a), zero), Eq(BitCast(di, abs_b), zero)), max_exp, | 
|  | BitCast(di, Min(BitCast(d_exp_min_max, exp_a), | 
|  | BitCast(d_exp_min_max, exp_b)))); | 
|  |  | 
|  | // scl_pow2[i] is the power of 2 to scale abs_a[i] and abs_b[i] by | 
|  |  | 
|  | // abs_a[i] and abs_b[i] should be scaled by a factor that is greater than | 
|  | // zero but less than or equal to | 
|  | // pow(2, kMaxValToSquareBiasedExp - max_exp[i]) to ensure that that the | 
|  | // multiplications or addition operations do not overflow if | 
|  | // std::hypot(abs_a[i], abs_b[i]) is finite | 
|  |  | 
|  | // If either abs_a[i] or abs_b[i] is a a positive value that is less than | 
|  | // pow(2, kMinValToSquareBiasedExp - kExpBias), then scaling up abs_a[i] and | 
|  | // abs_b[i] by pow(2, kMinValToSquareBiasedExp - min_exp[i]) will ensure that | 
|  | // the multiplications and additions result in normal floating point values, | 
|  | // infinities, or NaNs. | 
|  |  | 
|  | // If HWY_MAX(kMinValToSquareBiasedExp - min_exp[i], 0) is greater than | 
|  | // kMaxValToSquareBiasedExp - max_exp[i], scale abs_a[i] and abs_b[i] up by | 
|  | // pow(2, kMaxValToSquareBiasedExp - max_exp[i]) to ensure that the | 
|  | // multiplication and addition operations result in a finite result if | 
|  | // std::hypot(abs_a[i], abs_b[i]) is finite. | 
|  |  | 
|  | const VI scl_pow2 = BitCast( | 
|  | di, | 
|  | Min(BitCast(d_exp_min_max, | 
|  | SaturatedSub(BitCast(d_exp_sat_sub, | 
|  | Set(di, static_cast<TI>( | 
|  | kMinValToSquareBiasedExp))), | 
|  | BitCast(d_exp_sat_sub, min_exp))), | 
|  | BitCast(d_exp_min_max, | 
|  | Sub(Set(di, static_cast<TI>(kMaxValToSquareBiasedExp)), | 
|  | max_exp)))); | 
|  |  | 
|  | const VI exp_bias = Set(di, static_cast<TI>(kExpBias)); | 
|  |  | 
|  | const V ab_scl_factor = | 
|  | BitCast(d, ShiftLeft<kNumOfMantBits>(Add(exp_bias, scl_pow2))); | 
|  | const V hypot_scl_factor = | 
|  | BitCast(d, ShiftLeft<kNumOfMantBits>(Sub(exp_bias, scl_pow2))); | 
|  |  | 
|  | const V scl_a = Mul(abs_a, ab_scl_factor); | 
|  | const V scl_b = Mul(abs_b, ab_scl_factor); | 
|  |  | 
|  | const V scl_hypot = Sqrt(MulAdd(scl_a, scl_a, Mul(scl_b, scl_b))); | 
|  | // std::hypot returns inf if one input is +/- inf, even if the other is NaN. | 
|  | return IfThenElse(either_inf, Inf(d), Mul(scl_hypot, hypot_scl_factor)); | 
|  | } | 
|  |  | 
|  | // NOLINTNEXTLINE(google-readability-namespace-comments) | 
|  | }  // namespace HWY_NAMESPACE | 
|  | }  // namespace hwy | 
|  | HWY_AFTER_NAMESPACE(); | 
|  |  | 
|  | #endif  // HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_ |