| /* |
| * Original implementation written in 2019 |
| * by David Blackman and Sebastiano Vigna (vigna@acm.org) |
| * Available at https://prng.di.unimi.it/ with creative commons license: |
| * To the extent possible under law, the author has dedicated all copyright |
| * and related and neighboring rights to this software to the public domain |
| * worldwide. This software is distributed without any warranty. |
| * See <http://creativecommons.org/publicdomain/zero/1.0/>. |
| * |
| * This implementation is a Vector port of the original implementation |
| * written by Marco Barbone (m.barbone19@imperial.ac.uk). |
| * I take no credit for the original implementation. |
| * The code is provided as is and the original license applies. |
| */ |
| |
| #if defined(HIGHWAY_HWY_CONTRIB_RANDOM_RANDOM_H_) == \ |
| defined(HWY_TARGET_TOGGLE) // NOLINT |
| #ifdef HIGHWAY_HWY_CONTRIB_RANDOM_RANDOM_H_ |
| #undef HIGHWAY_HWY_CONTRIB_RANDOM_RANDOM_H_ |
| #else |
| #define HIGHWAY_HWY_CONTRIB_RANDOM_RANDOM_H_ |
| #endif |
| |
| #include <array> |
| #include <cstdint> |
| #include <limits> |
| |
| #include "third_party/highway/hwy/aligned_allocator.h" |
| #include "third_party/highway/hwy/highway.h" |
| |
| HWY_BEFORE_NAMESPACE(); // required if not using HWY_ATTR |
| |
| namespace hwy { |
| |
| namespace HWY_NAMESPACE { // required: unique per target |
| namespace internal { |
| |
| namespace { |
| #if HWY_HAVE_FLOAT64 |
| // C++ < 17 does not support hexfloat |
| #if __cpp_hex_float > 201603L |
| constexpr double kMulConst = 0x1.0p-53; |
| #else |
| constexpr double kMulConst = |
| 0.00000000000000011102230246251565404236316680908203125; |
| #endif // __cpp_hex_float |
| |
| #endif // HWY_HAVE_FLOAT64 |
| |
| constexpr std::uint64_t kJump[] = {0x180ec6d33cfd0aba, 0xd5a61266f0c9392c, |
| 0xa9582618e03fc9aa, 0x39abdc4529b1661c}; |
| |
| constexpr std::uint64_t kLongJump[] = {0x76e15d3efefdcbbf, 0xc5004e441c522fb3, |
| 0x77710069854ee241, 0x39109bb02acbe635}; |
| } // namespace |
| |
| class SplitMix64 { |
| public: |
| constexpr explicit SplitMix64(const std::uint64_t state) noexcept |
| : state_(state) {} |
| |
| HWY_CXX14_CONSTEXPR std::uint64_t operator()() { |
| std::uint64_t z = (state_ += 0x9e3779b97f4a7c15); |
| z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9; |
| z = (z ^ (z >> 27)) * 0x94d049bb133111eb; |
| return z ^ (z >> 31); |
| } |
| |
| private: |
| std::uint64_t state_; |
| }; |
| |
| class Xoshiro { |
| public: |
| HWY_CXX14_CONSTEXPR explicit Xoshiro(const std::uint64_t seed) noexcept |
| : state_{} { |
| SplitMix64 splitMix64{seed}; |
| for (auto &element : state_) { |
| element = splitMix64(); |
| } |
| } |
| |
| HWY_CXX14_CONSTEXPR explicit Xoshiro(const std::uint64_t seed, |
| const std::uint64_t thread_id) noexcept |
| : Xoshiro(seed) { |
| for (auto i = UINT64_C(0); i < thread_id; ++i) { |
| Jump(); |
| } |
| } |
| |
| HWY_CXX14_CONSTEXPR std::uint64_t operator()() noexcept { return Next(); } |
| |
| #if HWY_HAVE_FLOAT64 |
| HWY_CXX14_CONSTEXPR double Uniform() noexcept { |
| return static_cast<double>(Next() >> 11) * kMulConst; |
| } |
| #endif |
| |
| HWY_CXX14_CONSTEXPR std::array<std::uint64_t, 4> GetState() const { |
| return {state_[0], state_[1], state_[2], state_[3]}; |
| } |
| |
| HWY_CXX17_CONSTEXPR void SetState( |
| std::array<std::uint64_t, 4> state) noexcept { |
| state_[0] = state[0]; |
| state_[1] = state[1]; |
| state_[2] = state[2]; |
| state_[3] = state[3]; |
| } |
| |
| static constexpr std::uint64_t StateSize() noexcept { return 4; } |
| |
| /* This is the jump function for the generator. It is equivalent to 2^128 |
| * calls to next(); it can be used to generate 2^128 non-overlapping |
| * subsequences for parallel computations. */ |
| HWY_CXX14_CONSTEXPR void Jump() noexcept { Jump(kJump); } |
| |
| /* This is the long-jump function for the generator. It is equivalent to 2^192 |
| * calls to next(); it can be used to generate 2^64 starting points, from each |
| * of which jump() will generate 2^64 non-overlapping subsequences for |
| * parallel distributed computations. */ |
| HWY_CXX14_CONSTEXPR void LongJump() noexcept { Jump(kLongJump); } |
| |
| private: |
| std::uint64_t state_[4]; |
| |
| static constexpr std::uint64_t Rotl(const std::uint64_t x, int k) noexcept { |
| return (x << k) | (x >> (64 - k)); |
| } |
| |
| HWY_CXX14_CONSTEXPR std::uint64_t Next() noexcept { |
| const std::uint64_t result = Rotl(state_[0] + state_[3], 23) + state_[0]; |
| const std::uint64_t t = state_[1] << 17; |
| |
| state_[2] ^= state_[0]; |
| state_[3] ^= state_[1]; |
| state_[1] ^= state_[2]; |
| state_[0] ^= state_[3]; |
| |
| state_[2] ^= t; |
| |
| state_[3] = Rotl(state_[3], 45); |
| |
| return result; |
| } |
| |
| HWY_CXX14_CONSTEXPR void Jump(const std::uint64_t (&jumpArray)[4]) noexcept { |
| std::uint64_t s0 = 0; |
| std::uint64_t s1 = 0; |
| std::uint64_t s2 = 0; |
| std::uint64_t s3 = 0; |
| |
| for (const std::uint64_t i : jumpArray) |
| for (std::uint_fast8_t b = 0; b < 64; b++) { |
| if (i & std::uint64_t{1UL} << b) { |
| s0 ^= state_[0]; |
| s1 ^= state_[1]; |
| s2 ^= state_[2]; |
| s3 ^= state_[3]; |
| } |
| Next(); |
| } |
| |
| state_[0] = s0; |
| state_[1] = s1; |
| state_[2] = s2; |
| state_[3] = s3; |
| } |
| }; |
| |
| } // namespace internal |
| |
| class VectorXoshiro { |
| private: |
| using VU64 = Vec<ScalableTag<std::uint64_t>>; |
| using StateType = AlignedNDArray<std::uint64_t, 2>; |
| #if HWY_HAVE_FLOAT64 |
| using VF64 = Vec<ScalableTag<double>>; |
| #endif |
| public: |
| explicit VectorXoshiro(const std::uint64_t seed, |
| const std::uint64_t threadNumber = 0) |
| : state_{{internal::Xoshiro::StateSize(), |
| Lanes(ScalableTag<std::uint64_t>{})}}, |
| streams{state_.shape().back()} { |
| internal::Xoshiro xoshiro{seed}; |
| |
| for (std::uint64_t i = 0; i < threadNumber; ++i) { |
| xoshiro.LongJump(); |
| } |
| |
| for (size_t i = 0UL; i < streams; ++i) { |
| const auto state = xoshiro.GetState(); |
| for (size_t j = 0UL; j < internal::Xoshiro::StateSize(); ++j) { |
| state_[{j}][i] = state[j]; |
| } |
| xoshiro.Jump(); |
| } |
| } |
| |
| HWY_INLINE VU64 operator()() noexcept { return Next(); } |
| |
| AlignedVector<std::uint64_t> operator()(const std::size_t n) { |
| AlignedVector<std::uint64_t> result(n); |
| const ScalableTag<std::uint64_t> tag{}; |
| auto s0 = Load(tag, state_[{0}].data()); |
| auto s1 = Load(tag, state_[{1}].data()); |
| auto s2 = Load(tag, state_[{2}].data()); |
| auto s3 = Load(tag, state_[{3}].data()); |
| for (std::uint64_t i = 0; i < n; i += Lanes(tag)) { |
| const auto next = Update(s0, s1, s2, s3); |
| Store(next, tag, result.data() + i); |
| } |
| Store(s0, tag, state_[{0}].data()); |
| Store(s1, tag, state_[{1}].data()); |
| Store(s2, tag, state_[{2}].data()); |
| Store(s3, tag, state_[{3}].data()); |
| return result; |
| } |
| |
| template <std::uint64_t N> |
| std::array<std::uint64_t, N> operator()() noexcept { |
| alignas(HWY_ALIGNMENT) std::array<std::uint64_t, N> result; |
| const ScalableTag<std::uint64_t> tag{}; |
| auto s0 = Load(tag, state_[{0}].data()); |
| auto s1 = Load(tag, state_[{1}].data()); |
| auto s2 = Load(tag, state_[{2}].data()); |
| auto s3 = Load(tag, state_[{3}].data()); |
| for (std::uint64_t i = 0; i < N; i += Lanes(tag)) { |
| const auto next = Update(s0, s1, s2, s3); |
| Store(next, tag, result.data() + i); |
| } |
| Store(s0, tag, state_[{0}].data()); |
| Store(s1, tag, state_[{1}].data()); |
| Store(s2, tag, state_[{2}].data()); |
| Store(s3, tag, state_[{3}].data()); |
| return result; |
| } |
| |
| std::uint64_t StateSize() const noexcept { |
| return streams * internal::Xoshiro::StateSize(); |
| } |
| |
| const StateType &GetState() const { return state_; } |
| |
| #if HWY_HAVE_FLOAT64 |
| |
| HWY_INLINE VF64 Uniform() noexcept { |
| const ScalableTag<double> real_tag{}; |
| const auto MUL_VALUE = Set(real_tag, internal::kMulConst); |
| const auto bits = ShiftRight<11>(Next()); |
| const auto real = ConvertTo(real_tag, bits); |
| return Mul(real, MUL_VALUE); |
| } |
| |
| AlignedVector<double> Uniform(const std::size_t n) { |
| AlignedVector<double> result(n); |
| const ScalableTag<std::uint64_t> tag{}; |
| const ScalableTag<double> real_tag{}; |
| const auto MUL_VALUE = Set(real_tag, internal::kMulConst); |
| |
| auto s0 = Load(tag, state_[{0}].data()); |
| auto s1 = Load(tag, state_[{1}].data()); |
| auto s2 = Load(tag, state_[{2}].data()); |
| auto s3 = Load(tag, state_[{3}].data()); |
| |
| for (std::uint64_t i = 0; i < n; i += Lanes(real_tag)) { |
| const auto next = Update(s0, s1, s2, s3); |
| const auto bits = ShiftRight<11>(next); |
| const auto real = ConvertTo(real_tag, bits); |
| const auto uniform = Mul(real, MUL_VALUE); |
| Store(uniform, real_tag, result.data() + i); |
| } |
| |
| Store(s0, tag, state_[{0}].data()); |
| Store(s1, tag, state_[{1}].data()); |
| Store(s2, tag, state_[{2}].data()); |
| Store(s3, tag, state_[{3}].data()); |
| return result; |
| } |
| |
| template <std::uint64_t N> |
| std::array<double, N> Uniform() noexcept { |
| alignas(HWY_ALIGNMENT) std::array<double, N> result; |
| const ScalableTag<std::uint64_t> tag{}; |
| const ScalableTag<double> real_tag{}; |
| const auto MUL_VALUE = Set(real_tag, internal::kMulConst); |
| |
| auto s0 = Load(tag, state_[{0}].data()); |
| auto s1 = Load(tag, state_[{1}].data()); |
| auto s2 = Load(tag, state_[{2}].data()); |
| auto s3 = Load(tag, state_[{3}].data()); |
| |
| for (std::uint64_t i = 0; i < N; i += Lanes(real_tag)) { |
| const auto next = Update(s0, s1, s2, s3); |
| const auto bits = ShiftRight<11>(next); |
| const auto real = ConvertTo(real_tag, bits); |
| const auto uniform = Mul(real, MUL_VALUE); |
| Store(uniform, real_tag, result.data() + i); |
| } |
| |
| Store(s0, tag, state_[{0}].data()); |
| Store(s1, tag, state_[{1}].data()); |
| Store(s2, tag, state_[{2}].data()); |
| Store(s3, tag, state_[{3}].data()); |
| return result; |
| } |
| |
| #endif |
| |
| private: |
| StateType state_; |
| const std::uint64_t streams; |
| |
| HWY_INLINE static VU64 Update(VU64 &s0, VU64 &s1, VU64 &s2, |
| VU64 &s3) noexcept { |
| const auto result = Add(RotateRight<41>(Add(s0, s3)), s0); |
| const auto t = ShiftLeft<17>(s1); |
| s2 = Xor(s2, s0); |
| s3 = Xor(s3, s1); |
| s1 = Xor(s1, s2); |
| s0 = Xor(s0, s3); |
| s2 = Xor(s2, t); |
| s3 = RotateRight<19>(s3); |
| return result; |
| } |
| |
| HWY_INLINE VU64 Next() noexcept { |
| const ScalableTag<std::uint64_t> tag{}; |
| auto s0 = Load(tag, state_[{0}].data()); |
| auto s1 = Load(tag, state_[{1}].data()); |
| auto s2 = Load(tag, state_[{2}].data()); |
| auto s3 = Load(tag, state_[{3}].data()); |
| auto result = Update(s0, s1, s2, s3); |
| Store(s0, tag, state_[{0}].data()); |
| Store(s1, tag, state_[{1}].data()); |
| Store(s2, tag, state_[{2}].data()); |
| Store(s3, tag, state_[{3}].data()); |
| return result; |
| } |
| }; |
| |
| template <std::uint64_t size = 1024> |
| class CachedXoshiro { |
| public: |
| using result_type = std::uint64_t; |
| |
| static constexpr result_type(min)() { |
| return (std::numeric_limits<result_type>::min)(); |
| } |
| |
| static constexpr result_type(max)() { |
| return (std::numeric_limits<result_type>::max)(); |
| } |
| |
| explicit CachedXoshiro(const result_type seed, |
| const result_type threadNumber = 0) |
| : generator_{seed, threadNumber}, |
| cache_{generator_.operator()<size>()}, |
| index_{0} {} |
| |
| result_type operator()() noexcept { |
| if (HWY_UNLIKELY(index_ == size)) { |
| cache_ = std::move(generator_.operator()<size>()); |
| index_ = 0; |
| } |
| return cache_[index_++]; |
| } |
| |
| private: |
| VectorXoshiro generator_; |
| alignas(HWY_ALIGNMENT) std::array<result_type, size> cache_; |
| std::size_t index_; |
| |
| static_assert((size & (size - 1)) == 0 && size != 0, |
| "only power of 2 are supported"); |
| }; |
| |
| } // namespace HWY_NAMESPACE |
| } // namespace hwy |
| |
| HWY_AFTER_NAMESPACE(); |
| |
| #endif // HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_ |