| // Copyright 2021 Google LLC |
| // SPDX-License-Identifier: Apache-2.0 |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // Normal include guard for target-independent parts |
| #ifndef HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_ |
| #define HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_ |
| |
| #include <stddef.h> |
| #include <stdint.h> |
| |
| #include <algorithm> // std::sort |
| #include <functional> // std::less, std::greater |
| #include <vector> |
| |
| #include "third_party/highway/hwy/contrib/sort/vqsort.h" |
| #include "third_party/highway/hwy/highway.h" |
| #include "third_party/highway/hwy/print.h" |
| |
| // Third-party algorithms |
| #define HAVE_AVX2SORT 0 |
| #define HAVE_IPS4O 0 |
| // When enabling, consider changing max_threads (required for Table 1a) |
| #define HAVE_PARALLEL_IPS4O (HAVE_IPS4O && 1) |
| #define HAVE_PDQSORT 0 |
| #define HAVE_SORT512 0 |
| #define HAVE_VXSORT 0 |
| #if HWY_ARCH_X86 |
| #define HAVE_INTEL 0 |
| #else |
| #define HAVE_INTEL 0 |
| #endif |
| |
| #if HAVE_PARALLEL_IPS4O |
| #include <thread> // NOLINT |
| #endif |
| |
| #if HAVE_AVX2SORT |
| HWY_PUSH_ATTRIBUTES("avx2,avx") |
| #include "avx2sort.h" //NOLINT |
| HWY_POP_ATTRIBUTES |
| #endif |
| #if HAVE_IPS4O || HAVE_PARALLEL_IPS4O |
| #include "third_party/ips4o/include/ips4o.hpp" |
| #include "third_party/ips4o/include/ips4o/thread_pool.hpp" |
| #endif |
| #if HAVE_PDQSORT |
| #include "third_party/boost/allowed/sort/sort.hpp" |
| #endif |
| #if HAVE_SORT512 |
| #include "sort512.h" //NOLINT |
| #endif |
| |
| // vxsort is difficult to compile for multiple targets because it also uses |
| // .cpp files, and we'd also have to #undef its include guards. Instead, compile |
| // only for AVX2 or AVX3 depending on this macro. |
| #define VXSORT_AVX3 1 |
| #if HAVE_VXSORT |
| // inlined from vxsort_targets_enable_avx512 (must close before end of header) |
| #ifdef __GNUC__ |
| #ifdef __clang__ |
| #if VXSORT_AVX3 |
| #pragma clang attribute push(__attribute__((target("avx512f,avx512dq"))), \ |
| apply_to = any(function)) |
| #else |
| #pragma clang attribute push(__attribute__((target("avx2"))), \ |
| apply_to = any(function)) |
| #endif // VXSORT_AVX3 |
| |
| #else |
| #pragma GCC push_options |
| #if VXSORT_AVX3 |
| #pragma GCC target("avx512f,avx512dq") |
| #else |
| #pragma GCC target("avx2") |
| #endif // VXSORT_AVX3 |
| #endif |
| #endif |
| |
| #if VXSORT_AVX3 |
| #include "vxsort/machine_traits.avx512.h" |
| #else |
| #include "vxsort/machine_traits.avx2.h" |
| #endif // VXSORT_AVX3 |
| #include "vxsort/vxsort.h" |
| #ifdef __GNUC__ |
| #ifdef __clang__ |
| #pragma clang attribute pop |
| #else |
| #pragma GCC pop_options |
| #endif |
| #endif |
| #endif // HAVE_VXSORT |
| |
| namespace hwy { |
| |
| enum class Dist { kUniform8, kUniform16, kUniform32 }; |
| |
| static inline std::vector<Dist> AllDist() { |
| // Also include lower-entropy distributions to test MaybePartitionTwoValue. |
| return {Dist::kUniform8, /*Dist::kUniform16,*/ Dist::kUniform32}; |
| } |
| |
| static inline const char* DistName(Dist dist) { |
| switch (dist) { |
| case Dist::kUniform8: |
| return "uniform8"; |
| case Dist::kUniform16: |
| return "uniform16"; |
| case Dist::kUniform32: |
| return "uniform32"; |
| } |
| return "unreachable"; |
| } |
| |
| template <typename T> |
| class InputStats { |
| public: |
| void Notify(T value) { |
| min_ = HWY_MIN(min_, value); |
| max_ = HWY_MAX(max_, value); |
| // Converting to integer would truncate floats, multiplying to save digits |
| // risks overflow especially when casting, so instead take the sum of the |
| // bit representations as the checksum. |
| uint64_t bits = 0; |
| static_assert(sizeof(T) <= 8, "Expected a built-in type"); |
| CopyBytes<sizeof(T)>(&value, &bits); // not same size |
| sum_ += bits; |
| count_ += 1; |
| } |
| |
| bool operator==(const InputStats& other) const { |
| char type_name[100]; |
| detail::TypeName(hwy::detail::MakeTypeInfo<T>(), 1, type_name); |
| |
| if (count_ != other.count_) { |
| HWY_ABORT("Sort %s: count %d vs %d\n", type_name, |
| static_cast<int>(count_), static_cast<int>(other.count_)); |
| } |
| |
| if (min_ != other.min_ || max_ != other.max_) { |
| HWY_ABORT("Sort %s: minmax %f/%f vs %f/%f\n", type_name, |
| static_cast<double>(min_), static_cast<double>(max_), |
| static_cast<double>(other.min_), |
| static_cast<double>(other.max_)); |
| } |
| |
| // Sum helps detect duplicated/lost values |
| if (sum_ != other.sum_) { |
| HWY_ABORT("Sort %s: Sum mismatch %g %g; min %g max %g\n", type_name, |
| static_cast<double>(sum_), static_cast<double>(other.sum_), |
| static_cast<double>(min_), static_cast<double>(max_)); |
| } |
| |
| return true; |
| } |
| |
| private: |
| T min_ = hwy::HighestValue<T>(); |
| T max_ = hwy::LowestValue<T>(); |
| uint64_t sum_ = 0; |
| size_t count_ = 0; |
| }; |
| |
| enum class Algo { |
| #if HAVE_INTEL |
| kIntel, |
| #endif |
| #if HAVE_AVX2SORT |
| kSEA, |
| #endif |
| #if HAVE_IPS4O |
| kIPS4O, |
| #endif |
| #if HAVE_PARALLEL_IPS4O |
| kParallelIPS4O, |
| #endif |
| #if HAVE_PDQSORT |
| kPDQ, |
| #endif |
| #if HAVE_SORT512 |
| kSort512, |
| #endif |
| #if HAVE_VXSORT |
| kVXSort, |
| #endif |
| kStdSort, |
| kStdSelect, |
| kStdPartialSort, |
| kVQSort, |
| kVQPartialSort, |
| kVQSelect, |
| kHeapSort, |
| kHeapPartialSort, |
| kHeapSelect, |
| }; |
| |
| static inline bool IsVQ(Algo algo) { |
| switch (algo) { |
| case Algo::kVQSort: |
| case Algo::kVQPartialSort: |
| case Algo::kVQSelect: |
| return true; |
| default: |
| return false; |
| } |
| } |
| |
| static inline bool IsSelect(Algo algo) { |
| switch (algo) { |
| case Algo::kStdSelect: |
| case Algo::kVQSelect: |
| case Algo::kHeapSelect: |
| return true; |
| default: |
| return false; |
| } |
| } |
| |
| static inline bool IsPartialSort(Algo algo) { |
| switch (algo) { |
| case Algo::kStdPartialSort: |
| case Algo::kVQPartialSort: |
| case Algo::kHeapPartialSort: |
| return true; |
| default: |
| return false; |
| } |
| } |
| |
| static inline Algo ReferenceAlgoFor(Algo algo) { |
| if (IsPartialSort(algo)) return Algo::kStdPartialSort; |
| #if HAVE_PDQSORT |
| return Algo::kPDQ; |
| #else |
| return Algo::kStdSort; |
| #endif |
| } |
| |
| static inline const char* AlgoName(Algo algo) { |
| switch (algo) { |
| #if HAVE_INTEL |
| case Algo::kIntel: |
| return "intel"; |
| #endif |
| #if HAVE_AVX2SORT |
| case Algo::kSEA: |
| return "sea"; |
| #endif |
| #if HAVE_IPS4O |
| case Algo::kIPS4O: |
| return "ips4o"; |
| #endif |
| #if HAVE_PARALLEL_IPS4O |
| case Algo::kParallelIPS4O: |
| return "par_ips4o"; |
| #endif |
| #if HAVE_PDQSORT |
| case Algo::kPDQ: |
| return "pdq"; |
| #endif |
| #if HAVE_SORT512 |
| case Algo::kSort512: |
| return "sort512"; |
| #endif |
| #if HAVE_VXSORT |
| case Algo::kVXSort: |
| return "vxsort"; |
| #endif |
| case Algo::kStdSort: |
| return "std"; |
| case Algo::kStdPartialSort: |
| return "std_partial"; |
| case Algo::kStdSelect: |
| return "std_select"; |
| case Algo::kVQSort: |
| return "vq"; |
| case Algo::kVQPartialSort: |
| return "vq_partial"; |
| case Algo::kVQSelect: |
| return "vq_select"; |
| case Algo::kHeapSort: |
| return "heap"; |
| case Algo::kHeapPartialSort: |
| return "heap_partial"; |
| case Algo::kHeapSelect: |
| return "heap_select"; |
| } |
| return "unreachable"; |
| } |
| |
| } // namespace hwy |
| #endif // HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_ |
| |
| // Per-target |
| // clang-format off |
| #if defined(HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE) == defined(HWY_TARGET_TOGGLE) // NOLINT |
| #ifdef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE |
| #undef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE |
| #else |
| #define HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE |
| #endif |
| // clang-format on |
| |
| #include "third_party/highway/hwy/aligned_allocator.h" |
| #include "third_party/highway/hwy/contrib/sort/traits-inl.h" |
| #include "third_party/highway/hwy/contrib/sort/traits128-inl.h" |
| #include "third_party/highway/hwy/contrib/sort/vqsort-inl.h" // HeapSort |
| |
| HWY_BEFORE_NAMESPACE(); |
| |
| // Requires target pragma set by HWY_BEFORE_NAMESPACE |
| #if HAVE_INTEL && HWY_TARGET <= HWY_AVX3 |
| // #include "avx512-16bit-qsort.hpp" // requires AVX512-VBMI2 |
| #include "avx512-32bit-qsort.hpp" |
| #include "avx512-64bit-qsort.hpp" |
| #endif |
| |
| namespace hwy { |
| namespace HWY_NAMESPACE { |
| |
| #if HAVE_INTEL || HAVE_VXSORT // only supports ascending order |
| template <typename T> |
| using OtherOrder = detail::OrderAscending<T>; |
| #else |
| template <typename T> |
| using OtherOrder = detail::OrderDescending<T>; |
| #endif |
| |
| class Xorshift128Plus { |
| static HWY_INLINE uint64_t SplitMix64(uint64_t z) { |
| z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull; |
| z = (z ^ (z >> 27)) * 0x94D049BB133111EBull; |
| return z ^ (z >> 31); |
| } |
| |
| public: |
| // Generates two vectors of 64-bit seeds via SplitMix64 and stores into |
| // `seeds`. Generating these afresh in each ChoosePivot is too expensive. |
| template <class DU64> |
| static void GenerateSeeds(DU64 du64, TFromD<DU64>* HWY_RESTRICT seeds) { |
| seeds[0] = SplitMix64(0x9E3779B97F4A7C15ull); |
| for (size_t i = 1; i < 2 * Lanes(du64); ++i) { |
| seeds[i] = SplitMix64(seeds[i - 1]); |
| } |
| } |
| |
| // Need to pass in the state because vector cannot be class members. |
| template <class VU64> |
| static VU64 RandomBits(VU64& state0, VU64& state1) { |
| VU64 s1 = state0; |
| VU64 s0 = state1; |
| const VU64 bits = Add(s1, s0); |
| state0 = s0; |
| s1 = Xor(s1, ShiftLeft<23>(s1)); |
| state1 = Xor(s1, Xor(s0, Xor(ShiftRight<18>(s1), ShiftRight<5>(s0)))); |
| return bits; |
| } |
| }; |
| |
| template <class D, class VU64, HWY_IF_NOT_FLOAT_D(D)> |
| Vec<D> RandomValues(D d, VU64& s0, VU64& s1, const VU64 mask) { |
| const VU64 bits = Xorshift128Plus::RandomBits(s0, s1); |
| return BitCast(d, And(bits, mask)); |
| } |
| |
| // It is important to avoid denormals, which are flushed to zero by SIMD but not |
| // scalar sorts, and NaN, which may be ordered differently in scalar vs. SIMD. |
| template <class DF, class VU64, HWY_IF_FLOAT_D(DF)> |
| Vec<DF> RandomValues(DF df, VU64& s0, VU64& s1, const VU64 mask) { |
| using TF = TFromD<DF>; |
| const RebindToUnsigned<decltype(df)> du; |
| using VU = Vec<decltype(du)>; |
| |
| const VU64 bits64 = And(Xorshift128Plus::RandomBits(s0, s1), mask); |
| |
| #if HWY_TARGET == HWY_SCALAR // Cannot repartition u64 to smaller types |
| using TU = MakeUnsigned<TF>; |
| const VU bits = Set(du, static_cast<TU>(GetLane(bits64) & LimitsMax<TU>())); |
| #else |
| const VU bits = BitCast(du, bits64); |
| #endif |
| // Avoid NaN/denormal by only generating values in [1, 2), i.e. random |
| // mantissas with the exponent taken from the representation of 1.0. |
| const VU k1 = BitCast(du, Set(df, TF{1.0})); |
| const VU mantissa_mask = Set(du, MantissaMask<TF>()); |
| const VU representation = OrAnd(k1, bits, mantissa_mask); |
| return BitCast(df, representation); |
| } |
| |
| template <class DU64> |
| Vec<DU64> MaskForDist(DU64 du64, const Dist dist, size_t sizeof_t) { |
| switch (sizeof_t) { |
| case 2: |
| return Set(du64, (dist == Dist::kUniform8) ? 0x00FF00FF00FF00FFull |
| : 0xFFFFFFFFFFFFFFFFull); |
| case 4: |
| return Set(du64, (dist == Dist::kUniform8) ? 0x000000FF000000FFull |
| : (dist == Dist::kUniform16) ? 0x0000FFFF0000FFFFull |
| : 0xFFFFFFFFFFFFFFFFull); |
| case 8: |
| return Set(du64, (dist == Dist::kUniform8) ? 0x00000000000000FFull |
| : (dist == Dist::kUniform16) ? 0x000000000000FFFFull |
| : 0x00000000FFFFFFFFull); |
| default: |
| HWY_ABORT("Logic error"); |
| return Zero(du64); |
| } |
| } |
| |
| template <typename T> |
| InputStats<T> GenerateInput(const Dist dist, T* v, size_t num_lanes) { |
| SortTag<uint64_t> du64; |
| using VU64 = Vec<decltype(du64)>; |
| const size_t N64 = Lanes(du64); |
| auto seeds = hwy::AllocateAligned<uint64_t>(2 * N64); |
| Xorshift128Plus::GenerateSeeds(du64, seeds.get()); |
| VU64 s0 = Load(du64, seeds.get()); |
| VU64 s1 = Load(du64, seeds.get() + N64); |
| |
| #if HWY_TARGET == HWY_SCALAR |
| const Sisd<T> d; |
| #else |
| const Repartition<T, decltype(du64)> d; |
| #endif |
| using V = Vec<decltype(d)>; |
| const size_t N = Lanes(d); |
| const VU64 mask = MaskForDist(du64, dist, sizeof(T)); |
| auto buf = hwy::AllocateAligned<T>(N); |
| |
| size_t i = 0; |
| for (; i + N <= num_lanes; i += N) { |
| const V values = RandomValues(d, s0, s1, mask); |
| StoreU(values, d, v + i); |
| } |
| if (i < num_lanes) { |
| const V values = RandomValues(d, s0, s1, mask); |
| StoreU(values, d, buf.get()); |
| CopyBytes(buf.get(), v + i, (num_lanes - i) * sizeof(T)); |
| } |
| |
| InputStats<T> input_stats; |
| for (size_t i = 0; i < num_lanes; ++i) { |
| input_stats.Notify(v[i]); |
| } |
| return input_stats; |
| } |
| |
| struct SharedState { |
| #if HAVE_PARALLEL_IPS4O |
| const unsigned max_threads = hwy::LimitsMax<unsigned>(); // 16 for Table 1a |
| ips4o::StdThreadPool pool{static_cast<int>( |
| HWY_MIN(max_threads, std::thread::hardware_concurrency() / 2))}; |
| #endif |
| }; |
| |
| // Adapters from Run's num_keys to vqsort-inl.h num_lanes. |
| template <typename KeyType, class Order> |
| void CallHeapSort(KeyType* keys, const size_t num_keys, Order) { |
| const detail::MakeTraits<KeyType, Order> st; |
| using LaneType = typename decltype(st)::LaneType; |
| return detail::HeapSort(st, reinterpret_cast<LaneType*>(keys), |
| num_keys * st.LanesPerKey()); |
| } |
| template <typename KeyType, class Order> |
| void CallHeapPartialSort(KeyType* keys, const size_t num_keys, |
| const size_t k_keys, Order) { |
| const detail::MakeTraits<KeyType, Order> st; |
| using LaneType = typename decltype(st)::LaneType; |
| detail::HeapPartialSort(st, reinterpret_cast<LaneType*>(keys), |
| num_keys * st.LanesPerKey(), |
| k_keys * st.LanesPerKey()); |
| } |
| template <typename KeyType, class Order> |
| void CallHeapSelect(KeyType* keys, const size_t num_keys, const size_t k_keys, |
| Order) { |
| const detail::MakeTraits<KeyType, Order> st; |
| using LaneType = typename decltype(st)::LaneType; |
| detail::HeapSelect(st, reinterpret_cast<LaneType*>(keys), |
| num_keys * st.LanesPerKey(), k_keys * st.LanesPerKey()); |
| } |
| |
| template <typename KeyType, class Order> |
| void Run(Algo algo, KeyType* inout, size_t num_keys, SharedState& shared, |
| size_t /*thread*/, size_t k_keys, Order) { |
| const std::less<KeyType> less; |
| const std::greater<KeyType> greater; |
| |
| constexpr bool kAscending = Order::IsAscending(); |
| |
| #if !HAVE_PARALLEL_IPS4O |
| (void)shared; |
| #endif |
| |
| switch (algo) { |
| #if HAVE_INTEL && HWY_TARGET <= HWY_AVX3 |
| case Algo::kIntel: |
| return avx512_qsort<KeyType>(inout, static_cast<int64_t>(num_keys)); |
| #endif |
| |
| #if HAVE_AVX2SORT |
| case Algo::kSEA: |
| return avx2::quicksort(inout, static_cast<int>(num_keys)); |
| #endif |
| |
| #if HAVE_IPS4O |
| case Algo::kIPS4O: |
| if (kAscending) { |
| return ips4o::sort(inout, inout + num_keys, less); |
| } else { |
| return ips4o::sort(inout, inout + num_keys, greater); |
| } |
| #endif |
| |
| #if HAVE_PARALLEL_IPS4O |
| case Algo::kParallelIPS4O: |
| if (kAscending) { |
| return ips4o::parallel::sort(inout, inout + num_keys, less, |
| shared.pool); |
| } else { |
| return ips4o::parallel::sort(inout, inout + num_keys, greater, |
| shared.pool); |
| } |
| #endif |
| |
| #if HAVE_SORT512 |
| case Algo::kSort512: |
| HWY_ABORT("not supported"); |
| // return Sort512::Sort(inout, num_keys); |
| #endif |
| |
| #if HAVE_PDQSORT |
| case Algo::kPDQ: |
| if (kAscending) { |
| return boost::sort::pdqsort_branchless(inout, inout + num_keys, less); |
| } else { |
| return boost::sort::pdqsort_branchless(inout, inout + num_keys, |
| greater); |
| } |
| #endif |
| |
| #if HAVE_VXSORT |
| case Algo::kVXSort: { |
| #if (VXSORT_AVX3 && HWY_TARGET != HWY_AVX3) || \ |
| (!VXSORT_AVX3 && HWY_TARGET != HWY_AVX2) |
| HWY_WARN("Do not call for target %s\n", hwy::TargetName(HWY_TARGET)); |
| return; |
| #else |
| #if VXSORT_AVX3 |
| vxsort::vxsort<KeyType, vxsort::AVX512> vx; |
| #else |
| vxsort::vxsort<KeyType, vxsort::AVX2> vx; |
| #endif |
| if (kAscending) { |
| return vx.sort(inout, inout + num_keys - 1); |
| } else { |
| HWY_WARN("Skipping VX - does not support descending order\n"); |
| return; |
| } |
| #endif // enabled for this target |
| } |
| #endif // HAVE_VXSORT |
| |
| case Algo::kStdSort: |
| if (kAscending) { |
| return std::sort(inout, inout + num_keys, less); |
| } else { |
| return std::sort(inout, inout + num_keys, greater); |
| } |
| case Algo::kStdPartialSort: |
| if (kAscending) { |
| return std::partial_sort(inout, inout + k_keys, inout + num_keys, less); |
| } else { |
| return std::partial_sort(inout, inout + k_keys, inout + num_keys, |
| greater); |
| } |
| case Algo::kStdSelect: |
| if (kAscending) { |
| return std::nth_element(inout, inout + k_keys, inout + num_keys, less); |
| } else { |
| return std::nth_element(inout, inout + k_keys, inout + num_keys, |
| greater); |
| } |
| |
| case Algo::kVQSort: |
| return VQSort(inout, num_keys, Order()); |
| case Algo::kVQPartialSort: |
| return VQPartialSort(inout, num_keys, k_keys, Order()); |
| case Algo::kVQSelect: |
| return VQSelect(inout, num_keys, k_keys, Order()); |
| |
| case Algo::kHeapSort: |
| return CallHeapSort(inout, num_keys, Order()); |
| case Algo::kHeapPartialSort: |
| return CallHeapPartialSort(inout, num_keys, k_keys, Order()); |
| case Algo::kHeapSelect: |
| return CallHeapSelect(inout, num_keys, k_keys, Order()); |
| |
| default: |
| HWY_ABORT("Not implemented"); |
| } |
| } |
| |
| // NOLINTNEXTLINE(google-readability-namespace-comments) |
| } // namespace HWY_NAMESPACE |
| } // namespace hwy |
| HWY_AFTER_NAMESPACE(); |
| |
| #endif // HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE |