| // Copyright 2025 Google LLC |
| // SPDX-License-Identifier: Apache-2.0 |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| #ifndef HIGHWAY_HWY_AUTO_TUNE_H_ |
| #define HIGHWAY_HWY_AUTO_TUNE_H_ |
| |
| #include <stddef.h> |
| #include <stdint.h> |
| #include <string.h> // memmove |
| |
| #include <cmath> |
| #include <vector> |
| |
| #include "third_party/highway/hwy/aligned_allocator.h" // Span |
| #include "third_party/highway/hwy/base.h" // HWY_MIN |
| #include "third_party/highway/hwy/contrib/sort/vqsort.h" |
| |
| // Infrastructure for auto-tuning (choosing optimal parameters at runtime). |
| |
| namespace hwy { |
| |
| // O(1) storage to estimate the central tendency of hundreds of independent |
| // distributions (one per configuration). The number of samples per distribution |
| // (`kMinSamples`) varies from few to dozens. We support both by first storing |
| // values in a buffer, and when full, switching to online variance estimation. |
| // Modified from `hwy/stats.h`. |
| class CostDistribution { |
| public: |
| static constexpr size_t kMaxValues = 14; // for total size of 128 bytes |
| |
| void Notify(const double x) { |
| if (HWY_UNLIKELY(x < 0.0)) { |
| HWY_WARN("Ignoring negative cost %f.", x); |
| return; |
| } |
| |
| // Online phase after filling and warm-up. |
| if (HWY_LIKELY(IsOnline())) return OnlineNotify(x); |
| |
| // Fill phase: store up to `kMaxValues` values. |
| values_[num_values_++] = x; |
| HWY_DASSERT(num_values_ <= kMaxValues); |
| if (HWY_UNLIKELY(num_values_ == kMaxValues)) { |
| WarmUpOnline(); |
| HWY_DASSERT(IsOnline()); |
| } |
| } |
| |
| // Returns an estimate of the true cost, mitigating the impact of noise. |
| // |
| // Background and observations from time measurements in `thread_pool.h`: |
| // - We aim for O(1) storage because there may be hundreds of instances. |
| // - The mean is biased upwards by mostly additive noise: particularly |
| // interruptions such as context switches, but also contention. |
| // - The minimum is not a robust estimator because there are also "lucky |
| // shots" (1.2-1.6x lower values) where interruptions or contention happen |
| // to be low. |
| // - We want to preserve information about contention and a configuration's |
| // sensitivity to it. Otherwise, we are optimizing for the best-case, not |
| // the common case. |
| // - It is still important to minimize the influence of outliers, such as page |
| // faults, which can cause multiple times larger measurements. |
| // - Detecting outliers based only on the initial variance is too brittle. If |
| // the sample is narrow, measurements will fluctuate across runs because |
| // too many measurements are considered outliers. This would cause the |
| // 'best' configuration to vary. |
| // |
| // Approach: |
| // - Use Winsorization to reduce the impact of outliers, while preserving |
| // information on the central tendency. |
| // - Continually update the thresholds based on the online variance, with |
| // exponential smoothing for stability. |
| // - Trim the initial sample via MAD or skewness for a robust estimate of the |
| // variance. |
| double EstimateCost() { |
| if (!IsOnline()) { |
| WarmUpOnline(); |
| HWY_DASSERT(IsOnline()); |
| } |
| return Mean(); |
| } |
| |
| // Multiplex online state into values_ to allow higher `kMaxValues`. |
| // Public for inspection in tests. Do not use directly. |
| double& M1() { return values_[0]; } // Moments for variance. |
| double& M2() { return values_[1]; } |
| double& Mean() { return values_[2]; } // Exponential smoothing. |
| double& Stddev() { return values_[3]; } |
| double& Lower() { return values_[4]; } |
| double& Upper() { return values_[5]; } |
| |
| private: |
| static double Median(double* to_sort, size_t n) { |
| HWY_DASSERT(n >= 2); |
| // F64 is supported everywhere except Armv7. |
| #if !HWY_ARCH_ARM_V7 |
| VQSort(to_sort, n, SortAscending()); |
| #else |
| // Values are known to be finite and non-negative, hence sorting as U64 is |
| // equivalent. |
| VQSort(reinterpret_cast<uint64_t*>(to_sort), n, SortAscending()); |
| #endif |
| if (n & 1) return to_sort[n / 2]; |
| // Even length: average of two middle elements. |
| return (to_sort[n / 2] + to_sort[n / 2 - 1]) * 0.5; |
| } |
| |
| static double MAD(const double* values, size_t n, const double median) { |
| double abs_dev[kMaxValues]; |
| for (size_t i = 0; i < n; ++i) { |
| abs_dev[i] = ScalarAbs(values[i] - median); |
| } |
| return Median(abs_dev, n); |
| } |
| |
| // If `num_values_` is large enough, sorts and discards outliers: either via |
| // MAD, or if too many values are equal, by trimming according to skewness. |
| void RemoveOutliers() { |
| if (num_values_ < 3) return; // Not enough to discard two. |
| HWY_DASSERT(num_values_ <= kMaxValues); |
| |
| // Given the noise level in `auto_tune_test`, it can happen that 1/4 of the |
| // sample is an outlier *in either direction*. Use median absolute |
| // deviation, which is robust to almost half of the sample being outliers. |
| const double median = Median(values_, num_values_); // sorts in-place. |
| const double mad = MAD(values_, num_values_, median); |
| // At least half the sample is equal. |
| if (mad == 0.0) { |
| // Estimate skewness to decide which side to trim more. |
| const double skewness = |
| (values_[num_values_ - 1] - median) - (median - values_[0]); |
| |
| const size_t trim = HWY_MAX(num_values_ / 2, size_t{2}); |
| const size_t left = |
| HWY_MAX(skewness < 0.0 ? trim * 3 / 4 : trim / 4, size_t{1}); |
| num_values_ -= trim; |
| HWY_DASSERT(num_values_ >= 1); |
| memmove(values_, values_ + left, num_values_ * sizeof(values_[0])); |
| return; |
| } |
| |
| const double upper = median + 5.0 * mad; |
| const double lower = median - 5.0 * mad; |
| size_t right = num_values_ - 1; |
| while (values_[right] > upper) --right; |
| // Nonzero MAD implies no more than half are equal, so we did not advance |
| // beyond the median. |
| HWY_DASSERT(right >= num_values_ / 2); |
| |
| size_t left = 0; |
| while (left < right && values_[left] < lower) ++left; |
| HWY_DASSERT(left <= num_values_ / 2); |
| num_values_ = right - left + 1; |
| memmove(values_, values_ + left, num_values_ * sizeof(values_[0])); |
| } |
| |
| double SampleMean() const { |
| // Only called in non-online phase, but buffer might not be full. |
| HWY_DASSERT(!IsOnline() && 0 != num_values_ && num_values_ <= kMaxValues); |
| double sum = 0.0; |
| for (size_t i = 0; i < num_values_; ++i) { |
| sum += values_[i]; |
| } |
| return sum / static_cast<double>(num_values_); |
| } |
| |
| // Unbiased estimator for population variance even for small `num_values_`. |
| double SampleVariance(double sample_mean) const { |
| HWY_DASSERT(sample_mean >= 0.0); // we checked costs are non-negative. |
| // Only called in non-online phase, but buffer might not be full. |
| HWY_DASSERT(!IsOnline() && 0 != num_values_ && num_values_ <= kMaxValues); |
| if (HWY_UNLIKELY(num_values_ == 1)) return 0.0; // prevent divide-by-zero. |
| double sum2 = 0.0; |
| for (size_t i = 0; i < num_values_; ++i) { |
| const double d = values_[i] - sample_mean; |
| sum2 += d * d; |
| } |
| return sum2 / static_cast<double>(num_values_ - 1); |
| } |
| |
| bool IsOnline() const { return online_n_ > 0.0; } |
| |
| void OnlineNotify(double x) { |
| // Winsorize. |
| x = HWY_MIN(HWY_MAX(Lower(), x), Upper()); |
| |
| // Welford's online variance estimator. |
| // https://media.thinkbrg.com/wp-content/uploads/2020/06/19094655/720_720_McCrary_ImplementingAlgorithms_Whitepaper_20151119_WEB.pdf#page=7.09 |
| const double n_minus_1 = online_n_; |
| online_n_ += 1.0; |
| const double d = x - M1(); |
| const double d_div_n = d / online_n_; |
| M1() += d_div_n; |
| HWY_DASSERT(M1() >= Lower()); |
| M2() += d * n_minus_1 * d_div_n; // d^2 * (N-1)/N |
| // HWY_MAX avoids divide-by-zero. |
| const double stddev = std::sqrt(M2() / HWY_MAX(1.0, n_minus_1)); |
| |
| // Exponential smoothing. |
| constexpr double kNew = 0.2; // relatively fast update |
| constexpr double kOld = 1.0 - kNew; |
| Mean() = M1() * kNew + Mean() * kOld; |
| Stddev() = stddev * kNew + Stddev() * kOld; |
| |
| // Update thresholds from smoothed mean and stddev to enable recovering from |
| // a too narrow initial range due to excessive trimming. |
| Lower() = Mean() - 3.5 * Stddev(); |
| Upper() = Mean() + 3.5 * Stddev(); |
| } |
| |
| void WarmUpOnline() { |
| RemoveOutliers(); |
| |
| // Compute and copy before writing to `M1`, which overwrites `values_`! |
| const double sample_mean = SampleMean(); |
| const double sample_variance = SampleVariance(sample_mean); |
| double copy[kMaxValues]; |
| hwy::CopyBytes(values_, copy, num_values_ * sizeof(values_[0])); |
| |
| M1() = M2() = 0.0; |
| Mean() = sample_mean; |
| Stddev() = std::sqrt(sample_variance); |
| // For single-value or all-equal sample, widen the range, else we will only |
| // accept the same value. |
| if (Stddev() == 0.0) Stddev() = Mean() / 2; |
| |
| // High tolerance because the distribution is not actually Gaussian, and |
| // we trimmed up to *half*, and do not want to reject too many values in |
| // the online phase. |
| Lower() = Mean() - 4.0 * Stddev(); |
| Upper() = Mean() + 4.0 * Stddev(); |
| // Feed copied values into online estimator. |
| for (size_t i = 0; i < num_values_; ++i) { |
| OnlineNotify(copy[i]); |
| } |
| HWY_DASSERT(IsOnline()); |
| |
| #if SIZE_MAX == 0xFFFFFFFFu |
| (void)padding_; |
| #endif |
| } |
| |
| size_t num_values_ = 0; // size of `values_` <= `kMaxValues` |
| #if SIZE_MAX == 0xFFFFFFFFu |
| uint32_t padding_ = 0; |
| #endif |
| |
| double online_n_ = 0.0; // number of calls to `OnlineNotify`. |
| |
| double values_[kMaxValues]; |
| }; |
| static_assert(sizeof(CostDistribution) == 128, ""); |
| |
| // Implements a counter with wrap-around, plus the ability to skip values. |
| // O(1) time, O(N) space via doubly-linked list of indices. |
| class NextWithSkip { |
| public: |
| NextWithSkip() {} |
| explicit NextWithSkip(size_t num) { |
| links_.reserve(num); |
| for (size_t i = 0; i < num; ++i) { |
| links_.emplace_back(i, num); |
| } |
| } |
| |
| size_t Next(size_t pos) { |
| HWY_DASSERT(pos < links_.size()); |
| HWY_DASSERT(!links_[pos].IsRemoved()); |
| return links_[pos].Next(); |
| } |
| |
| // Must not be called for an already skipped position. Ignores an attempt to |
| // skip the last remaining position. |
| void Skip(size_t pos) { |
| HWY_DASSERT(!links_[pos].IsRemoved()); // not already skipped. |
| const size_t prev = links_[pos].Prev(); |
| const size_t next = links_[pos].Next(); |
| if (prev == pos || next == pos) return; // last remaining position. |
| links_[next].SetPrev(prev); |
| links_[prev].SetNext(next); |
| links_[pos].Remove(); |
| } |
| |
| private: |
| // Combine prev/next into one array to improve locality/reduce allocations. |
| class Link { |
| // Bit-shifts avoid potentially expensive 16-bit loads. Store `next` at the |
| // top and `prev` at the bottom for extraction with a single shift/AND. |
| // There may be hundreds of configurations, so 8 bits are not enough. |
| static constexpr size_t kBits = 14; |
| static constexpr size_t kShift = 32 - kBits; |
| static constexpr uint32_t kMaxNum = 1u << kBits; |
| |
| public: |
| Link(size_t pos, size_t num) { |
| HWY_DASSERT(num < kMaxNum); |
| const size_t prev = pos == 0 ? num - 1 : pos - 1; |
| const size_t next = pos == num - 1 ? 0 : pos + 1; |
| bits_ = |
| (static_cast<uint32_t>(next) << kShift) | static_cast<uint32_t>(prev); |
| HWY_DASSERT(Next() == next && Prev() == prev); |
| HWY_DASSERT(!IsRemoved()); |
| } |
| |
| bool IsRemoved() const { return (bits_ & kMaxNum) != 0; } |
| void Remove() { bits_ |= kMaxNum; } |
| |
| size_t Next() const { return bits_ >> kShift; } |
| size_t Prev() const { return bits_ & (kMaxNum - 1); } |
| |
| void SetNext(size_t next) { |
| HWY_DASSERT(next < kMaxNum); |
| bits_ &= (~0u >> kBits); // clear old next |
| bits_ |= static_cast<uint32_t>(next) << kShift; |
| HWY_DASSERT(Next() == next); |
| HWY_DASSERT(!IsRemoved()); |
| } |
| void SetPrev(size_t prev) { |
| HWY_DASSERT(prev < kMaxNum); |
| bits_ &= ~(kMaxNum - 1); // clear old prev |
| bits_ |= static_cast<uint32_t>(prev); |
| HWY_DASSERT(Prev() == prev); |
| HWY_DASSERT(!IsRemoved()); |
| } |
| |
| private: |
| uint32_t bits_; |
| }; |
| std::vector<Link> links_; |
| }; |
| |
| // State machine for choosing at runtime the lowest-cost `Config`, which is |
| // typically a struct containing multiple parameters. For an introduction, see |
| // "Auto-Tuning and Performance Portability on Heterogeneous Hardware". |
| // |
| // **Which parameters** |
| // Note that simple parameters such as the L2 cache size can be directly queried |
| // via `hwy/contrib/thread_pool/topology.h`. Difficult to predict parameters |
| // such as task granularity are more appropriate for auto-tuning. We also |
| // suggest that at least some parameters should also be 'algorithm variants' |
| // such as parallel vs. serial, or 2D tiling vs. 1D striping. |
| // |
| // **Search strategy** |
| // To guarantee the optimal result, we use exhaustive search, which is suitable |
| // for around 10 parameters and a few hundred combinations of 'candidate' |
| // configurations. |
| // |
| // **How to generate candidates** |
| // To keep this framework simple and generic, applications enumerate the search |
| // space and pass the list of all feasible candidates to `SetCandidates` before |
| // the first call to `NextConfig`. Applications should prune the space as much |
| // as possible, e.g. by upper-bounding parameters based on the known cache |
| // sizes, and applying constraints such as one being a multiple of another. |
| // |
| // **Usage** |
| // Applications typically conditionally branch to the code implementing the |
| // configuration returned by `NextConfig`. They measure the cost of running it |
| // and pass that to `NotifyCost`. Branching avoids the complexity and |
| // opaqueness of a JIT. The number of branches can be reduced (at the cost of |
| // code size) by inlining low-level decisions into larger code regions, e.g. by |
| // hoisting them outside hot loops. |
| // |
| // **What is cost** |
| // Cost is an arbitrary `uint64_t`, with lower values being better. Most |
| // applications will use the elapsed time. If the tasks being tuned are short, |
| // it is important to use a high-resolution timer such as `hwy/timer.h`. Energy |
| // may also be useful [https://www.osti.gov/servlets/purl/1361296]. |
| // |
| // **Online vs. offline** |
| // Although applications can auto-tune once, offline, it may be difficult to |
| // ensure the stored configuration still applies to the current circumstances. |
| // Thus we recommend online auto-tuning, re-discovering the configuration on |
| // each run. We assume the overhead of bookkeeping and measuring cost is |
| // negligible relative to the actual work. The cost of auto-tuning is then that |
| // of running sub-optimal configurations. Assuming the best configuration is |
| // better than baseline, and the work is performed many thousands of times, the |
| // cost is outweighed by the benefits. |
| // |
| // **kMinSamples** |
| // To further reduce overhead, after `kMinSamples` rounds (= measurements of |
| // each configuration) we start excluding configurations from further |
| // measurements if they are sufficiently worse than the current best. |
| // `kMinSamples` can be several dozen when the tasks being tuned take a few |
| // microseconds. Even for longer tasks, it should be at least 2 for some noise |
| // tolerance. After this, there are another `kMinSamples / 2 + 1` rounds before |
| // declaring the winner. |
| template <typename Config, size_t kMinSamples = 2> |
| class AutoTune { |
| public: |
| // Returns non-null best configuration if auto-tuning has already finished. |
| // Otherwise, callers continue calling `NextConfig` and `NotifyCost`. |
| // Points into `Candidates()`. |
| const Config* Best() const { return best_; } |
| |
| // If false, caller must call `SetCandidates` before `NextConfig`. |
| bool HasCandidates() const { |
| HWY_DASSERT(!Best()); |
| return !candidates_.empty(); |
| } |
| // WARNING: invalidates `Best()`, do not call if that is non-null. |
| void SetCandidates(std::vector<Config> candidates) { |
| HWY_DASSERT(!Best() && !HasCandidates()); |
| candidates_.swap(candidates); |
| HWY_DASSERT(HasCandidates()); |
| costs_.resize(candidates_.size()); |
| list_ = NextWithSkip(candidates_.size()); |
| } |
| |
| // Typically called after Best() is non-null to compare all candidates' costs. |
| Span<const Config> Candidates() const { |
| HWY_DASSERT(HasCandidates()); |
| return Span<const Config>(candidates_.data(), candidates_.size()); |
| } |
| Span<CostDistribution> Costs() { |
| return Span<CostDistribution>(costs_.data(), costs_.size()); |
| } |
| |
| // Returns the current `Config` to measure. |
| const Config& NextConfig() const { |
| HWY_DASSERT(!Best() && HasCandidates()); |
| return candidates_[config_idx_]; |
| } |
| |
| // O(1) except at the end of each round, which is O(N). |
| void NotifyCost(uint64_t cost) { |
| HWY_DASSERT(!Best() && HasCandidates()); |
| |
| costs_[config_idx_].Notify(static_cast<double>(cost)); |
| // Save now before we update `config_idx_`. |
| const size_t my_idx = config_idx_; |
| // Only retrieve once we have enough samples, otherwise, we switch to |
| // online variance before the buffer is populated. |
| const double my_cost = rounds_complete_ >= kMinSamples |
| ? costs_[config_idx_].EstimateCost() |
| : 0.0; |
| |
| // Advance to next non-skipped config with wrap-around. This decorrelates |
| // measurements by not immediately re-measuring the same config. |
| config_idx_ = list_.Next(config_idx_); |
| // Might still equal `my_idx` if this is the only non-skipped config. |
| |
| // Disqualify from future `NextConfig` if cost was too far beyond the |
| // current best. This reduces the number of measurements, while tolerating |
| // noise in the first few measurements. Must happen after advancing. |
| if (my_cost > skip_if_above_) { |
| list_.Skip(my_idx); |
| } |
| |
| // Wrap-around indicates the round is complete. |
| if (HWY_UNLIKELY(config_idx_ <= my_idx)) { |
| ++rounds_complete_; |
| |
| // Enough samples for stable estimates: update the thresholds. |
| if (rounds_complete_ >= kMinSamples) { |
| double best_cost = HighestValue<double>(); |
| size_t idx_min = 0; |
| for (size_t i = 0; i < candidates_.size(); ++i) { |
| const double estimate = costs_[i].EstimateCost(); |
| if (estimate < best_cost) { |
| best_cost = estimate; |
| idx_min = i; |
| } |
| } |
| skip_if_above_ = best_cost * 1.25; |
| |
| // After sufficient rounds, declare the winner. |
| if (HWY_UNLIKELY(rounds_complete_ == 3 * kMinSamples / 2 + 1)) { |
| best_ = &candidates_[idx_min]; |
| HWY_DASSERT(Best()); |
| } |
| } |
| } |
| } |
| |
| // Avoid printing during the first few rounds, because those might be noisy |
| // and not yet skipped. |
| bool ShouldPrint() { return rounds_complete_ > kMinSamples; } |
| |
| private: |
| const Config* best_ = nullptr; |
| std::vector<Config> candidates_; |
| std::vector<CostDistribution> costs_; // one per candidate |
| size_t config_idx_ = 0; // [0, candidates_.size()) |
| NextWithSkip list_; |
| size_t rounds_complete_ = 0; |
| |
| double skip_if_above_ = 0.0; |
| }; |
| |
| } // namespace hwy |
| |
| #endif // HIGHWAY_HWY_AUTO_TUNE_H_ |