|  | // Copyright 2025 Google LLC | 
|  | // SPDX-License-Identifier: Apache-2.0 | 
|  | // | 
|  | // Licensed under the Apache License, Version 2.0 (the "License"); | 
|  | // you may not use this file except in compliance with the License. | 
|  | // You may obtain a copy of the License at | 
|  | // | 
|  | //      http://www.apache.org/licenses/LICENSE-2.0 | 
|  | // | 
|  | // Unless required by applicable law or agreed to in writing, software | 
|  | // distributed under the License is distributed on an "AS IS" BASIS, | 
|  | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
|  | // See the License for the specific language governing permissions and | 
|  | // limitations under the License. | 
|  |  | 
|  | #ifndef HIGHWAY_HWY_AUTO_TUNE_H_ | 
|  | #define HIGHWAY_HWY_AUTO_TUNE_H_ | 
|  |  | 
|  | #include <stddef.h> | 
|  | #include <stdint.h> | 
|  | #include <string.h>  // memmove | 
|  |  | 
|  | #include <cmath> | 
|  | #include <vector> | 
|  |  | 
|  | #include "third_party/highway/hwy/aligned_allocator.h"  // Span | 
|  | #include "third_party/highway/hwy/base.h"               // HWY_MIN | 
|  | #include "third_party/highway/hwy/contrib/sort/vqsort.h" | 
|  |  | 
|  | // Infrastructure for auto-tuning (choosing optimal parameters at runtime). | 
|  |  | 
|  | namespace hwy { | 
|  |  | 
|  | // O(1) storage to estimate the central tendency of hundreds of independent | 
|  | // distributions (one per configuration). The number of samples per distribution | 
|  | // (`kMinSamples`) varies from few to dozens. We support both by first storing | 
|  | // values in a buffer, and when full, switching to online variance estimation. | 
|  | // Modified from `hwy/stats.h`. | 
|  | class CostDistribution { | 
|  | public: | 
|  | static constexpr size_t kMaxValues = 14;  // for total size of 128 bytes | 
|  |  | 
|  | void Notify(const double x) { | 
|  | if (HWY_UNLIKELY(x < 0.0)) { | 
|  | HWY_WARN("Ignoring negative cost %f.", x); | 
|  | return; | 
|  | } | 
|  |  | 
|  | // Online phase after filling and warm-up. | 
|  | if (HWY_LIKELY(IsOnline())) return OnlineNotify(x); | 
|  |  | 
|  | // Fill phase: store up to `kMaxValues` values. | 
|  | values_[num_values_++] = x; | 
|  | HWY_DASSERT(num_values_ <= kMaxValues); | 
|  | if (HWY_UNLIKELY(num_values_ == kMaxValues)) { | 
|  | WarmUpOnline(); | 
|  | HWY_DASSERT(IsOnline()); | 
|  | } | 
|  | } | 
|  |  | 
|  | // Returns an estimate of the true cost, mitigating the impact of noise. | 
|  | // | 
|  | // Background and observations from time measurements in `thread_pool.h`: | 
|  | // - We aim for O(1) storage because there may be hundreds of instances. | 
|  | // - The mean is biased upwards by mostly additive noise: particularly | 
|  | //   interruptions such as context switches, but also contention. | 
|  | // - The minimum is not a robust estimator because there are also "lucky | 
|  | //   shots" (1.2-1.6x lower values) where interruptions or contention happen | 
|  | //   to be low. | 
|  | // - We want to preserve information about contention and a configuration's | 
|  | //   sensitivity to it. Otherwise, we are optimizing for the best-case, not | 
|  | //   the common case. | 
|  | // - It is still important to minimize the influence of outliers, such as page | 
|  | //   faults, which can cause multiple times larger measurements. | 
|  | // - Detecting outliers based only on the initial variance is too brittle. If | 
|  | //   the sample is narrow, measurements will fluctuate across runs because | 
|  | //   too many measurements are considered outliers. This would cause the | 
|  | //   'best' configuration to vary. | 
|  | // | 
|  | // Approach: | 
|  | // - Use Winsorization to reduce the impact of outliers, while preserving | 
|  | //   information on the central tendency. | 
|  | // - Continually update the thresholds based on the online variance, with | 
|  | //   exponential smoothing for stability. | 
|  | // - Trim the initial sample via MAD or skewness for a robust estimate of the | 
|  | //   variance. | 
|  | double EstimateCost() { | 
|  | if (!IsOnline()) { | 
|  | WarmUpOnline(); | 
|  | HWY_DASSERT(IsOnline()); | 
|  | } | 
|  | return Mean(); | 
|  | } | 
|  |  | 
|  | // Multiplex online state into values_ to allow higher `kMaxValues`. | 
|  | // Public for inspection in tests. Do not use directly. | 
|  | double& M1() { return values_[0]; }  // Moments for variance. | 
|  | double& M2() { return values_[1]; } | 
|  | double& Mean() { return values_[2]; }  // Exponential smoothing. | 
|  | double& Stddev() { return values_[3]; } | 
|  | double& Lower() { return values_[4]; } | 
|  | double& Upper() { return values_[5]; } | 
|  |  | 
|  | private: | 
|  | static double Median(double* to_sort, size_t n) { | 
|  | HWY_DASSERT(n >= 2); | 
|  | // F64 is supported everywhere except Armv7. | 
|  | #if !HWY_ARCH_ARM_V7 | 
|  | VQSort(to_sort, n, SortAscending()); | 
|  | #else | 
|  | // Values are known to be finite and non-negative, hence sorting as U64 is | 
|  | // equivalent. | 
|  | VQSort(reinterpret_cast<uint64_t*>(to_sort), n, SortAscending()); | 
|  | #endif | 
|  | if (n & 1) return to_sort[n / 2]; | 
|  | // Even length: average of two middle elements. | 
|  | return (to_sort[n / 2] + to_sort[n / 2 - 1]) * 0.5; | 
|  | } | 
|  |  | 
|  | static double MAD(const double* values, size_t n, const double median) { | 
|  | double abs_dev[kMaxValues]; | 
|  | for (size_t i = 0; i < n; ++i) { | 
|  | abs_dev[i] = ScalarAbs(values[i] - median); | 
|  | } | 
|  | return Median(abs_dev, n); | 
|  | } | 
|  |  | 
|  | // If `num_values_` is large enough, sorts and discards outliers: either via | 
|  | // MAD, or if too many values are equal, by trimming according to skewness. | 
|  | void RemoveOutliers() { | 
|  | if (num_values_ < 3) return;  // Not enough to discard two. | 
|  | HWY_DASSERT(num_values_ <= kMaxValues); | 
|  |  | 
|  | // Given the noise level in `auto_tune_test`, it can happen that 1/4 of the | 
|  | // sample is an outlier *in either direction*. Use median absolute | 
|  | // deviation, which is robust to almost half of the sample being outliers. | 
|  | const double median = Median(values_, num_values_);  // sorts in-place. | 
|  | const double mad = MAD(values_, num_values_, median); | 
|  | // At least half the sample is equal. | 
|  | if (mad == 0.0) { | 
|  | // Estimate skewness to decide which side to trim more. | 
|  | const double skewness = | 
|  | (values_[num_values_ - 1] - median) - (median - values_[0]); | 
|  |  | 
|  | const size_t trim = HWY_MAX(num_values_ / 2, size_t{2}); | 
|  | const size_t left = | 
|  | HWY_MAX(skewness < 0.0 ? trim * 3 / 4 : trim / 4, size_t{1}); | 
|  | num_values_ -= trim; | 
|  | HWY_DASSERT(num_values_ >= 1); | 
|  | memmove(values_, values_ + left, num_values_ * sizeof(values_[0])); | 
|  | return; | 
|  | } | 
|  |  | 
|  | const double upper = median + 5.0 * mad; | 
|  | const double lower = median - 5.0 * mad; | 
|  | size_t right = num_values_ - 1; | 
|  | while (values_[right] > upper) --right; | 
|  | // Nonzero MAD implies no more than half are equal, so we did not advance | 
|  | // beyond the median. | 
|  | HWY_DASSERT(right >= num_values_ / 2); | 
|  |  | 
|  | size_t left = 0; | 
|  | while (left < right && values_[left] < lower) ++left; | 
|  | HWY_DASSERT(left <= num_values_ / 2); | 
|  | num_values_ = right - left + 1; | 
|  | memmove(values_, values_ + left, num_values_ * sizeof(values_[0])); | 
|  | } | 
|  |  | 
|  | double SampleMean() const { | 
|  | // Only called in non-online phase, but buffer might not be full. | 
|  | HWY_DASSERT(!IsOnline() && 0 != num_values_ && num_values_ <= kMaxValues); | 
|  | double sum = 0.0; | 
|  | for (size_t i = 0; i < num_values_; ++i) { | 
|  | sum += values_[i]; | 
|  | } | 
|  | return sum / static_cast<double>(num_values_); | 
|  | } | 
|  |  | 
|  | // Unbiased estimator for population variance even for small `num_values_`. | 
|  | double SampleVariance(double sample_mean) const { | 
|  | HWY_DASSERT(sample_mean >= 0.0);  // we checked costs are non-negative. | 
|  | // Only called in non-online phase, but buffer might not be full. | 
|  | HWY_DASSERT(!IsOnline() && 0 != num_values_ && num_values_ <= kMaxValues); | 
|  | if (HWY_UNLIKELY(num_values_ == 1)) return 0.0;  // prevent divide-by-zero. | 
|  | double sum2 = 0.0; | 
|  | for (size_t i = 0; i < num_values_; ++i) { | 
|  | const double d = values_[i] - sample_mean; | 
|  | sum2 += d * d; | 
|  | } | 
|  | return sum2 / static_cast<double>(num_values_ - 1); | 
|  | } | 
|  |  | 
|  | bool IsOnline() const { return online_n_ > 0.0; } | 
|  |  | 
|  | void OnlineNotify(double x) { | 
|  | // Winsorize. | 
|  | x = HWY_MIN(HWY_MAX(Lower(), x), Upper()); | 
|  |  | 
|  | // Welford's online variance estimator. | 
|  | // https://media.thinkbrg.com/wp-content/uploads/2020/06/19094655/720_720_McCrary_ImplementingAlgorithms_Whitepaper_20151119_WEB.pdf#page=7.09 | 
|  | const double n_minus_1 = online_n_; | 
|  | online_n_ += 1.0; | 
|  | const double d = x - M1(); | 
|  | const double d_div_n = d / online_n_; | 
|  | M1() += d_div_n; | 
|  | HWY_DASSERT(M1() >= Lower()); | 
|  | M2() += d * n_minus_1 * d_div_n;  // d^2 * (N-1)/N | 
|  | // HWY_MAX avoids divide-by-zero. | 
|  | const double stddev = std::sqrt(M2() / HWY_MAX(1.0, n_minus_1)); | 
|  |  | 
|  | // Exponential smoothing. | 
|  | constexpr double kNew = 0.2;  // relatively fast update | 
|  | constexpr double kOld = 1.0 - kNew; | 
|  | Mean() = M1() * kNew + Mean() * kOld; | 
|  | Stddev() = stddev * kNew + Stddev() * kOld; | 
|  |  | 
|  | // Update thresholds from smoothed mean and stddev to enable recovering from | 
|  | // a too narrow initial range due to excessive trimming. | 
|  | Lower() = Mean() - 3.5 * Stddev(); | 
|  | Upper() = Mean() + 3.5 * Stddev(); | 
|  | } | 
|  |  | 
|  | void WarmUpOnline() { | 
|  | RemoveOutliers(); | 
|  |  | 
|  | // Compute and copy before writing to `M1`, which overwrites `values_`! | 
|  | const double sample_mean = SampleMean(); | 
|  | const double sample_variance = SampleVariance(sample_mean); | 
|  | double copy[kMaxValues]; | 
|  | hwy::CopyBytes(values_, copy, num_values_ * sizeof(values_[0])); | 
|  |  | 
|  | M1() = M2() = 0.0; | 
|  | Mean() = sample_mean; | 
|  | Stddev() = std::sqrt(sample_variance); | 
|  | // For single-value or all-equal sample, widen the range, else we will only | 
|  | // accept the same value. | 
|  | if (Stddev() == 0.0) Stddev() = Mean() / 2; | 
|  |  | 
|  | // High tolerance because the distribution is not actually Gaussian, and | 
|  | // we trimmed up to *half*, and do not want to reject too many values in | 
|  | // the online phase. | 
|  | Lower() = Mean() - 4.0 * Stddev(); | 
|  | Upper() = Mean() + 4.0 * Stddev(); | 
|  | // Feed copied values into online estimator. | 
|  | for (size_t i = 0; i < num_values_; ++i) { | 
|  | OnlineNotify(copy[i]); | 
|  | } | 
|  | HWY_DASSERT(IsOnline()); | 
|  |  | 
|  | #if SIZE_MAX == 0xFFFFFFFFu | 
|  | (void)padding_; | 
|  | #endif | 
|  | } | 
|  |  | 
|  | size_t num_values_ = 0;  // size of `values_` <= `kMaxValues` | 
|  | #if SIZE_MAX == 0xFFFFFFFFu | 
|  | uint32_t padding_ = 0; | 
|  | #endif | 
|  |  | 
|  | double online_n_ = 0.0;  // number of calls to `OnlineNotify`. | 
|  |  | 
|  | double values_[kMaxValues]; | 
|  | }; | 
|  | static_assert(sizeof(CostDistribution) == 128, ""); | 
|  |  | 
|  | // Implements a counter with wrap-around, plus the ability to skip values. | 
|  | // O(1) time, O(N) space via doubly-linked list of indices. | 
|  | class NextWithSkip { | 
|  | public: | 
|  | NextWithSkip() {} | 
|  | explicit NextWithSkip(size_t num) { | 
|  | links_.reserve(num); | 
|  | for (size_t i = 0; i < num; ++i) { | 
|  | links_.emplace_back(i, num); | 
|  | } | 
|  | } | 
|  |  | 
|  | size_t Next(size_t pos) { | 
|  | HWY_DASSERT(pos < links_.size()); | 
|  | HWY_DASSERT(!links_[pos].IsRemoved()); | 
|  | return links_[pos].Next(); | 
|  | } | 
|  |  | 
|  | // Must not be called for an already skipped position. Ignores an attempt to | 
|  | // skip the last remaining position. | 
|  | void Skip(size_t pos) { | 
|  | HWY_DASSERT(!links_[pos].IsRemoved());  // not already skipped. | 
|  | const size_t prev = links_[pos].Prev(); | 
|  | const size_t next = links_[pos].Next(); | 
|  | if (prev == pos || next == pos) return;  // last remaining position. | 
|  | links_[next].SetPrev(prev); | 
|  | links_[prev].SetNext(next); | 
|  | links_[pos].Remove(); | 
|  | } | 
|  |  | 
|  | private: | 
|  | // Combine prev/next into one array to improve locality/reduce allocations. | 
|  | class Link { | 
|  | // Bit-shifts avoid potentially expensive 16-bit loads. Store `next` at the | 
|  | // top and `prev` at the bottom for extraction with a single shift/AND. | 
|  | // There may be hundreds of configurations, so 8 bits are not enough. | 
|  | static constexpr size_t kBits = 14; | 
|  | static constexpr size_t kShift = 32 - kBits; | 
|  | static constexpr uint32_t kMaxNum = 1u << kBits; | 
|  |  | 
|  | public: | 
|  | Link(size_t pos, size_t num) { | 
|  | HWY_DASSERT(num < kMaxNum); | 
|  | const size_t prev = pos == 0 ? num - 1 : pos - 1; | 
|  | const size_t next = pos == num - 1 ? 0 : pos + 1; | 
|  | bits_ = | 
|  | (static_cast<uint32_t>(next) << kShift) | static_cast<uint32_t>(prev); | 
|  | HWY_DASSERT(Next() == next && Prev() == prev); | 
|  | HWY_DASSERT(!IsRemoved()); | 
|  | } | 
|  |  | 
|  | bool IsRemoved() const { return (bits_ & kMaxNum) != 0; } | 
|  | void Remove() { bits_ |= kMaxNum; } | 
|  |  | 
|  | size_t Next() const { return bits_ >> kShift; } | 
|  | size_t Prev() const { return bits_ & (kMaxNum - 1); } | 
|  |  | 
|  | void SetNext(size_t next) { | 
|  | HWY_DASSERT(next < kMaxNum); | 
|  | bits_ &= (~0u >> kBits);  // clear old next | 
|  | bits_ |= static_cast<uint32_t>(next) << kShift; | 
|  | HWY_DASSERT(Next() == next); | 
|  | HWY_DASSERT(!IsRemoved()); | 
|  | } | 
|  | void SetPrev(size_t prev) { | 
|  | HWY_DASSERT(prev < kMaxNum); | 
|  | bits_ &= ~(kMaxNum - 1);  // clear old prev | 
|  | bits_ |= static_cast<uint32_t>(prev); | 
|  | HWY_DASSERT(Prev() == prev); | 
|  | HWY_DASSERT(!IsRemoved()); | 
|  | } | 
|  |  | 
|  | private: | 
|  | uint32_t bits_; | 
|  | }; | 
|  | std::vector<Link> links_; | 
|  | }; | 
|  |  | 
|  | // State machine for choosing at runtime the lowest-cost `Config`, which is | 
|  | // typically a struct containing multiple parameters. For an introduction, see | 
|  | // "Auto-Tuning and Performance Portability on Heterogeneous Hardware". | 
|  | // | 
|  | // **Which parameters** | 
|  | // Note that simple parameters such as the L2 cache size can be directly queried | 
|  | // via `hwy/contrib/thread_pool/topology.h`. Difficult to predict parameters | 
|  | // such as task granularity are more appropriate for auto-tuning. We also | 
|  | // suggest that at least some parameters should also be 'algorithm variants' | 
|  | // such as parallel vs. serial, or 2D tiling vs. 1D striping. | 
|  | // | 
|  | // **Search strategy** | 
|  | // To guarantee the optimal result, we use exhaustive search, which is suitable | 
|  | // for around 10 parameters and a few hundred combinations of 'candidate' | 
|  | // configurations. | 
|  | // | 
|  | // **How to generate candidates** | 
|  | // To keep this framework simple and generic, applications enumerate the search | 
|  | // space and pass the list of all feasible candidates to `SetCandidates` before | 
|  | // the first call to `NextConfig`. Applications should prune the space as much | 
|  | // as possible, e.g. by upper-bounding parameters based on the known cache | 
|  | // sizes, and applying constraints such as one being a multiple of another. | 
|  | // | 
|  | // **Usage** | 
|  | // Applications typically conditionally branch to the code implementing the | 
|  | // configuration returned by `NextConfig`. They measure the cost of running it | 
|  | // and pass that to `NotifyCost`. Branching avoids the complexity and | 
|  | // opaqueness of a JIT. The number of branches can be reduced (at the cost of | 
|  | // code size) by inlining low-level decisions into larger code regions, e.g. by | 
|  | // hoisting them outside hot loops. | 
|  | // | 
|  | // **What is cost** | 
|  | // Cost is an arbitrary `uint64_t`, with lower values being better. Most | 
|  | // applications will use the elapsed time. If the tasks being tuned are short, | 
|  | // it is important to use a high-resolution timer such as `hwy/timer.h`. Energy | 
|  | // may also be useful [https://www.osti.gov/servlets/purl/1361296]. | 
|  | // | 
|  | // **Online vs. offline** | 
|  | // Although applications can auto-tune once, offline, it may be difficult to | 
|  | // ensure the stored configuration still applies to the current circumstances. | 
|  | // Thus we recommend online auto-tuning, re-discovering the configuration on | 
|  | // each run. We assume the overhead of bookkeeping and measuring cost is | 
|  | // negligible relative to the actual work. The cost of auto-tuning is then that | 
|  | // of running sub-optimal configurations. Assuming the best configuration is | 
|  | // better than baseline, and the work is performed many thousands of times, the | 
|  | // cost is outweighed by the benefits. | 
|  | // | 
|  | // **kMinSamples** | 
|  | // To further reduce overhead, after `kMinSamples` rounds (= measurements of | 
|  | // each configuration) we start excluding configurations from further | 
|  | // measurements if they are sufficiently worse than the current best. | 
|  | // `kMinSamples` can be several dozen when the tasks being tuned take a few | 
|  | // microseconds. Even for longer tasks, it should be at least 2 for some noise | 
|  | // tolerance. After this, there are another `kMinSamples / 2 + 1` rounds before | 
|  | // declaring the winner. | 
|  | template <typename Config, size_t kMinSamples = 2> | 
|  | class AutoTune { | 
|  | public: | 
|  | // Returns non-null best configuration if auto-tuning has already finished. | 
|  | // Otherwise, callers continue calling `NextConfig` and `NotifyCost`. | 
|  | // Points into `Candidates()`. | 
|  | const Config* Best() const { return best_; } | 
|  |  | 
|  | // If false, caller must call `SetCandidates` before `NextConfig`. | 
|  | bool HasCandidates() const { | 
|  | HWY_DASSERT(!Best()); | 
|  | return !candidates_.empty(); | 
|  | } | 
|  | // WARNING: invalidates `Best()`, do not call if that is non-null. | 
|  | void SetCandidates(std::vector<Config> candidates) { | 
|  | HWY_DASSERT(!Best() && !HasCandidates()); | 
|  | candidates_.swap(candidates); | 
|  | HWY_DASSERT(HasCandidates()); | 
|  | costs_.resize(candidates_.size()); | 
|  | list_ = NextWithSkip(candidates_.size()); | 
|  | } | 
|  |  | 
|  | // Typically called after Best() is non-null to compare all candidates' costs. | 
|  | Span<const Config> Candidates() const { | 
|  | HWY_DASSERT(HasCandidates()); | 
|  | return Span<const Config>(candidates_.data(), candidates_.size()); | 
|  | } | 
|  | Span<CostDistribution> Costs() { | 
|  | return Span<CostDistribution>(costs_.data(), costs_.size()); | 
|  | } | 
|  |  | 
|  | // Returns the current `Config` to measure. | 
|  | const Config& NextConfig() const { | 
|  | HWY_DASSERT(!Best() && HasCandidates()); | 
|  | return candidates_[config_idx_]; | 
|  | } | 
|  |  | 
|  | // O(1) except at the end of each round, which is O(N). | 
|  | void NotifyCost(uint64_t cost) { | 
|  | HWY_DASSERT(!Best() && HasCandidates()); | 
|  |  | 
|  | costs_[config_idx_].Notify(static_cast<double>(cost)); | 
|  | // Save now before we update `config_idx_`. | 
|  | const size_t my_idx = config_idx_; | 
|  | // Only retrieve once we have enough samples, otherwise, we switch to | 
|  | // online variance before the buffer is populated. | 
|  | const double my_cost = rounds_complete_ >= kMinSamples | 
|  | ? costs_[config_idx_].EstimateCost() | 
|  | : 0.0; | 
|  |  | 
|  | // Advance to next non-skipped config with wrap-around. This decorrelates | 
|  | // measurements by not immediately re-measuring the same config. | 
|  | config_idx_ = list_.Next(config_idx_); | 
|  | // Might still equal `my_idx` if this is the only non-skipped config. | 
|  |  | 
|  | // Disqualify from future `NextConfig` if cost was too far beyond the | 
|  | // current best. This reduces the number of measurements, while tolerating | 
|  | // noise in the first few measurements. Must happen after advancing. | 
|  | if (my_cost > skip_if_above_) { | 
|  | list_.Skip(my_idx); | 
|  | } | 
|  |  | 
|  | // Wrap-around indicates the round is complete. | 
|  | if (HWY_UNLIKELY(config_idx_ <= my_idx)) { | 
|  | ++rounds_complete_; | 
|  |  | 
|  | // Enough samples for stable estimates: update the thresholds. | 
|  | if (rounds_complete_ >= kMinSamples) { | 
|  | double best_cost = HighestValue<double>(); | 
|  | size_t idx_min = 0; | 
|  | for (size_t i = 0; i < candidates_.size(); ++i) { | 
|  | const double estimate = costs_[i].EstimateCost(); | 
|  | if (estimate < best_cost) { | 
|  | best_cost = estimate; | 
|  | idx_min = i; | 
|  | } | 
|  | } | 
|  | skip_if_above_ = best_cost * 1.25; | 
|  |  | 
|  | // After sufficient rounds, declare the winner. | 
|  | if (HWY_UNLIKELY(rounds_complete_ == 3 * kMinSamples / 2 + 1)) { | 
|  | best_ = &candidates_[idx_min]; | 
|  | HWY_DASSERT(Best()); | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | // Avoid printing during the first few rounds, because those might be noisy | 
|  | // and not yet skipped. | 
|  | bool ShouldPrint() { return rounds_complete_ > kMinSamples; } | 
|  |  | 
|  | private: | 
|  | const Config* best_ = nullptr; | 
|  | std::vector<Config> candidates_; | 
|  | std::vector<CostDistribution> costs_;  // one per candidate | 
|  | size_t config_idx_ = 0;                // [0, candidates_.size()) | 
|  | NextWithSkip list_; | 
|  | size_t rounds_complete_ = 0; | 
|  |  | 
|  | double skip_if_above_ = 0.0; | 
|  | }; | 
|  |  | 
|  | }  // namespace hwy | 
|  |  | 
|  | #endif  // HIGHWAY_HWY_AUTO_TUNE_H_ |