|  | // Copyright 2024 Google LLC | 
|  | // SPDX-License-Identifier: Apache-2.0 | 
|  | // | 
|  | // Licensed under the Apache License, Version 2.0 (the "License"); | 
|  | // you may not use this file except in compliance with the License. | 
|  | // You may obtain a copy of the License at | 
|  | // | 
|  | //      http://www.apache.org/licenses/LICENSE-2.0 | 
|  | // | 
|  | // Unless required by applicable law or agreed to in writing, software | 
|  | // distributed under the License is distributed on an "AS IS" BASIS, | 
|  | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
|  | // See the License for the specific language governing permissions and | 
|  | // limitations under the License. | 
|  |  | 
|  | #ifndef HIGHWAY_HWY_PERF_COUNTERS_H_ | 
|  | #define HIGHWAY_HWY_PERF_COUNTERS_H_ | 
|  |  | 
|  | // Reads OS/CPU performance counters. | 
|  |  | 
|  | #include <stddef.h> | 
|  |  | 
|  | #include "third_party/highway/hwy/base.h"  // HWY_ABORT | 
|  | #include "third_party/highway/hwy/bit_set.h" | 
|  |  | 
|  | namespace hwy { | 
|  | namespace platform { | 
|  |  | 
|  | // Avoid padding in case callers such as profiler.h store many instances. | 
|  | #pragma pack(push, 1) | 
|  | // Provides access to CPU/OS performance counters. Each instance has space for | 
|  | // multiple counter values; which counters these are may change in future. | 
|  | // Although counters are per-CPU, Linux accesses them via a syscall, hence we | 
|  | // use the monostate pattern to avoid callers having to pass around a pointer. | 
|  | // Note that this is not thread-safe, so the static member functions should only | 
|  | // be called from the main thread. | 
|  | class PerfCounters { | 
|  | public: | 
|  | // Chosen such that this class occupies one or two cache lines. | 
|  | static constexpr size_t kCapacity = 14; | 
|  |  | 
|  | // Bit indices used to identify counters. The ordering is arbitrary. Some of | 
|  | // these counters may be 'removed' in the sense of not being visited by | 
|  | // `Foreach`, but their enumerators will remain. New counters may be appended. | 
|  | enum Counter { | 
|  | kRefCycles = 0, | 
|  | kInstructions, | 
|  | kBranches, | 
|  | kBranchMispredicts, | 
|  | kBusCycles, | 
|  | kCacheRefs, | 
|  | kCacheMisses, | 
|  | kL3Loads, | 
|  | kL3Stores, | 
|  | kPageFaults,  // SW | 
|  | kMigrations   // SW | 
|  | };  // BitSet64 requires these values to be less than 64. | 
|  |  | 
|  | // Strings for user-facing messages, not used in the implementation. | 
|  | static inline const char* Name(Counter c) { | 
|  | switch (c) { | 
|  | case kRefCycles: | 
|  | return "ref_cycles"; | 
|  | case kInstructions: | 
|  | return "instructions"; | 
|  | case kBranches: | 
|  | return "branches"; | 
|  | case kBranchMispredicts: | 
|  | return "branch_mispredicts"; | 
|  | case kBusCycles: | 
|  | return "bus_cycles"; | 
|  | case kCacheRefs: | 
|  | return "cache_refs"; | 
|  | case kCacheMisses: | 
|  | return "cache_misses"; | 
|  | case kL3Loads: | 
|  | return "l3_load"; | 
|  | case kL3Stores: | 
|  | return "l3_store"; | 
|  | case kPageFaults: | 
|  | return "page_fault"; | 
|  | case kMigrations: | 
|  | return "migration"; | 
|  | default: | 
|  | HWY_ABORT("Bug: unknown counter %d", c); | 
|  | } | 
|  | } | 
|  |  | 
|  | // Returns false if counters are unavailable. Must be called at least once | 
|  | // before `StartAll`; it is separate to reduce the overhead of repeatedly | 
|  | // stopping/starting counters. | 
|  | HWY_DLLEXPORT static bool Init(); | 
|  |  | 
|  | // Returns false if counters are unavailable, otherwise starts them. Note that | 
|  | // they default to stopped. Unless this is called, the values read may be 0. | 
|  | HWY_DLLEXPORT static bool StartAll(); | 
|  |  | 
|  | // Stops and zeros all counters. This is not necessary if users subtract the | 
|  | // previous counter values, but can increase precision because floating-point | 
|  | // has more precision near zero. | 
|  | HWY_DLLEXPORT static void StopAllAndReset(); | 
|  |  | 
|  | // Reads the current (extrapolated, in case of multiplexing) counter values. | 
|  | HWY_DLLEXPORT PerfCounters(); | 
|  |  | 
|  | // Returns whether any counters were successfully read. | 
|  | bool AnyValid() const { return valid_.Any(); } | 
|  |  | 
|  | // Returns whether the given counter was successfully read. | 
|  | bool IsValid(Counter c) const { | 
|  | const size_t bit_idx = static_cast<size_t>(c); | 
|  | return valid_.Get(bit_idx); | 
|  | } | 
|  |  | 
|  | // Returns the maximum extrapolation factor for any counter, which is the | 
|  | // total time between `StartAll` and now or the last `StopAllAndReset`, | 
|  | // divided by the time that the counter was actually running. This | 
|  | // approximates the number of counter groups that the CPU multiplexes onto the | 
|  | // actual counter hardware. It is only meaningful if AnyValid(). | 
|  | double MaxExtrapolate() const { return max_extrapolate_; } | 
|  |  | 
|  | // Returns the value of the given counter, or zero if it is not valid. | 
|  | double Get(Counter c) const { | 
|  | return IsValid(c) ? values_[IndexForCounter(c)] : 0.0; | 
|  | } | 
|  |  | 
|  | // For each valid counter in increasing numerical order, calls `visitor` with | 
|  | // the value and `Counter`. | 
|  | template <class Visitor> | 
|  | void Foreach(const Visitor& visitor) { | 
|  | valid_.Foreach([&](size_t bit_idx) { | 
|  | const Counter c = static_cast<Counter>(bit_idx); | 
|  | visitor(values_[IndexForCounter(c)], c); | 
|  | }); | 
|  | } | 
|  |  | 
|  | private: | 
|  | // Index within `values_` for a given counter. | 
|  | HWY_DLLEXPORT static size_t IndexForCounter(Counter c); | 
|  |  | 
|  | BitSet64 valid_; | 
|  | double max_extrapolate_; | 
|  | // Floating-point because these are extrapolated (multiplexing). It would be | 
|  | // nice for this to fit in one cache line to reduce the cost of reading | 
|  | // counters in profiler.h, but some of the values are too large for float and | 
|  | // we want more than 8 counters. Ensure all values are sums, not ratios, so | 
|  | // that profiler.h can add/subtract them. These are contiguous in memory, in | 
|  | // the order that counters were initialized. | 
|  | double values_[kCapacity]; | 
|  | }; | 
|  | #pragma pack(pop) | 
|  |  | 
|  | }  // namespace platform | 
|  | }  // namespace hwy | 
|  |  | 
|  | #endif  // HIGHWAY_HWY_PERF_COUNTERS_H_ |