|  | // Copyright 2017 Google Inc. All Rights Reserved. | 
|  | // | 
|  | // Licensed under the Apache License, Version 2.0 (the "License"); | 
|  | // you may not use this file except in compliance with the License. | 
|  | // You may obtain a copy of the License at | 
|  | // | 
|  | //     http://www.apache.org/licenses/LICENSE-2.0 | 
|  | // | 
|  | // Unless required by applicable law or agreed to in writing, software | 
|  | // distributed under the License is distributed on an "AS IS" BASIS, | 
|  | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
|  | // See the License for the specific language governing permissions and | 
|  | // limitations under the License. | 
|  |  | 
|  | #ifndef HIGHWAY_HWY_PROFILER_H_ | 
|  | #define HIGHWAY_HWY_PROFILER_H_ | 
|  |  | 
|  | // High precision, low overhead time measurements. Returns exact call counts and | 
|  | // total elapsed time for user-defined 'zones' (code regions, i.e. C++ scopes). | 
|  | // | 
|  | // Uses RAII to capture begin/end timestamps, with user-specified zone names: | 
|  | //   { PROFILER_ZONE("name"); /*code*/ } or | 
|  | // the name of the current function: | 
|  | //   void FuncToMeasure() { PROFILER_FUNC; /*code*/ }. | 
|  | // | 
|  | // After all threads have exited any zones, invoke PROFILER_PRINT_RESULTS() to | 
|  | // print call counts and average durations [CPU cycles] to stdout, sorted in | 
|  | // descending order of total duration. | 
|  | // | 
|  | // The binary MUST be built with --dynamic_mode=off because we rely on the data | 
|  | // segments being nearby; if not, an assertion will likely fail. | 
|  |  | 
|  | #include "third_party/highway/hwy/base.h" | 
|  |  | 
|  | // Configuration settings: | 
|  |  | 
|  | // If zero, this file has no effect and no measurements will be recorded. | 
|  | #ifndef PROFILER_ENABLED | 
|  | #define PROFILER_ENABLED 0 | 
|  | #endif | 
|  |  | 
|  | // How many mebibytes to allocate (if PROFILER_ENABLED) per thread that | 
|  | // enters at least one zone. Once this buffer is full, the thread will analyze | 
|  | // and discard packets, thus temporarily adding some observer overhead. | 
|  | // Each zone occupies 16 bytes. | 
|  | #ifndef PROFILER_THREAD_STORAGE | 
|  | #define PROFILER_THREAD_STORAGE 200ULL | 
|  | #endif | 
|  |  | 
|  | #if PROFILER_ENABLED || HWY_IDE | 
|  |  | 
|  | #include <stddef.h> | 
|  | #include <stdint.h> | 
|  | #include <stdio.h> | 
|  | #include <string.h>  // strcmp | 
|  |  | 
|  | #include <atomic> | 
|  |  | 
|  | #include "third_party/highway/hwy/aligned_allocator.h" | 
|  | #include "third_party/highway/hwy/cache_control.h"  // FlushStream | 
|  | #include "third_party/highway/hwy/contrib/sort/vqsort.h" | 
|  | #include "third_party/highway/hwy/robust_statistics.h" | 
|  | #include "third_party/highway/hwy/timer.h" | 
|  |  | 
|  | #define PROFILER_PRINT_OVERHEAD 0 | 
|  |  | 
|  | namespace hwy { | 
|  |  | 
|  | // Upper bounds for fixed-size data structures (guarded via HWY_DASSERT): | 
|  |  | 
|  | // How many threads can actually enter a zone (those that don't do not count). | 
|  | // Memory use is about kMaxThreads * PROFILER_THREAD_STORAGE MiB. | 
|  | // WARNING: a fiber library can spawn hundreds of threads. | 
|  | static constexpr size_t kMaxThreads = 256; | 
|  |  | 
|  | static constexpr size_t kMaxDepth = 64;  // Maximum nesting of zones. | 
|  |  | 
|  | static constexpr size_t kMaxZones = 256;  // Total number of zones. | 
|  |  | 
|  | #pragma pack(push, 1) | 
|  |  | 
|  | // Represents zone entry/exit events. Stores a full-resolution timestamp plus | 
|  | // an offset (representing zone name or identifying exit packets). POD. | 
|  | class Packet { | 
|  | public: | 
|  | // If offsets do not fit, UpdateOrAdd will overrun our heap allocation | 
|  | // (governed by kMaxZones). We have seen multi-megabyte offsets. | 
|  | static constexpr size_t kOffsetBits = 25; | 
|  | static constexpr uint64_t kOffsetBias = 1ULL << (kOffsetBits - 1); | 
|  |  | 
|  | // We need full-resolution timestamps; at an effective rate of 4 GHz, | 
|  | // this permits 1 minute zone durations (for longer durations, split into | 
|  | // multiple zones). Wraparound is handled by masking. | 
|  | static constexpr size_t kTimestampBits = 64 - kOffsetBits; | 
|  | static constexpr uint64_t kTimestampMask = (1ULL << kTimestampBits) - 1; | 
|  |  | 
|  | static Packet Make(const size_t biased_offset, const uint64_t timestamp) { | 
|  | HWY_DASSERT(biased_offset != 0); | 
|  | HWY_DASSERT(biased_offset < (1ULL << kOffsetBits)); | 
|  |  | 
|  | Packet packet; | 
|  | packet.bits_ = | 
|  | (biased_offset << kTimestampBits) + (timestamp & kTimestampMask); | 
|  |  | 
|  | HWY_DASSERT(packet.BiasedOffset() == biased_offset); | 
|  | HWY_DASSERT(packet.Timestamp() == (timestamp & kTimestampMask)); | 
|  | return packet; | 
|  | } | 
|  |  | 
|  | uint64_t Timestamp() const { return bits_ & kTimestampMask; } | 
|  |  | 
|  | size_t BiasedOffset() const { | 
|  | const size_t biased_offset = (bits_ >> kTimestampBits); | 
|  | HWY_DASSERT(biased_offset != 0); | 
|  | HWY_DASSERT(biased_offset < (1ULL << kOffsetBits)); | 
|  | return biased_offset; | 
|  | } | 
|  |  | 
|  | private: | 
|  | uint64_t bits_; | 
|  | }; | 
|  | static_assert(sizeof(Packet) == 8, "Wrong Packet size"); | 
|  |  | 
|  | // All translation units must use the same string origin. A static member | 
|  | // function ensures this without requiring a separate .cc file. | 
|  | struct StringOrigin { | 
|  | // Returns the address of a string literal. Assuming zone names are also | 
|  | // literals and stored nearby, we can represent them as offsets from this, | 
|  | // which is faster to compute than hashes or even a static index. | 
|  | static const char* Get() { | 
|  | // Chosen such that no zone name is a prefix nor suffix of this string | 
|  | // to ensure they aren't merged. Note zone exit packets use | 
|  | // `biased_offset == kOffsetBias`. | 
|  | static const char* string_origin = "__#__"; | 
|  | return string_origin - Packet::kOffsetBias; | 
|  | } | 
|  | }; | 
|  |  | 
|  | // Representation of an active zone, stored in a stack. Used to deduct | 
|  | // child duration from the parent's self time. POD. | 
|  | struct Node { | 
|  | Packet packet; | 
|  | uint64_t child_total; | 
|  | }; | 
|  | static_assert(sizeof(Node) == 16, "Wrong Node size"); | 
|  |  | 
|  | // Holds statistics for all zones with the same name. POD. | 
|  | struct Accumulator { | 
|  | static constexpr size_t kNumCallBits = 64 - Packet::kOffsetBits; | 
|  |  | 
|  | uint64_t BiasedOffset() const { | 
|  | const size_t biased_offset = u128.lo >> kNumCallBits; | 
|  | HWY_DASSERT(biased_offset != 0); | 
|  | HWY_DASSERT(biased_offset < (1ULL << Packet::kOffsetBits)); | 
|  | return biased_offset; | 
|  | } | 
|  | uint64_t NumCalls() const { return u128.lo & ((1ULL << kNumCallBits) - 1); } | 
|  | uint64_t Duration() const { return u128.hi; } | 
|  |  | 
|  | void Set(uint64_t biased_offset, uint64_t num_calls, uint64_t duration) { | 
|  | HWY_DASSERT(biased_offset != 0); | 
|  | HWY_DASSERT(biased_offset < (1ULL << Packet::kOffsetBits)); | 
|  | HWY_DASSERT(num_calls < (1ULL << kNumCallBits)); | 
|  |  | 
|  | u128.hi = duration; | 
|  | u128.lo = (biased_offset << kNumCallBits) + num_calls; | 
|  |  | 
|  | HWY_DASSERT(BiasedOffset() == biased_offset); | 
|  | HWY_DASSERT(NumCalls() == num_calls); | 
|  | HWY_DASSERT(Duration() == duration); | 
|  | } | 
|  |  | 
|  | void Add(uint64_t num_calls, uint64_t duration) { | 
|  | const uint64_t biased_offset = BiasedOffset(); | 
|  | (void)biased_offset; | 
|  |  | 
|  | u128.lo += num_calls; | 
|  | u128.hi += duration; | 
|  |  | 
|  | HWY_DASSERT(biased_offset == BiasedOffset()); | 
|  | } | 
|  |  | 
|  | // For fast sorting by duration, which must therefore be the hi element. | 
|  | // lo holds BiasedOffset and NumCalls. | 
|  | uint128_t u128; | 
|  | }; | 
|  | static_assert(sizeof(Accumulator) == 16, "Wrong Accumulator size"); | 
|  |  | 
|  | template <typename T> | 
|  | inline T ClampedSubtract(const T minuend, const T subtrahend) { | 
|  | if (subtrahend > minuend) { | 
|  | return 0; | 
|  | } | 
|  | return minuend - subtrahend; | 
|  | } | 
|  |  | 
|  | // Per-thread call graph (stack) and Accumulator for each zone. | 
|  | class Results { | 
|  | public: | 
|  | Results() { | 
|  | ZeroBytes(nodes_, sizeof(nodes_)); | 
|  | ZeroBytes(zones_, sizeof(zones_)); | 
|  | } | 
|  |  | 
|  | // Used for computing overhead when this thread encounters its first Zone. | 
|  | // This has no observable effect apart from increasing "analyze_elapsed_". | 
|  | uint64_t ZoneDuration(const Packet* packets) { | 
|  | HWY_DASSERT(depth_ == 0); | 
|  | HWY_DASSERT(num_zones_ == 0); | 
|  | AnalyzePackets(packets, 2); | 
|  | const uint64_t duration = zones_[0].Duration(); | 
|  | zones_[0].Set(1, 0, 0);  // avoids triggering biased_offset = 0 checks | 
|  | HWY_DASSERT(depth_ == 0); | 
|  | num_zones_ = 0; | 
|  | return duration; | 
|  | } | 
|  |  | 
|  | void SetSelfOverhead(const uint64_t self_overhead) { | 
|  | self_overhead_ = self_overhead; | 
|  | } | 
|  |  | 
|  | void SetChildOverhead(const uint64_t child_overhead) { | 
|  | child_overhead_ = child_overhead; | 
|  | } | 
|  |  | 
|  | // Draw all required information from the packets, which can be discarded | 
|  | // afterwards. Called whenever this thread's storage is full. | 
|  | void AnalyzePackets(const Packet* packets, const size_t num_packets) { | 
|  | const uint64_t t0 = timer::Start(); | 
|  |  | 
|  | for (size_t i = 0; i < num_packets; ++i) { | 
|  | const Packet p = packets[i]; | 
|  | // Entering a zone | 
|  | if (p.BiasedOffset() != Packet::kOffsetBias) { | 
|  | HWY_DASSERT(depth_ < kMaxDepth); | 
|  | nodes_[depth_].packet = p; | 
|  | HWY_DASSERT(p.BiasedOffset() != 0); | 
|  | nodes_[depth_].child_total = 0; | 
|  | ++depth_; | 
|  | continue; | 
|  | } | 
|  |  | 
|  | HWY_DASSERT(depth_ != 0); | 
|  | const Node& node = nodes_[depth_ - 1]; | 
|  | // Masking correctly handles unsigned wraparound. | 
|  | const uint64_t duration = | 
|  | (p.Timestamp() - node.packet.Timestamp()) & Packet::kTimestampMask; | 
|  | const uint64_t self_duration = ClampedSubtract( | 
|  | duration, self_overhead_ + child_overhead_ + node.child_total); | 
|  |  | 
|  | UpdateOrAdd(node.packet.BiasedOffset(), 1, self_duration); | 
|  | --depth_; | 
|  |  | 
|  | // Deduct this nested node's time from its parent's self_duration. | 
|  | if (depth_ != 0) { | 
|  | nodes_[depth_ - 1].child_total += duration + child_overhead_; | 
|  | } | 
|  | } | 
|  |  | 
|  | const uint64_t t1 = timer::Stop(); | 
|  | analyze_elapsed_ += t1 - t0; | 
|  | } | 
|  |  | 
|  | // Incorporates results from another thread. Call after all threads have | 
|  | // exited any zones. | 
|  | void Assimilate(Results& other) { | 
|  | const uint64_t t0 = timer::Start(); | 
|  | HWY_DASSERT(depth_ == 0); | 
|  | HWY_DASSERT(other.depth_ == 0); | 
|  |  | 
|  | for (size_t i = 0; i < other.num_zones_; ++i) { | 
|  | const Accumulator& zone = other.zones_[i]; | 
|  | UpdateOrAdd(zone.BiasedOffset(), zone.NumCalls(), zone.Duration()); | 
|  | } | 
|  | other.num_zones_ = 0; | 
|  | const uint64_t t1 = timer::Stop(); | 
|  | analyze_elapsed_ += t1 - t0 + other.analyze_elapsed_; | 
|  | } | 
|  |  | 
|  | // Single-threaded. | 
|  | void Print() { | 
|  | const uint64_t t0 = timer::Start(); | 
|  | MergeDuplicates(); | 
|  |  | 
|  | // Sort by decreasing total (self) cost. | 
|  | VQSort(&zones_[0].u128, num_zones_, SortDescending()); | 
|  |  | 
|  | const double inv_freq = 1.0 / platform::InvariantTicksPerSecond(); | 
|  |  | 
|  | const char* string_origin = StringOrigin::Get(); | 
|  | for (size_t i = 0; i < num_zones_; ++i) { | 
|  | const Accumulator& z = zones_[i]; | 
|  | const size_t num_calls = z.NumCalls(); | 
|  | const double duration = static_cast<double>(z.Duration()); | 
|  | printf("%-40s: %10zu x %15.0f = %9.6f\n", | 
|  | string_origin + z.BiasedOffset(), num_calls, duration / num_calls, | 
|  | duration * inv_freq); | 
|  | } | 
|  | num_zones_ = 0; | 
|  |  | 
|  | const uint64_t t1 = timer::Stop(); | 
|  | analyze_elapsed_ += t1 - t0; | 
|  | printf("Total analysis [s]: %f\n", | 
|  | static_cast<double>(analyze_elapsed_) * inv_freq); | 
|  | } | 
|  |  | 
|  | private: | 
|  | // Updates an existing Accumulator (uniquely identified by biased_offset) or | 
|  | // adds one if this is the first time this thread analyzed that zone. | 
|  | // Uses a self-organizing list data structure, which avoids dynamic memory | 
|  | // allocations and is far faster than unordered_map. | 
|  | void UpdateOrAdd(const size_t biased_offset, const uint64_t num_calls, | 
|  | const uint64_t duration) { | 
|  | HWY_DASSERT(biased_offset != 0); | 
|  | HWY_DASSERT(biased_offset < (1ULL << Packet::kOffsetBits)); | 
|  |  | 
|  | // Special case for first zone: (maybe) update, without swapping. | 
|  | if (num_zones_ != 0 && zones_[0].BiasedOffset() == biased_offset) { | 
|  | zones_[0].Add(num_calls, duration); | 
|  | return; | 
|  | } | 
|  |  | 
|  | // Look for a zone with the same offset. | 
|  | for (size_t i = 1; i < num_zones_; ++i) { | 
|  | if (zones_[i].BiasedOffset() == biased_offset) { | 
|  | zones_[i].Add(num_calls, duration); | 
|  | // Swap with predecessor (more conservative than move to front, | 
|  | // but at least as successful). | 
|  | const Accumulator prev = zones_[i - 1]; | 
|  | zones_[i - 1] = zones_[i]; | 
|  | zones_[i] = prev; | 
|  | return; | 
|  | } | 
|  | } | 
|  |  | 
|  | // Not found; create a new Accumulator. | 
|  | HWY_DASSERT(num_zones_ < kMaxZones); | 
|  | zones_[num_zones_].Set(biased_offset, num_calls, duration); | 
|  | ++num_zones_; | 
|  | } | 
|  |  | 
|  | // Each instantiation of a function template seems to get its own copy of | 
|  | // __func__ and GCC doesn't merge them. An N^2 search for duplicates is | 
|  | // acceptable because we only expect a few dozen zones. | 
|  | void MergeDuplicates() { | 
|  | const char* string_origin = StringOrigin::Get(); | 
|  | for (size_t i = 0; i < num_zones_; ++i) { | 
|  | const size_t biased_offset = zones_[i].BiasedOffset(); | 
|  | const char* name = string_origin + biased_offset; | 
|  | // Separate num_calls from biased_offset so we can add them together. | 
|  | uint64_t num_calls = zones_[i].NumCalls(); | 
|  |  | 
|  | // Add any subsequent duplicates to num_calls and total_duration. | 
|  | for (size_t j = i + 1; j < num_zones_;) { | 
|  | if (!strcmp(name, string_origin + zones_[j].BiasedOffset())) { | 
|  | num_calls += zones_[j].NumCalls(); | 
|  | zones_[i].Add(0, zones_[j].Duration()); | 
|  | // j was the last zone, so we are done. | 
|  | if (j == num_zones_ - 1) break; | 
|  | // Replace current zone with the last one, and check it next. | 
|  | zones_[j] = zones_[--num_zones_]; | 
|  | } else {  // Name differed, try next Accumulator. | 
|  | ++j; | 
|  | } | 
|  | } | 
|  |  | 
|  | // Re-pack regardless of whether any duplicates were found. | 
|  | zones_[i].Set(biased_offset, num_calls, zones_[i].Duration()); | 
|  | } | 
|  | } | 
|  |  | 
|  | uint64_t analyze_elapsed_ = 0; | 
|  | uint64_t self_overhead_ = 0; | 
|  | uint64_t child_overhead_ = 0; | 
|  |  | 
|  | size_t depth_ = 0;      // Number of active zones. | 
|  | size_t num_zones_ = 0;  // Number of retired zones. | 
|  |  | 
|  | alignas(HWY_ALIGNMENT) Node nodes_[kMaxDepth];         // Stack | 
|  | alignas(HWY_ALIGNMENT) Accumulator zones_[kMaxZones];  // Self-organizing list | 
|  | }; | 
|  |  | 
|  | // Per-thread packet storage, dynamically allocated. | 
|  | class ThreadSpecific { | 
|  | static constexpr size_t kBufferCapacity = HWY_ALIGNMENT / sizeof(Packet); | 
|  |  | 
|  | public: | 
|  | // "name" is used to sanity-check offsets fit in kOffsetBits. | 
|  | explicit ThreadSpecific(const char* name) | 
|  | : max_packets_((PROFILER_THREAD_STORAGE << 20) / sizeof(Packet)), | 
|  | packets_(AllocateAligned<Packet>(max_packets_)), | 
|  | num_packets_(0), | 
|  | string_origin_(StringOrigin::Get()) { | 
|  | // Even in optimized builds, verify that this zone's name offset fits | 
|  | // within the allotted space. If not, UpdateOrAdd is likely to overrun | 
|  | // zones_[]. Checking here on the cold path (only reached once per thread) | 
|  | // is cheap, but it only covers one zone. | 
|  | const size_t biased_offset = name - string_origin_; | 
|  | HWY_ASSERT(biased_offset < (1ULL << Packet::kOffsetBits)); | 
|  | } | 
|  |  | 
|  | // Depends on Zone => defined below. | 
|  | void ComputeOverhead(); | 
|  |  | 
|  | void WriteEntry(const char* name, const uint64_t timestamp) { | 
|  | HWY_DASSERT(name >= string_origin_); | 
|  | const size_t biased_offset = static_cast<size_t>(name - string_origin_); | 
|  | Write(Packet::Make(biased_offset, timestamp)); | 
|  | } | 
|  |  | 
|  | void WriteExit(const uint64_t timestamp) { | 
|  | const size_t biased_offset = Packet::kOffsetBias; | 
|  | Write(Packet::Make(biased_offset, timestamp)); | 
|  | } | 
|  |  | 
|  | void AnalyzeRemainingPackets() { | 
|  | // Ensures prior weakly-ordered streaming stores are globally visible. | 
|  | FlushStream(); | 
|  |  | 
|  | // Storage full => empty it. | 
|  | if (num_packets_ + buffer_size_ > max_packets_) { | 
|  | results_.AnalyzePackets(packets_.get(), num_packets_); | 
|  | num_packets_ = 0; | 
|  | } | 
|  | CopyBytes(buffer_, packets_.get() + num_packets_, | 
|  | buffer_size_ * sizeof(Packet)); | 
|  | num_packets_ += buffer_size_; | 
|  |  | 
|  | results_.AnalyzePackets(packets_.get(), num_packets_); | 
|  | num_packets_ = 0; | 
|  | } | 
|  |  | 
|  | Results& GetResults() { return results_; } | 
|  |  | 
|  | private: | 
|  | // Overwrites "to" while attempting to bypass the cache (read-for-ownership). | 
|  | // Both pointers must be aligned. | 
|  | static void StreamCacheLine(const uint64_t* HWY_RESTRICT from, | 
|  | uint64_t* HWY_RESTRICT to) { | 
|  | #if HWY_COMPILER_CLANG | 
|  | for (size_t i = 0; i < HWY_ALIGNMENT / sizeof(uint64_t); ++i) { | 
|  | __builtin_nontemporal_store(from[i], to + i); | 
|  | } | 
|  | #else | 
|  | hwy::CopyBytes(from, to, HWY_ALIGNMENT); | 
|  | #endif | 
|  | } | 
|  |  | 
|  | // Write packet to buffer/storage, emptying them as needed. | 
|  | void Write(const Packet packet) { | 
|  | // Buffer full => copy to storage. | 
|  | if (buffer_size_ == kBufferCapacity) { | 
|  | // Storage full => empty it. | 
|  | if (num_packets_ + kBufferCapacity > max_packets_) { | 
|  | results_.AnalyzePackets(packets_.get(), num_packets_); | 
|  | num_packets_ = 0; | 
|  | } | 
|  | // This buffering halves observer overhead and decreases the overall | 
|  | // runtime by about 3%. Casting is safe because the first member is u64. | 
|  | StreamCacheLine( | 
|  | reinterpret_cast<const uint64_t*>(buffer_), | 
|  | reinterpret_cast<uint64_t*>(packets_.get() + num_packets_)); | 
|  | num_packets_ += kBufferCapacity; | 
|  | buffer_size_ = 0; | 
|  | } | 
|  | buffer_[buffer_size_] = packet; | 
|  | ++buffer_size_; | 
|  | } | 
|  |  | 
|  | // Write-combining buffer to avoid cache pollution. Must be the first | 
|  | // non-static member to ensure cache-line alignment. | 
|  | Packet buffer_[kBufferCapacity]; | 
|  | size_t buffer_size_ = 0; | 
|  |  | 
|  | const size_t max_packets_; | 
|  | // Contiguous storage for zone enter/exit packets. | 
|  | AlignedFreeUniquePtr<Packet[]> packets_; | 
|  | size_t num_packets_; | 
|  | // Cached here because we already read this cache line on zone entry/exit. | 
|  | const char* string_origin_; | 
|  | Results results_; | 
|  | }; | 
|  |  | 
|  | class ThreadList { | 
|  | public: | 
|  | // Called from any thread. | 
|  | ThreadSpecific* Add(const char* name) { | 
|  | const size_t index = num_threads_.fetch_add(1, std::memory_order_relaxed); | 
|  | HWY_DASSERT(index < kMaxThreads); | 
|  |  | 
|  | ThreadSpecific* ts = MakeUniqueAligned<ThreadSpecific>(name).release(); | 
|  | threads_[index].store(ts, std::memory_order_release); | 
|  | return ts; | 
|  | } | 
|  |  | 
|  | // Single-threaded. | 
|  | void PrintResults() { | 
|  | const auto acq = std::memory_order_acquire; | 
|  | const size_t num_threads = num_threads_.load(acq); | 
|  |  | 
|  | ThreadSpecific* main = threads_[0].load(acq); | 
|  | main->AnalyzeRemainingPackets(); | 
|  |  | 
|  | for (size_t i = 1; i < num_threads; ++i) { | 
|  | ThreadSpecific* ts = threads_[i].load(acq); | 
|  | ts->AnalyzeRemainingPackets(); | 
|  | main->GetResults().Assimilate(ts->GetResults()); | 
|  | } | 
|  |  | 
|  | if (num_threads != 0) { | 
|  | main->GetResults().Print(); | 
|  | } | 
|  | } | 
|  |  | 
|  | private: | 
|  | // Owning pointers. | 
|  | alignas(64) std::atomic<ThreadSpecific*> threads_[kMaxThreads]; | 
|  | std::atomic<size_t> num_threads_{0}; | 
|  | }; | 
|  |  | 
|  | // RAII zone enter/exit recorder constructed by the ZONE macro; also | 
|  | // responsible for initializing ThreadSpecific. | 
|  | class Zone { | 
|  | public: | 
|  | // "name" must be a string literal (see StringOrigin::Get). | 
|  | HWY_NOINLINE explicit Zone(const char* name) { | 
|  | HWY_FENCE; | 
|  | ThreadSpecific* HWY_RESTRICT thread_specific = StaticThreadSpecific(); | 
|  | if (HWY_UNLIKELY(thread_specific == nullptr)) { | 
|  | // Ensure the CPU supports our timer. | 
|  | char cpu[100]; | 
|  | if (!platform::HaveTimerStop(cpu)) { | 
|  | HWY_ABORT("CPU %s is too old for PROFILER_ENABLED=1, exiting", cpu); | 
|  | } | 
|  |  | 
|  | thread_specific = StaticThreadSpecific() = Threads().Add(name); | 
|  | // Must happen after setting StaticThreadSpecific, because ComputeOverhead | 
|  | // also calls Zone(). | 
|  | thread_specific->ComputeOverhead(); | 
|  | } | 
|  |  | 
|  | // (Capture timestamp ASAP, not inside WriteEntry.) | 
|  | HWY_FENCE; | 
|  | const uint64_t timestamp = timer::Start(); | 
|  | thread_specific->WriteEntry(name, timestamp); | 
|  | } | 
|  |  | 
|  | HWY_NOINLINE ~Zone() { | 
|  | HWY_FENCE; | 
|  | const uint64_t timestamp = timer::Stop(); | 
|  | StaticThreadSpecific()->WriteExit(timestamp); | 
|  | HWY_FENCE; | 
|  | } | 
|  |  | 
|  | // Call exactly once after all threads have exited all zones. | 
|  | static void PrintResults() { Threads().PrintResults(); } | 
|  |  | 
|  | private: | 
|  | // Returns reference to the thread's ThreadSpecific pointer (initially null). | 
|  | // Function-local static avoids needing a separate definition. | 
|  | static ThreadSpecific*& StaticThreadSpecific() { | 
|  | static thread_local ThreadSpecific* thread_specific; | 
|  | return thread_specific; | 
|  | } | 
|  |  | 
|  | // Returns the singleton ThreadList. Non time-critical. | 
|  | static ThreadList& Threads() { | 
|  | static ThreadList threads_; | 
|  | return threads_; | 
|  | } | 
|  | }; | 
|  |  | 
|  | // Creates a zone starting from here until the end of the current scope. | 
|  | // Timestamps will be recorded when entering and exiting the zone. | 
|  | // "name" must be a string literal, which is ensured by merging with "". | 
|  | #define PROFILER_ZONE(name)      \ | 
|  | HWY_FENCE;                     \ | 
|  | const hwy::Zone zone("" name); \ | 
|  | HWY_FENCE | 
|  |  | 
|  | // Creates a zone for an entire function (when placed at its beginning). | 
|  | // Shorter/more convenient than ZONE. | 
|  | #define PROFILER_FUNC             \ | 
|  | HWY_FENCE;                      \ | 
|  | const hwy::Zone zone(__func__); \ | 
|  | HWY_FENCE | 
|  |  | 
|  | #define PROFILER_PRINT_RESULTS hwy::Zone::PrintResults | 
|  |  | 
|  | inline void ThreadSpecific::ComputeOverhead() { | 
|  | // Delay after capturing timestamps before/after the actual zone runs. Even | 
|  | // with frequency throttling disabled, this has a multimodal distribution, | 
|  | // including 32, 34, 48, 52, 59, 62. | 
|  | uint64_t self_overhead; | 
|  | { | 
|  | const size_t kNumSamples = 32; | 
|  | uint32_t samples[kNumSamples]; | 
|  | for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) { | 
|  | const size_t kNumDurations = 1024; | 
|  | uint32_t durations[kNumDurations]; | 
|  |  | 
|  | for (size_t idx_duration = 0; idx_duration < kNumDurations; | 
|  | ++idx_duration) { | 
|  | { | 
|  | PROFILER_ZONE("Dummy Zone (never shown)"); | 
|  | } | 
|  | const uint64_t duration = results_.ZoneDuration(buffer_); | 
|  | buffer_size_ = 0; | 
|  | durations[idx_duration] = static_cast<uint32_t>(duration); | 
|  | HWY_DASSERT(num_packets_ == 0); | 
|  | } | 
|  | robust_statistics::CountingSort(durations, kNumDurations); | 
|  | samples[idx_sample] = robust_statistics::Mode(durations, kNumDurations); | 
|  | } | 
|  | // Median. | 
|  | robust_statistics::CountingSort(samples, kNumSamples); | 
|  | self_overhead = samples[kNumSamples / 2]; | 
|  | if (PROFILER_PRINT_OVERHEAD) { | 
|  | printf("Overhead: %.0f\n", static_cast<double>(self_overhead)); | 
|  | } | 
|  | results_.SetSelfOverhead(self_overhead); | 
|  | } | 
|  |  | 
|  | // Delay before capturing start timestamp / after end timestamp. | 
|  | const size_t kNumSamples = 32; | 
|  | uint32_t samples[kNumSamples]; | 
|  | for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) { | 
|  | const size_t kNumDurations = 16; | 
|  | uint32_t durations[kNumDurations]; | 
|  | for (size_t idx_duration = 0; idx_duration < kNumDurations; | 
|  | ++idx_duration) { | 
|  | const size_t kReps = 10000; | 
|  | // Analysis time should not be included => must fit within buffer. | 
|  | HWY_DASSERT(kReps * 2 < max_packets_); | 
|  | std::atomic_thread_fence(std::memory_order_seq_cst); | 
|  | const uint64_t t0 = timer::Start(); | 
|  | for (size_t i = 0; i < kReps; ++i) { | 
|  | PROFILER_ZONE("Dummy"); | 
|  | } | 
|  | FlushStream(); | 
|  | const uint64_t t1 = timer::Stop(); | 
|  | HWY_DASSERT(num_packets_ + buffer_size_ == kReps * 2); | 
|  | buffer_size_ = 0; | 
|  | num_packets_ = 0; | 
|  | const uint64_t avg_duration = (t1 - t0 + kReps / 2) / kReps; | 
|  | durations[idx_duration] = | 
|  | static_cast<uint32_t>(ClampedSubtract(avg_duration, self_overhead)); | 
|  | } | 
|  | robust_statistics::CountingSort(durations, kNumDurations); | 
|  | samples[idx_sample] = robust_statistics::Mode(durations, kNumDurations); | 
|  | } | 
|  | robust_statistics::CountingSort(samples, kNumSamples); | 
|  | const uint64_t child_overhead = samples[9 * kNumSamples / 10]; | 
|  | if (PROFILER_PRINT_OVERHEAD) { | 
|  | printf("Child overhead: %.0f\n", static_cast<double>(child_overhead)); | 
|  | } | 
|  | results_.SetChildOverhead(child_overhead); | 
|  | } | 
|  |  | 
|  | #pragma pack(pop) | 
|  |  | 
|  | }  // namespace hwy | 
|  |  | 
|  | #endif  // PROFILER_ENABLED || HWY_IDE | 
|  |  | 
|  | #if !PROFILER_ENABLED && !HWY_IDE | 
|  | #define PROFILER_ZONE(name) | 
|  | #define PROFILER_FUNC | 
|  | #define PROFILER_PRINT_RESULTS() | 
|  | #endif | 
|  |  | 
|  | #endif  // HIGHWAY_HWY_PROFILER_H_ |