| // Copyright 2017 Google Inc. All Rights Reserved. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| #ifndef HIGHWAY_HWY_PROFILER_H_ |
| #define HIGHWAY_HWY_PROFILER_H_ |
| |
| // High precision, low overhead time measurements. Returns exact call counts and |
| // total elapsed time for user-defined 'zones' (code regions, i.e. C++ scopes). |
| // |
| // Uses RAII to capture begin/end timestamps, with user-specified zone names: |
| // { PROFILER_ZONE("name"); /*code*/ } or |
| // the name of the current function: |
| // void FuncToMeasure() { PROFILER_FUNC; /*code*/ }. |
| // |
| // After all threads have exited any zones, invoke PROFILER_PRINT_RESULTS() to |
| // print call counts and average durations [CPU cycles] to stdout, sorted in |
| // descending order of total duration. |
| // |
| // The binary MUST be built with --dynamic_mode=off because we rely on the data |
| // segments being nearby; if not, an assertion will likely fail. |
| |
| #include "third_party/highway/hwy/base.h" |
| |
| // Configuration settings: |
| |
| // If zero, this file has no effect and no measurements will be recorded. |
| #ifndef PROFILER_ENABLED |
| #define PROFILER_ENABLED 0 |
| #endif |
| |
| // How many mebibytes to allocate (if PROFILER_ENABLED) per thread that |
| // enters at least one zone. Once this buffer is full, the thread will analyze |
| // and discard packets, thus temporarily adding some observer overhead. |
| // Each zone occupies 16 bytes. |
| #ifndef PROFILER_THREAD_STORAGE |
| #define PROFILER_THREAD_STORAGE 200ULL |
| #endif |
| |
| #if PROFILER_ENABLED || HWY_IDE |
| |
| #include <stddef.h> |
| #include <stdint.h> |
| #include <stdio.h> |
| #include <string.h> // strcmp |
| |
| #include <atomic> |
| |
| #include "third_party/highway/hwy/aligned_allocator.h" |
| #include "third_party/highway/hwy/cache_control.h" // FlushStream |
| #include "third_party/highway/hwy/contrib/sort/vqsort.h" |
| #include "third_party/highway/hwy/robust_statistics.h" |
| #include "third_party/highway/hwy/timer.h" |
| |
| #define PROFILER_PRINT_OVERHEAD 0 |
| |
| namespace hwy { |
| |
| // Upper bounds for fixed-size data structures (guarded via HWY_DASSERT): |
| |
| // How many threads can actually enter a zone (those that don't do not count). |
| // Memory use is about kMaxThreads * PROFILER_THREAD_STORAGE MiB. |
| // WARNING: a fiber library can spawn hundreds of threads. |
| static constexpr size_t kMaxThreads = 256; |
| |
| static constexpr size_t kMaxDepth = 64; // Maximum nesting of zones. |
| |
| static constexpr size_t kMaxZones = 256; // Total number of zones. |
| |
| #pragma pack(push, 1) |
| |
| // Represents zone entry/exit events. Stores a full-resolution timestamp plus |
| // an offset (representing zone name or identifying exit packets). POD. |
| class Packet { |
| public: |
| // If offsets do not fit, UpdateOrAdd will overrun our heap allocation |
| // (governed by kMaxZones). We have seen multi-megabyte offsets. |
| static constexpr size_t kOffsetBits = 25; |
| static constexpr uint64_t kOffsetBias = 1ULL << (kOffsetBits - 1); |
| |
| // We need full-resolution timestamps; at an effective rate of 4 GHz, |
| // this permits 1 minute zone durations (for longer durations, split into |
| // multiple zones). Wraparound is handled by masking. |
| static constexpr size_t kTimestampBits = 64 - kOffsetBits; |
| static constexpr uint64_t kTimestampMask = (1ULL << kTimestampBits) - 1; |
| |
| static Packet Make(const size_t biased_offset, const uint64_t timestamp) { |
| HWY_DASSERT(biased_offset != 0); |
| HWY_DASSERT(biased_offset < (1ULL << kOffsetBits)); |
| |
| Packet packet; |
| packet.bits_ = |
| (biased_offset << kTimestampBits) + (timestamp & kTimestampMask); |
| |
| HWY_DASSERT(packet.BiasedOffset() == biased_offset); |
| HWY_DASSERT(packet.Timestamp() == (timestamp & kTimestampMask)); |
| return packet; |
| } |
| |
| uint64_t Timestamp() const { return bits_ & kTimestampMask; } |
| |
| size_t BiasedOffset() const { |
| const size_t biased_offset = (bits_ >> kTimestampBits); |
| HWY_DASSERT(biased_offset != 0); |
| HWY_DASSERT(biased_offset < (1ULL << kOffsetBits)); |
| return biased_offset; |
| } |
| |
| private: |
| uint64_t bits_; |
| }; |
| static_assert(sizeof(Packet) == 8, "Wrong Packet size"); |
| |
| // All translation units must use the same string origin. A static member |
| // function ensures this without requiring a separate .cc file. |
| struct StringOrigin { |
| // Returns the address of a string literal. Assuming zone names are also |
| // literals and stored nearby, we can represent them as offsets from this, |
| // which is faster to compute than hashes or even a static index. |
| static const char* Get() { |
| // Chosen such that no zone name is a prefix nor suffix of this string |
| // to ensure they aren't merged. Note zone exit packets use |
| // `biased_offset == kOffsetBias`. |
| static const char* string_origin = "__#__"; |
| return string_origin - Packet::kOffsetBias; |
| } |
| }; |
| |
| // Representation of an active zone, stored in a stack. Used to deduct |
| // child duration from the parent's self time. POD. |
| struct Node { |
| Packet packet; |
| uint64_t child_total; |
| }; |
| static_assert(sizeof(Node) == 16, "Wrong Node size"); |
| |
| // Holds statistics for all zones with the same name. POD. |
| struct Accumulator { |
| static constexpr size_t kNumCallBits = 64 - Packet::kOffsetBits; |
| |
| uint64_t BiasedOffset() const { |
| const size_t biased_offset = u128.lo >> kNumCallBits; |
| HWY_DASSERT(biased_offset != 0); |
| HWY_DASSERT(biased_offset < (1ULL << Packet::kOffsetBits)); |
| return biased_offset; |
| } |
| uint64_t NumCalls() const { return u128.lo & ((1ULL << kNumCallBits) - 1); } |
| uint64_t Duration() const { return u128.hi; } |
| |
| void Set(uint64_t biased_offset, uint64_t num_calls, uint64_t duration) { |
| HWY_DASSERT(biased_offset != 0); |
| HWY_DASSERT(biased_offset < (1ULL << Packet::kOffsetBits)); |
| HWY_DASSERT(num_calls < (1ULL << kNumCallBits)); |
| |
| u128.hi = duration; |
| u128.lo = (biased_offset << kNumCallBits) + num_calls; |
| |
| HWY_DASSERT(BiasedOffset() == biased_offset); |
| HWY_DASSERT(NumCalls() == num_calls); |
| HWY_DASSERT(Duration() == duration); |
| } |
| |
| void Add(uint64_t num_calls, uint64_t duration) { |
| const uint64_t biased_offset = BiasedOffset(); |
| (void)biased_offset; |
| |
| u128.lo += num_calls; |
| u128.hi += duration; |
| |
| HWY_DASSERT(biased_offset == BiasedOffset()); |
| } |
| |
| // For fast sorting by duration, which must therefore be the hi element. |
| // lo holds BiasedOffset and NumCalls. |
| uint128_t u128; |
| }; |
| static_assert(sizeof(Accumulator) == 16, "Wrong Accumulator size"); |
| |
| template <typename T> |
| inline T ClampedSubtract(const T minuend, const T subtrahend) { |
| if (subtrahend > minuend) { |
| return 0; |
| } |
| return minuend - subtrahend; |
| } |
| |
| // Per-thread call graph (stack) and Accumulator for each zone. |
| class Results { |
| public: |
| Results() { |
| ZeroBytes(nodes_, sizeof(nodes_)); |
| ZeroBytes(zones_, sizeof(zones_)); |
| } |
| |
| // Used for computing overhead when this thread encounters its first Zone. |
| // This has no observable effect apart from increasing "analyze_elapsed_". |
| uint64_t ZoneDuration(const Packet* packets) { |
| HWY_DASSERT(depth_ == 0); |
| HWY_DASSERT(num_zones_ == 0); |
| AnalyzePackets(packets, 2); |
| const uint64_t duration = zones_[0].Duration(); |
| zones_[0].Set(1, 0, 0); // avoids triggering biased_offset = 0 checks |
| HWY_DASSERT(depth_ == 0); |
| num_zones_ = 0; |
| return duration; |
| } |
| |
| void SetSelfOverhead(const uint64_t self_overhead) { |
| self_overhead_ = self_overhead; |
| } |
| |
| void SetChildOverhead(const uint64_t child_overhead) { |
| child_overhead_ = child_overhead; |
| } |
| |
| // Draw all required information from the packets, which can be discarded |
| // afterwards. Called whenever this thread's storage is full. |
| void AnalyzePackets(const Packet* packets, const size_t num_packets) { |
| const uint64_t t0 = timer::Start(); |
| |
| for (size_t i = 0; i < num_packets; ++i) { |
| const Packet p = packets[i]; |
| // Entering a zone |
| if (p.BiasedOffset() != Packet::kOffsetBias) { |
| HWY_DASSERT(depth_ < kMaxDepth); |
| nodes_[depth_].packet = p; |
| HWY_DASSERT(p.BiasedOffset() != 0); |
| nodes_[depth_].child_total = 0; |
| ++depth_; |
| continue; |
| } |
| |
| HWY_DASSERT(depth_ != 0); |
| const Node& node = nodes_[depth_ - 1]; |
| // Masking correctly handles unsigned wraparound. |
| const uint64_t duration = |
| (p.Timestamp() - node.packet.Timestamp()) & Packet::kTimestampMask; |
| const uint64_t self_duration = ClampedSubtract( |
| duration, self_overhead_ + child_overhead_ + node.child_total); |
| |
| UpdateOrAdd(node.packet.BiasedOffset(), 1, self_duration); |
| --depth_; |
| |
| // Deduct this nested node's time from its parent's self_duration. |
| if (depth_ != 0) { |
| nodes_[depth_ - 1].child_total += duration + child_overhead_; |
| } |
| } |
| |
| const uint64_t t1 = timer::Stop(); |
| analyze_elapsed_ += t1 - t0; |
| } |
| |
| // Incorporates results from another thread. Call after all threads have |
| // exited any zones. |
| void Assimilate(Results& other) { |
| const uint64_t t0 = timer::Start(); |
| HWY_DASSERT(depth_ == 0); |
| HWY_DASSERT(other.depth_ == 0); |
| |
| for (size_t i = 0; i < other.num_zones_; ++i) { |
| const Accumulator& zone = other.zones_[i]; |
| UpdateOrAdd(zone.BiasedOffset(), zone.NumCalls(), zone.Duration()); |
| } |
| other.num_zones_ = 0; |
| const uint64_t t1 = timer::Stop(); |
| analyze_elapsed_ += t1 - t0 + other.analyze_elapsed_; |
| } |
| |
| // Single-threaded. |
| void Print() { |
| const uint64_t t0 = timer::Start(); |
| MergeDuplicates(); |
| |
| // Sort by decreasing total (self) cost. |
| VQSort(&zones_[0].u128, num_zones_, SortDescending()); |
| |
| const double inv_freq = 1.0 / platform::InvariantTicksPerSecond(); |
| |
| const char* string_origin = StringOrigin::Get(); |
| for (size_t i = 0; i < num_zones_; ++i) { |
| const Accumulator& z = zones_[i]; |
| const size_t num_calls = z.NumCalls(); |
| const double duration = static_cast<double>(z.Duration()); |
| printf("%-40s: %10zu x %15.0f = %9.6f\n", |
| string_origin + z.BiasedOffset(), num_calls, duration / num_calls, |
| duration * inv_freq); |
| } |
| num_zones_ = 0; |
| |
| const uint64_t t1 = timer::Stop(); |
| analyze_elapsed_ += t1 - t0; |
| printf("Total analysis [s]: %f\n", |
| static_cast<double>(analyze_elapsed_) * inv_freq); |
| } |
| |
| private: |
| // Updates an existing Accumulator (uniquely identified by biased_offset) or |
| // adds one if this is the first time this thread analyzed that zone. |
| // Uses a self-organizing list data structure, which avoids dynamic memory |
| // allocations and is far faster than unordered_map. |
| void UpdateOrAdd(const size_t biased_offset, const uint64_t num_calls, |
| const uint64_t duration) { |
| HWY_DASSERT(biased_offset != 0); |
| HWY_DASSERT(biased_offset < (1ULL << Packet::kOffsetBits)); |
| |
| // Special case for first zone: (maybe) update, without swapping. |
| if (num_zones_ != 0 && zones_[0].BiasedOffset() == biased_offset) { |
| zones_[0].Add(num_calls, duration); |
| return; |
| } |
| |
| // Look for a zone with the same offset. |
| for (size_t i = 1; i < num_zones_; ++i) { |
| if (zones_[i].BiasedOffset() == biased_offset) { |
| zones_[i].Add(num_calls, duration); |
| // Swap with predecessor (more conservative than move to front, |
| // but at least as successful). |
| const Accumulator prev = zones_[i - 1]; |
| zones_[i - 1] = zones_[i]; |
| zones_[i] = prev; |
| return; |
| } |
| } |
| |
| // Not found; create a new Accumulator. |
| HWY_DASSERT(num_zones_ < kMaxZones); |
| zones_[num_zones_].Set(biased_offset, num_calls, duration); |
| ++num_zones_; |
| } |
| |
| // Each instantiation of a function template seems to get its own copy of |
| // __func__ and GCC doesn't merge them. An N^2 search for duplicates is |
| // acceptable because we only expect a few dozen zones. |
| void MergeDuplicates() { |
| const char* string_origin = StringOrigin::Get(); |
| for (size_t i = 0; i < num_zones_; ++i) { |
| const size_t biased_offset = zones_[i].BiasedOffset(); |
| const char* name = string_origin + biased_offset; |
| // Separate num_calls from biased_offset so we can add them together. |
| uint64_t num_calls = zones_[i].NumCalls(); |
| |
| // Add any subsequent duplicates to num_calls and total_duration. |
| for (size_t j = i + 1; j < num_zones_;) { |
| if (!strcmp(name, string_origin + zones_[j].BiasedOffset())) { |
| num_calls += zones_[j].NumCalls(); |
| zones_[i].Add(0, zones_[j].Duration()); |
| // j was the last zone, so we are done. |
| if (j == num_zones_ - 1) break; |
| // Replace current zone with the last one, and check it next. |
| zones_[j] = zones_[--num_zones_]; |
| } else { // Name differed, try next Accumulator. |
| ++j; |
| } |
| } |
| |
| // Re-pack regardless of whether any duplicates were found. |
| zones_[i].Set(biased_offset, num_calls, zones_[i].Duration()); |
| } |
| } |
| |
| uint64_t analyze_elapsed_ = 0; |
| uint64_t self_overhead_ = 0; |
| uint64_t child_overhead_ = 0; |
| |
| size_t depth_ = 0; // Number of active zones. |
| size_t num_zones_ = 0; // Number of retired zones. |
| |
| alignas(HWY_ALIGNMENT) Node nodes_[kMaxDepth]; // Stack |
| alignas(HWY_ALIGNMENT) Accumulator zones_[kMaxZones]; // Self-organizing list |
| }; |
| |
| // Per-thread packet storage, dynamically allocated. |
| class ThreadSpecific { |
| static constexpr size_t kBufferCapacity = HWY_ALIGNMENT / sizeof(Packet); |
| |
| public: |
| // "name" is used to sanity-check offsets fit in kOffsetBits. |
| explicit ThreadSpecific(const char* name) |
| : max_packets_((PROFILER_THREAD_STORAGE << 20) / sizeof(Packet)), |
| packets_(AllocateAligned<Packet>(max_packets_)), |
| num_packets_(0), |
| string_origin_(StringOrigin::Get()) { |
| // Even in optimized builds, verify that this zone's name offset fits |
| // within the allotted space. If not, UpdateOrAdd is likely to overrun |
| // zones_[]. Checking here on the cold path (only reached once per thread) |
| // is cheap, but it only covers one zone. |
| const size_t biased_offset = name - string_origin_; |
| HWY_ASSERT(biased_offset < (1ULL << Packet::kOffsetBits)); |
| } |
| |
| // Depends on Zone => defined below. |
| void ComputeOverhead(); |
| |
| void WriteEntry(const char* name, const uint64_t timestamp) { |
| HWY_DASSERT(name >= string_origin_); |
| const size_t biased_offset = static_cast<size_t>(name - string_origin_); |
| Write(Packet::Make(biased_offset, timestamp)); |
| } |
| |
| void WriteExit(const uint64_t timestamp) { |
| const size_t biased_offset = Packet::kOffsetBias; |
| Write(Packet::Make(biased_offset, timestamp)); |
| } |
| |
| void AnalyzeRemainingPackets() { |
| // Ensures prior weakly-ordered streaming stores are globally visible. |
| FlushStream(); |
| |
| // Storage full => empty it. |
| if (num_packets_ + buffer_size_ > max_packets_) { |
| results_.AnalyzePackets(packets_.get(), num_packets_); |
| num_packets_ = 0; |
| } |
| CopyBytes(buffer_, packets_.get() + num_packets_, |
| buffer_size_ * sizeof(Packet)); |
| num_packets_ += buffer_size_; |
| |
| results_.AnalyzePackets(packets_.get(), num_packets_); |
| num_packets_ = 0; |
| } |
| |
| Results& GetResults() { return results_; } |
| |
| private: |
| // Overwrites "to" while attempting to bypass the cache (read-for-ownership). |
| // Both pointers must be aligned. |
| static void StreamCacheLine(const uint64_t* HWY_RESTRICT from, |
| uint64_t* HWY_RESTRICT to) { |
| #if HWY_COMPILER_CLANG |
| for (size_t i = 0; i < HWY_ALIGNMENT / sizeof(uint64_t); ++i) { |
| __builtin_nontemporal_store(from[i], to + i); |
| } |
| #else |
| hwy::CopyBytes(from, to, HWY_ALIGNMENT); |
| #endif |
| } |
| |
| // Write packet to buffer/storage, emptying them as needed. |
| void Write(const Packet packet) { |
| // Buffer full => copy to storage. |
| if (buffer_size_ == kBufferCapacity) { |
| // Storage full => empty it. |
| if (num_packets_ + kBufferCapacity > max_packets_) { |
| results_.AnalyzePackets(packets_.get(), num_packets_); |
| num_packets_ = 0; |
| } |
| // This buffering halves observer overhead and decreases the overall |
| // runtime by about 3%. Casting is safe because the first member is u64. |
| StreamCacheLine( |
| reinterpret_cast<const uint64_t*>(buffer_), |
| reinterpret_cast<uint64_t*>(packets_.get() + num_packets_)); |
| num_packets_ += kBufferCapacity; |
| buffer_size_ = 0; |
| } |
| buffer_[buffer_size_] = packet; |
| ++buffer_size_; |
| } |
| |
| // Write-combining buffer to avoid cache pollution. Must be the first |
| // non-static member to ensure cache-line alignment. |
| Packet buffer_[kBufferCapacity]; |
| size_t buffer_size_ = 0; |
| |
| const size_t max_packets_; |
| // Contiguous storage for zone enter/exit packets. |
| AlignedFreeUniquePtr<Packet[]> packets_; |
| size_t num_packets_; |
| // Cached here because we already read this cache line on zone entry/exit. |
| const char* string_origin_; |
| Results results_; |
| }; |
| |
| class ThreadList { |
| public: |
| // Called from any thread. |
| ThreadSpecific* Add(const char* name) { |
| const size_t index = num_threads_.fetch_add(1, std::memory_order_relaxed); |
| HWY_DASSERT(index < kMaxThreads); |
| |
| ThreadSpecific* ts = MakeUniqueAligned<ThreadSpecific>(name).release(); |
| threads_[index].store(ts, std::memory_order_release); |
| return ts; |
| } |
| |
| // Single-threaded. |
| void PrintResults() { |
| const auto acq = std::memory_order_acquire; |
| const size_t num_threads = num_threads_.load(acq); |
| |
| ThreadSpecific* main = threads_[0].load(acq); |
| main->AnalyzeRemainingPackets(); |
| |
| for (size_t i = 1; i < num_threads; ++i) { |
| ThreadSpecific* ts = threads_[i].load(acq); |
| ts->AnalyzeRemainingPackets(); |
| main->GetResults().Assimilate(ts->GetResults()); |
| } |
| |
| if (num_threads != 0) { |
| main->GetResults().Print(); |
| } |
| } |
| |
| private: |
| // Owning pointers. |
| alignas(64) std::atomic<ThreadSpecific*> threads_[kMaxThreads]; |
| std::atomic<size_t> num_threads_{0}; |
| }; |
| |
| // RAII zone enter/exit recorder constructed by the ZONE macro; also |
| // responsible for initializing ThreadSpecific. |
| class Zone { |
| public: |
| // "name" must be a string literal (see StringOrigin::Get). |
| HWY_NOINLINE explicit Zone(const char* name) { |
| HWY_FENCE; |
| ThreadSpecific* HWY_RESTRICT thread_specific = StaticThreadSpecific(); |
| if (HWY_UNLIKELY(thread_specific == nullptr)) { |
| // Ensure the CPU supports our timer. |
| char cpu[100]; |
| if (!platform::HaveTimerStop(cpu)) { |
| HWY_ABORT("CPU %s is too old for PROFILER_ENABLED=1, exiting", cpu); |
| } |
| |
| thread_specific = StaticThreadSpecific() = Threads().Add(name); |
| // Must happen after setting StaticThreadSpecific, because ComputeOverhead |
| // also calls Zone(). |
| thread_specific->ComputeOverhead(); |
| } |
| |
| // (Capture timestamp ASAP, not inside WriteEntry.) |
| HWY_FENCE; |
| const uint64_t timestamp = timer::Start(); |
| thread_specific->WriteEntry(name, timestamp); |
| } |
| |
| HWY_NOINLINE ~Zone() { |
| HWY_FENCE; |
| const uint64_t timestamp = timer::Stop(); |
| StaticThreadSpecific()->WriteExit(timestamp); |
| HWY_FENCE; |
| } |
| |
| // Call exactly once after all threads have exited all zones. |
| static void PrintResults() { Threads().PrintResults(); } |
| |
| private: |
| // Returns reference to the thread's ThreadSpecific pointer (initially null). |
| // Function-local static avoids needing a separate definition. |
| static ThreadSpecific*& StaticThreadSpecific() { |
| static thread_local ThreadSpecific* thread_specific; |
| return thread_specific; |
| } |
| |
| // Returns the singleton ThreadList. Non time-critical. |
| static ThreadList& Threads() { |
| static ThreadList threads_; |
| return threads_; |
| } |
| }; |
| |
| // Creates a zone starting from here until the end of the current scope. |
| // Timestamps will be recorded when entering and exiting the zone. |
| // "name" must be a string literal, which is ensured by merging with "". |
| #define PROFILER_ZONE(name) \ |
| HWY_FENCE; \ |
| const hwy::Zone zone("" name); \ |
| HWY_FENCE |
| |
| // Creates a zone for an entire function (when placed at its beginning). |
| // Shorter/more convenient than ZONE. |
| #define PROFILER_FUNC \ |
| HWY_FENCE; \ |
| const hwy::Zone zone(__func__); \ |
| HWY_FENCE |
| |
| #define PROFILER_PRINT_RESULTS hwy::Zone::PrintResults |
| |
| inline void ThreadSpecific::ComputeOverhead() { |
| // Delay after capturing timestamps before/after the actual zone runs. Even |
| // with frequency throttling disabled, this has a multimodal distribution, |
| // including 32, 34, 48, 52, 59, 62. |
| uint64_t self_overhead; |
| { |
| const size_t kNumSamples = 32; |
| uint32_t samples[kNumSamples]; |
| for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) { |
| const size_t kNumDurations = 1024; |
| uint32_t durations[kNumDurations]; |
| |
| for (size_t idx_duration = 0; idx_duration < kNumDurations; |
| ++idx_duration) { |
| { |
| PROFILER_ZONE("Dummy Zone (never shown)"); |
| } |
| const uint64_t duration = results_.ZoneDuration(buffer_); |
| buffer_size_ = 0; |
| durations[idx_duration] = static_cast<uint32_t>(duration); |
| HWY_DASSERT(num_packets_ == 0); |
| } |
| robust_statistics::CountingSort(durations, kNumDurations); |
| samples[idx_sample] = robust_statistics::Mode(durations, kNumDurations); |
| } |
| // Median. |
| robust_statistics::CountingSort(samples, kNumSamples); |
| self_overhead = samples[kNumSamples / 2]; |
| if (PROFILER_PRINT_OVERHEAD) { |
| printf("Overhead: %.0f\n", static_cast<double>(self_overhead)); |
| } |
| results_.SetSelfOverhead(self_overhead); |
| } |
| |
| // Delay before capturing start timestamp / after end timestamp. |
| const size_t kNumSamples = 32; |
| uint32_t samples[kNumSamples]; |
| for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) { |
| const size_t kNumDurations = 16; |
| uint32_t durations[kNumDurations]; |
| for (size_t idx_duration = 0; idx_duration < kNumDurations; |
| ++idx_duration) { |
| const size_t kReps = 10000; |
| // Analysis time should not be included => must fit within buffer. |
| HWY_DASSERT(kReps * 2 < max_packets_); |
| std::atomic_thread_fence(std::memory_order_seq_cst); |
| const uint64_t t0 = timer::Start(); |
| for (size_t i = 0; i < kReps; ++i) { |
| PROFILER_ZONE("Dummy"); |
| } |
| FlushStream(); |
| const uint64_t t1 = timer::Stop(); |
| HWY_DASSERT(num_packets_ + buffer_size_ == kReps * 2); |
| buffer_size_ = 0; |
| num_packets_ = 0; |
| const uint64_t avg_duration = (t1 - t0 + kReps / 2) / kReps; |
| durations[idx_duration] = |
| static_cast<uint32_t>(ClampedSubtract(avg_duration, self_overhead)); |
| } |
| robust_statistics::CountingSort(durations, kNumDurations); |
| samples[idx_sample] = robust_statistics::Mode(durations, kNumDurations); |
| } |
| robust_statistics::CountingSort(samples, kNumSamples); |
| const uint64_t child_overhead = samples[9 * kNumSamples / 10]; |
| if (PROFILER_PRINT_OVERHEAD) { |
| printf("Child overhead: %.0f\n", static_cast<double>(child_overhead)); |
| } |
| results_.SetChildOverhead(child_overhead); |
| } |
| |
| #pragma pack(pop) |
| |
| } // namespace hwy |
| |
| #endif // PROFILER_ENABLED || HWY_IDE |
| |
| #if !PROFILER_ENABLED && !HWY_IDE |
| #define PROFILER_ZONE(name) |
| #define PROFILER_FUNC |
| #define PROFILER_PRINT_RESULTS() |
| #endif |
| |
| #endif // HIGHWAY_HWY_PROFILER_H_ |