third_party/highway/hwy/profiler.h - aom - Git at Google

 // Copyright 2017 Google Inc. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #ifndef HIGHWAY_HWY_PROFILER_H_
 #define HIGHWAY_HWY_PROFILER_H_

 // High precision, low overhead time measurements. Returns exact call counts and
 // total elapsed time for user-defined 'zones' (code regions, i.e. C++ scopes).
 //
 // Uses RAII to capture begin/end timestamps, with user-specified zone names:
 //   { PROFILER_ZONE("name"); /*code*/ } or
 // the name of the current function:
 //   void FuncToMeasure() { PROFILER_FUNC; /*code*/ }.
 //
 // After all threads have exited any zones, invoke PROFILER_PRINT_RESULTS() to
 // print call counts and average durations [CPU cycles] to stdout, sorted in
 // descending order of total duration.
 //
 // The binary MUST be built with --dynamic_mode=off because we rely on the data
 // segments being nearby; if not, an assertion will likely fail.

 #include "third_party/highway/hwy/base.h"

 // Configuration settings:

 // If zero, this file has no effect and no measurements will be recorded.
 #ifndef PROFILER_ENABLED
 #define PROFILER_ENABLED 0
 #endif

 // How many mebibytes to allocate (if PROFILER_ENABLED) per thread that
 // enters at least one zone. Once this buffer is full, the thread will analyze
 // and discard packets, thus temporarily adding some observer overhead.
 // Each zone occupies 16 bytes.
 #ifndef PROFILER_THREAD_STORAGE
 #define PROFILER_THREAD_STORAGE 200ULL
 #endif

 #if PROFILER_ENABLED || HWY_IDE

 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>  // strcmp

 #include <atomic>

 #include "third_party/highway/hwy/aligned_allocator.h"
 #include "third_party/highway/hwy/cache_control.h"  // FlushStream
 #include "third_party/highway/hwy/contrib/sort/vqsort.h"
 #include "third_party/highway/hwy/robust_statistics.h"
 #include "third_party/highway/hwy/timer.h"

 #define PROFILER_PRINT_OVERHEAD 0

 namespace hwy {

 // Upper bounds for fixed-size data structures (guarded via HWY_DASSERT):

 // How many threads can actually enter a zone (those that don't do not count).
 // Memory use is about kMaxThreads * PROFILER_THREAD_STORAGE MiB.
 // WARNING: a fiber library can spawn hundreds of threads.
 static constexpr size_t kMaxThreads = 256;

 static constexpr size_t kMaxDepth = 64;  // Maximum nesting of zones.

 static constexpr size_t kMaxZones = 256;  // Total number of zones.

 #pragma pack(push, 1)

 // Represents zone entry/exit events. Stores a full-resolution timestamp plus
 // an offset (representing zone name or identifying exit packets). POD.
 class Packet {
  public:
   // If offsets do not fit, UpdateOrAdd will overrun our heap allocation
   // (governed by kMaxZones). We have seen multi-megabyte offsets.
   static constexpr size_t kOffsetBits = 25;
   static constexpr uint64_t kOffsetBias = 1ULL << (kOffsetBits - 1);

   // We need full-resolution timestamps; at an effective rate of 4 GHz,
   // this permits 1 minute zone durations (for longer durations, split into
   // multiple zones). Wraparound is handled by masking.
   static constexpr size_t kTimestampBits = 64 - kOffsetBits;
   static constexpr uint64_t kTimestampMask = (1ULL << kTimestampBits) - 1;

   static Packet Make(const size_t biased_offset, const uint64_t timestamp) {
     HWY_DASSERT(biased_offset != 0);
     HWY_DASSERT(biased_offset < (1ULL << kOffsetBits));

     Packet packet;
     packet.bits_ =
         (biased_offset << kTimestampBits) + (timestamp & kTimestampMask);

     HWY_DASSERT(packet.BiasedOffset() == biased_offset);
     HWY_DASSERT(packet.Timestamp() == (timestamp & kTimestampMask));
     return packet;
   }

   uint64_t Timestamp() const { return bits_ & kTimestampMask; }

   size_t BiasedOffset() const {
     const size_t biased_offset = (bits_ >> kTimestampBits);
     HWY_DASSERT(biased_offset != 0);
     HWY_DASSERT(biased_offset < (1ULL << kOffsetBits));
     return biased_offset;
   }

  private:
   uint64_t bits_;
 };
 static_assert(sizeof(Packet) == 8, "Wrong Packet size");

 // All translation units must use the same string origin. A static member
 // function ensures this without requiring a separate .cc file.
 struct StringOrigin {
   // Returns the address of a string literal. Assuming zone names are also
   // literals and stored nearby, we can represent them as offsets from this,
   // which is faster to compute than hashes or even a static index.
   static const char* Get() {
     // Chosen such that no zone name is a prefix nor suffix of this string
     // to ensure they aren't merged. Note zone exit packets use
     // `biased_offset == kOffsetBias`.
     static const char* string_origin = "__#__";
     return string_origin - Packet::kOffsetBias;
   }
 };

 // Representation of an active zone, stored in a stack. Used to deduct
 // child duration from the parent's self time. POD.
 struct Node {
   Packet packet;
   uint64_t child_total;
 };
 static_assert(sizeof(Node) == 16, "Wrong Node size");

 // Holds statistics for all zones with the same name. POD.
 struct Accumulator {
   static constexpr size_t kNumCallBits = 64 - Packet::kOffsetBits;

   uint64_t BiasedOffset() const {
     const size_t biased_offset = u128.lo >> kNumCallBits;
     HWY_DASSERT(biased_offset != 0);
     HWY_DASSERT(biased_offset < (1ULL << Packet::kOffsetBits));
     return biased_offset;
   }
   uint64_t NumCalls() const { return u128.lo & ((1ULL << kNumCallBits) - 1); }
   uint64_t Duration() const { return u128.hi; }

   void Set(uint64_t biased_offset, uint64_t num_calls, uint64_t duration) {
     HWY_DASSERT(biased_offset != 0);
     HWY_DASSERT(biased_offset < (1ULL << Packet::kOffsetBits));
     HWY_DASSERT(num_calls < (1ULL << kNumCallBits));

     u128.hi = duration;
     u128.lo = (biased_offset << kNumCallBits) + num_calls;

     HWY_DASSERT(BiasedOffset() == biased_offset);
     HWY_DASSERT(NumCalls() == num_calls);
     HWY_DASSERT(Duration() == duration);
   }

   void Add(uint64_t num_calls, uint64_t duration) {
     const uint64_t biased_offset = BiasedOffset();
     (void)biased_offset;

     u128.lo += num_calls;
     u128.hi += duration;

     HWY_DASSERT(biased_offset == BiasedOffset());
   }

   // For fast sorting by duration, which must therefore be the hi element.
   // lo holds BiasedOffset and NumCalls.
   uint128_t u128;
 };
 static_assert(sizeof(Accumulator) == 16, "Wrong Accumulator size");

 template <typename T>
 inline T ClampedSubtract(const T minuend, const T subtrahend) {
   if (subtrahend > minuend) {
     return 0;
   }
   return minuend - subtrahend;
 }

 // Per-thread call graph (stack) and Accumulator for each zone.
 class Results {
  public:
   Results() {
     ZeroBytes(nodes_, sizeof(nodes_));
     ZeroBytes(zones_, sizeof(zones_));
   }

   // Used for computing overhead when this thread encounters its first Zone.
   // This has no observable effect apart from increasing "analyze_elapsed_".
   uint64_t ZoneDuration(const Packet* packets) {
     HWY_DASSERT(depth_ == 0);
     HWY_DASSERT(num_zones_ == 0);
     AnalyzePackets(packets, 2);
     const uint64_t duration = zones_[0].Duration();
     zones_[0].Set(1, 0, 0);  // avoids triggering biased_offset = 0 checks
     HWY_DASSERT(depth_ == 0);
     num_zones_ = 0;
     return duration;
   }

   void SetSelfOverhead(const uint64_t self_overhead) {
     self_overhead_ = self_overhead;
   }

   void SetChildOverhead(const uint64_t child_overhead) {
     child_overhead_ = child_overhead;
   }

   // Draw all required information from the packets, which can be discarded
   // afterwards. Called whenever this thread's storage is full.
   void AnalyzePackets(const Packet* packets, const size_t num_packets) {
     const uint64_t t0 = timer::Start();

     for (size_t i = 0; i < num_packets; ++i) {
       const Packet p = packets[i];
       // Entering a zone
       if (p.BiasedOffset() != Packet::kOffsetBias) {
         HWY_DASSERT(depth_ < kMaxDepth);
         nodes_[depth_].packet = p;
         HWY_DASSERT(p.BiasedOffset() != 0);
         nodes_[depth_].child_total = 0;
         ++depth_;
         continue;
       }

       HWY_DASSERT(depth_ != 0);
       const Node& node = nodes_[depth_ - 1];
       // Masking correctly handles unsigned wraparound.
       const uint64_t duration =
           (p.Timestamp() - node.packet.Timestamp()) & Packet::kTimestampMask;
       const uint64_t self_duration = ClampedSubtract(
           duration, self_overhead_ + child_overhead_ + node.child_total);

       UpdateOrAdd(node.packet.BiasedOffset(), 1, self_duration);
       --depth_;

       // Deduct this nested node's time from its parent's self_duration.
       if (depth_ != 0) {
         nodes_[depth_ - 1].child_total += duration + child_overhead_;
       }
     }

     const uint64_t t1 = timer::Stop();
     analyze_elapsed_ += t1 - t0;
   }

   // Incorporates results from another thread. Call after all threads have
   // exited any zones.
   void Assimilate(Results& other) {
     const uint64_t t0 = timer::Start();
     HWY_DASSERT(depth_ == 0);
     HWY_DASSERT(other.depth_ == 0);

     for (size_t i = 0; i < other.num_zones_; ++i) {
       const Accumulator& zone = other.zones_[i];
       UpdateOrAdd(zone.BiasedOffset(), zone.NumCalls(), zone.Duration());
     }
     other.num_zones_ = 0;
     const uint64_t t1 = timer::Stop();
     analyze_elapsed_ += t1 - t0 + other.analyze_elapsed_;
   }

   // Single-threaded.
   void Print() {
     const uint64_t t0 = timer::Start();
     MergeDuplicates();

     // Sort by decreasing total (self) cost.
     VQSort(&zones_[0].u128, num_zones_, SortDescending());

     const double inv_freq = 1.0 / platform::InvariantTicksPerSecond();

     const char* string_origin = StringOrigin::Get();
     for (size_t i = 0; i < num_zones_; ++i) {
       const Accumulator& z = zones_[i];
       const size_t num_calls = z.NumCalls();
       const double duration = static_cast<double>(z.Duration());
       printf("%-40s: %10zu x %15.0f = %9.6f\n",
              string_origin + z.BiasedOffset(), num_calls, duration / num_calls,
              duration * inv_freq);
     }
     num_zones_ = 0;

     const uint64_t t1 = timer::Stop();
     analyze_elapsed_ += t1 - t0;
     printf("Total analysis [s]: %f\n",
            static_cast<double>(analyze_elapsed_) * inv_freq);
   }

  private:
   // Updates an existing Accumulator (uniquely identified by biased_offset) or
   // adds one if this is the first time this thread analyzed that zone.
   // Uses a self-organizing list data structure, which avoids dynamic memory
   // allocations and is far faster than unordered_map.
   void UpdateOrAdd(const size_t biased_offset, const uint64_t num_calls,
                    const uint64_t duration) {
     HWY_DASSERT(biased_offset != 0);
     HWY_DASSERT(biased_offset < (1ULL << Packet::kOffsetBits));

     // Special case for first zone: (maybe) update, without swapping.
     if (num_zones_ != 0 && zones_[0].BiasedOffset() == biased_offset) {
       zones_[0].Add(num_calls, duration);
       return;
     }

     // Look for a zone with the same offset.
     for (size_t i = 1; i < num_zones_; ++i) {
       if (zones_[i].BiasedOffset() == biased_offset) {
         zones_[i].Add(num_calls, duration);
         // Swap with predecessor (more conservative than move to front,
         // but at least as successful).
         const Accumulator prev = zones_[i - 1];
         zones_[i - 1] = zones_[i];
         zones_[i] = prev;
         return;
       }
     }

     // Not found; create a new Accumulator.
     HWY_DASSERT(num_zones_ < kMaxZones);
     zones_[num_zones_].Set(biased_offset, num_calls, duration);
     ++num_zones_;
   }

   // Each instantiation of a function template seems to get its own copy of
   // __func__ and GCC doesn't merge them. An N^2 search for duplicates is
   // acceptable because we only expect a few dozen zones.
   void MergeDuplicates() {
     const char* string_origin = StringOrigin::Get();
     for (size_t i = 0; i < num_zones_; ++i) {
       const size_t biased_offset = zones_[i].BiasedOffset();
       const char* name = string_origin + biased_offset;
       // Separate num_calls from biased_offset so we can add them together.
       uint64_t num_calls = zones_[i].NumCalls();

       // Add any subsequent duplicates to num_calls and total_duration.
       for (size_t j = i + 1; j < num_zones_;) {
         if (!strcmp(name, string_origin + zones_[j].BiasedOffset())) {
           num_calls += zones_[j].NumCalls();
           zones_[i].Add(0, zones_[j].Duration());
           // j was the last zone, so we are done.
           if (j == num_zones_ - 1) break;
           // Replace current zone with the last one, and check it next.
           zones_[j] = zones_[--num_zones_];
         } else {  // Name differed, try next Accumulator.
           ++j;
         }
       }

       // Re-pack regardless of whether any duplicates were found.
       zones_[i].Set(biased_offset, num_calls, zones_[i].Duration());
     }
   }

   uint64_t analyze_elapsed_ = 0;
   uint64_t self_overhead_ = 0;
   uint64_t child_overhead_ = 0;

   size_t depth_ = 0;      // Number of active zones.
   size_t num_zones_ = 0;  // Number of retired zones.

   alignas(HWY_ALIGNMENT) Node nodes_[kMaxDepth];         // Stack
   alignas(HWY_ALIGNMENT) Accumulator zones_[kMaxZones];  // Self-organizing list
 };

 // Per-thread packet storage, dynamically allocated.
 class ThreadSpecific {
   static constexpr size_t kBufferCapacity = HWY_ALIGNMENT / sizeof(Packet);

  public:
   // "name" is used to sanity-check offsets fit in kOffsetBits.
   explicit ThreadSpecific(const char* name)
       : max_packets_((PROFILER_THREAD_STORAGE << 20) / sizeof(Packet)),
         packets_(AllocateAligned<Packet>(max_packets_)),
         num_packets_(0),
         string_origin_(StringOrigin::Get()) {
     // Even in optimized builds, verify that this zone's name offset fits
     // within the allotted space. If not, UpdateOrAdd is likely to overrun
     // zones_[]. Checking here on the cold path (only reached once per thread)
     // is cheap, but it only covers one zone.
     const size_t biased_offset = name - string_origin_;
     HWY_ASSERT(biased_offset < (1ULL << Packet::kOffsetBits));
   }

   // Depends on Zone => defined below.
   void ComputeOverhead();

   void WriteEntry(const char* name, const uint64_t timestamp) {
     HWY_DASSERT(name >= string_origin_);
     const size_t biased_offset = static_cast<size_t>(name - string_origin_);
     Write(Packet::Make(biased_offset, timestamp));
   }

   void WriteExit(const uint64_t timestamp) {
     const size_t biased_offset = Packet::kOffsetBias;
     Write(Packet::Make(biased_offset, timestamp));
   }

   void AnalyzeRemainingPackets() {
     // Ensures prior weakly-ordered streaming stores are globally visible.
     FlushStream();

     // Storage full => empty it.
     if (num_packets_ + buffer_size_ > max_packets_) {
       results_.AnalyzePackets(packets_.get(), num_packets_);
       num_packets_ = 0;
     }
     CopyBytes(buffer_, packets_.get() + num_packets_,
               buffer_size_ * sizeof(Packet));
     num_packets_ += buffer_size_;

     results_.AnalyzePackets(packets_.get(), num_packets_);
     num_packets_ = 0;
   }

   Results& GetResults() { return results_; }

  private:
   // Overwrites "to" while attempting to bypass the cache (read-for-ownership).
   // Both pointers must be aligned.
   static void StreamCacheLine(const uint64_t* HWY_RESTRICT from,
                               uint64_t* HWY_RESTRICT to) {
 #if HWY_COMPILER_CLANG
     for (size_t i = 0; i < HWY_ALIGNMENT / sizeof(uint64_t); ++i) {
       __builtin_nontemporal_store(from[i], to + i);
     }
 #else
     hwy::CopyBytes(from, to, HWY_ALIGNMENT);
 #endif
   }

   // Write packet to buffer/storage, emptying them as needed.
   void Write(const Packet packet) {
     // Buffer full => copy to storage.
     if (buffer_size_ == kBufferCapacity) {
       // Storage full => empty it.
       if (num_packets_ + kBufferCapacity > max_packets_) {
         results_.AnalyzePackets(packets_.get(), num_packets_);
         num_packets_ = 0;
       }
       // This buffering halves observer overhead and decreases the overall
       // runtime by about 3%. Casting is safe because the first member is u64.
       StreamCacheLine(
           reinterpret_cast<const uint64_t*>(buffer_),
           reinterpret_cast<uint64_t*>(packets_.get() + num_packets_));
       num_packets_ += kBufferCapacity;
       buffer_size_ = 0;
     }
     buffer_[buffer_size_] = packet;
     ++buffer_size_;
   }

   // Write-combining buffer to avoid cache pollution. Must be the first
   // non-static member to ensure cache-line alignment.
   Packet buffer_[kBufferCapacity];
   size_t buffer_size_ = 0;

   const size_t max_packets_;
   // Contiguous storage for zone enter/exit packets.
   AlignedFreeUniquePtr<Packet[]> packets_;
   size_t num_packets_;
   // Cached here because we already read this cache line on zone entry/exit.
   const char* string_origin_;
   Results results_;
 };

 class ThreadList {
  public:
   // Called from any thread.
   ThreadSpecific* Add(const char* name) {
     const size_t index = num_threads_.fetch_add(1, std::memory_order_relaxed);
     HWY_DASSERT(index < kMaxThreads);

     ThreadSpecific* ts = MakeUniqueAligned<ThreadSpecific>(name).release();
     threads_[index].store(ts, std::memory_order_release);
     return ts;
   }

   // Single-threaded.
   void PrintResults() {
     const auto acq = std::memory_order_acquire;
     const size_t num_threads = num_threads_.load(acq);

     ThreadSpecific* main = threads_[0].load(acq);
     main->AnalyzeRemainingPackets();

     for (size_t i = 1; i < num_threads; ++i) {
       ThreadSpecific* ts = threads_[i].load(acq);
       ts->AnalyzeRemainingPackets();
       main->GetResults().Assimilate(ts->GetResults());
     }

     if (num_threads != 0) {
       main->GetResults().Print();
     }
   }

  private:
   // Owning pointers.
   alignas(64) std::atomic<ThreadSpecific*> threads_[kMaxThreads];
   std::atomic<size_t> num_threads_{0};
 };

 // RAII zone enter/exit recorder constructed by the ZONE macro; also
 // responsible for initializing ThreadSpecific.
 class Zone {
  public:
   // "name" must be a string literal (see StringOrigin::Get).
   HWY_NOINLINE explicit Zone(const char* name) {
     HWY_FENCE;
     ThreadSpecific* HWY_RESTRICT thread_specific = StaticThreadSpecific();
     if (HWY_UNLIKELY(thread_specific == nullptr)) {
       // Ensure the CPU supports our timer.
       char cpu[100];
       if (!platform::HaveTimerStop(cpu)) {
         HWY_ABORT("CPU %s is too old for PROFILER_ENABLED=1, exiting", cpu);
       }

       thread_specific = StaticThreadSpecific() = Threads().Add(name);
       // Must happen after setting StaticThreadSpecific, because ComputeOverhead
       // also calls Zone().
       thread_specific->ComputeOverhead();
     }

     // (Capture timestamp ASAP, not inside WriteEntry.)
     HWY_FENCE;
     const uint64_t timestamp = timer::Start();
     thread_specific->WriteEntry(name, timestamp);
   }

   HWY_NOINLINE ~Zone() {
     HWY_FENCE;
     const uint64_t timestamp = timer::Stop();
     StaticThreadSpecific()->WriteExit(timestamp);
     HWY_FENCE;
   }

   // Call exactly once after all threads have exited all zones.
   static void PrintResults() { Threads().PrintResults(); }

  private:
   // Returns reference to the thread's ThreadSpecific pointer (initially null).
   // Function-local static avoids needing a separate definition.
   static ThreadSpecific*& StaticThreadSpecific() {
     static thread_local ThreadSpecific* thread_specific;
     return thread_specific;
   }

   // Returns the singleton ThreadList. Non time-critical.
   static ThreadList& Threads() {
     static ThreadList threads_;
     return threads_;
   }
 };

 // Creates a zone starting from here until the end of the current scope.
 // Timestamps will be recorded when entering and exiting the zone.
 // "name" must be a string literal, which is ensured by merging with "".
 #define PROFILER_ZONE(name)      \
   HWY_FENCE;                     \
   const hwy::Zone zone("" name); \
   HWY_FENCE

 // Creates a zone for an entire function (when placed at its beginning).
 // Shorter/more convenient than ZONE.
 #define PROFILER_FUNC             \
   HWY_FENCE;                      \
   const hwy::Zone zone(__func__); \
   HWY_FENCE

 #define PROFILER_PRINT_RESULTS hwy::Zone::PrintResults

 inline void ThreadSpecific::ComputeOverhead() {
   // Delay after capturing timestamps before/after the actual zone runs. Even
   // with frequency throttling disabled, this has a multimodal distribution,
   // including 32, 34, 48, 52, 59, 62.
   uint64_t self_overhead;
   {
     const size_t kNumSamples = 32;
     uint32_t samples[kNumSamples];
     for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
       const size_t kNumDurations = 1024;
       uint32_t durations[kNumDurations];

       for (size_t idx_duration = 0; idx_duration < kNumDurations;
            ++idx_duration) {
         {
           PROFILER_ZONE("Dummy Zone (never shown)");
         }
         const uint64_t duration = results_.ZoneDuration(buffer_);
         buffer_size_ = 0;
         durations[idx_duration] = static_cast<uint32_t>(duration);
         HWY_DASSERT(num_packets_ == 0);
       }
       robust_statistics::CountingSort(durations, kNumDurations);
       samples[idx_sample] = robust_statistics::Mode(durations, kNumDurations);
     }
     // Median.
     robust_statistics::CountingSort(samples, kNumSamples);
     self_overhead = samples[kNumSamples / 2];
     if (PROFILER_PRINT_OVERHEAD) {
       printf("Overhead: %.0f\n", static_cast<double>(self_overhead));
     }
     results_.SetSelfOverhead(self_overhead);
   }

   // Delay before capturing start timestamp / after end timestamp.
   const size_t kNumSamples = 32;
   uint32_t samples[kNumSamples];
   for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
     const size_t kNumDurations = 16;
     uint32_t durations[kNumDurations];
     for (size_t idx_duration = 0; idx_duration < kNumDurations;
          ++idx_duration) {
       const size_t kReps = 10000;
       // Analysis time should not be included => must fit within buffer.
       HWY_DASSERT(kReps * 2 < max_packets_);
       std::atomic_thread_fence(std::memory_order_seq_cst);
       const uint64_t t0 = timer::Start();
       for (size_t i = 0; i < kReps; ++i) {
         PROFILER_ZONE("Dummy");
       }
       FlushStream();
       const uint64_t t1 = timer::Stop();
       HWY_DASSERT(num_packets_ + buffer_size_ == kReps * 2);
       buffer_size_ = 0;
       num_packets_ = 0;
       const uint64_t avg_duration = (t1 - t0 + kReps / 2) / kReps;
       durations[idx_duration] =
           static_cast<uint32_t>(ClampedSubtract(avg_duration, self_overhead));
     }
     robust_statistics::CountingSort(durations, kNumDurations);
     samples[idx_sample] = robust_statistics::Mode(durations, kNumDurations);
   }
   robust_statistics::CountingSort(samples, kNumSamples);
   const uint64_t child_overhead = samples[9 * kNumSamples / 10];
   if (PROFILER_PRINT_OVERHEAD) {
     printf("Child overhead: %.0f\n", static_cast<double>(child_overhead));
   }
   results_.SetChildOverhead(child_overhead);
 }

 #pragma pack(pop)

 }  // namespace hwy

 #endif  // PROFILER_ENABLED || HWY_IDE

 #if !PROFILER_ENABLED && !HWY_IDE
 #define PROFILER_ZONE(name)
 #define PROFILER_FUNC
 #define PROFILER_PRINT_RESULTS()
 #endif

 #endif  // HIGHWAY_HWY_PROFILER_H_