| // Copyright 2025 Google LLC |
| // SPDX-License-Identifier: Apache-2.0 |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| #ifndef HIGHWAY_HWY_CONTRIB_THREAD_POOL_SPIN_H_ |
| #define HIGHWAY_HWY_CONTRIB_THREAD_POOL_SPIN_H_ |
| |
| // Relatively power-efficient spin lock for low-latency synchronization. |
| |
| #include <stdint.h> |
| |
| #include <atomic> |
| |
| #include "third_party/highway/hwy/base.h" |
| #include "third_party/highway/hwy/cache_control.h" // Pause |
| |
| #ifndef HWY_ENABLE_MONITORX // allow override |
| // Clang 3.9 suffices for mwaitx, but the target pragma requires 9.0. |
| #if HWY_ARCH_X86 && ((HWY_COMPILER_CLANG >= 900) || \ |
| (HWY_COMPILER_GCC_ACTUAL >= 502) || defined(__MWAITX__)) |
| #define HWY_ENABLE_MONITORX 1 |
| #else |
| #define HWY_ENABLE_MONITORX 0 |
| #endif |
| #endif // HWY_ENABLE_MONITORX |
| |
| #ifndef HWY_ENABLE_UMONITOR // allow override |
| #if HWY_ARCH_X86 && ((HWY_COMPILER_CLANG >= 900) || \ |
| (HWY_COMPILER_GCC_ACTUAL >= 901) || defined(__WAITPKG__)) |
| #define HWY_ENABLE_UMONITOR 1 |
| #else |
| #define HWY_ENABLE_UMONITOR 0 |
| #endif |
| #endif // HWY_ENABLE_UMONITOR |
| |
| // Inline assembly is preferred because it allows inlining of `UntilDifferent` |
| // etc, but we also support intrinsics for MSVC. |
| #ifndef HWY_ENABLE_SPIN_ASM // allow override |
| #if (HWY_COMPILER_CLANG || HWY_COMPILER_GCC) && HWY_ARCH_X86_64 |
| #define HWY_ENABLE_SPIN_ASM 1 |
| #else |
| #define HWY_ENABLE_SPIN_ASM 0 |
| #endif |
| #endif // HWY_ENABLE_SPIN_ASM |
| |
| #if HWY_ENABLE_MONITORX || HWY_ENABLE_UMONITOR |
| #if HWY_ENABLE_SPIN_ASM |
| #define HWY_INLINE_SPIN HWY_INLINE // can inline functions with inline assembly |
| #else |
| // Intrinsics require attributes, which prevent inlining. |
| #define HWY_INLINE_SPIN |
| #include <x86intrin.h> |
| #endif // HWY_ENABLE_SPIN_ASM |
| |
| #include "third_party/highway/hwy/x86_cpuid.h" |
| #endif // HWY_ENABLE_MONITORX || HWY_ENABLE_UMONITOR |
| |
| namespace hwy { |
| |
| // Returned by `UntilDifferent` in a single register. |
| struct SpinResult { |
| // We also use u32 because that is all that futex.h supports. |
| uint32_t current; |
| // Number of retries before returning, useful for checking that the |
| // monitor/wait did not just return immediately. |
| uint32_t reps; |
| }; |
| |
| // User-space monitor/wait are supported on Zen2+ AMD and SPR+ Intel. Spin waits |
| // are rarely called from SIMD code, hence we do not integrate this into |
| // `HWY_TARGET` and its runtime dispatch mechanism. Returned by `Type()`, also |
| // used by callers to set the `disabled` argument for `DetectSpin`. |
| enum class SpinType : uint8_t { |
| kMonitorX = 1, // AMD |
| kUMonitor, // Intel |
| kPause, |
| kSentinel // for iterating over all enumerators. Must be last. |
| }; |
| |
| // For printing which is in use. |
| static inline const char* ToString(SpinType type) { |
| switch (type) { |
| case SpinType::kMonitorX: |
| return "MonitorX_C1"; |
| case SpinType::kUMonitor: |
| return "UMonitor_C0.2"; |
| case SpinType::kPause: |
| return "Pause"; |
| case SpinType::kSentinel: |
| return nullptr; |
| default: |
| HWY_UNREACHABLE; |
| } |
| } |
| |
| // Indirect function calls turn out to be too expensive because this is called |
| // multiple times per ThreadPool barrier. We will instead inline the spin and |
| // barrier using policy classes. This one is always available; use it as a |
| // reference for the interface. Note that Pause varies across CPUs: it can be |
| // a no-op, or wait 140 cycles. |
| struct SpinPause { |
| SpinType Type() const { return SpinType::kPause; } |
| |
| // Spins until `watched != prev` and returns the new value, similar to |
| // `BlockUntilDifferent` in `futex.h`. |
| HWY_INLINE SpinResult UntilDifferent( |
| const uint32_t prev, const std::atomic<uint32_t>& watched) const { |
| for (uint32_t reps = 0;; ++reps) { |
| const uint32_t current = watched.load(std::memory_order_acquire); |
| if (current != prev) return SpinResult{current, reps}; |
| hwy::Pause(); |
| } |
| } |
| |
| // Returns number of retries until `watched == expected`. |
| HWY_INLINE size_t UntilEqual(const uint32_t expected, |
| const std::atomic<uint32_t>& watched) const { |
| for (size_t reps = 0;; ++reps) { |
| const uint32_t current = watched.load(std::memory_order_acquire); |
| if (current == expected) return reps; |
| hwy::Pause(); |
| } |
| } |
| }; |
| |
| #if HWY_ENABLE_MONITORX || HWY_IDE |
| #if !HWY_ENABLE_SPIN_ASM |
| HWY_PUSH_ATTRIBUTES("mwaitx") |
| #endif |
| |
| // AMD's user-mode monitor/wait (Zen2+). |
| class SpinMonitorX { |
| public: |
| SpinType Type() const { return SpinType::kMonitorX; } |
| |
| HWY_INLINE_SPIN SpinResult UntilDifferent( |
| const uint32_t prev, const std::atomic<uint32_t>& watched) const { |
| for (uint32_t reps = 0;; ++reps) { |
| uint32_t current = watched.load(std::memory_order_acquire); |
| if (current != prev) return SpinResult{current, reps}; |
| Monitor(&watched); |
| // Double-checked 'lock' to avoid missed events: |
| current = watched.load(std::memory_order_acquire); |
| if (current != prev) return SpinResult{current, reps}; |
| Wait(); |
| } |
| } |
| |
| HWY_INLINE_SPIN size_t UntilEqual( |
| const uint32_t expected, const std::atomic<uint32_t>& watched) const { |
| for (size_t reps = 0;; ++reps) { |
| uint32_t current = watched.load(std::memory_order_acquire); |
| if (current == expected) return reps; |
| Monitor(&watched); |
| // Double-checked 'lock' to avoid missed events: |
| current = watched.load(std::memory_order_acquire); |
| if (current == expected) return reps; |
| Wait(); |
| } |
| } |
| |
| private: |
| static HWY_INLINE void Monitor(const void* addr) { |
| // No extensions/hints currently defined. |
| #if HWY_ENABLE_SPIN_ASM |
| asm volatile("monitorx" ::"a"(addr), "c"(0), "d"(0)); |
| #else |
| _mm_monitorx(const_cast<void*>(addr), 0, 0); |
| #endif |
| } |
| |
| static HWY_INLINE void Wait() { |
| #if HWY_ENABLE_SPIN_ASM |
| // EBX=0 cycles means no timeout/infinite. |
| asm volatile("mwaitx" ::"a"(kHints), "b"(0), "c"(kExtensions)); |
| #else |
| _mm_mwaitx(kExtensions, kHints, /*cycles=*/0); |
| #endif |
| } |
| |
| // 0xF would be C0. Its wakeup latency is less than 0.1 us shorter, and |
| // package power is sometimes actually higher than with Pause. The |
| // difference in spurious wakeups is minor. |
| static constexpr unsigned kHints = 0x0; // C1: a bit deeper than C0 |
| // No timeout required, we assume the mwaitx does not miss stores, see |
| // https://www.usenix.org/system/files/usenixsecurity23-zhang-ruiyi.pdf.] |
| static constexpr unsigned kExtensions = 0; |
| }; |
| |
| #if !HWY_ENABLE_SPIN_ASM |
| HWY_POP_ATTRIBUTES |
| #endif |
| #endif // HWY_ENABLE_MONITORX |
| |
| #if HWY_ENABLE_UMONITOR || HWY_IDE |
| #if !HWY_ENABLE_SPIN_ASM |
| HWY_PUSH_ATTRIBUTES("waitpkg") |
| #endif |
| |
| // Intel's user-mode monitor/wait (SPR+). |
| class SpinUMonitor { |
| public: |
| SpinType Type() const { return SpinType::kUMonitor; } |
| |
| HWY_INLINE_SPIN SpinResult UntilDifferent( |
| const uint32_t prev, const std::atomic<uint32_t>& watched) const { |
| for (uint32_t reps = 0;; ++reps) { |
| uint32_t current = watched.load(std::memory_order_acquire); |
| if (current != prev) return SpinResult{current, reps}; |
| Monitor(&watched); |
| // Double-checked 'lock' to avoid missed events: |
| current = watched.load(std::memory_order_acquire); |
| if (current != prev) return SpinResult{current, reps}; |
| Wait(); |
| } |
| } |
| |
| HWY_INLINE_SPIN size_t UntilEqual( |
| const uint32_t expected, const std::atomic<uint32_t>& watched) const { |
| for (size_t reps = 0;; ++reps) { |
| uint32_t current = watched.load(std::memory_order_acquire); |
| if (current == expected) return reps; |
| Monitor(&watched); |
| // Double-checked 'lock' to avoid missed events: |
| current = watched.load(std::memory_order_acquire); |
| if (current == expected) return reps; |
| Wait(); |
| } |
| } |
| |
| private: |
| static HWY_INLINE void Monitor(const void* addr) { |
| #if HWY_ENABLE_SPIN_ASM |
| asm volatile("umonitor %%rcx" ::"c"(addr)); |
| #else |
| _umonitor(const_cast<void*>(addr)); |
| #endif |
| } |
| |
| static HWY_INLINE void Wait() { |
| #if HWY_ENABLE_SPIN_ASM |
| asm volatile("umwait %%ecx" ::"c"(kControl), "d"(kDeadline >> 32), |
| "a"(kDeadline & 0xFFFFFFFFu)); |
| #else |
| _umwait(kControl, kDeadline); |
| #endif |
| } |
| |
| // 1 would be C0.1. C0.2 has 20x fewer spurious wakeups and additional 4% |
| // package power savings vs Pause on SPR. It comes at the cost of |
| // 0.4-0.6us higher wake latency, but the total is comparable to Zen4. |
| static constexpr unsigned kControl = 0; // C0.2 for deeper sleep |
| static constexpr uint64_t kDeadline = ~uint64_t{0}; // no timeout, see above |
| }; |
| |
| #if !HWY_ENABLE_SPIN_ASM |
| HWY_POP_ATTRIBUTES |
| #endif |
| #endif // HWY_ENABLE_UMONITOR |
| |
| // TODO(janwas): add WFE on Arm. May wake at 10 kHz, but still worthwhile. |
| |
| // Returns the best-available type whose bit in `disabled` is not set. Example: |
| // to disable kUMonitor, pass `1 << static_cast<int>(SpinType::kUMonitor)`. |
| // Ignores `disabled` for `kPause` if it is the only supported and enabled type. |
| // Somewhat expensive, typically called during initialization. |
| static inline SpinType DetectSpin(int disabled = 0) { |
| const auto HWY_MAYBE_UNUSED enabled = [disabled](SpinType type) { |
| return (disabled & (1 << static_cast<int>(type))) == 0; |
| }; |
| |
| #if HWY_ENABLE_MONITORX |
| if (enabled(SpinType::kMonitorX) && x86::IsAMD()) { |
| uint32_t abcd[4]; |
| x86::Cpuid(0x80000001U, 0, abcd); |
| if (x86::IsBitSet(abcd[2], 29)) return SpinType::kMonitorX; |
| } |
| #endif // HWY_ENABLE_MONITORX |
| |
| #if HWY_ENABLE_UMONITOR |
| if (enabled(SpinType::kUMonitor) && x86::MaxLevel() >= 7) { |
| uint32_t abcd[4]; |
| x86::Cpuid(7, 0, abcd); |
| if (x86::IsBitSet(abcd[2], 5)) return SpinType::kUMonitor; |
| } |
| #endif // HWY_ENABLE_UMONITOR |
| |
| if (!enabled(SpinType::kPause)) { |
| HWY_WARN("Ignoring attempt to disable Pause, it is the only option left."); |
| } |
| return SpinType::kPause; |
| } |
| |
| // Calls `func(spin)` for the given `spin_type`. |
| template <class Func> |
| HWY_INLINE void CallWithSpin(SpinType spin_type, Func&& func) { |
| switch (spin_type) { |
| #if HWY_ENABLE_MONITORX |
| case SpinType::kMonitorX: |
| func(SpinMonitorX()); |
| break; |
| #endif |
| #if HWY_ENABLE_UMONITOR |
| case SpinType::kUMonitor: |
| func(SpinUMonitor()); |
| break; |
| #endif |
| case SpinType::kPause: |
| default: |
| func(SpinPause()); |
| break; |
| } |
| } |
| |
| } // namespace hwy |
| |
| #endif // HIGHWAY_HWY_CONTRIB_THREAD_POOL_SPIN_H_ |