| // Copyright 2024 Google LLC |
| // SPDX-License-Identifier: Apache-2.0 |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| #ifndef HIGHWAY_HWY_CONTRIB_THREAD_POOL_FUTEX_H_ |
| #define HIGHWAY_HWY_CONTRIB_THREAD_POOL_FUTEX_H_ |
| |
| // Keyed event (futex): kernel queue of blocked threads, identified by the |
| // address of an atomic u32 called `current` within the same process (do NOT |
| // use with shared-memory mappings). |
| // |
| // Futex equivalents: https://outerproduct.net/futex-dictionary.html; we |
| // support Linux/Emscripten/Apple/Windows and C++20 std::atomic::wait, plus a |
| // NanoSleep fallback. |
| |
| #include <time.h> |
| |
| #include <atomic> |
| #include <climits> // INT_MAX |
| |
| #include "third_party/highway/hwy/base.h" |
| |
| #if HWY_ARCH_WASM |
| #include <emscripten/threading.h> |
| #include <math.h> // INFINITY |
| |
| #elif HWY_OS_LINUX |
| #include <errno.h> // IWYU pragma: keep |
| #include <linux/futex.h> // FUTEX_* |
| #include <pthread.h> |
| #include <sys/syscall.h> // SYS_* |
| #include <unistd.h> |
| // Android may not declare these: |
| #ifndef SYS_futex |
| #ifdef SYS_futex_time64 // 32-bit with 64-bit time_t |
| #define SYS_futex SYS_futex_time64 |
| #else |
| #define SYS_futex __NR_futex |
| #endif // SYS_futex_time64 |
| #endif // SYS_futex |
| #ifndef FUTEX_WAIT_PRIVATE |
| #define FUTEX_WAIT_PRIVATE (FUTEX_WAIT | 128) |
| #endif |
| #ifndef FUTEX_WAKE_PRIVATE |
| #define FUTEX_WAKE_PRIVATE (FUTEX_WAKE | 128) |
| #endif |
| |
| #elif HWY_OS_APPLE && !defined(HWY_DISABLE_FUTEX) |
| // These are private APIs, so add an opt-out. |
| extern "C" { |
| int __ulock_wait(uint32_t op, void* address, uint64_t val, uint32_t max_us); |
| int __ulock_wake(uint32_t op, void* address, uint64_t zero); |
| } // extern "C" |
| #define UL_COMPARE_AND_WAIT 1 |
| #define ULF_WAKE_ALL 0x00000100 |
| |
| #elif HWY_OS_WIN && !defined(HWY_DISABLE_FUTEX) |
| // WakeByAddressAll requires Windows 8, so add an opt-out. |
| #ifndef NOMINMAX |
| #define NOMINMAX |
| #endif // NOMINMAX |
| #ifndef WIN32_LEAN_AND_MEAN |
| #define WIN32_LEAN_AND_MEAN |
| #endif // WIN32_LEAN_AND_MEAN |
| #include <windows.h> |
| #if HWY_COMPILER_MSVC || HWY_COMPILER_CLANGCL |
| #pragma comment(lib, "synchronization.lib") |
| #endif |
| |
| #elif HWY_CXX_LANG < 202002L // NOT C++20, which has native support |
| #define HWY_FUTEX_SLEEP |
| #endif |
| |
| namespace hwy { |
| |
| // Attempts to pause for the specified nanoseconds, though the resolution is |
| // closer to 0.1 microseconds. Returns false if no wait happened. Thread-safe. |
| static inline bool NanoSleep(uint64_t ns) { |
| #if HWY_OS_WIN |
| static thread_local HANDLE hTimer = nullptr; |
| if (HWY_UNLIKELY(hTimer == nullptr)) { |
| // Must be manual reset: auto-reset would immediately signal after the next |
| // SetWaitableTimer. |
| hTimer = CreateWaitableTimer(nullptr, TRUE, nullptr); |
| if (hTimer == nullptr) return false; |
| } |
| |
| // Negative means relative, in units of 100 ns. |
| LARGE_INTEGER time; |
| time.QuadPart = -static_cast<LONGLONG>(ns / 100); |
| const LONG period = 0; // signal once |
| if (!SetWaitableTimer(hTimer, &time, period, nullptr, nullptr, FALSE)) { |
| return false; |
| } |
| |
| (void)WaitForSingleObject(hTimer, INFINITE); |
| return true; |
| #else |
| timespec duration; |
| duration.tv_sec = static_cast<time_t>(ns / 1000000000); |
| duration.tv_nsec = static_cast<decltype(duration.tv_nsec)>(ns % 1000000000); |
| timespec remainder; |
| // Repeat if interrupted by a signal. Note that the remainder may be rounded |
| // up, which could cause an infinite loop if continually interrupted. Using |
| // clock_nanosleep would work, but we'd have to get the current time. We |
| // assume durations are short, and instead just cap the number of retries. |
| for (int rep = 0; rep < 3; ++rep) { |
| if (nanosleep(&duration, &remainder) == 0 || errno != EINTR) break; |
| duration = remainder; |
| } |
| return true; |
| #endif |
| } |
| |
| // Waits until `current != prev` and returns the new value. May return |
| // immediately if `current` already changed, or after blocking and waking. |
| static inline uint32_t BlockUntilDifferent( |
| const uint32_t prev, const std::atomic<uint32_t>& current) { |
| const auto acq = std::memory_order_acquire; |
| |
| #if HWY_ARCH_WASM |
| // It is always safe to cast to void. |
| volatile void* address = |
| const_cast<volatile void*>(static_cast<const volatile void*>(¤t)); |
| const double max_ms = INFINITY; |
| for (;;) { |
| const uint32_t next = current.load(acq); |
| if (next != prev) return next; |
| const int ret = emscripten_futex_wait(address, prev, max_ms); |
| HWY_DASSERT(ret >= 0); |
| (void)ret; |
| } |
| |
| #elif HWY_OS_LINUX |
| // Safe to cast because std::atomic is a standard layout type. |
| const uint32_t* address = reinterpret_cast<const uint32_t*>(¤t); |
| // _PRIVATE requires this only be used in the same process, and avoids |
| // virtual->physical lookups and atomic reference counting. |
| const int op = FUTEX_WAIT_PRIVATE; |
| for (;;) { |
| const uint32_t next = current.load(acq); |
| if (next != prev) return next; |
| // timeout=null may prevent interrupts via signal. No lvalue because |
| // the timespec type is only standardized since C++17 or C11. |
| const auto ret = syscall(SYS_futex, address, op, prev, nullptr, nullptr, 0); |
| if (ret == -1) { |
| HWY_DASSERT(errno == EAGAIN); // otherwise an actual error |
| } |
| } |
| |
| #elif HWY_OS_WIN && !defined(HWY_DISABLE_FUTEX) |
| // It is always safe to cast to void. |
| volatile void* address = |
| const_cast<volatile void*>(static_cast<const volatile void*>(¤t)); |
| // API is not const-correct, but only loads from the pointer. |
| PVOID pprev = const_cast<void*>(static_cast<const void*>(&prev)); |
| const DWORD max_ms = INFINITE; |
| for (;;) { |
| const uint32_t next = current.load(acq); |
| if (next != prev) return next; |
| const BOOL ok = WaitOnAddress(address, pprev, sizeof(prev), max_ms); |
| HWY_DASSERT(ok); |
| (void)ok; |
| } |
| |
| #elif HWY_OS_APPLE && !defined(HWY_DISABLE_FUTEX) |
| // It is always safe to cast to void. |
| void* address = const_cast<void*>(static_cast<const void*>(¤t)); |
| for (;;) { |
| const uint32_t next = current.load(acq); |
| if (next != prev) return next; |
| __ulock_wait(UL_COMPARE_AND_WAIT, address, prev, 0); |
| } |
| |
| #elif defined(HWY_FUTEX_SLEEP) |
| for (;;) { |
| const uint32_t next = current.load(acq); |
| if (next != prev) return next; |
| NanoSleep(2000); |
| } |
| |
| #elif HWY_CXX_LANG >= 202002L |
| current.wait(prev, acq); // No spurious wakeup. |
| const uint32_t next = current.load(acq); |
| HWY_DASSERT(next != prev); |
| return next; |
| |
| #else |
| #error "Logic error, should have reached HWY_FUTEX_SLEEP" |
| #endif // HWY_OS_* |
| } // BlockUntilDifferent |
| |
| // Wakes all threads, if any, that are waiting because they called |
| // `BlockUntilDifferent` with the same `current`. |
| static inline void WakeAll(std::atomic<uint32_t>& current) { |
| #if HWY_ARCH_WASM |
| // It is always safe to cast to void. |
| volatile void* address = static_cast<volatile void*>(¤t); |
| const int max_to_wake = INT_MAX; // actually signed |
| const int ret = emscripten_futex_wake(address, max_to_wake); |
| HWY_DASSERT(ret >= 0); |
| (void)ret; |
| |
| #elif HWY_OS_LINUX |
| // Safe to cast because std::atomic is a standard layout type. |
| uint32_t* address = reinterpret_cast<uint32_t*>(¤t); |
| const int max_to_wake = INT_MAX; // actually signed |
| const auto ret = syscall(SYS_futex, address, FUTEX_WAKE_PRIVATE, max_to_wake, |
| nullptr, nullptr, 0); |
| HWY_DASSERT(ret >= 0); // number woken |
| (void)ret; |
| |
| #elif HWY_OS_WIN && !defined(HWY_DISABLE_FUTEX) |
| // It is always safe to cast to void. |
| void* address = static_cast<void*>(¤t); |
| WakeByAddressAll(address); |
| |
| #elif HWY_OS_APPLE && !defined(HWY_DISABLE_FUTEX) |
| // It is always safe to cast to void. |
| void* address = static_cast<void*>(¤t); |
| __ulock_wake(UL_COMPARE_AND_WAIT | ULF_WAKE_ALL, address, 0); |
| |
| #elif defined(HWY_FUTEX_SLEEP) |
| // NanoSleep loop does not require wakeup. |
| (void)current; |
| #elif HWY_CXX_LANG >= 202002L |
| current.notify_all(); |
| |
| #else |
| #error "Logic error, should have reached HWY_FUTEX_SLEEP" |
| #endif |
| } // WakeAll |
| |
| } // namespace hwy |
| |
| #endif // HIGHWAY_HWY_CONTRIB_THREAD_POOL_FUTEX_H_ |