|  | // Copyright 2024 Google LLC | 
|  | // SPDX-License-Identifier: Apache-2.0 | 
|  | // | 
|  | // Licensed under the Apache License, Version 2.0 (the "License"); | 
|  | // you may not use this file except in compliance with the License. | 
|  | // You may obtain a copy of the License at | 
|  | // | 
|  | //      http://www.apache.org/licenses/LICENSE-2.0 | 
|  | // | 
|  | // Unless required by applicable law or agreed to in writing, software | 
|  | // distributed under the License is distributed on an "AS IS" BASIS, | 
|  | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
|  | // See the License for the specific language governing permissions and | 
|  | // limitations under the License. | 
|  |  | 
|  | #ifndef HIGHWAY_HWY_CONTRIB_THREAD_POOL_TOPOLOGY_H_ | 
|  | #define HIGHWAY_HWY_CONTRIB_THREAD_POOL_TOPOLOGY_H_ | 
|  |  | 
|  | // OS-specific functions for processor topology and thread affinity. | 
|  |  | 
|  | #include <stddef.h> | 
|  |  | 
|  | #include <vector> | 
|  |  | 
|  | #include "third_party/highway/hwy/base.h" | 
|  | #include "third_party/highway/hwy/bit_set.h" | 
|  |  | 
|  | namespace hwy { | 
|  |  | 
|  | // Returns false if std::thread should not be used. | 
|  | HWY_CONTRIB_DLLEXPORT bool HaveThreadingSupport(); | 
|  |  | 
|  | // Upper bound on logical processors, including hyperthreads. | 
|  | static constexpr size_t kMaxLogicalProcessors = 1024;  // matches glibc | 
|  |  | 
|  | // Set used by Get/SetThreadAffinity. | 
|  | using LogicalProcessorSet = BitSet4096<kMaxLogicalProcessors>; | 
|  |  | 
|  | // Returns false, or sets `lps` to all logical processors which are online and | 
|  | // available to the current thread. | 
|  | HWY_CONTRIB_DLLEXPORT bool GetThreadAffinity(LogicalProcessorSet& lps); | 
|  |  | 
|  | // Ensures the current thread can only run on the logical processors in `lps`. | 
|  | // Returns false if not supported (in particular on Apple), or if the | 
|  | // intersection between `lps` and `GetThreadAffinity` is the empty set. | 
|  | HWY_CONTRIB_DLLEXPORT bool SetThreadAffinity(const LogicalProcessorSet& lps); | 
|  |  | 
|  | // Returns false, or ensures the current thread will only run on `lp`, which | 
|  | // must not exceed `TotalLogicalProcessors`. Note that this merely calls | 
|  | // `SetThreadAffinity`, see the comment there. | 
|  | static inline bool PinThreadToLogicalProcessor(size_t lp) { | 
|  | LogicalProcessorSet lps; | 
|  | lps.Set(lp); | 
|  | return SetThreadAffinity(lps); | 
|  | } | 
|  |  | 
|  | // Returns 1 if unknown, otherwise the total number of logical processors | 
|  | // provided by the hardware clamped to `kMaxLogicalProcessors`. | 
|  | // These processors are not necessarily all usable; you can determine which are | 
|  | // via GetThreadAffinity(). | 
|  | HWY_CONTRIB_DLLEXPORT size_t TotalLogicalProcessors(); | 
|  |  | 
|  | struct Topology { | 
|  | // Caller must check packages.empty(); if so, do not use any fields. | 
|  | HWY_CONTRIB_DLLEXPORT Topology(); | 
|  |  | 
|  | // Clique of cores with lower latency to each other. On Apple M1 these are | 
|  | // four cores sharing an L2. On Zen4 these 'CCX' are up to eight cores sharing | 
|  | // an L3 and a memory controller, or for Zen4c up to 16 and half the L3 size. | 
|  | struct Cluster { | 
|  | LogicalProcessorSet lps; | 
|  | uint64_t private_kib = 0;  // 0 if unknown | 
|  | uint64_t shared_kib = 0;   // 0 if unknown | 
|  | uint64_t reserved1 = 0; | 
|  | uint64_t reserved2 = 0; | 
|  | uint64_t reserved3 = 0; | 
|  | }; | 
|  |  | 
|  | struct Core { | 
|  | LogicalProcessorSet lps; | 
|  | uint64_t reserved = 0; | 
|  | }; | 
|  |  | 
|  | struct Package { | 
|  | std::vector<Cluster> clusters; | 
|  | std::vector<Core> cores; | 
|  | }; | 
|  |  | 
|  | std::vector<Package> packages; | 
|  |  | 
|  | // Several hundred instances, so prefer a compact representation. | 
|  | #pragma pack(push, 1) | 
|  | struct LP { | 
|  | uint16_t cluster = 0;  // < packages[package].clusters.size() | 
|  | uint16_t core = 0;     // < packages[package].cores.size() | 
|  | uint8_t package = 0;   // < packages.size() | 
|  | uint8_t smt = 0;       // < packages[package].cores[core].lps.Count() | 
|  | uint8_t node = 0; | 
|  |  | 
|  | uint8_t reserved = 0; | 
|  | }; | 
|  | #pragma pack(pop) | 
|  | std::vector<LP> lps;  // size() == TotalLogicalProcessors(). | 
|  | }; | 
|  |  | 
|  | #pragma pack(push, 1) | 
|  | // Cache parameters. Note the overlap with `HWY_ALIGNMENT`, which is intended | 
|  | // but not guaranteed to be an upper bound for L1/L2 line sizes, and | 
|  | // `Topology::Cluster::private_kib/shared_kib`, which are intended but not | 
|  | // guaranteed to be the L2/L3 sizes. Getting the exact parameters, including the | 
|  | // ways of associativity, can be useful for modeling cache conflicts. | 
|  | // | 
|  | // Uses packed fields so the array of `Cache` fits in a typical cache line. | 
|  | struct Cache { | 
|  | // Arbitrary upper bound for sanity checking. | 
|  | static constexpr uint16_t kMaxAssociativity = 128; | 
|  |  | 
|  | // Zero if the level does not exist; *per-core* portion for shared caches. | 
|  | uint32_t size_kib = 0; | 
|  | // Also per-core portion, computed as number of lines / associativity. | 
|  | uint32_t sets = 0; | 
|  | uint16_t bytes_per_line = 0; | 
|  | uint16_t associativity = 0;  // number of ways | 
|  | uint16_t cores_sharing = 0;  // usually 1 for L1 | 
|  | uint16_t reserved = 0; | 
|  | }; | 
|  | static_assert(sizeof(Cache) == 16, "Unexpected size"); | 
|  | #pragma pack(pop) | 
|  |  | 
|  | // Returns null if unknown, otherwise pointer to an array of `Cache` instances, | 
|  | // where entry 0 is reserved, entry 1 describes the L1 data cache, entry 2 | 
|  | // describes the (possibly unified or shared) L2, and entry 3 describes the L3 | 
|  | // if its `size_kib != 0`. | 
|  | // | 
|  | // Initializes on-demand, which has some overhead for thread safety, hence | 
|  | // callers should cache the result. | 
|  | HWY_CONTRIB_DLLEXPORT const Cache* DataCaches(); | 
|  |  | 
|  | }  // namespace hwy | 
|  |  | 
|  | #endif  // HIGHWAY_HWY_CONTRIB_THREAD_POOL_TOPOLOGY_H_ |