third_party/highway/hwy/contrib/thread_pool/topology.h - aom - Git at Google

 // Copyright 2024 Google LLC
 // SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #ifndef HIGHWAY_HWY_CONTRIB_THREAD_POOL_TOPOLOGY_H_
 #define HIGHWAY_HWY_CONTRIB_THREAD_POOL_TOPOLOGY_H_

 // OS-specific functions for processor topology and thread affinity.

 #include <stddef.h>

 #include <vector>

 #include "third_party/highway/hwy/base.h"
 #include "third_party/highway/hwy/bit_set.h"

 namespace hwy {

 // Returns false if std::thread should not be used.
 HWY_CONTRIB_DLLEXPORT bool HaveThreadingSupport();

 // Upper bound on logical processors, including hyperthreads.
 static constexpr size_t kMaxLogicalProcessors = 1024;  // matches glibc

 // Set used by Get/SetThreadAffinity.
 using LogicalProcessorSet = BitSet4096<kMaxLogicalProcessors>;

 // Returns false, or sets `lps` to all logical processors which are online and
 // available to the current thread.
 HWY_CONTRIB_DLLEXPORT bool GetThreadAffinity(LogicalProcessorSet& lps);

 // Ensures the current thread can only run on the logical processors in `lps`.
 // Returns false if not supported (in particular on Apple), or if the
 // intersection between `lps` and `GetThreadAffinity` is the empty set.
 HWY_CONTRIB_DLLEXPORT bool SetThreadAffinity(const LogicalProcessorSet& lps);

 // Returns false, or ensures the current thread will only run on `lp`, which
 // must not exceed `TotalLogicalProcessors`. Note that this merely calls
 // `SetThreadAffinity`, see the comment there.
 static inline bool PinThreadToLogicalProcessor(size_t lp) {
   LogicalProcessorSet lps;
   lps.Set(lp);
   return SetThreadAffinity(lps);
 }

 // Returns 1 if unknown, otherwise the total number of logical processors
 // provided by the hardware clamped to `kMaxLogicalProcessors`.
 // These processors are not necessarily all usable; you can determine which are
 // via GetThreadAffinity().
 HWY_CONTRIB_DLLEXPORT size_t TotalLogicalProcessors();

 struct Topology {
   // Caller must check packages.empty(); if so, do not use any fields.
   HWY_CONTRIB_DLLEXPORT Topology();

   // Clique of cores with lower latency to each other. On Apple M1 these are
   // four cores sharing an L2. On Zen4 these 'CCX' are up to eight cores sharing
   // an L3 and a memory controller, or for Zen4c up to 16 and half the L3 size.
   struct Cluster {
     LogicalProcessorSet lps;
     uint64_t private_kib = 0;  // 0 if unknown
     uint64_t shared_kib = 0;   // 0 if unknown
     uint64_t reserved1 = 0;
     uint64_t reserved2 = 0;
     uint64_t reserved3 = 0;
   };

   struct Core {
     LogicalProcessorSet lps;
     uint64_t reserved = 0;
   };

   struct Package {
     std::vector<Cluster> clusters;
     std::vector<Core> cores;
   };

   std::vector<Package> packages;

   // Several hundred instances, so prefer a compact representation.
 #pragma pack(push, 1)
   struct LP {
     uint16_t cluster = 0;  // < packages[package].clusters.size()
     uint16_t core = 0;     // < packages[package].cores.size()
     uint8_t package = 0;   // < packages.size()
     uint8_t smt = 0;       // < packages[package].cores[core].lps.Count()
     uint8_t node = 0;

     uint8_t reserved = 0;
   };
 #pragma pack(pop)
   std::vector<LP> lps;  // size() == TotalLogicalProcessors().
 };

 #pragma pack(push, 1)
 // Cache parameters. Note the overlap with `HWY_ALIGNMENT`, which is intended
 // but not guaranteed to be an upper bound for L1/L2 line sizes, and
 // `Topology::Cluster::private_kib/shared_kib`, which are intended but not
 // guaranteed to be the L2/L3 sizes. Getting the exact parameters, including the
 // ways of associativity, can be useful for modeling cache conflicts.
 //
 // Uses packed fields so the array of `Cache` fits in a typical cache line.
 struct Cache {
   // Arbitrary upper bound for sanity checking.
   static constexpr uint16_t kMaxAssociativity = 128;

   // Zero if the level does not exist; *per-core* portion for shared caches.
   uint32_t size_kib = 0;
   // Also per-core portion, computed as number of lines / associativity.
   uint32_t sets = 0;
   uint16_t bytes_per_line = 0;
   uint16_t associativity = 0;  // number of ways
   uint16_t cores_sharing = 0;  // usually 1 for L1
   uint16_t reserved = 0;
 };
 static_assert(sizeof(Cache) == 16, "Unexpected size");
 #pragma pack(pop)

 // Returns null if unknown, otherwise pointer to an array of `Cache` instances,
 // where entry 0 is reserved, entry 1 describes the L1 data cache, entry 2
 // describes the (possibly unified or shared) L2, and entry 3 describes the L3
 // if its `size_kib != 0`.
 //
 // Initializes on-demand, which has some overhead for thread safety, hence
 // callers should cache the result.
 HWY_CONTRIB_DLLEXPORT const Cache* DataCaches();

 }  // namespace hwy

 #endif  // HIGHWAY_HWY_CONTRIB_THREAD_POOL_TOPOLOGY_H_
	// Copyright 2024 Google LLC
	// SPDX-License-Identifier: Apache-2.0
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#ifndef HIGHWAY_HWY_CONTRIB_THREAD_POOL_TOPOLOGY_H_
	#define HIGHWAY_HWY_CONTRIB_THREAD_POOL_TOPOLOGY_H_

	// OS-specific functions for processor topology and thread affinity.

	#include <stddef.h>

	#include <vector>

	#include "third_party/highway/hwy/base.h"
	#include "third_party/highway/hwy/bit_set.h"

	namespace hwy {

	// Returns false if std::thread should not be used.
	HWY_CONTRIB_DLLEXPORT bool HaveThreadingSupport();

	// Upper bound on logical processors, including hyperthreads.
	static constexpr size_t kMaxLogicalProcessors = 1024; // matches glibc

	// Set used by Get/SetThreadAffinity.
	using LogicalProcessorSet = BitSet4096<kMaxLogicalProcessors>;

	// Returns false, or sets `lps` to all logical processors which are online and
	// available to the current thread.
	HWY_CONTRIB_DLLEXPORT bool GetThreadAffinity(LogicalProcessorSet& lps);

	// Ensures the current thread can only run on the logical processors in `lps`.
	// Returns false if not supported (in particular on Apple), or if the
	// intersection between `lps` and `GetThreadAffinity` is the empty set.
	HWY_CONTRIB_DLLEXPORT bool SetThreadAffinity(const LogicalProcessorSet& lps);

	// Returns false, or ensures the current thread will only run on `lp`, which
	// must not exceed `TotalLogicalProcessors`. Note that this merely calls
	// `SetThreadAffinity`, see the comment there.
	static inline bool PinThreadToLogicalProcessor(size_t lp) {
	LogicalProcessorSet lps;
	lps.Set(lp);
	return SetThreadAffinity(lps);
	}

	// Returns 1 if unknown, otherwise the total number of logical processors
	// provided by the hardware clamped to `kMaxLogicalProcessors`.
	// These processors are not necessarily all usable; you can determine which are
	// via GetThreadAffinity().
	HWY_CONTRIB_DLLEXPORT size_t TotalLogicalProcessors();

	struct Topology {
	// Caller must check packages.empty(); if so, do not use any fields.
	HWY_CONTRIB_DLLEXPORT Topology();

	// Clique of cores with lower latency to each other. On Apple M1 these are
	// four cores sharing an L2. On Zen4 these 'CCX' are up to eight cores sharing
	// an L3 and a memory controller, or for Zen4c up to 16 and half the L3 size.
	struct Cluster {
	LogicalProcessorSet lps;
	uint64_t private_kib = 0; // 0 if unknown
	uint64_t shared_kib = 0; // 0 if unknown
	uint64_t reserved1 = 0;
	uint64_t reserved2 = 0;
	uint64_t reserved3 = 0;
	};

	struct Core {
	LogicalProcessorSet lps;
	uint64_t reserved = 0;
	};

	struct Package {
	std::vector<Cluster> clusters;
	std::vector<Core> cores;
	};

	std::vector<Package> packages;

	// Several hundred instances, so prefer a compact representation.
	#pragma pack(push, 1)
	struct LP {
	uint16_t cluster = 0; // < packages[package].clusters.size()
	uint16_t core = 0; // < packages[package].cores.size()
	uint8_t package = 0; // < packages.size()
	uint8_t smt = 0; // < packages[package].cores[core].lps.Count()
	uint8_t node = 0;

	uint8_t reserved = 0;
	};
	#pragma pack(pop)
	std::vector<LP> lps; // size() == TotalLogicalProcessors().
	};

	#pragma pack(push, 1)
	// Cache parameters. Note the overlap with `HWY_ALIGNMENT`, which is intended
	// but not guaranteed to be an upper bound for L1/L2 line sizes, and
	// `Topology::Cluster::private_kib/shared_kib`, which are intended but not
	// guaranteed to be the L2/L3 sizes. Getting the exact parameters, including the
	// ways of associativity, can be useful for modeling cache conflicts.
	//
	// Uses packed fields so the array of `Cache` fits in a typical cache line.
	struct Cache {
	// Arbitrary upper bound for sanity checking.
	static constexpr uint16_t kMaxAssociativity = 128;

	// Zero if the level does not exist; per-core portion for shared caches.
	uint32_t size_kib = 0;
	// Also per-core portion, computed as number of lines / associativity.
	uint32_t sets = 0;
	uint16_t bytes_per_line = 0;
	uint16_t associativity = 0; // number of ways
	uint16_t cores_sharing = 0; // usually 1 for L1
	uint16_t reserved = 0;
	};
	static_assert(sizeof(Cache) == 16, "Unexpected size");
	#pragma pack(pop)

	// Returns null if unknown, otherwise pointer to an array of `Cache` instances,
	// where entry 0 is reserved, entry 1 describes the L1 data cache, entry 2
	// describes the (possibly unified or shared) L2, and entry 3 describes the L3
	// if its `size_kib != 0`.
	//
	// Initializes on-demand, which has some overhead for thread safety, hence
	// callers should cache the result.
	HWY_CONTRIB_DLLEXPORT const Cache* DataCaches();

	} // namespace hwy

	#endif // HIGHWAY_HWY_CONTRIB_THREAD_POOL_TOPOLOGY_H_