|  | // Copyright 2020 Google LLC | 
|  | // SPDX-License-Identifier: Apache-2.0 | 
|  | // | 
|  | // Licensed under the Apache License, Version 2.0 (the "License"); | 
|  | // you may not use this file except in compliance with the License. | 
|  | // You may obtain a copy of the License at | 
|  | // | 
|  | //      http://www.apache.org/licenses/LICENSE-2.0 | 
|  | // | 
|  | // Unless required by applicable law or agreed to in writing, software | 
|  | // distributed under the License is distributed on an "AS IS" BASIS, | 
|  | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
|  | // See the License for the specific language governing permissions and | 
|  | // limitations under the License. | 
|  |  | 
|  | #ifndef HIGHWAY_HWY_TARGETS_H_ | 
|  | #define HIGHWAY_HWY_TARGETS_H_ | 
|  |  | 
|  | // Allows opting out of C++ standard library usage, which is not available in | 
|  | // some Compiler Explorer environments. | 
|  | #ifndef HWY_NO_LIBCXX | 
|  | #include <vector> | 
|  | #endif | 
|  |  | 
|  | // For SIMD module implementations and their callers. Defines which targets to | 
|  | // generate and call. | 
|  |  | 
|  | #include "third_party/highway/hwy/base.h" | 
|  | #include "third_party/highway/hwy/detect_targets.h" | 
|  | #include "third_party/highway/hwy/highway_export.h" | 
|  |  | 
|  | #if !defined(HWY_NO_LIBCXX) | 
|  | #include <atomic> | 
|  | #endif | 
|  |  | 
|  | namespace hwy { | 
|  |  | 
|  | // Returns bitfield of enabled targets that are supported on this CPU; there is | 
|  | // always at least one such target, hence the return value is never 0. The | 
|  | // targets returned may change after calling DisableTargets. This function is | 
|  | // always defined, but the HWY_SUPPORTED_TARGETS wrapper may allow eliding | 
|  | // calls to it if there is only a single target enabled. | 
|  | HWY_DLLEXPORT int64_t SupportedTargets(); | 
|  |  | 
|  | // Evaluates to a function call, or literal if there is a single target. | 
|  | #if (HWY_TARGETS & (HWY_TARGETS - 1)) == 0 | 
|  | #define HWY_SUPPORTED_TARGETS HWY_TARGETS | 
|  | #else | 
|  | #define HWY_SUPPORTED_TARGETS hwy::SupportedTargets() | 
|  | #endif | 
|  |  | 
|  | // Subsequent SupportedTargets will not return targets whose bit(s) are set in | 
|  | // `disabled_targets`. Exception: if SupportedTargets would return 0, it will | 
|  | // instead return HWY_STATIC_TARGET (there must always be one target to call). | 
|  | // | 
|  | // This function is useful for disabling targets known to be buggy, or if the | 
|  | // best available target is undesirable (perhaps due to throttling or memory | 
|  | // bandwidth limitations). Use SetSupportedTargetsForTest instead of this | 
|  | // function for iteratively enabling specific targets for testing. | 
|  | HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets); | 
|  |  | 
|  | // Subsequent SupportedTargets will return the given set of targets, except | 
|  | // those disabled via DisableTargets. Call with a mask of 0 to disable the mock | 
|  | // and return to the normal SupportedTargets behavior. Used to run tests for | 
|  | // all targets. | 
|  | HWY_DLLEXPORT void SetSupportedTargetsForTest(int64_t targets); | 
|  |  | 
|  | #ifndef HWY_NO_LIBCXX | 
|  |  | 
|  | // Return the list of targets in HWY_TARGETS supported by the CPU as a list of | 
|  | // individual HWY_* target macros such as HWY_SCALAR or HWY_NEON. This list | 
|  | // is affected by the current SetSupportedTargetsForTest() mock if any. | 
|  | HWY_INLINE std::vector<int64_t> SupportedAndGeneratedTargets() { | 
|  | std::vector<int64_t> ret; | 
|  | for (int64_t targets = SupportedTargets() & HWY_TARGETS; targets != 0; | 
|  | targets = targets & (targets - 1)) { | 
|  | int64_t current_target = targets & ~(targets - 1); | 
|  | ret.push_back(current_target); | 
|  | } | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | #endif  // HWY_NO_LIBCXX | 
|  |  | 
|  | static inline HWY_MAYBE_UNUSED const char* TargetName(int64_t target) { | 
|  | switch (target) { | 
|  | #if HWY_ARCH_X86 | 
|  | case HWY_SSE2: | 
|  | return "SSE2"; | 
|  | case HWY_SSSE3: | 
|  | return "SSSE3"; | 
|  | case HWY_SSE4: | 
|  | return "SSE4"; | 
|  | case HWY_AVX2: | 
|  | return "AVX2"; | 
|  | case HWY_AVX3: | 
|  | return "AVX3"; | 
|  | case HWY_AVX3_DL: | 
|  | return "AVX3_DL"; | 
|  | case HWY_AVX3_ZEN4: | 
|  | return "AVX3_ZEN4"; | 
|  | case HWY_AVX10_2: | 
|  | return "AVX10_2"; | 
|  | case HWY_AVX3_SPR: | 
|  | return "AVX3_SPR"; | 
|  | case HWY_AVX10_2_512: | 
|  | return "AVX10_2_512"; | 
|  | #endif | 
|  |  | 
|  | #if HWY_ARCH_ARM | 
|  | case HWY_SVE2_128: | 
|  | return "SVE2_128"; | 
|  | case HWY_SVE_256: | 
|  | return "SVE_256"; | 
|  | case HWY_SVE2: | 
|  | return "SVE2"; | 
|  | case HWY_SVE: | 
|  | return "SVE"; | 
|  | case HWY_NEON_BF16: | 
|  | return "NEON_BF16"; | 
|  | case HWY_NEON: | 
|  | return "NEON"; | 
|  | case HWY_NEON_WITHOUT_AES: | 
|  | return "NEON_WITHOUT_AES"; | 
|  | #endif | 
|  |  | 
|  | #if HWY_ARCH_PPC | 
|  | case HWY_PPC8: | 
|  | return "PPC8"; | 
|  | case HWY_PPC9: | 
|  | return "PPC9"; | 
|  | case HWY_PPC10: | 
|  | return "PPC10"; | 
|  | #endif | 
|  |  | 
|  | #if HWY_ARCH_S390X | 
|  | case HWY_Z14: | 
|  | return "Z14"; | 
|  | case HWY_Z15: | 
|  | return "Z15"; | 
|  | #endif | 
|  |  | 
|  | #if HWY_ARCH_WASM | 
|  | case HWY_WASM: | 
|  | return "WASM"; | 
|  | case HWY_WASM_EMU256: | 
|  | return "WASM_EMU256"; | 
|  | #endif | 
|  |  | 
|  | #if HWY_ARCH_RISCV | 
|  | case HWY_RVV: | 
|  | return "RVV"; | 
|  | #endif | 
|  |  | 
|  | #if HWY_ARCH_LOONGARCH | 
|  | case HWY_LSX: | 
|  | return "LSX"; | 
|  | case HWY_LASX: | 
|  | return "LASX"; | 
|  | #endif | 
|  |  | 
|  | case HWY_EMU128: | 
|  | return "EMU128"; | 
|  | case HWY_SCALAR: | 
|  | return "SCALAR"; | 
|  |  | 
|  | default: | 
|  | return "Unknown";  // must satisfy gtest IsValidParamName() | 
|  | } | 
|  | } | 
|  |  | 
|  | // The maximum number of dynamic targets on any architecture is defined by | 
|  | // HWY_MAX_DYNAMIC_TARGETS and depends on the arch. | 
|  |  | 
|  | // For the ChosenTarget mask and index we use a different bit arrangement than | 
|  | // in the HWY_TARGETS mask. Only the targets involved in the current | 
|  | // architecture are used in this mask, and therefore only the least significant | 
|  | // (HWY_MAX_DYNAMIC_TARGETS + 2) bits of the int64_t mask are used. The least | 
|  | // significant bit is set when the mask is not initialized, the next | 
|  | // HWY_MAX_DYNAMIC_TARGETS more significant bits are a range of bits from the | 
|  | // HWY_TARGETS or SupportedTargets() mask for the given architecture shifted to | 
|  | // that position and the next more significant bit is used for HWY_SCALAR (if | 
|  | // HWY_COMPILE_ONLY_SCALAR is defined) or HWY_EMU128. Because of this we need to | 
|  | // define equivalent values for HWY_TARGETS in this representation. | 
|  | // This mask representation allows to use ctz() on this mask and obtain a small | 
|  | // number that's used as an index of the table for dynamic dispatch. In this | 
|  | // way the first entry is used when the mask is uninitialized, the following | 
|  | // HWY_MAX_DYNAMIC_TARGETS are for dynamic dispatch and the last one is for | 
|  | // scalar. | 
|  |  | 
|  | // The HWY_SCALAR/HWY_EMU128 bit in the ChosenTarget mask format. | 
|  | #define HWY_CHOSEN_TARGET_MASK_SCALAR (1LL << (HWY_MAX_DYNAMIC_TARGETS + 1)) | 
|  |  | 
|  | // Converts from a HWY_TARGETS mask to a ChosenTarget mask format for the | 
|  | // current architecture. | 
|  | #define HWY_CHOSEN_TARGET_SHIFT(X)                                    \ | 
|  | ((((X) >> (HWY_HIGHEST_TARGET_BIT + 1 - HWY_MAX_DYNAMIC_TARGETS)) & \ | 
|  | ((1LL << HWY_MAX_DYNAMIC_TARGETS) - 1))                           \ | 
|  | << 1) | 
|  |  | 
|  | // The HWY_TARGETS mask in the ChosenTarget mask format. | 
|  | #define HWY_CHOSEN_TARGET_MASK_TARGETS \ | 
|  | (HWY_CHOSEN_TARGET_SHIFT(HWY_TARGETS) | HWY_CHOSEN_TARGET_MASK_SCALAR | 1LL) | 
|  |  | 
|  | #if HWY_ARCH_X86 | 
|  | // Maximum number of dynamic targets, changing this value is an ABI incompatible | 
|  | // change | 
|  | #define HWY_MAX_DYNAMIC_TARGETS 15 | 
|  | #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_X86 | 
|  | // These must match the order in which the HWY_TARGETS are defined | 
|  | // starting by the least significant (HWY_HIGHEST_TARGET_BIT + 1 - | 
|  | // HWY_MAX_DYNAMIC_TARGETS) bit. This list must contain exactly | 
|  | // HWY_MAX_DYNAMIC_TARGETS elements and does not include SCALAR. The first entry | 
|  | // corresponds to the best target. Don't include a "," at the end of the list. | 
|  | #define HWY_CHOOSE_TARGET_LIST(func_name)                       \ | 
|  | nullptr,                               /* reserved */         \ | 
|  | nullptr,                           /* reserved */         \ | 
|  | nullptr,                           /* reserved */         \ | 
|  | HWY_CHOOSE_AVX10_2_512(func_name), /* AVX10_2_512 */      \ | 
|  | HWY_CHOOSE_AVX3_SPR(func_name),    /* AVX3_SPR */         \ | 
|  | HWY_CHOOSE_AVX10_2(func_name),     /* reserved */         \ | 
|  | HWY_CHOOSE_AVX3_ZEN4(func_name),   /* AVX3_ZEN4 */        \ | 
|  | HWY_CHOOSE_AVX3_DL(func_name),     /* AVX3_DL */          \ | 
|  | HWY_CHOOSE_AVX3(func_name),        /* AVX3 */             \ | 
|  | HWY_CHOOSE_AVX2(func_name),        /* AVX2 */             \ | 
|  | nullptr,                           /* AVX */              \ | 
|  | HWY_CHOOSE_SSE4(func_name),        /* SSE4 */             \ | 
|  | HWY_CHOOSE_SSSE3(func_name),       /* SSSE3 */            \ | 
|  | nullptr,                           /* reserved - SSE3? */ \ | 
|  | HWY_CHOOSE_SSE2(func_name)         /* SSE2 */ | 
|  |  | 
|  | #elif HWY_ARCH_ARM | 
|  | // See HWY_ARCH_X86 above for details. | 
|  | #define HWY_MAX_DYNAMIC_TARGETS 15 | 
|  | #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_ARM | 
|  | #define HWY_CHOOSE_TARGET_LIST(func_name)                              \ | 
|  | nullptr,                                   /* reserved */            \ | 
|  | nullptr,                               /* reserved */            \ | 
|  | nullptr,                               /* reserved */            \ | 
|  | HWY_CHOOSE_SVE2_128(func_name),        /* SVE2 128-bit */        \ | 
|  | HWY_CHOOSE_SVE_256(func_name),         /* SVE 256-bit */         \ | 
|  | nullptr,                               /* reserved */            \ | 
|  | nullptr,                               /* reserved */            \ | 
|  | nullptr,                               /* reserved */            \ | 
|  | HWY_CHOOSE_SVE2(func_name),            /* SVE2 */                \ | 
|  | HWY_CHOOSE_SVE(func_name),             /* SVE */                 \ | 
|  | nullptr,                               /* reserved */            \ | 
|  | HWY_CHOOSE_NEON_BF16(func_name),       /* NEON + f16/dot/bf16 */ \ | 
|  | nullptr,                               /* reserved */            \ | 
|  | HWY_CHOOSE_NEON(func_name),            /* NEON */                \ | 
|  | HWY_CHOOSE_NEON_WITHOUT_AES(func_name) /* NEON without AES */ | 
|  |  | 
|  | #elif HWY_ARCH_RISCV | 
|  | // See HWY_ARCH_X86 above for details. | 
|  | #define HWY_MAX_DYNAMIC_TARGETS 9 | 
|  | #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_RVV | 
|  | #define HWY_CHOOSE_TARGET_LIST(func_name)       \ | 
|  | nullptr,                       /* reserved */ \ | 
|  | nullptr,                   /* reserved */ \ | 
|  | nullptr,                   /* reserved */ \ | 
|  | nullptr,                   /* reserved */ \ | 
|  | nullptr,                   /* reserved */ \ | 
|  | nullptr,                   /* reserved */ \ | 
|  | nullptr,                   /* reserved */ \ | 
|  | HWY_CHOOSE_RVV(func_name), /* RVV */      \ | 
|  | nullptr                    /* reserved */ | 
|  |  | 
|  | #elif HWY_ARCH_PPC || HWY_ARCH_S390X | 
|  | // See HWY_ARCH_X86 above for details. | 
|  | #define HWY_MAX_DYNAMIC_TARGETS 9 | 
|  | #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_PPC | 
|  | #define HWY_CHOOSE_TARGET_LIST(func_name)         \ | 
|  | nullptr,                         /* reserved */ \ | 
|  | nullptr,                     /* reserved */ \ | 
|  | nullptr,                     /* reserved */ \ | 
|  | nullptr,                     /* reserved */ \ | 
|  | HWY_CHOOSE_PPC10(func_name), /* PPC10 */    \ | 
|  | HWY_CHOOSE_PPC9(func_name),  /* PPC9 */     \ | 
|  | HWY_CHOOSE_PPC8(func_name),  /* PPC8 */     \ | 
|  | HWY_CHOOSE_Z15(func_name),   /* Z15 */      \ | 
|  | HWY_CHOOSE_Z14(func_name)    /* Z14 */ | 
|  |  | 
|  | #elif HWY_ARCH_WASM | 
|  | // See HWY_ARCH_X86 above for details. | 
|  | #define HWY_MAX_DYNAMIC_TARGETS 9 | 
|  | #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_WASM | 
|  | #define HWY_CHOOSE_TARGET_LIST(func_name)                  \ | 
|  | nullptr,                               /* reserved */    \ | 
|  | nullptr,                           /* reserved */    \ | 
|  | nullptr,                           /* reserved */    \ | 
|  | nullptr,                           /* reserved */    \ | 
|  | nullptr,                           /* reserved */    \ | 
|  | nullptr,                           /* reserved */    \ | 
|  | HWY_CHOOSE_WASM_EMU256(func_name), /* WASM_EMU256 */ \ | 
|  | HWY_CHOOSE_WASM(func_name),        /* WASM */        \ | 
|  | nullptr                            /* reserved */ | 
|  |  | 
|  | #elif HWY_ARCH_LOONGARCH | 
|  | #define HWY_MAX_DYNAMIC_TARGETS 3 | 
|  | #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_LOONGARCH | 
|  | #define HWY_CHOOSE_TARGET_LIST(func_name)        \ | 
|  | nullptr,                        /* reserved */ \ | 
|  | HWY_CHOOSE_LASX(func_name), /* LASX */     \ | 
|  | HWY_CHOOSE_LSX(func_name)   /* LSX */ | 
|  |  | 
|  | #else | 
|  | // Unknown architecture, will use HWY_SCALAR without dynamic dispatch, though | 
|  | // still creating single-entry tables in HWY_EXPORT to ensure portability. | 
|  | #define HWY_MAX_DYNAMIC_TARGETS 1 | 
|  | #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_SCALAR | 
|  | #endif | 
|  |  | 
|  | // Bitfield of supported and enabled targets. The format differs from that of | 
|  | // HWY_TARGETS; the lowest bit governs the first function pointer (which is | 
|  | // special in that it calls FunctionCache, then Update, then dispatches to the | 
|  | // actual implementation) in the tables created by HWY_EXPORT. Monostate (see | 
|  | // GetChosenTarget), thread-safe except on RVV. | 
|  | struct ChosenTarget { | 
|  | public: | 
|  | // Reset bits according to `targets` (typically the return value of | 
|  | // SupportedTargets()). Postcondition: IsInitialized() == true. | 
|  | void Update(int64_t targets) { | 
|  | // These are `targets` shifted downwards, see above. Also include SCALAR | 
|  | // (corresponds to the last entry in the function table) as fallback. | 
|  | StoreMask(HWY_CHOSEN_TARGET_SHIFT(targets) | HWY_CHOSEN_TARGET_MASK_SCALAR); | 
|  | } | 
|  |  | 
|  | // Reset to the uninitialized state, so that FunctionCache will call Update | 
|  | // during the next HWY_DYNAMIC_DISPATCH, and IsInitialized returns false. | 
|  | void DeInit() { StoreMask(1); } | 
|  |  | 
|  | // Whether Update was called. This indicates whether any HWY_DYNAMIC_DISPATCH | 
|  | // function was called, which we check in tests. | 
|  | bool IsInitialized() const { return LoadMask() != 1; } | 
|  |  | 
|  | // Return the index in the dynamic dispatch table to be used by the current | 
|  | // CPU. Note that this method must be in the header file so it uses the value | 
|  | // of HWY_CHOSEN_TARGET_MASK_TARGETS defined in the translation unit that | 
|  | // calls it, which may be different from others. This means we only enable | 
|  | // those targets that were actually compiled in this module. | 
|  | size_t HWY_INLINE GetIndex() const { | 
|  | return hwy::Num0BitsBelowLS1Bit_Nonzero64( | 
|  | static_cast<uint64_t>(LoadMask() & HWY_CHOSEN_TARGET_MASK_TARGETS)); | 
|  | } | 
|  |  | 
|  | private: | 
|  | #if defined(HWY_NO_LIBCXX) | 
|  | int64_t LoadMask() const { return mask_; } | 
|  | void StoreMask(int64_t mask) { mask_ = mask; } | 
|  |  | 
|  | int64_t mask_{1};  // Initialized to 1 so GetIndex() returns 0. | 
|  | #else | 
|  | int64_t LoadMask() const { return mask_.load(); } | 
|  | void StoreMask(int64_t mask) { mask_.store(mask); } | 
|  |  | 
|  | std::atomic<int64_t> mask_{1};  // Initialized to 1 so GetIndex() returns 0. | 
|  | #endif  // HWY_ARCH_RISCV | 
|  | }; | 
|  |  | 
|  | // For internal use (e.g. by FunctionCache and DisableTargets). | 
|  | HWY_DLLEXPORT ChosenTarget& GetChosenTarget(); | 
|  |  | 
|  | }  // namespace hwy | 
|  |  | 
|  | #endif  // HIGHWAY_HWY_TARGETS_H_ |