diff options
Diffstat (limited to 'third_party/highway/hwy/targets.cc')
-rw-r--r-- | third_party/highway/hwy/targets.cc | 433 |
1 files changed, 433 insertions, 0 deletions
diff --git a/third_party/highway/hwy/targets.cc b/third_party/highway/hwy/targets.cc new file mode 100644 index 0000000000..dc4217c8fe --- /dev/null +++ b/third_party/highway/hwy/targets.cc @@ -0,0 +1,433 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/targets.h" + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS // before inttypes.h +#endif +#include <inttypes.h> // PRIx64 +#include <stdarg.h> +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> + +#include <atomic> + +#include "hwy/per_target.h" // VectorBytes + +#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN +#include "sanitizer/common_interface_defs.h" // __sanitizer_print_stack_trace +#endif + +#include <stdlib.h> // abort / exit + +#if HWY_ARCH_X86 +#include <xmmintrin.h> +#if HWY_COMPILER_MSVC +#include <intrin.h> +#else // !HWY_COMPILER_MSVC +#include <cpuid.h> +#endif // HWY_COMPILER_MSVC + +#elif HWY_ARCH_ARM && HWY_OS_LINUX && !defined(TOOLCHAIN_MISS_SYS_AUXV_H) +#include <sys/auxv.h> +#endif // HWY_ARCH_* + +namespace hwy { +namespace { + +#if HWY_ARCH_X86 + +HWY_INLINE bool IsBitSet(const uint32_t reg, const int index) { + return (reg & (1U << index)) != 0; +} + +// Calls CPUID instruction with eax=level and ecx=count and returns the result +// in abcd array where abcd = {eax, ebx, ecx, edx} (hence the name abcd). +HWY_INLINE void Cpuid(const uint32_t level, const uint32_t count, + uint32_t* HWY_RESTRICT abcd) { +#if HWY_COMPILER_MSVC + int regs[4]; + __cpuidex(regs, level, count); + for (int i = 0; i < 4; ++i) { + abcd[i] = regs[i]; + } +#else // HWY_COMPILER_MSVC + uint32_t a; + uint32_t b; + uint32_t c; + uint32_t d; + __cpuid_count(level, count, a, b, c, d); + abcd[0] = a; + abcd[1] = b; + abcd[2] = c; + abcd[3] = d; +#endif // HWY_COMPILER_MSVC +} + +// Returns the lower 32 bits of extended control register 0. +// Requires CPU support for "OSXSAVE" (see below). +uint32_t ReadXCR0() { +#if HWY_COMPILER_MSVC + return static_cast<uint32_t>(_xgetbv(0)); +#else // HWY_COMPILER_MSVC + uint32_t xcr0, xcr0_high; + const uint32_t index = 0; + asm volatile(".byte 0x0F, 0x01, 0xD0" + : "=a"(xcr0), "=d"(xcr0_high) + : "c"(index)); + return xcr0; +#endif // HWY_COMPILER_MSVC +} + +#endif // HWY_ARCH_X86 + +// When running tests, this value can be set to the mocked supported targets +// mask. Only written to from a single thread before the test starts. +int64_t supported_targets_for_test_ = 0; + +// Mask of targets disabled at runtime with DisableTargets. +int64_t supported_mask_ = LimitsMax<int64_t>(); + +#if HWY_ARCH_X86 +// Arbitrary bit indices indicating which instruction set extensions are +// supported. Use enum to ensure values are distinct. +enum class FeatureIndex : uint32_t { + kSSE = 0, + kSSE2, + kSSE3, + kSSSE3, + + kSSE41, + kSSE42, + kCLMUL, + kAES, + + kAVX, + kAVX2, + kF16C, + kFMA, + kLZCNT, + kBMI, + kBMI2, + + kAVX512F, + kAVX512VL, + kAVX512DQ, + kAVX512BW, + + kVNNI, + kVPCLMULQDQ, + kVBMI, + kVBMI2, + kVAES, + kPOPCNTDQ, + kBITALG, + + kSentinel +}; +static_assert(static_cast<size_t>(FeatureIndex::kSentinel) < 64, + "Too many bits for u64"); + +HWY_INLINE constexpr uint64_t Bit(FeatureIndex index) { + return 1ull << static_cast<size_t>(index); +} + +constexpr uint64_t kGroupSSSE3 = + Bit(FeatureIndex::kSSE) | Bit(FeatureIndex::kSSE2) | + Bit(FeatureIndex::kSSE3) | Bit(FeatureIndex::kSSSE3); + +constexpr uint64_t kGroupSSE4 = + Bit(FeatureIndex::kSSE41) | Bit(FeatureIndex::kSSE42) | + Bit(FeatureIndex::kCLMUL) | Bit(FeatureIndex::kAES) | kGroupSSSE3; + +// We normally assume BMI/BMI2/FMA are available if AVX2 is. This allows us to +// use BZHI and (compiler-generated) MULX. However, VirtualBox lacks them +// [https://www.virtualbox.org/ticket/15471]. Thus we provide the option of +// avoiding using and requiring these so AVX2 can still be used. +#ifdef HWY_DISABLE_BMI2_FMA +constexpr uint64_t kGroupBMI2_FMA = 0; +#else +constexpr uint64_t kGroupBMI2_FMA = Bit(FeatureIndex::kBMI) | + Bit(FeatureIndex::kBMI2) | + Bit(FeatureIndex::kFMA); +#endif + +#ifdef HWY_DISABLE_F16C +constexpr uint64_t kGroupF16C = 0; +#else +constexpr uint64_t kGroupF16C = Bit(FeatureIndex::kF16C); +#endif + +constexpr uint64_t kGroupAVX2 = + Bit(FeatureIndex::kAVX) | Bit(FeatureIndex::kAVX2) | + Bit(FeatureIndex::kLZCNT) | kGroupBMI2_FMA | kGroupF16C | kGroupSSE4; + +constexpr uint64_t kGroupAVX3 = + Bit(FeatureIndex::kAVX512F) | Bit(FeatureIndex::kAVX512VL) | + Bit(FeatureIndex::kAVX512DQ) | Bit(FeatureIndex::kAVX512BW) | kGroupAVX2; + +constexpr uint64_t kGroupAVX3_DL = + Bit(FeatureIndex::kVNNI) | Bit(FeatureIndex::kVPCLMULQDQ) | + Bit(FeatureIndex::kVBMI) | Bit(FeatureIndex::kVBMI2) | + Bit(FeatureIndex::kVAES) | Bit(FeatureIndex::kPOPCNTDQ) | + Bit(FeatureIndex::kBITALG) | kGroupAVX3; + +#endif // HWY_ARCH_X86 + +// Returns targets supported by the CPU, independently of DisableTargets. +// Factored out of SupportedTargets to make its structure more obvious. Note +// that x86 CPUID may take several hundred cycles. +int64_t DetectTargets() { + // Apps will use only one of these (the default is EMU128), but compile flags + // for this TU may differ from that of the app, so allow both. + int64_t bits = HWY_SCALAR | HWY_EMU128; + +#if HWY_ARCH_X86 + bool has_osxsave = false; + { // ensures we do not accidentally use flags outside this block + uint64_t flags = 0; + uint32_t abcd[4]; + + Cpuid(0, 0, abcd); + const uint32_t max_level = abcd[0]; + + // Standard feature flags + Cpuid(1, 0, abcd); + flags |= IsBitSet(abcd[3], 25) ? Bit(FeatureIndex::kSSE) : 0; + flags |= IsBitSet(abcd[3], 26) ? Bit(FeatureIndex::kSSE2) : 0; + flags |= IsBitSet(abcd[2], 0) ? Bit(FeatureIndex::kSSE3) : 0; + flags |= IsBitSet(abcd[2], 1) ? Bit(FeatureIndex::kCLMUL) : 0; + flags |= IsBitSet(abcd[2], 9) ? Bit(FeatureIndex::kSSSE3) : 0; + flags |= IsBitSet(abcd[2], 12) ? Bit(FeatureIndex::kFMA) : 0; + flags |= IsBitSet(abcd[2], 19) ? Bit(FeatureIndex::kSSE41) : 0; + flags |= IsBitSet(abcd[2], 20) ? Bit(FeatureIndex::kSSE42) : 0; + flags |= IsBitSet(abcd[2], 25) ? Bit(FeatureIndex::kAES) : 0; + flags |= IsBitSet(abcd[2], 28) ? Bit(FeatureIndex::kAVX) : 0; + flags |= IsBitSet(abcd[2], 29) ? Bit(FeatureIndex::kF16C) : 0; + has_osxsave = IsBitSet(abcd[2], 27); + + // Extended feature flags + Cpuid(0x80000001U, 0, abcd); + flags |= IsBitSet(abcd[2], 5) ? Bit(FeatureIndex::kLZCNT) : 0; + + // Extended features + if (max_level >= 7) { + Cpuid(7, 0, abcd); + flags |= IsBitSet(abcd[1], 3) ? Bit(FeatureIndex::kBMI) : 0; + flags |= IsBitSet(abcd[1], 5) ? Bit(FeatureIndex::kAVX2) : 0; + flags |= IsBitSet(abcd[1], 8) ? Bit(FeatureIndex::kBMI2) : 0; + + flags |= IsBitSet(abcd[1], 16) ? Bit(FeatureIndex::kAVX512F) : 0; + flags |= IsBitSet(abcd[1], 17) ? Bit(FeatureIndex::kAVX512DQ) : 0; + flags |= IsBitSet(abcd[1], 30) ? Bit(FeatureIndex::kAVX512BW) : 0; + flags |= IsBitSet(abcd[1], 31) ? Bit(FeatureIndex::kAVX512VL) : 0; + + flags |= IsBitSet(abcd[2], 1) ? Bit(FeatureIndex::kVBMI) : 0; + flags |= IsBitSet(abcd[2], 6) ? Bit(FeatureIndex::kVBMI2) : 0; + flags |= IsBitSet(abcd[2], 9) ? Bit(FeatureIndex::kVAES) : 0; + flags |= IsBitSet(abcd[2], 10) ? Bit(FeatureIndex::kVPCLMULQDQ) : 0; + flags |= IsBitSet(abcd[2], 11) ? Bit(FeatureIndex::kVNNI) : 0; + flags |= IsBitSet(abcd[2], 12) ? Bit(FeatureIndex::kBITALG) : 0; + flags |= IsBitSet(abcd[2], 14) ? Bit(FeatureIndex::kPOPCNTDQ) : 0; + } + + // Set target bit(s) if all their group's flags are all set. + if ((flags & kGroupAVX3_DL) == kGroupAVX3_DL) { + bits |= HWY_AVX3_DL; + } + if ((flags & kGroupAVX3) == kGroupAVX3) { + bits |= HWY_AVX3; + } + if ((flags & kGroupAVX2) == kGroupAVX2) { + bits |= HWY_AVX2; + } + if ((flags & kGroupSSE4) == kGroupSSE4) { + bits |= HWY_SSE4; + } + if ((flags & kGroupSSSE3) == kGroupSSSE3) { + bits |= HWY_SSSE3; + } + } + + // Clear bits if the OS does not support XSAVE - otherwise, registers + // are not preserved across context switches. + if (has_osxsave) { + const uint32_t xcr0 = ReadXCR0(); + const int64_t min_avx3 = HWY_AVX3 | HWY_AVX3_DL; + const int64_t min_avx2 = HWY_AVX2 | min_avx3; + // XMM + if (!IsBitSet(xcr0, 1)) { + bits &= ~(HWY_SSSE3 | HWY_SSE4 | min_avx2); + } + // YMM + if (!IsBitSet(xcr0, 2)) { + bits &= ~min_avx2; + } + // opmask, ZMM lo/hi + if (!IsBitSet(xcr0, 5) || !IsBitSet(xcr0, 6) || !IsBitSet(xcr0, 7)) { + bits &= ~min_avx3; + } + } + + if ((bits & HWY_ENABLED_BASELINE) != HWY_ENABLED_BASELINE) { + fprintf(stderr, + "WARNING: CPU supports %" PRIx64 " but software requires %" PRIx64 + "\n", + bits, static_cast<int64_t>(HWY_ENABLED_BASELINE)); + } + +#elif HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH + using CapBits = unsigned long; // NOLINT + const CapBits hw = getauxval(AT_HWCAP); + (void)hw; + +#if HWY_ARCH_ARM_A64 + +#if defined(HWCAP_AES) + // aarch64 always has NEON and VFPv4, but not necessarily AES, which we + // require and thus must still check for. + if (hw & HWCAP_AES) { + bits |= HWY_NEON; + } +#endif // HWCAP_AES + +#if defined(HWCAP_SVE) + if (hw & HWCAP_SVE) { + bits |= HWY_SVE; + } +#endif + +#if defined(HWCAP2_SVE2) && defined(HWCAP2_SVEAES) + const CapBits hw2 = getauxval(AT_HWCAP2); + if ((hw2 & HWCAP2_SVE2) && (hw2 & HWCAP2_SVEAES)) { + bits |= HWY_SVE2; + } +#endif + +#else // HWY_ARCH_ARM_A64 + +// Some old auxv.h / hwcap.h do not define these. If not, treat as unsupported. +// Note that AES has a different HWCAP bit compared to aarch64. +#if defined(HWCAP_NEON) && defined(HWCAP_VFPv4) + if ((hw & HWCAP_NEON) && (hw & HWCAP_VFPv4)) { + bits |= HWY_NEON; + } +#endif + +#endif // HWY_ARCH_ARM_A64 + if ((bits & HWY_ENABLED_BASELINE) != HWY_ENABLED_BASELINE) { + fprintf(stderr, + "WARNING: CPU supports %" PRIx64 " but software requires %" PRIx64 + "\n", + bits, static_cast<int64_t>(HWY_ENABLED_BASELINE)); + } +#else // HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH + // TODO(janwas): detect for other platforms and check for baseline + // This file is typically compiled without HWY_IS_TEST, but targets_test has + // it set, and will expect all of its HWY_TARGETS (= all attainable) to be + // supported. + bits |= HWY_ENABLED_BASELINE; +#endif // HWY_ARCH_X86 + + return bits; +} + +} // namespace + +HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4) + Abort(const char* file, int line, const char* format, ...) { + char buf[2000]; + va_list args; + va_start(args, format); + vsnprintf(buf, sizeof(buf), format, args); + va_end(args); + + fprintf(stderr, "Abort at %s:%d: %s\n", file, line, buf); + +// If compiled with any sanitizer, they can also print a stack trace. +#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN + __sanitizer_print_stack_trace(); +#endif // HWY_IS_* + fflush(stderr); + +// Now terminate the program: +#if HWY_ARCH_RVV + exit(1); // trap/abort just freeze Spike. +#elif HWY_IS_DEBUG_BUILD && !HWY_COMPILER_MSVC + // Facilitates breaking into a debugger, but don't use this in non-debug + // builds because it looks like "illegal instruction", which is misleading. + __builtin_trap(); +#else + abort(); // Compile error without this due to HWY_NORETURN. +#endif +} + +HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets) { + supported_mask_ = static_cast<int64_t>(~disabled_targets); + // This will take effect on the next call to SupportedTargets, which is + // called right before GetChosenTarget::Update. However, calling Update here + // would make it appear that HWY_DYNAMIC_DISPATCH was called, which we want + // to check in tests. We instead de-initialize such that the next + // HWY_DYNAMIC_DISPATCH calls GetChosenTarget::Update via FunctionCache. + GetChosenTarget().DeInit(); +} + +HWY_DLLEXPORT void SetSupportedTargetsForTest(int64_t targets) { + supported_targets_for_test_ = targets; + GetChosenTarget().DeInit(); // see comment above +} + +HWY_DLLEXPORT int64_t SupportedTargets() { + int64_t targets = supported_targets_for_test_; + if (HWY_LIKELY(targets == 0)) { + // Mock not active. Re-detect instead of caching just in case we're on a + // heterogeneous ISA (also requires some app support to pin threads). This + // is only reached on the first HWY_DYNAMIC_DISPATCH or after each call to + // DisableTargets or SetSupportedTargetsForTest. + targets = DetectTargets(); + + // VectorBytes invokes HWY_DYNAMIC_DISPATCH. To prevent infinite recursion, + // first set up ChosenTarget. No need to Update() again afterwards with the + // final targets - that will be done by a caller of this function. + GetChosenTarget().Update(targets); + + // Now that we can call VectorBytes, check for targets with specific sizes. + if (HWY_ARCH_ARM_A64) { + const size_t vec_bytes = VectorBytes(); // uncached, see declaration + if ((targets & HWY_SVE) && vec_bytes == 32) { + targets = static_cast<int64_t>(targets | HWY_SVE_256); + } else { + targets = static_cast<int64_t>(targets & ~HWY_SVE_256); + } + if ((targets & HWY_SVE2) && vec_bytes == 16) { + targets = static_cast<int64_t>(targets | HWY_SVE2_128); + } else { + targets = static_cast<int64_t>(targets & ~HWY_SVE2_128); + } + } // HWY_ARCH_ARM_A64 + } + + targets &= supported_mask_; + return targets == 0 ? HWY_STATIC_TARGET : targets; +} + +HWY_DLLEXPORT ChosenTarget& GetChosenTarget() { + static ChosenTarget chosen_target; + return chosen_target; +} + +} // namespace hwy |