summaryrefslogtreecommitdiffstats
path: root/third_party/highway/hwy/targets.cc
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/highway/hwy/targets.cc')
-rw-r--r--third_party/highway/hwy/targets.cc433
1 files changed, 433 insertions, 0 deletions
diff --git a/third_party/highway/hwy/targets.cc b/third_party/highway/hwy/targets.cc
new file mode 100644
index 0000000000..dc4217c8fe
--- /dev/null
+++ b/third_party/highway/hwy/targets.cc
@@ -0,0 +1,433 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/targets.h"
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS // before inttypes.h
+#endif
+#include <inttypes.h> // PRIx64
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <atomic>
+
+#include "hwy/per_target.h" // VectorBytes
+
+#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
+#include "sanitizer/common_interface_defs.h" // __sanitizer_print_stack_trace
+#endif
+
+#include <stdlib.h> // abort / exit
+
+#if HWY_ARCH_X86
+#include <xmmintrin.h>
+#if HWY_COMPILER_MSVC
+#include <intrin.h>
+#else // !HWY_COMPILER_MSVC
+#include <cpuid.h>
+#endif // HWY_COMPILER_MSVC
+
+#elif HWY_ARCH_ARM && HWY_OS_LINUX && !defined(TOOLCHAIN_MISS_SYS_AUXV_H)
+#include <sys/auxv.h>
+#endif // HWY_ARCH_*
+
+namespace hwy {
+namespace {
+
+#if HWY_ARCH_X86
+
+HWY_INLINE bool IsBitSet(const uint32_t reg, const int index) {
+ return (reg & (1U << index)) != 0;
+}
+
+// Calls CPUID instruction with eax=level and ecx=count and returns the result
+// in abcd array where abcd = {eax, ebx, ecx, edx} (hence the name abcd).
+HWY_INLINE void Cpuid(const uint32_t level, const uint32_t count,
+ uint32_t* HWY_RESTRICT abcd) {
+#if HWY_COMPILER_MSVC
+ int regs[4];
+ __cpuidex(regs, level, count);
+ for (int i = 0; i < 4; ++i) {
+ abcd[i] = regs[i];
+ }
+#else // HWY_COMPILER_MSVC
+ uint32_t a;
+ uint32_t b;
+ uint32_t c;
+ uint32_t d;
+ __cpuid_count(level, count, a, b, c, d);
+ abcd[0] = a;
+ abcd[1] = b;
+ abcd[2] = c;
+ abcd[3] = d;
+#endif // HWY_COMPILER_MSVC
+}
+
+// Returns the lower 32 bits of extended control register 0.
+// Requires CPU support for "OSXSAVE" (see below).
+uint32_t ReadXCR0() {
+#if HWY_COMPILER_MSVC
+ return static_cast<uint32_t>(_xgetbv(0));
+#else // HWY_COMPILER_MSVC
+ uint32_t xcr0, xcr0_high;
+ const uint32_t index = 0;
+ asm volatile(".byte 0x0F, 0x01, 0xD0"
+ : "=a"(xcr0), "=d"(xcr0_high)
+ : "c"(index));
+ return xcr0;
+#endif // HWY_COMPILER_MSVC
+}
+
+#endif // HWY_ARCH_X86
+
+// When running tests, this value can be set to the mocked supported targets
+// mask. Only written to from a single thread before the test starts.
+int64_t supported_targets_for_test_ = 0;
+
+// Mask of targets disabled at runtime with DisableTargets.
+int64_t supported_mask_ = LimitsMax<int64_t>();
+
+#if HWY_ARCH_X86
+// Arbitrary bit indices indicating which instruction set extensions are
+// supported. Use enum to ensure values are distinct.
+enum class FeatureIndex : uint32_t {
+ kSSE = 0,
+ kSSE2,
+ kSSE3,
+ kSSSE3,
+
+ kSSE41,
+ kSSE42,
+ kCLMUL,
+ kAES,
+
+ kAVX,
+ kAVX2,
+ kF16C,
+ kFMA,
+ kLZCNT,
+ kBMI,
+ kBMI2,
+
+ kAVX512F,
+ kAVX512VL,
+ kAVX512DQ,
+ kAVX512BW,
+
+ kVNNI,
+ kVPCLMULQDQ,
+ kVBMI,
+ kVBMI2,
+ kVAES,
+ kPOPCNTDQ,
+ kBITALG,
+
+ kSentinel
+};
+static_assert(static_cast<size_t>(FeatureIndex::kSentinel) < 64,
+ "Too many bits for u64");
+
+HWY_INLINE constexpr uint64_t Bit(FeatureIndex index) {
+ return 1ull << static_cast<size_t>(index);
+}
+
+constexpr uint64_t kGroupSSSE3 =
+ Bit(FeatureIndex::kSSE) | Bit(FeatureIndex::kSSE2) |
+ Bit(FeatureIndex::kSSE3) | Bit(FeatureIndex::kSSSE3);
+
+constexpr uint64_t kGroupSSE4 =
+ Bit(FeatureIndex::kSSE41) | Bit(FeatureIndex::kSSE42) |
+ Bit(FeatureIndex::kCLMUL) | Bit(FeatureIndex::kAES) | kGroupSSSE3;
+
+// We normally assume BMI/BMI2/FMA are available if AVX2 is. This allows us to
+// use BZHI and (compiler-generated) MULX. However, VirtualBox lacks them
+// [https://www.virtualbox.org/ticket/15471]. Thus we provide the option of
+// avoiding using and requiring these so AVX2 can still be used.
+#ifdef HWY_DISABLE_BMI2_FMA
+constexpr uint64_t kGroupBMI2_FMA = 0;
+#else
+constexpr uint64_t kGroupBMI2_FMA = Bit(FeatureIndex::kBMI) |
+ Bit(FeatureIndex::kBMI2) |
+ Bit(FeatureIndex::kFMA);
+#endif
+
+#ifdef HWY_DISABLE_F16C
+constexpr uint64_t kGroupF16C = 0;
+#else
+constexpr uint64_t kGroupF16C = Bit(FeatureIndex::kF16C);
+#endif
+
+constexpr uint64_t kGroupAVX2 =
+ Bit(FeatureIndex::kAVX) | Bit(FeatureIndex::kAVX2) |
+ Bit(FeatureIndex::kLZCNT) | kGroupBMI2_FMA | kGroupF16C | kGroupSSE4;
+
+constexpr uint64_t kGroupAVX3 =
+ Bit(FeatureIndex::kAVX512F) | Bit(FeatureIndex::kAVX512VL) |
+ Bit(FeatureIndex::kAVX512DQ) | Bit(FeatureIndex::kAVX512BW) | kGroupAVX2;
+
+constexpr uint64_t kGroupAVX3_DL =
+ Bit(FeatureIndex::kVNNI) | Bit(FeatureIndex::kVPCLMULQDQ) |
+ Bit(FeatureIndex::kVBMI) | Bit(FeatureIndex::kVBMI2) |
+ Bit(FeatureIndex::kVAES) | Bit(FeatureIndex::kPOPCNTDQ) |
+ Bit(FeatureIndex::kBITALG) | kGroupAVX3;
+
+#endif // HWY_ARCH_X86
+
+// Returns targets supported by the CPU, independently of DisableTargets.
+// Factored out of SupportedTargets to make its structure more obvious. Note
+// that x86 CPUID may take several hundred cycles.
+int64_t DetectTargets() {
+ // Apps will use only one of these (the default is EMU128), but compile flags
+ // for this TU may differ from that of the app, so allow both.
+ int64_t bits = HWY_SCALAR | HWY_EMU128;
+
+#if HWY_ARCH_X86
+ bool has_osxsave = false;
+ { // ensures we do not accidentally use flags outside this block
+ uint64_t flags = 0;
+ uint32_t abcd[4];
+
+ Cpuid(0, 0, abcd);
+ const uint32_t max_level = abcd[0];
+
+ // Standard feature flags
+ Cpuid(1, 0, abcd);
+ flags |= IsBitSet(abcd[3], 25) ? Bit(FeatureIndex::kSSE) : 0;
+ flags |= IsBitSet(abcd[3], 26) ? Bit(FeatureIndex::kSSE2) : 0;
+ flags |= IsBitSet(abcd[2], 0) ? Bit(FeatureIndex::kSSE3) : 0;
+ flags |= IsBitSet(abcd[2], 1) ? Bit(FeatureIndex::kCLMUL) : 0;
+ flags |= IsBitSet(abcd[2], 9) ? Bit(FeatureIndex::kSSSE3) : 0;
+ flags |= IsBitSet(abcd[2], 12) ? Bit(FeatureIndex::kFMA) : 0;
+ flags |= IsBitSet(abcd[2], 19) ? Bit(FeatureIndex::kSSE41) : 0;
+ flags |= IsBitSet(abcd[2], 20) ? Bit(FeatureIndex::kSSE42) : 0;
+ flags |= IsBitSet(abcd[2], 25) ? Bit(FeatureIndex::kAES) : 0;
+ flags |= IsBitSet(abcd[2], 28) ? Bit(FeatureIndex::kAVX) : 0;
+ flags |= IsBitSet(abcd[2], 29) ? Bit(FeatureIndex::kF16C) : 0;
+ has_osxsave = IsBitSet(abcd[2], 27);
+
+ // Extended feature flags
+ Cpuid(0x80000001U, 0, abcd);
+ flags |= IsBitSet(abcd[2], 5) ? Bit(FeatureIndex::kLZCNT) : 0;
+
+ // Extended features
+ if (max_level >= 7) {
+ Cpuid(7, 0, abcd);
+ flags |= IsBitSet(abcd[1], 3) ? Bit(FeatureIndex::kBMI) : 0;
+ flags |= IsBitSet(abcd[1], 5) ? Bit(FeatureIndex::kAVX2) : 0;
+ flags |= IsBitSet(abcd[1], 8) ? Bit(FeatureIndex::kBMI2) : 0;
+
+ flags |= IsBitSet(abcd[1], 16) ? Bit(FeatureIndex::kAVX512F) : 0;
+ flags |= IsBitSet(abcd[1], 17) ? Bit(FeatureIndex::kAVX512DQ) : 0;
+ flags |= IsBitSet(abcd[1], 30) ? Bit(FeatureIndex::kAVX512BW) : 0;
+ flags |= IsBitSet(abcd[1], 31) ? Bit(FeatureIndex::kAVX512VL) : 0;
+
+ flags |= IsBitSet(abcd[2], 1) ? Bit(FeatureIndex::kVBMI) : 0;
+ flags |= IsBitSet(abcd[2], 6) ? Bit(FeatureIndex::kVBMI2) : 0;
+ flags |= IsBitSet(abcd[2], 9) ? Bit(FeatureIndex::kVAES) : 0;
+ flags |= IsBitSet(abcd[2], 10) ? Bit(FeatureIndex::kVPCLMULQDQ) : 0;
+ flags |= IsBitSet(abcd[2], 11) ? Bit(FeatureIndex::kVNNI) : 0;
+ flags |= IsBitSet(abcd[2], 12) ? Bit(FeatureIndex::kBITALG) : 0;
+ flags |= IsBitSet(abcd[2], 14) ? Bit(FeatureIndex::kPOPCNTDQ) : 0;
+ }
+
+ // Set target bit(s) if all their group's flags are all set.
+ if ((flags & kGroupAVX3_DL) == kGroupAVX3_DL) {
+ bits |= HWY_AVX3_DL;
+ }
+ if ((flags & kGroupAVX3) == kGroupAVX3) {
+ bits |= HWY_AVX3;
+ }
+ if ((flags & kGroupAVX2) == kGroupAVX2) {
+ bits |= HWY_AVX2;
+ }
+ if ((flags & kGroupSSE4) == kGroupSSE4) {
+ bits |= HWY_SSE4;
+ }
+ if ((flags & kGroupSSSE3) == kGroupSSSE3) {
+ bits |= HWY_SSSE3;
+ }
+ }
+
+ // Clear bits if the OS does not support XSAVE - otherwise, registers
+ // are not preserved across context switches.
+ if (has_osxsave) {
+ const uint32_t xcr0 = ReadXCR0();
+ const int64_t min_avx3 = HWY_AVX3 | HWY_AVX3_DL;
+ const int64_t min_avx2 = HWY_AVX2 | min_avx3;
+ // XMM
+ if (!IsBitSet(xcr0, 1)) {
+ bits &= ~(HWY_SSSE3 | HWY_SSE4 | min_avx2);
+ }
+ // YMM
+ if (!IsBitSet(xcr0, 2)) {
+ bits &= ~min_avx2;
+ }
+ // opmask, ZMM lo/hi
+ if (!IsBitSet(xcr0, 5) || !IsBitSet(xcr0, 6) || !IsBitSet(xcr0, 7)) {
+ bits &= ~min_avx3;
+ }
+ }
+
+ if ((bits & HWY_ENABLED_BASELINE) != HWY_ENABLED_BASELINE) {
+ fprintf(stderr,
+ "WARNING: CPU supports %" PRIx64 " but software requires %" PRIx64
+ "\n",
+ bits, static_cast<int64_t>(HWY_ENABLED_BASELINE));
+ }
+
+#elif HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH
+ using CapBits = unsigned long; // NOLINT
+ const CapBits hw = getauxval(AT_HWCAP);
+ (void)hw;
+
+#if HWY_ARCH_ARM_A64
+
+#if defined(HWCAP_AES)
+ // aarch64 always has NEON and VFPv4, but not necessarily AES, which we
+ // require and thus must still check for.
+ if (hw & HWCAP_AES) {
+ bits |= HWY_NEON;
+ }
+#endif // HWCAP_AES
+
+#if defined(HWCAP_SVE)
+ if (hw & HWCAP_SVE) {
+ bits |= HWY_SVE;
+ }
+#endif
+
+#if defined(HWCAP2_SVE2) && defined(HWCAP2_SVEAES)
+ const CapBits hw2 = getauxval(AT_HWCAP2);
+ if ((hw2 & HWCAP2_SVE2) && (hw2 & HWCAP2_SVEAES)) {
+ bits |= HWY_SVE2;
+ }
+#endif
+
+#else // HWY_ARCH_ARM_A64
+
+// Some old auxv.h / hwcap.h do not define these. If not, treat as unsupported.
+// Note that AES has a different HWCAP bit compared to aarch64.
+#if defined(HWCAP_NEON) && defined(HWCAP_VFPv4)
+ if ((hw & HWCAP_NEON) && (hw & HWCAP_VFPv4)) {
+ bits |= HWY_NEON;
+ }
+#endif
+
+#endif // HWY_ARCH_ARM_A64
+ if ((bits & HWY_ENABLED_BASELINE) != HWY_ENABLED_BASELINE) {
+ fprintf(stderr,
+ "WARNING: CPU supports %" PRIx64 " but software requires %" PRIx64
+ "\n",
+ bits, static_cast<int64_t>(HWY_ENABLED_BASELINE));
+ }
+#else // HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH
+ // TODO(janwas): detect for other platforms and check for baseline
+ // This file is typically compiled without HWY_IS_TEST, but targets_test has
+ // it set, and will expect all of its HWY_TARGETS (= all attainable) to be
+ // supported.
+ bits |= HWY_ENABLED_BASELINE;
+#endif // HWY_ARCH_X86
+
+ return bits;
+}
+
+} // namespace
+
+HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
+ Abort(const char* file, int line, const char* format, ...) {
+ char buf[2000];
+ va_list args;
+ va_start(args, format);
+ vsnprintf(buf, sizeof(buf), format, args);
+ va_end(args);
+
+ fprintf(stderr, "Abort at %s:%d: %s\n", file, line, buf);
+
+// If compiled with any sanitizer, they can also print a stack trace.
+#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
+ __sanitizer_print_stack_trace();
+#endif // HWY_IS_*
+ fflush(stderr);
+
+// Now terminate the program:
+#if HWY_ARCH_RVV
+ exit(1); // trap/abort just freeze Spike.
+#elif HWY_IS_DEBUG_BUILD && !HWY_COMPILER_MSVC
+ // Facilitates breaking into a debugger, but don't use this in non-debug
+ // builds because it looks like "illegal instruction", which is misleading.
+ __builtin_trap();
+#else
+ abort(); // Compile error without this due to HWY_NORETURN.
+#endif
+}
+
+HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets) {
+ supported_mask_ = static_cast<int64_t>(~disabled_targets);
+ // This will take effect on the next call to SupportedTargets, which is
+ // called right before GetChosenTarget::Update. However, calling Update here
+ // would make it appear that HWY_DYNAMIC_DISPATCH was called, which we want
+ // to check in tests. We instead de-initialize such that the next
+ // HWY_DYNAMIC_DISPATCH calls GetChosenTarget::Update via FunctionCache.
+ GetChosenTarget().DeInit();
+}
+
+HWY_DLLEXPORT void SetSupportedTargetsForTest(int64_t targets) {
+ supported_targets_for_test_ = targets;
+ GetChosenTarget().DeInit(); // see comment above
+}
+
+HWY_DLLEXPORT int64_t SupportedTargets() {
+ int64_t targets = supported_targets_for_test_;
+ if (HWY_LIKELY(targets == 0)) {
+ // Mock not active. Re-detect instead of caching just in case we're on a
+ // heterogeneous ISA (also requires some app support to pin threads). This
+ // is only reached on the first HWY_DYNAMIC_DISPATCH or after each call to
+ // DisableTargets or SetSupportedTargetsForTest.
+ targets = DetectTargets();
+
+ // VectorBytes invokes HWY_DYNAMIC_DISPATCH. To prevent infinite recursion,
+ // first set up ChosenTarget. No need to Update() again afterwards with the
+ // final targets - that will be done by a caller of this function.
+ GetChosenTarget().Update(targets);
+
+ // Now that we can call VectorBytes, check for targets with specific sizes.
+ if (HWY_ARCH_ARM_A64) {
+ const size_t vec_bytes = VectorBytes(); // uncached, see declaration
+ if ((targets & HWY_SVE) && vec_bytes == 32) {
+ targets = static_cast<int64_t>(targets | HWY_SVE_256);
+ } else {
+ targets = static_cast<int64_t>(targets & ~HWY_SVE_256);
+ }
+ if ((targets & HWY_SVE2) && vec_bytes == 16) {
+ targets = static_cast<int64_t>(targets | HWY_SVE2_128);
+ } else {
+ targets = static_cast<int64_t>(targets & ~HWY_SVE2_128);
+ }
+ } // HWY_ARCH_ARM_A64
+ }
+
+ targets &= supported_mask_;
+ return targets == 0 ? HWY_STATIC_TARGET : targets;
+}
+
+HWY_DLLEXPORT ChosenTarget& GetChosenTarget() {
+ static ChosenTarget chosen_target;
+ return chosen_target;
+}
+
+} // namespace hwy