// Copyright 2020 Google LLC // SPDX-License-Identifier: Apache-2.0 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef HIGHWAY_HWY_BASE_H_ #define HIGHWAY_HWY_BASE_H_ // For SIMD module implementations and their callers, target-independent. #include #include #include "hwy/detect_compiler_arch.h" #include "hwy/highway_export.h" #if HWY_COMPILER_MSVC #include // memcpy #endif #if HWY_ARCH_X86 #include #endif //------------------------------------------------------------------------------ // Compiler-specific definitions #define HWY_STR_IMPL(macro) #macro #define HWY_STR(macro) HWY_STR_IMPL(macro) #if HWY_COMPILER_MSVC #include #define HWY_RESTRICT __restrict #define HWY_INLINE __forceinline #define HWY_NOINLINE __declspec(noinline) #define HWY_FLATTEN #define HWY_NORETURN __declspec(noreturn) #define HWY_LIKELY(expr) (expr) #define HWY_UNLIKELY(expr) (expr) #define HWY_PRAGMA(tokens) __pragma(tokens) #define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(warning(tokens)) #define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(msc) #define HWY_MAYBE_UNUSED #define HWY_HAS_ASSUME_ALIGNED 0 #if (_MSC_VER >= 1700) #define HWY_MUST_USE_RESULT _Check_return_ #else #define HWY_MUST_USE_RESULT #endif #else #define HWY_RESTRICT __restrict__ // force inlining without optimization enabled creates very inefficient code // that can cause compiler timeout #ifdef __OPTIMIZE__ #define HWY_INLINE inline __attribute__((always_inline)) #else #define HWY_INLINE inline #endif #define HWY_NOINLINE __attribute__((noinline)) #define HWY_FLATTEN __attribute__((flatten)) #define HWY_NORETURN __attribute__((noreturn)) #define HWY_LIKELY(expr) __builtin_expect(!!(expr), 1) #define HWY_UNLIKELY(expr) __builtin_expect(!!(expr), 0) #define HWY_PRAGMA(tokens) _Pragma(#tokens) #define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(GCC diagnostic tokens) #define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(gcc) // Encountered "attribute list cannot appear here" when using the C++17 // [[maybe_unused]], so only use the old style attribute for now. #define HWY_MAYBE_UNUSED __attribute__((unused)) #define HWY_MUST_USE_RESULT __attribute__((warn_unused_result)) #endif // !HWY_COMPILER_MSVC //------------------------------------------------------------------------------ // Builtin/attributes // Enables error-checking of format strings. #if HWY_HAS_ATTRIBUTE(__format__) #define HWY_FORMAT(idx_fmt, idx_arg) \ __attribute__((__format__(__printf__, idx_fmt, idx_arg))) #else #define HWY_FORMAT(idx_fmt, idx_arg) #endif // Returns a void* pointer which the compiler then assumes is N-byte aligned. // Example: float* HWY_RESTRICT aligned = (float*)HWY_ASSUME_ALIGNED(in, 32); // // The assignment semantics are required by GCC/Clang. ICC provides an in-place // __assume_aligned, whereas MSVC's __assume appears unsuitable. #if HWY_HAS_BUILTIN(__builtin_assume_aligned) #define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align)) #else #define HWY_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */ #endif // Clang and GCC require attributes on each function into which SIMD intrinsics // are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and // automatic annotation via pragmas. #if HWY_COMPILER_CLANG #define HWY_PUSH_ATTRIBUTES(targets_str) \ HWY_PRAGMA(clang attribute push(__attribute__((target(targets_str))), \ apply_to = function)) #define HWY_POP_ATTRIBUTES HWY_PRAGMA(clang attribute pop) #elif HWY_COMPILER_GCC #define HWY_PUSH_ATTRIBUTES(targets_str) \ HWY_PRAGMA(GCC push_options) HWY_PRAGMA(GCC target targets_str) #define HWY_POP_ATTRIBUTES HWY_PRAGMA(GCC pop_options) #else #define HWY_PUSH_ATTRIBUTES(targets_str) #define HWY_POP_ATTRIBUTES #endif //------------------------------------------------------------------------------ // Macros #define HWY_API static HWY_INLINE HWY_FLATTEN HWY_MAYBE_UNUSED #define HWY_CONCAT_IMPL(a, b) a##b #define HWY_CONCAT(a, b) HWY_CONCAT_IMPL(a, b) #define HWY_MIN(a, b) ((a) < (b) ? (a) : (b)) #define HWY_MAX(a, b) ((a) > (b) ? (a) : (b)) #if HWY_COMPILER_GCC_ACTUAL // nielskm: GCC does not support '#pragma GCC unroll' without the factor. #define HWY_UNROLL(factor) HWY_PRAGMA(GCC unroll factor) #define HWY_DEFAULT_UNROLL HWY_UNROLL(4) #elif HWY_COMPILER_CLANG || HWY_COMPILER_ICC || HWY_COMPILER_ICX #define HWY_UNROLL(factor) HWY_PRAGMA(unroll factor) #define HWY_DEFAULT_UNROLL HWY_UNROLL() #else #define HWY_UNROLL(factor) #define HWY_DEFAULT_UNROLL #endif // Tell a compiler that the expression always evaluates to true. // The expression should be free from any side effects. // Some older compilers may have trouble with complex expressions, therefore // it is advisable to split multiple conditions into separate assume statements, // and manually check the generated code. // OK but could fail: // HWY_ASSUME(x == 2 && y == 3); // Better: // HWY_ASSUME(x == 2); // HWY_ASSUME(y == 3); #if defined(__has_cpp_attribute) && __has_cpp_attribute(assume) #define HWY_ASSUME(expr) [[assume(expr)]] #elif HWY_COMPILER_MSVC || HWY_COMPILER_ICC #define HWY_ASSUME(expr) __assume(expr) // __builtin_assume() was added in clang 3.6. #elif HWY_COMPILER_CLANG && HWY_HAS_BUILTIN(__builtin_assume) #define HWY_ASSUME(expr) __builtin_assume(expr) // __builtin_unreachable() was added in GCC 4.5, but __has_builtin() was added // later, so check for the compiler version directly. #elif HWY_COMPILER_GCC_ACTUAL >= 405 #define HWY_ASSUME(expr) \ ((expr) ? static_cast(0) : __builtin_unreachable()) #else #define HWY_ASSUME(expr) static_cast(0) #endif // Compile-time fence to prevent undesirable code reordering. On Clang x86, the // typical asm volatile("" : : : "memory") has no effect, whereas atomic fence // does, without generating code. #if HWY_ARCH_X86 #define HWY_FENCE std::atomic_thread_fence(std::memory_order_acq_rel) #else // TODO(janwas): investigate alternatives. On ARM, the above generates barriers. #define HWY_FENCE #endif // 4 instances of a given literal value, useful as input to LoadDup128. #define HWY_REP4(literal) literal, literal, literal, literal #define HWY_ABORT(format, ...) \ ::hwy::Abort(__FILE__, __LINE__, format, ##__VA_ARGS__) // Always enabled. #define HWY_ASSERT(condition) \ do { \ if (!(condition)) { \ HWY_ABORT("Assert %s", #condition); \ } \ } while (0) #if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER) #define HWY_IS_MSAN 1 #else #define HWY_IS_MSAN 0 #endif #if HWY_HAS_FEATURE(address_sanitizer) || defined(ADDRESS_SANITIZER) #define HWY_IS_ASAN 1 #else #define HWY_IS_ASAN 0 #endif #if HWY_HAS_FEATURE(thread_sanitizer) || defined(THREAD_SANITIZER) #define HWY_IS_TSAN 1 #else #define HWY_IS_TSAN 0 #endif // MSAN may cause lengthy build times or false positives e.g. in AVX3 DemoteTo. // You can disable MSAN by adding this attribute to the function that fails. #if HWY_IS_MSAN #define HWY_ATTR_NO_MSAN __attribute__((no_sanitize_memory)) #else #define HWY_ATTR_NO_MSAN #endif // For enabling HWY_DASSERT and shortening tests in slower debug builds #if !defined(HWY_IS_DEBUG_BUILD) // Clang does not define NDEBUG, but it and GCC define __OPTIMIZE__, and recent // MSVC defines NDEBUG (if not, could instead check _DEBUG). #if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) || HWY_IS_ASAN || \ HWY_IS_MSAN || HWY_IS_TSAN || defined(__clang_analyzer__) #define HWY_IS_DEBUG_BUILD 1 #else #define HWY_IS_DEBUG_BUILD 0 #endif #endif // HWY_IS_DEBUG_BUILD #if HWY_IS_DEBUG_BUILD #define HWY_DASSERT(condition) HWY_ASSERT(condition) #else #define HWY_DASSERT(condition) \ do { \ } while (0) #endif namespace hwy { //------------------------------------------------------------------------------ // kMaxVectorSize (undocumented, pending removal) #if HWY_ARCH_X86 static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 64; // AVX-512 #elif HWY_ARCH_RVV && defined(__riscv_vector) // Not actually an upper bound on the size. static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 4096; #else static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16; #endif //------------------------------------------------------------------------------ // Alignment // Potentially useful for LoadDup128 and capped vectors. In other cases, arrays // should be allocated dynamically via aligned_allocator.h because Lanes() may // exceed the stack size. #if HWY_ARCH_X86 #define HWY_ALIGN_MAX alignas(64) #elif HWY_ARCH_RVV && defined(__riscv_vector) #define HWY_ALIGN_MAX alignas(8) // only elements need be aligned #else #define HWY_ALIGN_MAX alignas(16) #endif //------------------------------------------------------------------------------ // Lane types // Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name // by concatenating base type and bits. #pragma pack(push, 1) // ACLE (https://gcc.gnu.org/onlinedocs/gcc/Half-Precision.html): // always supported on aarch64, for v7 only if -mfp16-format is given. #if ((HWY_ARCH_ARM_A64 || (__ARM_FP & 2)) && HWY_COMPILER_GCC) using float16_t = __fp16; // C11 extension ISO/IEC TS 18661-3:2015 but not supported on all targets. // Required for Clang RVV if the float16 extension is used. #elif HWY_ARCH_RVV && HWY_COMPILER_CLANG && defined(__riscv_zvfh) using float16_t = _Float16; // Otherwise emulate #else struct float16_t { uint16_t bits; }; #endif struct bfloat16_t { uint16_t bits; }; #pragma pack(pop) using float32_t = float; using float64_t = double; #pragma pack(push, 1) // Aligned 128-bit type. Cannot use __int128 because clang doesn't yet align it: // https://reviews.llvm.org/D86310 struct alignas(16) uint128_t { uint64_t lo; // little-endian layout uint64_t hi; }; // 64 bit key plus 64 bit value. Faster than using uint128_t when only the key // field is to be compared (Lt128Upper instead of Lt128). struct alignas(16) K64V64 { uint64_t value; // little-endian layout uint64_t key; }; // 32 bit key plus 32 bit value. Allows vqsort recursions to terminate earlier // than when considering both to be a 64-bit key. struct alignas(8) K32V32 { uint32_t value; // little-endian layout uint32_t key; }; #pragma pack(pop) static inline HWY_MAYBE_UNUSED bool operator<(const uint128_t& a, const uint128_t& b) { return (a.hi == b.hi) ? a.lo < b.lo : a.hi < b.hi; } // Required for std::greater. static inline HWY_MAYBE_UNUSED bool operator>(const uint128_t& a, const uint128_t& b) { return b < a; } static inline HWY_MAYBE_UNUSED bool operator==(const uint128_t& a, const uint128_t& b) { return a.lo == b.lo && a.hi == b.hi; } static inline HWY_MAYBE_UNUSED bool operator<(const K64V64& a, const K64V64& b) { return a.key < b.key; } // Required for std::greater. static inline HWY_MAYBE_UNUSED bool operator>(const K64V64& a, const K64V64& b) { return b < a; } static inline HWY_MAYBE_UNUSED bool operator==(const K64V64& a, const K64V64& b) { return a.key == b.key; } static inline HWY_MAYBE_UNUSED bool operator<(const K32V32& a, const K32V32& b) { return a.key < b.key; } // Required for std::greater. static inline HWY_MAYBE_UNUSED bool operator>(const K32V32& a, const K32V32& b) { return b < a; } static inline HWY_MAYBE_UNUSED bool operator==(const K32V32& a, const K32V32& b) { return a.key == b.key; } //------------------------------------------------------------------------------ // Controlling overload resolution (SFINAE) template struct EnableIfT {}; template <> struct EnableIfT { using type = void; }; template using EnableIf = typename EnableIfT::type; template struct IsSameT { enum { value = 0 }; }; template struct IsSameT { enum { value = 1 }; }; template HWY_API constexpr bool IsSame() { return IsSameT::value; } // Insert into template/function arguments to enable this overload only for // vectors of AT MOST this many bits. // // Note that enabling for exactly 128 bits is unnecessary because a function can // simply be overloaded with Vec128 and/or Full128 tag. Enabling for other // sizes (e.g. 64 bit) can be achieved via Simd. #define HWY_IF_LE128(T, N) hwy::EnableIf* = nullptr #define HWY_IF_LE64(T, N) hwy::EnableIf* = nullptr #define HWY_IF_LE32(T, N) hwy::EnableIf* = nullptr #define HWY_IF_GE32(T, N) hwy::EnableIf= 4>* = nullptr #define HWY_IF_GE64(T, N) hwy::EnableIf= 8>* = nullptr #define HWY_IF_GE128(T, N) hwy::EnableIf= 16>* = nullptr #define HWY_IF_GT128(T, N) hwy::EnableIf<(N * sizeof(T) > 16)>* = nullptr #define HWY_IF_UNSIGNED(T) hwy::EnableIf()>* = nullptr #define HWY_IF_SIGNED(T) \ hwy::EnableIf() && !IsFloat()>* = nullptr #define HWY_IF_FLOAT(T) hwy::EnableIf()>* = nullptr #define HWY_IF_NOT_FLOAT(T) hwy::EnableIf()>* = nullptr #define HWY_IF_LANE_SIZE(T, bytes) \ hwy::EnableIf* = nullptr #define HWY_IF_NOT_LANE_SIZE(T, bytes) \ hwy::EnableIf* = nullptr // bit_array = 0x102 means 1 or 8 bytes. There is no NONE_OF because it sounds // too similar. If you want the opposite of this (2 or 4 bytes), ask for those // bits explicitly (0x14) instead of attempting to 'negate' 0x102. #define HWY_IF_LANE_SIZE_ONE_OF(T, bit_array) \ hwy::EnableIf<((size_t{1} << sizeof(T)) & (bit_array)) != 0>* = nullptr #define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \ hwy::EnableIf* = nullptr // Empty struct used as a size tag type. template struct SizeTag {}; template struct RemoveConstT { using type = T; }; template struct RemoveConstT { using type = T; }; template using RemoveConst = typename RemoveConstT::type; //------------------------------------------------------------------------------ // Type relations namespace detail { template struct Relations; template <> struct Relations { using Unsigned = uint8_t; using Signed = int8_t; using Wide = uint16_t; enum { is_signed = 0, is_float = 0 }; }; template <> struct Relations { using Unsigned = uint8_t; using Signed = int8_t; using Wide = int16_t; enum { is_signed = 1, is_float = 0 }; }; template <> struct Relations { using Unsigned = uint16_t; using Signed = int16_t; using Wide = uint32_t; using Narrow = uint8_t; enum { is_signed = 0, is_float = 0 }; }; template <> struct Relations { using Unsigned = uint16_t; using Signed = int16_t; using Wide = int32_t; using Narrow = int8_t; enum { is_signed = 1, is_float = 0 }; }; template <> struct Relations { using Unsigned = uint32_t; using Signed = int32_t; using Float = float; using Wide = uint64_t; using Narrow = uint16_t; enum { is_signed = 0, is_float = 0 }; }; template <> struct Relations { using Unsigned = uint32_t; using Signed = int32_t; using Float = float; using Wide = int64_t; using Narrow = int16_t; enum { is_signed = 1, is_float = 0 }; }; template <> struct Relations { using Unsigned = uint64_t; using Signed = int64_t; using Float = double; using Wide = uint128_t; using Narrow = uint32_t; enum { is_signed = 0, is_float = 0 }; }; template <> struct Relations { using Unsigned = uint64_t; using Signed = int64_t; using Float = double; using Narrow = int32_t; enum { is_signed = 1, is_float = 0 }; }; template <> struct Relations { using Unsigned = uint128_t; using Narrow = uint64_t; enum { is_signed = 0, is_float = 0 }; }; template <> struct Relations { using Unsigned = uint16_t; using Signed = int16_t; using Float = float16_t; using Wide = float; enum { is_signed = 1, is_float = 1 }; }; template <> struct Relations { using Unsigned = uint16_t; using Signed = int16_t; using Wide = float; enum { is_signed = 1, is_float = 1 }; }; template <> struct Relations { using Unsigned = uint32_t; using Signed = int32_t; using Float = float; using Wide = double; using Narrow = float16_t; enum { is_signed = 1, is_float = 1 }; }; template <> struct Relations { using Unsigned = uint64_t; using Signed = int64_t; using Float = double; using Narrow = float; enum { is_signed = 1, is_float = 1 }; }; template struct TypeFromSize; template <> struct TypeFromSize<1> { using Unsigned = uint8_t; using Signed = int8_t; }; template <> struct TypeFromSize<2> { using Unsigned = uint16_t; using Signed = int16_t; }; template <> struct TypeFromSize<4> { using Unsigned = uint32_t; using Signed = int32_t; using Float = float; }; template <> struct TypeFromSize<8> { using Unsigned = uint64_t; using Signed = int64_t; using Float = double; }; template <> struct TypeFromSize<16> { using Unsigned = uint128_t; }; } // namespace detail // Aliases for types of a different category, but the same size. template using MakeUnsigned = typename detail::Relations::Unsigned; template using MakeSigned = typename detail::Relations::Signed; template using MakeFloat = typename detail::Relations::Float; // Aliases for types of the same category, but different size. template using MakeWide = typename detail::Relations::Wide; template using MakeNarrow = typename detail::Relations::Narrow; // Obtain type from its size [bytes]. template using UnsignedFromSize = typename detail::TypeFromSize::Unsigned; template using SignedFromSize = typename detail::TypeFromSize::Signed; template using FloatFromSize = typename detail::TypeFromSize::Float; // Avoid confusion with SizeTag where the parameter is a lane size. using UnsignedTag = SizeTag<0>; using SignedTag = SizeTag<0x100>; // integer using FloatTag = SizeTag<0x200>; template > constexpr auto TypeTag() -> hwy::SizeTag<((R::is_signed + R::is_float) << 8)> { return hwy::SizeTag<((R::is_signed + R::is_float) << 8)>(); } // For when we only want to distinguish FloatTag from everything else. using NonFloatTag = SizeTag<0x400>; template > constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 : 0x400)> { return hwy::SizeTag<(R::is_float ? 0x200 : 0x400)>(); } //------------------------------------------------------------------------------ // Type traits template HWY_API constexpr bool IsFloat() { // Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or // from a float, not compared. return IsSame() || IsSame(); } template HWY_API constexpr bool IsSigned() { return T(0) > T(-1); } template <> constexpr bool IsSigned() { return true; } template <> constexpr bool IsSigned() { return true; } // Largest/smallest representable integer values. template HWY_API constexpr T LimitsMax() { static_assert(!IsFloat(), "Only for integer types"); using TU = MakeUnsigned; return static_cast(IsSigned() ? (static_cast(~0ull) >> 1) : static_cast(~0ull)); } template HWY_API constexpr T LimitsMin() { static_assert(!IsFloat(), "Only for integer types"); return IsSigned() ? T(-1) - LimitsMax() : T(0); } // Largest/smallest representable value (integer or float). This naming avoids // confusion with numeric_limits::min() (the smallest positive value). template HWY_API constexpr T LowestValue() { return LimitsMin(); } template <> constexpr float LowestValue() { return -3.402823466e+38F; } template <> constexpr double LowestValue() { return -1.7976931348623158e+308; } template HWY_API constexpr T HighestValue() { return LimitsMax(); } template <> constexpr float HighestValue() { return 3.402823466e+38F; } template <> constexpr double HighestValue() { return 1.7976931348623158e+308; } // Difference between 1.0 and the next representable value. template HWY_API constexpr T Epsilon() { return 1; } template <> constexpr float Epsilon() { return 1.192092896e-7f; } template <> constexpr double Epsilon() { return 2.2204460492503131e-16; } // Returns width in bits of the mantissa field in IEEE binary32/64. template constexpr int MantissaBits() { static_assert(sizeof(T) == 0, "Only instantiate the specializations"); return 0; } template <> constexpr int MantissaBits() { return 23; } template <> constexpr int MantissaBits() { return 52; } // Returns the (left-shifted by one bit) IEEE binary32/64 representation with // the largest possible (biased) exponent field. Used by IsInf. template constexpr MakeSigned MaxExponentTimes2() { return -(MakeSigned{1} << (MantissaBits() + 1)); } // Returns bitmask of the sign bit in IEEE binary32/64. template constexpr MakeUnsigned SignMask() { return MakeUnsigned{1} << (sizeof(T) * 8 - 1); } // Returns bitmask of the exponent field in IEEE binary32/64. template constexpr MakeUnsigned ExponentMask() { return (~(MakeUnsigned{1} << MantissaBits()) + 1) & ~SignMask(); } // Returns bitmask of the mantissa field in IEEE binary32/64. template constexpr MakeUnsigned MantissaMask() { return (MakeUnsigned{1} << MantissaBits()) - 1; } // Returns 1 << mantissa_bits as a floating-point number. All integers whose // absolute value are less than this can be represented exactly. template constexpr T MantissaEnd() { static_assert(sizeof(T) == 0, "Only instantiate the specializations"); return 0; } template <> constexpr float MantissaEnd() { return 8388608.0f; // 1 << 23 } template <> constexpr double MantissaEnd() { // floating point literal with p52 requires C++17. return 4503599627370496.0; // 1 << 52 } // Returns width in bits of the exponent field in IEEE binary32/64. template constexpr int ExponentBits() { // Exponent := remaining bits after deducting sign and mantissa. return 8 * sizeof(T) - 1 - MantissaBits(); } // Returns largest value of the biased exponent field in IEEE binary32/64, // right-shifted so that the LSB is bit zero. Example: 0xFF for float. // This is expressed as a signed integer for more efficient comparison. template constexpr MakeSigned MaxExponentField() { return (MakeSigned{1} << ExponentBits()) - 1; } //------------------------------------------------------------------------------ // Helper functions template constexpr inline T1 DivCeil(T1 a, T2 b) { return (a + b - 1) / b; } // Works for any `align`; if a power of two, compiler emits ADD+AND. constexpr inline size_t RoundUpTo(size_t what, size_t align) { return DivCeil(what, align) * align; } // Undefined results for x == 0. HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) { #if HWY_COMPILER_MSVC unsigned long index; // NOLINT _BitScanForward(&index, x); return index; #else // HWY_COMPILER_MSVC return static_cast(__builtin_ctz(x)); #endif // HWY_COMPILER_MSVC } HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) { #if HWY_COMPILER_MSVC #if HWY_ARCH_X86_64 unsigned long index; // NOLINT _BitScanForward64(&index, x); return index; #else // HWY_ARCH_X86_64 // _BitScanForward64 not available uint32_t lsb = static_cast(x & 0xFFFFFFFF); unsigned long index; // NOLINT if (lsb == 0) { uint32_t msb = static_cast(x >> 32u); _BitScanForward(&index, msb); return 32 + index; } else { _BitScanForward(&index, lsb); return index; } #endif // HWY_ARCH_X86_64 #else // HWY_COMPILER_MSVC return static_cast(__builtin_ctzll(x)); #endif // HWY_COMPILER_MSVC } // Undefined results for x == 0. HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) { #if HWY_COMPILER_MSVC unsigned long index; // NOLINT _BitScanReverse(&index, x); return 31 - index; #else // HWY_COMPILER_MSVC return static_cast(__builtin_clz(x)); #endif // HWY_COMPILER_MSVC } HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) { #if HWY_COMPILER_MSVC #if HWY_ARCH_X86_64 unsigned long index; // NOLINT _BitScanReverse64(&index, x); return 63 - index; #else // HWY_ARCH_X86_64 // _BitScanReverse64 not available const uint32_t msb = static_cast(x >> 32u); unsigned long index; // NOLINT if (msb == 0) { const uint32_t lsb = static_cast(x & 0xFFFFFFFF); _BitScanReverse(&index, lsb); return 63 - index; } else { _BitScanReverse(&index, msb); return 31 - index; } #endif // HWY_ARCH_X86_64 #else // HWY_COMPILER_MSVC return static_cast(__builtin_clzll(x)); #endif // HWY_COMPILER_MSVC } HWY_API size_t PopCount(uint64_t x) { #if HWY_COMPILER_GCC // includes clang return static_cast(__builtin_popcountll(x)); // This instruction has a separate feature flag, but is often called from // non-SIMD code, so we don't want to require dynamic dispatch. It was first // supported by Intel in Nehalem (SSE4.2), but MSVC only predefines a macro // for AVX, so check for that. #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__) return _mm_popcnt_u64(x); #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__) return _mm_popcnt_u32(static_cast(x & 0xFFFFFFFFu)) + _mm_popcnt_u32(static_cast(x >> 32)); #else x -= ((x >> 1) & 0x5555555555555555ULL); x = (((x >> 2) & 0x3333333333333333ULL) + (x & 0x3333333333333333ULL)); x = (((x >> 4) + x) & 0x0F0F0F0F0F0F0F0FULL); x += (x >> 8); x += (x >> 16); x += (x >> 32); return static_cast(x & 0x7Fu); #endif } // Skip HWY_API due to GCC "function not considered for inlining". Previously // such errors were caused by underlying type mismatches, but it's not clear // what is still mismatched despite all the casts. template /*HWY_API*/ constexpr size_t FloorLog2(TI x) { return x == TI{1} ? 0 : static_cast(FloorLog2(static_cast(x >> 1)) + 1); } template /*HWY_API*/ constexpr size_t CeilLog2(TI x) { return x == TI{1} ? 0 : static_cast(FloorLog2(static_cast(x - 1)) + 1); } template HWY_INLINE constexpr T AddWithWraparound(hwy::FloatTag /*tag*/, T t, size_t n) { return t + static_cast(n); } template HWY_INLINE constexpr T AddWithWraparound(hwy::NonFloatTag /*tag*/, T t, size_t n) { using TU = MakeUnsigned; return static_cast( static_cast(static_cast(t) + static_cast(n)) & hwy::LimitsMax()); } #if HWY_COMPILER_MSVC && HWY_ARCH_X86_64 #pragma intrinsic(_umul128) #endif // 64 x 64 = 128 bit multiplication HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) { #if defined(__SIZEOF_INT128__) __uint128_t product = (__uint128_t)a * (__uint128_t)b; *upper = (uint64_t)(product >> 64); return (uint64_t)(product & 0xFFFFFFFFFFFFFFFFULL); #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 return _umul128(a, b, upper); #else constexpr uint64_t kLo32 = 0xFFFFFFFFU; const uint64_t lo_lo = (a & kLo32) * (b & kLo32); const uint64_t hi_lo = (a >> 32) * (b & kLo32); const uint64_t lo_hi = (a & kLo32) * (b >> 32); const uint64_t hi_hi = (a >> 32) * (b >> 32); const uint64_t t = (lo_lo >> 32) + (hi_lo & kLo32) + lo_hi; *upper = (hi_lo >> 32) + (t >> 32) + hi_hi; return (t << 32) | (lo_lo & kLo32); #endif } #if HWY_COMPILER_MSVC #pragma intrinsic(memcpy) #pragma intrinsic(memset) #endif // The source/destination must not overlap/alias. template HWY_API void CopyBytes(const From* from, To* to) { #if HWY_COMPILER_MSVC memcpy(to, from, kBytes); #else __builtin_memcpy( static_cast(to), static_cast(from), kBytes); #endif } // Same as CopyBytes, but for same-sized objects; avoids a size argument. template HWY_API void CopySameSize(const From* HWY_RESTRICT from, To* HWY_RESTRICT to) { static_assert(sizeof(From) == sizeof(To), ""); CopyBytes(from, to); } template HWY_API void ZeroBytes(To* to) { #if HWY_COMPILER_MSVC memset(to, 0, kBytes); #else __builtin_memset(to, 0, kBytes); #endif } HWY_API float F32FromBF16(bfloat16_t bf) { uint32_t bits = bf.bits; bits <<= 16; float f; CopySameSize(&bits, &f); return f; } HWY_API bfloat16_t BF16FromF32(float f) { uint32_t bits; CopySameSize(&f, &bits); bfloat16_t bf; bf.bits = static_cast(bits >> 16); return bf; } HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4) Abort(const char* file, int line, const char* format, ...); } // namespace hwy #endif // HIGHWAY_HWY_BASE_H_