// Copyright 2020 Google LLC // SPDX-License-Identifier: Apache-2.0 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Per-target definitions shared by ops/*.h and user code. // IWYU pragma: begin_exports // Export does not seem to be recursive, so re-export these (also in base.h) #include #include "hwy/base.h" // "IWYU pragma: keep" does not work for this include, so hide it from the IDE. #if !HWY_IDE #include #endif #include "hwy/detect_compiler_arch.h" // Separate header because foreach_target.h re-enables its include guard. #include "hwy/ops/set_macros-inl.h" // IWYU pragma: end_exports #if HWY_IS_MSAN #include #endif // We are covered by the highway.h include guard, but generic_ops-inl.h // includes this again #if HWY_IDE. #if defined(HIGHWAY_HWY_OPS_SHARED_TOGGLE) == \ defined(HWY_TARGET_TOGGLE) #ifdef HIGHWAY_HWY_OPS_SHARED_TOGGLE #undef HIGHWAY_HWY_OPS_SHARED_TOGGLE #else #define HIGHWAY_HWY_OPS_SHARED_TOGGLE #endif HWY_BEFORE_NAMESPACE(); namespace hwy { namespace HWY_NAMESPACE { // NOTE: GCC generates incorrect code for vector arguments to non-inlined // functions in two situations: // - on Windows and GCC 10.3, passing by value crashes due to unaligned loads: // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412. // - on aarch64 and GCC 9.3.0 or 11.2.1, passing by value causes many (but not // all) tests to fail. // // We therefore pass by const& only on GCC and (Windows or aarch64). This alias // must be used for all vector/mask parameters of functions marked HWY_NOINLINE, // and possibly also other functions that are not inlined. #if HWY_COMPILER_GCC_ACTUAL && (HWY_OS_WIN || HWY_ARCH_ARM_A64) template using VecArg = const V&; #else template using VecArg = V; #endif namespace detail { // Returns N * 2^pow2. N is the number of lanes in a full vector and pow2 the // desired fraction or multiple of it, see Simd<>. `pow2` is most often in // [-3, 3] but can also be lower for user-specified fractions. constexpr size_t ScaleByPower(size_t N, int pow2) { return pow2 >= 0 ? (N << pow2) : (N >> (-pow2)); } template HWY_INLINE void MaybeUnpoison(T* HWY_RESTRICT unaligned, size_t count) { // Workaround for MSAN not marking compressstore as initialized (b/233326619) #if HWY_IS_MSAN __msan_unpoison(unaligned, count * sizeof(T)); #else (void)unaligned; (void)count; #endif } } // namespace detail // Highway operations are implemented as overloaded functions selected using a // zero-sized tag type D := Simd. T denotes the lane type. // // N defines how many lanes are in a 'full' vector, typically equal to // HWY_LANES(T) (which is the actual count on targets with vectors of known // size, and an upper bound in case of scalable vectors), otherwise a // user-specified limit at most that large. // // 2^kPow2 is a _subsequently_ applied scaling factor that indicates the // desired fraction of a 'full' vector: 0 means full, -1 means half; 1,2,3 // means two/four/eight full vectors ganged together. The largest supported // kPow2 is `HWY_MAX_POW2` and the aliases below take care of clamping // user-specified values to that. Note that `Simd` and `Simd` // have the same `MaxLanes` and `Lanes`. // // We can theoretically keep halving Lanes(), but recursive instantiations of // kPow2 - 1 will eventually fail e.g. because -64 is not a valid shift count. // Users must terminate such compile-time recursions at or above HWY_MIN_POW2. // // WARNING: do not use N directly because it may be a special representation of // a fractional MaxLanes. This arises when we Rebind Simd to // Simd. RVV requires that the last argument (kPow2) be two, // but we want MaxLanes to be the same in both cases. Hence ?? is a // fixed-point encoding of 1/4. // // Instead of referring to Simd<> directly, users create D via aliases: // - ScalableTag for a full vector; // - ScalableTag() for a fraction/group, where `kPow2` is // interpreted as `HWY_MIN(kPow2, HWY_MAX_POW2)`; // - CappedTag for a vector with up to kLimit lanes; or // - FixedTag for a vector with exactly kNumLanes lanes. // // Instead of N, use Lanes(D()) for the actual number of lanes at runtime and // D().MaxLanes() for a constexpr upper bound. Both are powers of two. template struct Simd { constexpr Simd() = default; using T = Lane; private: static_assert(sizeof(Lane) <= 8, "Lanes are up to 64-bit"); // 20 bits are sufficient for any HWY_MAX_BYTES. This is the 'normal' value of // N when kFrac == 0, otherwise it is one (see FracN). static constexpr size_t kWhole = N & 0xFFFFF; // Fractional part is in the bits above kWhole. static constexpr int kFrac = static_cast(N >> 20); // Can be 8x larger because kPow2 may be as low as -3 (Rebind of a larger // type to u8 results in fractions). static_assert(kWhole <= 8 * HWY_MAX_N && kFrac <= 3, "Out of range"); static_assert(kFrac == 0 || kWhole == 1, "If frac, whole must be 1"); static_assert((kWhole & (kWhole - 1)) == 0 && kWhole != 0, "Not 2^x"); // Important to check this here because kPow2 <= -64 causes confusing // compile errors (invalid shift count). static_assert(kPow2 >= HWY_MIN_POW2, "Forgot kPow2 recursion terminator?"); // However, do NOT verify kPow2 <= HWY_MAX_POW2 - users should be able to // Rebind> in order to discover that its // kPow2 is out of bounds. public: // Upper bound on the number of lanes (tight if !HWY_HAVE_SCALABLE). In the // common case, N == kWhole, but if kFrac is nonzero, we deduct it from kPow2. // E.g. Rebind> is Simd. // The resulting number of lanes is still 1 because this N represents 1/4 // (the ratio of the sizes). Note that RVV requires kPow2 to be the ratio of // the sizes so that the correct LMUL overloads are chosen, even if N is // small enough that it would fit in an LMUL=1 vector. // // Cannot be an enum because GCC warns when using enums and non-enums in the // same expression. Cannot be a static constexpr function (MSVC limitation). // Rounded up to one so this is a valid array length. // // Do not use this directly - only 'public' so it is visible from the accessor // macro required by MSVC. static constexpr size_t kPrivateLanes = HWY_MAX(size_t{1}, detail::ScaleByPower(kWhole, kPow2 - kFrac)); constexpr size_t MaxLanes() const { return kPrivateLanes; } constexpr size_t MaxBytes() const { return kPrivateLanes * sizeof(Lane); } // For SFINAE on RVV. constexpr int Pow2() const { return kPow2; } // ------------------------------ Changing lane type or count // Do not use any of these directly. Anything used from member typedefs cannot // be made private, but functions only used within other functions can. // Returns number of NewT lanes that fit within MaxBytes(). template static constexpr size_t RepartitionLanes() { // Round up to correctly handle larger NewT. return (kPrivateLanes * sizeof(T) + sizeof(NewT) - 1) / sizeof(NewT); } // Returns the new kPow2 required for lanes of type NewT. template static constexpr int RebindPow2() { return kPow2 + ((sizeof(NewT) >= sizeof(T)) ? static_cast(CeilLog2(sizeof(NewT) / sizeof(T))) : -static_cast(CeilLog2(sizeof(T) / sizeof(NewT)))); } private: // Returns 0 or whole NewN such that kNewMaxLanes = NewN * 2^kNewPow2. template static constexpr size_t WholeN() { return detail::ScaleByPower(kNewMaxLanes, -kNewPow2); } // Returns fractional NewN such that kNewMaxLanes = NewN * 2^kNewPow2. template static constexpr size_t FracN() { // Only reached if kNewPow2 > CeilLog2(kNewMaxLanes) >= 0 (else WholeN // would not have been zero), but clamp to zero to avoid warnings. kFrac is // the difference, stored in the upper bits of N, and we also set kWhole = // 1 so that the new kPrivateLanes = kNewMaxLanes. static_assert(HWY_MAX_N <= (size_t{1} << 20), "Change bit shift"); return static_cast( 1 + (HWY_MAX(0, kNewPow2 - static_cast(CeilLog2(kNewMaxLanes))) << 20)); } public: // Returns (whole or fractional) NewN, see above. template static constexpr size_t NewN() { // We require a fraction if inverting kNewPow2 results in 0. return WholeN() == 0 ? FracN() : WholeN(); } // PromoteTo/DemoteTo() with another lane type, but same number of lanes. template using Rebind = Simd(), kPrivateLanes>(), RebindPow2()>; // Change lane type while keeping the same vector size, e.g. for MulEven. template using Repartition = Simd()>(), kPow2>; // Half the lanes while keeping the same lane type, e.g. for LowerHalf. using Half = Simd; // Twice the lanes while keeping the same lane type, e.g. for Combine. using Twice = Simd; }; namespace detail { template constexpr bool IsFull(Simd /* d */) { return N == HWY_LANES(T) && kPow2 == 0; } // Struct wrappers enable validation of arguments via static_assert. template struct ClampNAndPow2 { using type = Simd; }; template struct ScalableTagChecker { using type = typename ClampNAndPow2::type; }; template struct CappedTagChecker { static_assert(kLimit != 0, "Does not make sense to have zero lanes"); // Safely handle non-power-of-two inputs by rounding down, which is allowed by // CappedTag. Otherwise, Simd would static_assert. static constexpr size_t kLimitPow2 = size_t{1} << hwy::FloorLog2(kLimit); static constexpr size_t N = HWY_MIN(kLimitPow2, HWY_LANES(T)); using type = typename ClampNAndPow2::type; }; template struct FixedTagChecker { static_assert(kNumLanes != 0, "Does not make sense to have zero lanes"); static_assert(kNumLanes <= HWY_LANES(T), "Too many lanes"); using type = Simd; }; } // namespace detail // ------------------------------ Aliases for Simd<> // Tag describing a full vector (kPow2 == 0: the most common usage, e.g. 1D // loops where the application does not care about the vector size) or a // fraction/multiple of one. Fractions (kPow2 < 0) are useful for arguments or // return values of type promotion and demotion. User-specified kPow2 is // interpreted as `HWY_MIN(kPow2, HWY_MAX_POW2)`. template using ScalableTag = typename detail::ScalableTagChecker::type; // Tag describing a vector with *up to* kLimit active lanes, even on targets // with scalable vectors and HWY_SCALAR. The runtime lane count `Lanes(tag)` may // be less than kLimit, and is 1 on HWY_SCALAR. This alias is typically used for // 1D loops with a relatively low application-defined upper bound, e.g. for 8x8 // DCTs. However, it is better if data structures are designed to be // vector-length-agnostic (e.g. a hybrid SoA where there are chunks of `M >= // MaxLanes(d)` DC components followed by M AC1, .., and M AC63; this would // enable vector-length-agnostic loops using ScalableTag). User-specified kPow2 // is interpreted as `HWY_MIN(kPow2, HWY_MAX_POW2)`. template using CappedTag = typename detail::CappedTagChecker::type; #if !HWY_HAVE_SCALABLE // If the vector size is known, and the app knows it does not want more than // kLimit lanes, then capping can be beneficial. For example, AVX-512 has lower // IPC and potentially higher costs for unaligned load/store vs. 256-bit AVX2. template using CappedTagIfFixed = CappedTag; #else // HWY_HAVE_SCALABLE // .. whereas on RVV/SVE, the cost of clamping Lanes() may exceed the benefit. template using CappedTagIfFixed = ScalableTag; #endif // Alias for a tag describing a vector with *exactly* kNumLanes active lanes, // even on targets with scalable vectors. Requires `kNumLanes` to be a power of // two not exceeding `HWY_LANES(T)`. // // NOTE: if the application does not need to support HWY_SCALAR (+), use this // instead of CappedTag to emphasize that there will be exactly kNumLanes lanes. // This is useful for data structures that rely on exactly 128-bit SIMD, but // these are discouraged because they cannot benefit from wider vectors. // Instead, applications would ideally define a larger problem size and loop // over it with the (unknown size) vectors from ScalableTag. // // + e.g. if the baseline is known to support SIMD, or the application requires // ops such as TableLookupBytes not supported by HWY_SCALAR. template using FixedTag = typename detail::FixedTagChecker::type; // Convenience form for fixed sizes. template using Full16 = Simd; template using Full32 = Simd; template using Full64 = Simd; template using Full128 = Simd; // ------------------------------ Accessors for Simd<> // Lane type. template using TFromD = typename D::T; // Upper bound on the number of lanes, typically used for SFINAE conditions and // to allocate storage for targets with known vector sizes. Note: this may be a // loose bound, instead use Lanes() as the actual size for AllocateAligned. // MSVC workaround: use static constant directly instead of a function. #define HWY_MAX_LANES_D(D) D::kPrivateLanes // Non-macro form of HWY_MAX_LANES_D in case that is preferable. WARNING: the // macro form may be required for MSVC, which has limitations on deducing // arguments. template HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D) { return HWY_MAX_LANES_D(D); } #if !HWY_HAVE_SCALABLE // If non-scalable, this is constexpr; otherwise the target's header defines a // non-constexpr version of this function. This is the actual vector length, // used when advancing loop counters. template HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t Lanes(D) { return HWY_MAX_LANES_D(D); } #endif // !HWY_HAVE_SCALABLE // Tag for the same number of lanes as D, but with the LaneType T. template using Rebind = typename D::template Rebind; template using RebindToSigned = Rebind>, D>; template using RebindToUnsigned = Rebind>, D>; template using RebindToFloat = Rebind>, D>; // Tag for the same total size as D, but with the LaneType T. template using Repartition = typename D::template Repartition; template using RepartitionToWide = Repartition>, D>; template using RepartitionToNarrow = Repartition>, D>; // Tag for the same lane type as D, but half the lanes. template using Half = typename D::Half; // Tag for the same lane type as D, but twice the lanes. template using Twice = typename D::Twice; // ------------------------------ Choosing overloads (SFINAE) // Same as base.h macros but with a Simd argument instead of T. #define HWY_IF_UNSIGNED_D(D) HWY_IF_UNSIGNED(TFromD) #define HWY_IF_SIGNED_D(D) HWY_IF_SIGNED(TFromD) #define HWY_IF_FLOAT_D(D) HWY_IF_FLOAT(TFromD) #define HWY_IF_NOT_FLOAT_D(D) HWY_IF_NOT_FLOAT(TFromD) #define HWY_IF_SPECIAL_FLOAT_D(D) HWY_IF_SPECIAL_FLOAT(TFromD) #define HWY_IF_NOT_SPECIAL_FLOAT_D(D) HWY_IF_NOT_SPECIAL_FLOAT(TFromD) #define HWY_IF_T_SIZE_D(D, bytes) HWY_IF_T_SIZE(TFromD, bytes) #define HWY_IF_NOT_T_SIZE_D(D, bytes) HWY_IF_NOT_T_SIZE(TFromD, bytes) #define HWY_IF_T_SIZE_ONE_OF_D(D, bit_array) \ HWY_IF_T_SIZE_ONE_OF(TFromD, bit_array) #define HWY_IF_LANES_D(D, lanes) HWY_IF_LANES(HWY_MAX_LANES_D(D), lanes) #define HWY_IF_LANES_LE_D(D, lanes) HWY_IF_LANES_LE(HWY_MAX_LANES_D(D), lanes) #define HWY_IF_LANES_GT_D(D, lanes) HWY_IF_LANES_GT(HWY_MAX_LANES_D(D), lanes) #define HWY_IF_LANES_PER_BLOCK_D(D, lanes) \ HWY_IF_LANES_PER_BLOCK( \ TFromD, HWY_MIN(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD)), lanes) #define HWY_IF_POW2_LE_D(D, pow2) hwy::EnableIf* = nullptr #define HWY_IF_POW2_GT_D(D, pow2) hwy::EnableIf<(D().Pow2() > pow2)>* = nullptr #define HWY_IF_U8_D(D) hwy::EnableIf, uint8_t>()>* = nullptr #define HWY_IF_U16_D(D) hwy::EnableIf, uint16_t>()>* = nullptr #define HWY_IF_U32_D(D) hwy::EnableIf, uint32_t>()>* = nullptr #define HWY_IF_U64_D(D) hwy::EnableIf, uint64_t>()>* = nullptr #define HWY_IF_I8_D(D) hwy::EnableIf, int8_t>()>* = nullptr #define HWY_IF_I16_D(D) hwy::EnableIf, int16_t>()>* = nullptr #define HWY_IF_I32_D(D) hwy::EnableIf, int32_t>()>* = nullptr #define HWY_IF_I64_D(D) hwy::EnableIf, int64_t>()>* = nullptr // Use instead of HWY_IF_T_SIZE_D to avoid ambiguity with float/double // overloads. #define HWY_IF_UI32_D(D) \ hwy::EnableIf, uint32_t>() || \ IsSame, int32_t>()>* = nullptr #define HWY_IF_UI64_D(D) \ hwy::EnableIf, uint64_t>() || \ IsSame, int64_t>()>* = nullptr #define HWY_IF_BF16_D(D) \ hwy::EnableIf, bfloat16_t>()>* = nullptr #define HWY_IF_F16_D(D) hwy::EnableIf, float16_t>()>* = nullptr #define HWY_IF_F32_D(D) hwy::EnableIf, float>()>* = nullptr #define HWY_IF_F64_D(D) hwy::EnableIf, double>()>* = nullptr #define HWY_IF_V_SIZE_D(D, bytes) \ HWY_IF_V_SIZE(TFromD, HWY_MAX_LANES_D(D), bytes) #define HWY_IF_V_SIZE_LE_D(D, bytes) \ HWY_IF_V_SIZE_LE(TFromD, HWY_MAX_LANES_D(D), bytes) #define HWY_IF_V_SIZE_GT_D(D, bytes) \ HWY_IF_V_SIZE_GT(TFromD, HWY_MAX_LANES_D(D), bytes) // Same, but with a vector argument. ops/*-inl.h define their own TFromV. #define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(TFromV) #define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(TFromV) #define HWY_IF_FLOAT_V(V) HWY_IF_FLOAT(TFromV) #define HWY_IF_NOT_FLOAT_V(V) HWY_IF_NOT_FLOAT(TFromV) #define HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V) \ HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromV) #define HWY_IF_T_SIZE_V(V, bytes) HWY_IF_T_SIZE(TFromV, bytes) #define HWY_IF_NOT_T_SIZE_V(V, bytes) HWY_IF_NOT_T_SIZE(TFromV, bytes) #define HWY_IF_T_SIZE_ONE_OF_V(V, bit_array) \ HWY_IF_T_SIZE_ONE_OF(TFromV, bit_array) #define HWY_MAX_LANES_V(V) HWY_MAX_LANES_D(DFromV) #define HWY_IF_V_SIZE_V(V, bytes) \ HWY_IF_V_SIZE(TFromV, HWY_MAX_LANES_V(V), bytes) #define HWY_IF_V_SIZE_LE_V(V, bytes) \ HWY_IF_V_SIZE_LE(TFromV, HWY_MAX_LANES_V(V), bytes) #define HWY_IF_V_SIZE_GT_V(V, bytes) \ HWY_IF_V_SIZE_GT(TFromV, HWY_MAX_LANES_V(V), bytes) // Old names (deprecated) #define HWY_IF_LANE_SIZE_D(D, bytes) HWY_IF_T_SIZE_D(D, bytes) #define HWY_IF_NOT_LANE_SIZE_D(D, bytes) HWY_IF_NOT_T_SIZE_D(D, bytes) // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace hwy HWY_AFTER_NAMESPACE(); #endif // HIGHWAY_HWY_OPS_SHARED_TOGGLE