// Copyright 2019 Google LLC // SPDX-License-Identifier: Apache-2.0 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Single-element vectors and operations. // External include guard in highway.h - see comment there. #include #include #include "hwy/base.h" #include "hwy/ops/shared-inl.h" HWY_BEFORE_NAMESPACE(); namespace hwy { namespace HWY_NAMESPACE { // Single instruction, single data. template using Sisd = Simd; // (Wrapper class required for overloading comparison operators.) template struct Vec1 { using PrivateT = T; // only for DFromV static constexpr size_t kPrivateN = 1; // only for DFromV HWY_INLINE Vec1() = default; Vec1(const Vec1&) = default; Vec1& operator=(const Vec1&) = default; HWY_INLINE explicit Vec1(const T t) : raw(t) {} HWY_INLINE Vec1& operator*=(const Vec1 other) { return *this = (*this * other); } HWY_INLINE Vec1& operator/=(const Vec1 other) { return *this = (*this / other); } HWY_INLINE Vec1& operator+=(const Vec1 other) { return *this = (*this + other); } HWY_INLINE Vec1& operator-=(const Vec1 other) { return *this = (*this - other); } HWY_INLINE Vec1& operator&=(const Vec1 other) { return *this = (*this & other); } HWY_INLINE Vec1& operator|=(const Vec1 other) { return *this = (*this | other); } HWY_INLINE Vec1& operator^=(const Vec1 other) { return *this = (*this ^ other); } T raw; }; // 0 or FF..FF, same size as Vec1. template class Mask1 { using Raw = hwy::MakeUnsigned; public: static HWY_INLINE Mask1 FromBool(bool b) { Mask1 mask; mask.bits = b ? static_cast(~Raw{0}) : 0; return mask; } Raw bits; }; template using DFromV = Simd; template using TFromV = typename V::PrivateT; // ------------------------------ BitCast template HWY_API Vec1 BitCast(Sisd /* tag */, Vec1 v) { static_assert(sizeof(T) <= sizeof(FromT), "Promoting is undefined"); T to; CopyBytes(&v.raw, &to); // not same size - ok to shrink return Vec1(to); } // ------------------------------ Set template HWY_API Vec1 Zero(Sisd /* tag */) { return Vec1(T(0)); } template HWY_API Vec1 Set(Sisd /* tag */, const T2 t) { return Vec1(static_cast(t)); } template HWY_API Vec1 Undefined(Sisd d) { return Zero(d); } template HWY_API Vec1 Iota(const Sisd /* tag */, const T2 first) { return Vec1(static_cast(first)); } template using VFromD = decltype(Zero(D())); // ================================================== LOGICAL // ------------------------------ Not template HWY_API Vec1 Not(const Vec1 v) { using TU = MakeUnsigned; const Sisd du; return BitCast(Sisd(), Vec1(static_cast(~BitCast(du, v).raw))); } // ------------------------------ And template HWY_API Vec1 And(const Vec1 a, const Vec1 b) { using TU = MakeUnsigned; const Sisd du; return BitCast(Sisd(), Vec1(BitCast(du, a).raw & BitCast(du, b).raw)); } template HWY_API Vec1 operator&(const Vec1 a, const Vec1 b) { return And(a, b); } // ------------------------------ AndNot template HWY_API Vec1 AndNot(const Vec1 a, const Vec1 b) { using TU = MakeUnsigned; const Sisd du; return BitCast(Sisd(), Vec1(static_cast(~BitCast(du, a).raw & BitCast(du, b).raw))); } // ------------------------------ Or template HWY_API Vec1 Or(const Vec1 a, const Vec1 b) { using TU = MakeUnsigned; const Sisd du; return BitCast(Sisd(), Vec1(BitCast(du, a).raw | BitCast(du, b).raw)); } template HWY_API Vec1 operator|(const Vec1 a, const Vec1 b) { return Or(a, b); } // ------------------------------ Xor template HWY_API Vec1 Xor(const Vec1 a, const Vec1 b) { using TU = MakeUnsigned; const Sisd du; return BitCast(Sisd(), Vec1(BitCast(du, a).raw ^ BitCast(du, b).raw)); } template HWY_API Vec1 operator^(const Vec1 a, const Vec1 b) { return Xor(a, b); } // ------------------------------ Xor3 template HWY_API Vec1 Xor3(Vec1 x1, Vec1 x2, Vec1 x3) { return Xor(x1, Xor(x2, x3)); } // ------------------------------ Or3 template HWY_API Vec1 Or3(Vec1 o1, Vec1 o2, Vec1 o3) { return Or(o1, Or(o2, o3)); } // ------------------------------ OrAnd template HWY_API Vec1 OrAnd(const Vec1 o, const Vec1 a1, const Vec1 a2) { return Or(o, And(a1, a2)); } // ------------------------------ IfVecThenElse template HWY_API Vec1 IfVecThenElse(Vec1 mask, Vec1 yes, Vec1 no) { return IfThenElse(MaskFromVec(mask), yes, no); } // ------------------------------ CopySign template HWY_API Vec1 CopySign(const Vec1 magn, const Vec1 sign) { static_assert(IsFloat(), "Only makes sense for floating-point"); const auto msb = SignBit(Sisd()); return Or(AndNot(msb, magn), And(msb, sign)); } template HWY_API Vec1 CopySignToAbs(const Vec1 abs, const Vec1 sign) { static_assert(IsFloat(), "Only makes sense for floating-point"); return Or(abs, And(SignBit(Sisd()), sign)); } // ------------------------------ BroadcastSignBit template HWY_API Vec1 BroadcastSignBit(const Vec1 v) { // This is used inside ShiftRight, so we cannot implement in terms of it. return v.raw < 0 ? Vec1(T(-1)) : Vec1(0); } // ------------------------------ PopulationCount #ifdef HWY_NATIVE_POPCNT #undef HWY_NATIVE_POPCNT #else #define HWY_NATIVE_POPCNT #endif template HWY_API Vec1 PopulationCount(Vec1 v) { return Vec1(static_cast(PopCount(v.raw))); } // ------------------------------ Mask template HWY_API Mask1 RebindMask(Sisd /*tag*/, Mask1 m) { static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); return Mask1{m.bits}; } // v must be 0 or FF..FF. template HWY_API Mask1 MaskFromVec(const Vec1 v) { Mask1 mask; CopySameSize(&v, &mask); return mask; } template Vec1 VecFromMask(const Mask1 mask) { Vec1 v; CopySameSize(&mask, &v); return v; } template Vec1 VecFromMask(Sisd /* tag */, const Mask1 mask) { Vec1 v; CopySameSize(&mask, &v); return v; } template HWY_API Mask1 FirstN(Sisd /*tag*/, size_t n) { return Mask1::FromBool(n != 0); } // Returns mask ? yes : no. template HWY_API Vec1 IfThenElse(const Mask1 mask, const Vec1 yes, const Vec1 no) { return mask.bits ? yes : no; } template HWY_API Vec1 IfThenElseZero(const Mask1 mask, const Vec1 yes) { return mask.bits ? yes : Vec1(0); } template HWY_API Vec1 IfThenZeroElse(const Mask1 mask, const Vec1 no) { return mask.bits ? Vec1(0) : no; } template HWY_API Vec1 IfNegativeThenElse(Vec1 v, Vec1 yes, Vec1 no) { return v.raw < 0 ? yes : no; } template HWY_API Vec1 ZeroIfNegative(const Vec1 v) { return v.raw < 0 ? Vec1(0) : v; } // ------------------------------ Mask logical template HWY_API Mask1 Not(const Mask1 m) { return MaskFromVec(Not(VecFromMask(Sisd(), m))); } template HWY_API Mask1 And(const Mask1 a, Mask1 b) { const Sisd d; return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask1 AndNot(const Mask1 a, Mask1 b) { const Sisd d; return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask1 Or(const Mask1 a, Mask1 b) { const Sisd d; return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask1 Xor(const Mask1 a, Mask1 b) { const Sisd d; return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask1 ExclusiveNeither(const Mask1 a, Mask1 b) { const Sisd d; return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); } // ================================================== SHIFTS // ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit) template HWY_API Vec1 ShiftLeft(const Vec1 v) { static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); return Vec1( static_cast(static_cast>(v.raw) << kBits)); } template HWY_API Vec1 ShiftRight(const Vec1 v) { static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); #if __cplusplus >= 202002L // Signed right shift is now guaranteed to be arithmetic (rounding toward // negative infinity, i.e. shifting in the sign bit). return Vec1(static_cast(v.raw >> kBits)); #else if (IsSigned()) { // Emulate arithmetic shift using only logical (unsigned) shifts, because // signed shifts are still implementation-defined. using TU = hwy::MakeUnsigned; const Sisd du; const TU shifted = static_cast(BitCast(du, v).raw >> kBits); const TU sign = BitCast(du, BroadcastSignBit(v)).raw; const size_t sign_shift = static_cast(static_cast(sizeof(TU)) * 8 - 1 - kBits); const TU upper = static_cast(sign << sign_shift); return BitCast(Sisd(), Vec1(shifted | upper)); } else { // T is unsigned return Vec1(static_cast(v.raw >> kBits)); } #endif } // ------------------------------ RotateRight (ShiftRight) namespace detail { // For partial specialization: kBits == 0 results in an invalid shift count template struct RotateRight { template HWY_INLINE Vec1 operator()(const Vec1 v) const { return Or(ShiftRight(v), ShiftLeft(v)); } }; template <> struct RotateRight<0> { template HWY_INLINE Vec1 operator()(const Vec1 v) const { return v; } }; } // namespace detail template HWY_API Vec1 RotateRight(const Vec1 v) { static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); return detail::RotateRight()(v); } // ------------------------------ ShiftLeftSame (BroadcastSignBit) template HWY_API Vec1 ShiftLeftSame(const Vec1 v, int bits) { return Vec1( static_cast(static_cast>(v.raw) << bits)); } template HWY_API Vec1 ShiftRightSame(const Vec1 v, int bits) { #if __cplusplus >= 202002L // Signed right shift is now guaranteed to be arithmetic (rounding toward // negative infinity, i.e. shifting in the sign bit). return Vec1(static_cast(v.raw >> bits)); #else if (IsSigned()) { // Emulate arithmetic shift using only logical (unsigned) shifts, because // signed shifts are still implementation-defined. using TU = hwy::MakeUnsigned; const Sisd du; const TU shifted = static_cast(BitCast(du, v).raw >> bits); const TU sign = BitCast(du, BroadcastSignBit(v)).raw; const size_t sign_shift = static_cast(static_cast(sizeof(TU)) * 8 - 1 - bits); const TU upper = static_cast(sign << sign_shift); return BitCast(Sisd(), Vec1(shifted | upper)); } else { // T is unsigned return Vec1(static_cast(v.raw >> bits)); } #endif } // ------------------------------ Shl // Single-lane => same as ShiftLeftSame except for the argument type. template HWY_API Vec1 operator<<(const Vec1 v, const Vec1 bits) { return ShiftLeftSame(v, static_cast(bits.raw)); } template HWY_API Vec1 operator>>(const Vec1 v, const Vec1 bits) { return ShiftRightSame(v, static_cast(bits.raw)); } // ================================================== ARITHMETIC template HWY_API Vec1 operator+(Vec1 a, Vec1 b) { const uint64_t a64 = static_cast(a.raw); const uint64_t b64 = static_cast(b.raw); return Vec1(static_cast((a64 + b64) & static_cast(~T(0)))); } HWY_API Vec1 operator+(const Vec1 a, const Vec1 b) { return Vec1(a.raw + b.raw); } HWY_API Vec1 operator+(const Vec1 a, const Vec1 b) { return Vec1(a.raw + b.raw); } template HWY_API Vec1 operator-(Vec1 a, Vec1 b) { const uint64_t a64 = static_cast(a.raw); const uint64_t b64 = static_cast(b.raw); return Vec1(static_cast((a64 - b64) & static_cast(~T(0)))); } HWY_API Vec1 operator-(const Vec1 a, const Vec1 b) { return Vec1(a.raw - b.raw); } HWY_API Vec1 operator-(const Vec1 a, const Vec1 b) { return Vec1(a.raw - b.raw); } // ------------------------------ SumsOf8 HWY_API Vec1 SumsOf8(const Vec1 v) { return Vec1(v.raw); } // ------------------------------ SaturatedAdd // Returns a + b clamped to the destination range. // Unsigned HWY_API Vec1 SaturatedAdd(const Vec1 a, const Vec1 b) { return Vec1( static_cast(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 255))); } HWY_API Vec1 SaturatedAdd(const Vec1 a, const Vec1 b) { return Vec1( static_cast(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 65535))); } // Signed HWY_API Vec1 SaturatedAdd(const Vec1 a, const Vec1 b) { return Vec1( static_cast(HWY_MIN(HWY_MAX(-128, a.raw + b.raw), 127))); } HWY_API Vec1 SaturatedAdd(const Vec1 a, const Vec1 b) { return Vec1( static_cast(HWY_MIN(HWY_MAX(-32768, a.raw + b.raw), 32767))); } // ------------------------------ Saturating subtraction // Returns a - b clamped to the destination range. // Unsigned HWY_API Vec1 SaturatedSub(const Vec1 a, const Vec1 b) { return Vec1( static_cast(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 255))); } HWY_API Vec1 SaturatedSub(const Vec1 a, const Vec1 b) { return Vec1( static_cast(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 65535))); } // Signed HWY_API Vec1 SaturatedSub(const Vec1 a, const Vec1 b) { return Vec1( static_cast(HWY_MIN(HWY_MAX(-128, a.raw - b.raw), 127))); } HWY_API Vec1 SaturatedSub(const Vec1 a, const Vec1 b) { return Vec1( static_cast(HWY_MIN(HWY_MAX(-32768, a.raw - b.raw), 32767))); } // ------------------------------ Average // Returns (a + b + 1) / 2 HWY_API Vec1 AverageRound(const Vec1 a, const Vec1 b) { return Vec1(static_cast((a.raw + b.raw + 1) / 2)); } HWY_API Vec1 AverageRound(const Vec1 a, const Vec1 b) { return Vec1(static_cast((a.raw + b.raw + 1) / 2)); } // ------------------------------ Absolute value template HWY_API Vec1 Abs(const Vec1 a) { const T i = a.raw; if (i >= 0 || i == hwy::LimitsMin()) return a; return Vec1(static_cast(-i & T{-1})); } HWY_API Vec1 Abs(Vec1 a) { int32_t i; CopyBytes(&a.raw, &i); i &= 0x7FFFFFFF; CopyBytes(&i, &a.raw); return a; } HWY_API Vec1 Abs(Vec1 a) { int64_t i; CopyBytes(&a.raw, &i); i &= 0x7FFFFFFFFFFFFFFFL; CopyBytes(&i, &a.raw); return a; } // ------------------------------ Min/Max // may be unavailable, so implement our own. namespace detail { static inline float Abs(float f) { uint32_t i; CopyBytes<4>(&f, &i); i &= 0x7FFFFFFFu; CopyBytes<4>(&i, &f); return f; } static inline double Abs(double f) { uint64_t i; CopyBytes<8>(&f, &i); i &= 0x7FFFFFFFFFFFFFFFull; CopyBytes<8>(&i, &f); return f; } static inline bool SignBit(float f) { uint32_t i; CopyBytes<4>(&f, &i); return (i >> 31) != 0; } static inline bool SignBit(double f) { uint64_t i; CopyBytes<8>(&f, &i); return (i >> 63) != 0; } } // namespace detail template HWY_API Vec1 Min(const Vec1 a, const Vec1 b) { return Vec1(HWY_MIN(a.raw, b.raw)); } template HWY_API Vec1 Min(const Vec1 a, const Vec1 b) { if (isnan(a.raw)) return b; if (isnan(b.raw)) return a; return Vec1(HWY_MIN(a.raw, b.raw)); } template HWY_API Vec1 Max(const Vec1 a, const Vec1 b) { return Vec1(HWY_MAX(a.raw, b.raw)); } template HWY_API Vec1 Max(const Vec1 a, const Vec1 b) { if (isnan(a.raw)) return b; if (isnan(b.raw)) return a; return Vec1(HWY_MAX(a.raw, b.raw)); } // ------------------------------ Floating-point negate template HWY_API Vec1 Neg(const Vec1 v) { return Xor(v, SignBit(Sisd())); } template