// Copyright 2022 Google LLC // SPDX-License-Identifier: Apache-2.0 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Single-element vectors and operations. // External include guard in highway.h - see comment there. #include #include #include // std::abs, std::isnan #include "hwy/base.h" #include "hwy/ops/shared-inl.h" HWY_BEFORE_NAMESPACE(); namespace hwy { namespace HWY_NAMESPACE { template using Full128 = Simd; // (Wrapper class required for overloading comparison operators.) template struct Vec128 { using PrivateT = T; // only for DFromV static constexpr size_t kPrivateN = N; // only for DFromV HWY_INLINE Vec128() = default; Vec128(const Vec128&) = default; Vec128& operator=(const Vec128&) = default; HWY_INLINE Vec128& operator*=(const Vec128 other) { return *this = (*this * other); } HWY_INLINE Vec128& operator/=(const Vec128 other) { return *this = (*this / other); } HWY_INLINE Vec128& operator+=(const Vec128 other) { return *this = (*this + other); } HWY_INLINE Vec128& operator-=(const Vec128 other) { return *this = (*this - other); } HWY_INLINE Vec128& operator&=(const Vec128 other) { return *this = (*this & other); } HWY_INLINE Vec128& operator|=(const Vec128 other) { return *this = (*this | other); } HWY_INLINE Vec128& operator^=(const Vec128 other) { return *this = (*this ^ other); } // Behave like wasm128 (vectors can always hold 128 bits). generic_ops-inl.h // relies on this for LoadInterleaved*. CAVEAT: this method of padding // prevents using range for, especially in SumOfLanes, where it would be // incorrect. Moving padding to another field would require handling the case // where N = 16 / sizeof(T) (i.e. there is no padding), which is also awkward. T raw[16 / sizeof(T)] = {}; }; // 0 or FF..FF, same size as Vec128. template struct Mask128 { using Raw = hwy::MakeUnsigned; static HWY_INLINE Raw FromBool(bool b) { return b ? static_cast(~Raw{0}) : 0; } // Must match the size of Vec128. Raw bits[16 / sizeof(T)] = {}; }; template using DFromV = Simd; template using TFromV = typename V::PrivateT; // ------------------------------ BitCast template HWY_API Vec128 BitCast(Simd /* tag */, Vec128 v) { Vec128 to; CopySameSize(&v, &to); return to; } // ------------------------------ Set template HWY_API Vec128 Zero(Simd /* tag */) { Vec128 v; ZeroBytes(v.raw); return v; } template using VFromD = decltype(Zero(D())); template HWY_API Vec128 Set(Simd /* tag */, const T2 t) { Vec128 v; for (size_t i = 0; i < N; ++i) { v.raw[i] = static_cast(t); } return v; } template HWY_API Vec128 Undefined(Simd d) { return Zero(d); } template HWY_API Vec128 Iota(const Simd /* tag */, T2 first) { Vec128 v; for (size_t i = 0; i < N; ++i) { v.raw[i] = AddWithWraparound(hwy::IsFloatTag(), static_cast(first), i); } return v; } // ================================================== LOGICAL // ------------------------------ Not template HWY_API Vec128 Not(const Vec128 v) { const Simd d; const RebindToUnsigned du; using TU = TFromD; VFromD vu = BitCast(du, v); for (size_t i = 0; i < N; ++i) { vu.raw[i] = static_cast(~vu.raw[i]); } return BitCast(d, vu); } // ------------------------------ And template HWY_API Vec128 And(const Vec128 a, const Vec128 b) { const Simd d; const RebindToUnsigned du; auto au = BitCast(du, a); auto bu = BitCast(du, b); for (size_t i = 0; i < N; ++i) { au.raw[i] &= bu.raw[i]; } return BitCast(d, au); } template HWY_API Vec128 operator&(const Vec128 a, const Vec128 b) { return And(a, b); } // ------------------------------ AndNot template HWY_API Vec128 AndNot(const Vec128 a, const Vec128 b) { return And(Not(a), b); } // ------------------------------ Or template HWY_API Vec128 Or(const Vec128 a, const Vec128 b) { const Simd d; const RebindToUnsigned du; auto au = BitCast(du, a); auto bu = BitCast(du, b); for (size_t i = 0; i < N; ++i) { au.raw[i] |= bu.raw[i]; } return BitCast(d, au); } template HWY_API Vec128 operator|(const Vec128 a, const Vec128 b) { return Or(a, b); } // ------------------------------ Xor template HWY_API Vec128 Xor(const Vec128 a, const Vec128 b) { const Simd d; const RebindToUnsigned du; auto au = BitCast(du, a); auto bu = BitCast(du, b); for (size_t i = 0; i < N; ++i) { au.raw[i] ^= bu.raw[i]; } return BitCast(d, au); } template HWY_API Vec128 operator^(const Vec128 a, const Vec128 b) { return Xor(a, b); } // ------------------------------ Xor3 template HWY_API Vec128 Xor3(Vec128 x1, Vec128 x2, Vec128 x3) { return Xor(x1, Xor(x2, x3)); } // ------------------------------ Or3 template HWY_API Vec128 Or3(Vec128 o1, Vec128 o2, Vec128 o3) { return Or(o1, Or(o2, o3)); } // ------------------------------ OrAnd template HWY_API Vec128 OrAnd(const Vec128 o, const Vec128 a1, const Vec128 a2) { return Or(o, And(a1, a2)); } // ------------------------------ IfVecThenElse template HWY_API Vec128 IfVecThenElse(Vec128 mask, Vec128 yes, Vec128 no) { return Or(And(mask, yes), AndNot(mask, no)); } // ------------------------------ CopySign template HWY_API Vec128 CopySign(const Vec128 magn, const Vec128 sign) { static_assert(IsFloat(), "Only makes sense for floating-point"); const auto msb = SignBit(Simd()); return Or(AndNot(msb, magn), And(msb, sign)); } template HWY_API Vec128 CopySignToAbs(const Vec128 abs, const Vec128 sign) { static_assert(IsFloat(), "Only makes sense for floating-point"); return Or(abs, And(SignBit(Simd()), sign)); } // ------------------------------ BroadcastSignBit template HWY_API Vec128 BroadcastSignBit(Vec128 v) { // This is used inside ShiftRight, so we cannot implement in terms of it. for (size_t i = 0; i < N; ++i) { v.raw[i] = v.raw[i] < 0 ? T(-1) : T(0); } return v; } // ------------------------------ Mask template HWY_API Mask128 RebindMask(Simd /*tag*/, Mask128 mask) { Mask128 to; CopySameSize(&mask, &to); return to; } // v must be 0 or FF..FF. template HWY_API Mask128 MaskFromVec(const Vec128 v) { Mask128 mask; CopySameSize(&v, &mask); return mask; } template Vec128 VecFromMask(const Mask128 mask) { Vec128 v; CopySameSize(&mask, &v); return v; } template Vec128 VecFromMask(Simd /* tag */, const Mask128 mask) { return VecFromMask(mask); } template HWY_API Mask128 FirstN(Simd /*tag*/, size_t n) { Mask128 m; for (size_t i = 0; i < N; ++i) { m.bits[i] = Mask128::FromBool(i < n); } return m; } // Returns mask ? yes : no. template HWY_API Vec128 IfThenElse(const Mask128 mask, const Vec128 yes, const Vec128 no) { return IfVecThenElse(VecFromMask(mask), yes, no); } template HWY_API Vec128 IfThenElseZero(const Mask128 mask, const Vec128 yes) { return IfVecThenElse(VecFromMask(mask), yes, Zero(Simd())); } template HWY_API Vec128 IfThenZeroElse(const Mask128 mask, const Vec128 no) { return IfVecThenElse(VecFromMask(mask), Zero(Simd()), no); } template HWY_API Vec128 IfNegativeThenElse(Vec128 v, Vec128 yes, Vec128 no) { for (size_t i = 0; i < N; ++i) { v.raw[i] = v.raw[i] < 0 ? yes.raw[i] : no.raw[i]; } return v; } template HWY_API Vec128 ZeroIfNegative(const Vec128 v) { return IfNegativeThenElse(v, Zero(Simd()), v); } // ------------------------------ Mask logical template HWY_API Mask128 Not(const Mask128 m) { return MaskFromVec(Not(VecFromMask(Simd(), m))); } template HWY_API Mask128 And(const Mask128 a, Mask128 b) { const Simd d; return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask128 AndNot(const Mask128 a, Mask128 b) { const Simd d; return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask128 Or(const Mask128 a, Mask128 b) { const Simd d; return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask128 Xor(const Mask128 a, Mask128 b) { const Simd d; return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask128 ExclusiveNeither(const Mask128 a, Mask128 b) { const Simd d; return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); } // ================================================== SHIFTS // ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit) template HWY_API Vec128 ShiftLeft(Vec128 v) { static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); for (size_t i = 0; i < N; ++i) { const auto shifted = static_cast>(v.raw[i]) << kBits; v.raw[i] = static_cast(shifted); } return v; } template HWY_API Vec128 ShiftRight(Vec128 v) { static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); #if __cplusplus >= 202002L // Signed right shift is now guaranteed to be arithmetic (rounding toward // negative infinity, i.e. shifting in the sign bit). for (size_t i = 0; i < N; ++i) { v.raw[i] = static_cast(v.raw[i] >> kBits); } #else if (IsSigned()) { // Emulate arithmetic shift using only logical (unsigned) shifts, because // signed shifts are still implementation-defined. using TU = hwy::MakeUnsigned; for (size_t i = 0; i < N; ++i) { const TU shifted = static_cast(static_cast(v.raw[i]) >> kBits); const TU sign = v.raw[i] < 0 ? static_cast(~TU{0}) : 0; const size_t sign_shift = static_cast(static_cast(sizeof(TU)) * 8 - 1 - kBits); const TU upper = static_cast(sign << sign_shift); v.raw[i] = static_cast(shifted | upper); } } else { // T is unsigned for (size_t i = 0; i < N; ++i) { v.raw[i] = static_cast(v.raw[i] >> kBits); } } #endif return v; } // ------------------------------ RotateRight (ShiftRight) namespace detail { // For partial specialization: kBits == 0 results in an invalid shift count template struct RotateRight { template HWY_INLINE Vec128 operator()(const Vec128 v) const { return Or(ShiftRight(v), ShiftLeft(v)); } }; template <> struct RotateRight<0> { template HWY_INLINE Vec128 operator()(const Vec128 v) const { return v; } }; } // namespace detail template HWY_API Vec128 RotateRight(const Vec128 v) { static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); return detail::RotateRight()(v); } // ------------------------------ ShiftLeftSame template HWY_API Vec128 ShiftLeftSame(Vec128 v, int bits) { for (size_t i = 0; i < N; ++i) { const auto shifted = static_cast>(v.raw[i]) << bits; v.raw[i] = static_cast(shifted); } return v; } template HWY_API Vec128 ShiftRightSame(Vec128 v, int bits) { #if __cplusplus >= 202002L // Signed right shift is now guaranteed to be arithmetic (rounding toward // negative infinity, i.e. shifting in the sign bit). for (size_t i = 0; i < N; ++i) { v.raw[i] = static_cast(v.raw[i] >> bits); } #else if (IsSigned()) { // Emulate arithmetic shift using only logical (unsigned) shifts, because // signed shifts are still implementation-defined. using TU = hwy::MakeUnsigned; for (size_t i = 0; i < N; ++i) { const TU shifted = static_cast(static_cast(v.raw[i]) >> bits); const TU sign = v.raw[i] < 0 ? static_cast(~TU{0}) : 0; const size_t sign_shift = static_cast(static_cast(sizeof(TU)) * 8 - 1 - bits); const TU upper = static_cast(sign << sign_shift); v.raw[i] = static_cast(shifted | upper); } } else { for (size_t i = 0; i < N; ++i) { v.raw[i] = static_cast(v.raw[i] >> bits); // unsigned, logical shift } } #endif return v; } // ------------------------------ Shl template HWY_API Vec128 operator<<(Vec128 v, const Vec128 bits) { for (size_t i = 0; i < N; ++i) { const auto shifted = static_cast>(v.raw[i]) << bits.raw[i]; v.raw[i] = static_cast(shifted); } return v; } template HWY_API Vec128 operator>>(Vec128 v, const Vec128 bits) { #if __cplusplus >= 202002L // Signed right shift is now guaranteed to be arithmetic (rounding toward // negative infinity, i.e. shifting in the sign bit). for (size_t i = 0; i < N; ++i) { v.raw[i] = static_cast(v.raw[i] >> bits.raw[i]); } #else if (IsSigned()) { // Emulate arithmetic shift using only logical (unsigned) shifts, because // signed shifts are still implementation-defined. using TU = hwy::MakeUnsigned; for (size_t i = 0; i < N; ++i) { const TU shifted = static_cast(static_cast(v.raw[i]) >> bits.raw[i]); const TU sign = v.raw[i] < 0 ? static_cast(~TU{0}) : 0; const size_t sign_shift = static_cast( static_cast(sizeof(TU)) * 8 - 1 - bits.raw[i]); const TU upper = static_cast(sign << sign_shift); v.raw[i] = static_cast(shifted | upper); } } else { // T is unsigned for (size_t i = 0; i < N; ++i) { v.raw[i] = static_cast(v.raw[i] >> bits.raw[i]); } } #endif return v; } // ================================================== ARITHMETIC // Tag dispatch instead of SFINAE for MSVC 2017 compatibility namespace detail { template HWY_INLINE Vec128 Add(hwy::NonFloatTag /*tag*/, Vec128 a, Vec128 b) { for (size_t i = 0; i < N; ++i) { const uint64_t a64 = static_cast(a.raw[i]); const uint64_t b64 = static_cast(b.raw[i]); a.raw[i] = static_cast((a64 + b64) & static_cast(~T(0))); } return a; } template HWY_INLINE Vec128 Sub(hwy::NonFloatTag /*tag*/, Vec128 a, Vec128 b) { for (size_t i = 0; i < N; ++i) { const uint64_t a64 = static_cast(a.raw[i]); const uint64_t b64 = static_cast(b.raw[i]); a.raw[i] = static_cast((a64 - b64) & static_cast(~T(0))); } return a; } template HWY_INLINE Vec128 Add(hwy::FloatTag /*tag*/, Vec128 a, const Vec128 b) { for (size_t i = 0; i < N; ++i) { a.raw[i] += b.raw[i]; } return a; } template HWY_INLINE Vec128 Sub(hwy::FloatTag /*tag*/, Vec128 a, const Vec128 b) { for (size_t i = 0; i < N; ++i) { a.raw[i] -= b.raw[i]; } return a; } } // namespace detail template HWY_API Vec128 operator-(Vec128 a, const Vec128 b) { return detail::Sub(hwy::IsFloatTag(), a, b); } template HWY_API Vec128 operator+(Vec128 a, const Vec128 b) { return detail::Add(hwy::IsFloatTag(), a, b); } // ------------------------------ SumsOf8 template HWY_API Vec128 SumsOf8(const Vec128 v) { Vec128 sums; for (size_t i = 0; i < N; ++i) { sums.raw[i / 8] += v.raw[i]; } return sums; } // ------------------------------ SaturatedAdd template HWY_API Vec128 SaturatedAdd(Vec128 a, const Vec128 b) { for (size_t i = 0; i < N; ++i) { a.raw[i] = static_cast( HWY_MIN(HWY_MAX(hwy::LowestValue(), a.raw[i] + b.raw[i]), hwy::HighestValue())); } return a; } // ------------------------------ SaturatedSub template HWY_API Vec128 SaturatedSub(Vec128 a, const Vec128 b) { for (size_t i = 0; i < N; ++i) { a.raw[i] = static_cast( HWY_MIN(HWY_MAX(hwy::LowestValue(), a.raw[i] - b.raw[i]), hwy::HighestValue())); } return a; } // ------------------------------ AverageRound template HWY_API Vec128 AverageRound(Vec128 a, const Vec128 b) { static_assert(!IsSigned(), "Only for unsigned"); for (size_t i = 0; i < N; ++i) { a.raw[i] = static_cast((a.raw[i] + b.raw[i] + 1) / 2); } return a; } // ------------------------------ Abs // Tag dispatch instead of SFINAE for MSVC 2017 compatibility namespace detail { template HWY_INLINE Vec128 Abs(SignedTag /*tag*/, Vec128 a) { for (size_t i = 0; i < N; ++i) { const T s = a.raw[i]; const T min = hwy::LimitsMin(); a.raw[i] = static_cast((s >= 0 || s == min) ? a.raw[i] : -s); } return a; } template HWY_INLINE Vec128 Abs(hwy::FloatTag /*tag*/, Vec128 v) { for (size_t i = 0; i < N; ++i) { v.raw[i] = std::abs(v.raw[i]); } return v; } } // namespace detail template HWY_API Vec128 Abs(Vec128 a) { return detail::Abs(hwy::TypeTag(), a); } // ------------------------------ Min/Max // Tag dispatch instead of SFINAE for MSVC 2017 compatibility namespace detail { template HWY_INLINE Vec128 Min(hwy::NonFloatTag /*tag*/, Vec128 a, const Vec128 b) { for (size_t i = 0; i < N; ++i) { a.raw[i] = HWY_MIN(a.raw[i], b.raw[i]); } return a; } template HWY_INLINE Vec128 Max(hwy::NonFloatTag /*tag*/, Vec128 a, const Vec128 b) { for (size_t i = 0; i < N; ++i) { a.raw[i] = HWY_MAX(a.raw[i], b.raw[i]); } return a; } template HWY_INLINE Vec128 Min(hwy::FloatTag /*tag*/, Vec128 a, const Vec128 b) { for (size_t i = 0; i < N; ++i) { if (std::isnan(a.raw[i])) { a.raw[i] = b.raw[i]; } else if (std::isnan(b.raw[i])) { // no change } else { a.raw[i] = HWY_MIN(a.raw[i], b.raw[i]); } } return a; } template HWY_INLINE Vec128 Max(hwy::FloatTag /*tag*/, Vec128 a, const Vec128 b) { for (size_t i = 0; i < N; ++i) { if (std::isnan(a.raw[i])) { a.raw[i] = b.raw[i]; } else if (std::isnan(b.raw[i])) { // no change } else { a.raw[i] = HWY_MAX(a.raw[i], b.raw[i]); } } return a; } } // namespace detail template HWY_API Vec128 Min(Vec128 a, const Vec128 b) { return detail::Min(hwy::IsFloatTag(), a, b); } template HWY_API Vec128 Max(Vec128 a, const Vec128 b) { return detail::Max(hwy::IsFloatTag(), a, b); } // ------------------------------ Neg // Tag dispatch instead of SFINAE for MSVC 2017 compatibility namespace detail { template HWY_API Vec128 Neg(hwy::NonFloatTag /*tag*/, Vec128 v) { return Zero(Simd()) - v; } template HWY_API Vec128 Neg(hwy::FloatTag /*tag*/, Vec128 v) { return Xor(v, SignBit(Simd())); } } // namespace detail template HWY_API Vec128 Neg(Vec128 v) { return detail::Neg(hwy::IsFloatTag(), v); } // ------------------------------ Mul/Div // Tag dispatch instead of SFINAE for MSVC 2017 compatibility namespace detail { template HWY_INLINE Vec128 Mul(hwy::FloatTag /*tag*/, Vec128 a, const Vec128 b) { for (size_t i = 0; i < N; ++i) { a.raw[i] *= b.raw[i]; } return a; } template HWY_INLINE Vec128 Mul(SignedTag /*tag*/, Vec128 a, const Vec128 b) { for (size_t i = 0; i < N; ++i) { a.raw[i] = static_cast(static_cast(a.raw[i]) * static_cast(b.raw[i])); } return a; } template HWY_INLINE Vec128 Mul(UnsignedTag /*tag*/, Vec128 a, const Vec128 b) { for (size_t i = 0; i < N; ++i) { a.raw[i] = static_cast(static_cast(a.raw[i]) * static_cast(b.raw[i])); } return a; } } // namespace detail template HWY_API Vec128 operator*(Vec128 a, const Vec128 b) { return detail::Mul(hwy::TypeTag(), a, b); } template HWY_API Vec128 operator/(Vec128 a, const Vec128 b) { for (size_t i = 0; i < N; ++i) { a.raw[i] /= b.raw[i]; } return a; } // Returns the upper 16 bits of a * b in each lane. template HWY_API Vec128 MulHigh(Vec128 a, const Vec128 b) { for (size_t i = 0; i < N; ++i) { a.raw[i] = static_cast