// Copyright 2021 Google LLC // SPDX-License-Identifier: Apache-2.0 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // 256-bit WASM vectors and operations. Experimental. // External include guard in highway.h - see comment there. // For half-width vectors. Already includes base.h and shared-inl.h. #include "hwy/ops/wasm_128-inl.h" HWY_BEFORE_NAMESPACE(); namespace hwy { namespace HWY_NAMESPACE { template class Vec256 { public: using PrivateT = T; // only for DFromV static constexpr size_t kPrivateN = 32 / sizeof(T); // only for DFromV // Compound assignment. Only usable if there is a corresponding non-member // binary operator overload. For example, only f32 and f64 support division. HWY_INLINE Vec256& operator*=(const Vec256 other) { return *this = (*this * other); } HWY_INLINE Vec256& operator/=(const Vec256 other) { return *this = (*this / other); } HWY_INLINE Vec256& operator+=(const Vec256 other) { return *this = (*this + other); } HWY_INLINE Vec256& operator-=(const Vec256 other) { return *this = (*this - other); } HWY_INLINE Vec256& operator&=(const Vec256 other) { return *this = (*this & other); } HWY_INLINE Vec256& operator|=(const Vec256 other) { return *this = (*this | other); } HWY_INLINE Vec256& operator^=(const Vec256 other) { return *this = (*this ^ other); } Vec128 v0; Vec128 v1; }; template struct Mask256 { Mask128 m0; Mask128 m1; }; // ------------------------------ BitCast template HWY_API Vec256 BitCast(Full256 d, Vec256 v) { const Half dh; Vec256 ret; ret.v0 = BitCast(dh, v.v0); ret.v1 = BitCast(dh, v.v1); return ret; } // ------------------------------ Zero template HWY_API Vec256 Zero(Full256 d) { const Half dh; Vec256 ret; ret.v0 = ret.v1 = Zero(dh); return ret; } template using VFromD = decltype(Zero(D())); // ------------------------------ Set // Returns a vector/part with all lanes set to "t". template HWY_API Vec256 Set(Full256 d, const T2 t) { const Half dh; Vec256 ret; ret.v0 = ret.v1 = Set(dh, static_cast(t)); return ret; } template HWY_API Vec256 Undefined(Full256 d) { const Half dh; Vec256 ret; ret.v0 = ret.v1 = Undefined(dh); return ret; } template Vec256 Iota(const Full256 d, const T2 first) { const Half dh; Vec256 ret; ret.v0 = Iota(dh, first); // NB: for floating types the gap between parts might be a bit uneven. ret.v1 = Iota(dh, AddWithWraparound(hwy::IsFloatTag(), static_cast(first), Lanes(dh))); return ret; } // ================================================== ARITHMETIC template HWY_API Vec256 operator+(Vec256 a, const Vec256 b) { a.v0 += b.v0; a.v1 += b.v1; return a; } template HWY_API Vec256 operator-(Vec256 a, const Vec256 b) { a.v0 -= b.v0; a.v1 -= b.v1; return a; } // ------------------------------ SumsOf8 HWY_API Vec256 SumsOf8(const Vec256 v) { Vec256 ret; ret.v0 = SumsOf8(v.v0); ret.v1 = SumsOf8(v.v1); return ret; } template HWY_API Vec256 SaturatedAdd(Vec256 a, const Vec256 b) { a.v0 = SaturatedAdd(a.v0, b.v0); a.v1 = SaturatedAdd(a.v1, b.v1); return a; } template HWY_API Vec256 SaturatedSub(Vec256 a, const Vec256 b) { a.v0 = SaturatedSub(a.v0, b.v0); a.v1 = SaturatedSub(a.v1, b.v1); return a; } template HWY_API Vec256 AverageRound(Vec256 a, const Vec256 b) { a.v0 = AverageRound(a.v0, b.v0); a.v1 = AverageRound(a.v1, b.v1); return a; } template HWY_API Vec256 Abs(Vec256 v) { v.v0 = Abs(v.v0); v.v1 = Abs(v.v1); return v; } // ------------------------------ Shift lanes by constant #bits template HWY_API Vec256 ShiftLeft(Vec256 v) { v.v0 = ShiftLeft(v.v0); v.v1 = ShiftLeft(v.v1); return v; } template HWY_API Vec256 ShiftRight(Vec256 v) { v.v0 = ShiftRight(v.v0); v.v1 = ShiftRight(v.v1); return v; } // ------------------------------ RotateRight (ShiftRight, Or) template HWY_API Vec256 RotateRight(const Vec256 v) { constexpr size_t kSizeInBits = sizeof(T) * 8; static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); if (kBits == 0) return v; return Or(ShiftRight(v), ShiftLeft(v)); } // ------------------------------ Shift lanes by same variable #bits template HWY_API Vec256 ShiftLeftSame(Vec256 v, const int bits) { v.v0 = ShiftLeftSame(v.v0, bits); v.v1 = ShiftLeftSame(v.v1, bits); return v; } template HWY_API Vec256 ShiftRightSame(Vec256 v, const int bits) { v.v0 = ShiftRightSame(v.v0, bits); v.v1 = ShiftRightSame(v.v1, bits); return v; } // ------------------------------ Min, Max template HWY_API Vec256 Min(Vec256 a, const Vec256 b) { a.v0 = Min(a.v0, b.v0); a.v1 = Min(a.v1, b.v1); return a; } template HWY_API Vec256 Max(Vec256 a, const Vec256 b) { a.v0 = Max(a.v0, b.v0); a.v1 = Max(a.v1, b.v1); return a; } // ------------------------------ Integer multiplication template HWY_API Vec256 operator*(Vec256 a, const Vec256 b) { a.v0 *= b.v0; a.v1 *= b.v1; return a; } template HWY_API Vec256 MulHigh(Vec256 a, const Vec256 b) { a.v0 = MulHigh(a.v0, b.v0); a.v1 = MulHigh(a.v1, b.v1); return a; } template HWY_API Vec256 MulFixedPoint15(Vec256 a, const Vec256 b) { a.v0 = MulFixedPoint15(a.v0, b.v0); a.v1 = MulFixedPoint15(a.v1, b.v1); return a; } // Cannot use MakeWide because that returns uint128_t for uint64_t, but we want // uint64_t. HWY_API Vec256 MulEven(Vec256 a, const Vec256 b) { Vec256 ret; ret.v0 = MulEven(a.v0, b.v0); ret.v1 = MulEven(a.v1, b.v1); return ret; } HWY_API Vec256 MulEven(Vec256 a, const Vec256 b) { Vec256 ret; ret.v0 = MulEven(a.v0, b.v0); ret.v1 = MulEven(a.v1, b.v1); return ret; } HWY_API Vec256 MulEven(Vec256 a, const Vec256 b) { Vec256 ret; ret.v0 = MulEven(a.v0, b.v0); ret.v1 = MulEven(a.v1, b.v1); return ret; } HWY_API Vec256 MulOdd(Vec256 a, const Vec256 b) { Vec256 ret; ret.v0 = MulOdd(a.v0, b.v0); ret.v1 = MulOdd(a.v1, b.v1); return ret; } // ------------------------------ Negate template HWY_API Vec256 Neg(Vec256 v) { v.v0 = Neg(v.v0); v.v1 = Neg(v.v1); return v; } // ------------------------------ Floating-point division template HWY_API Vec256 operator/(Vec256 a, const Vec256 b) { a.v0 /= b.v0; a.v1 /= b.v1; return a; } // Approximate reciprocal HWY_API Vec256 ApproximateReciprocal(const Vec256 v) { const Vec256 one = Set(Full256(), 1.0f); return one / v; } // Absolute value of difference. HWY_API Vec256 AbsDiff(const Vec256 a, const Vec256 b) { return Abs(a - b); } // ------------------------------ Floating-point multiply-add variants // Returns mul * x + add HWY_API Vec256 MulAdd(const Vec256 mul, const Vec256 x, const Vec256 add) { // TODO(eustas): replace, when implemented in WASM. // TODO(eustas): is it wasm_f32x4_qfma? return mul * x + add; } // Returns add - mul * x HWY_API Vec256 NegMulAdd(const Vec256 mul, const Vec256 x, const Vec256 add) { // TODO(eustas): replace, when implemented in WASM. return add - mul * x; } // Returns mul * x - sub HWY_API Vec256 MulSub(const Vec256 mul, const Vec256 x, const Vec256 sub) { // TODO(eustas): replace, when implemented in WASM. // TODO(eustas): is it wasm_f32x4_qfms? return mul * x - sub; } // Returns -mul * x - sub HWY_API Vec256 NegMulSub(const Vec256 mul, const Vec256 x, const Vec256 sub) { // TODO(eustas): replace, when implemented in WASM. return Neg(mul) * x - sub; } // ------------------------------ Floating-point square root template HWY_API Vec256 Sqrt(Vec256 v) { v.v0 = Sqrt(v.v0); v.v1 = Sqrt(v.v1); return v; } // Approximate reciprocal square root HWY_API Vec256 ApproximateReciprocalSqrt(const Vec256 v) { // TODO(eustas): find cheaper a way to calculate this. const Vec256 one = Set(Full256(), 1.0f); return one / Sqrt(v); } // ------------------------------ Floating-point rounding // Toward nearest integer, ties to even HWY_API Vec256 Round(Vec256 v) { v.v0 = Round(v.v0); v.v1 = Round(v.v1); return v; } // Toward zero, aka truncate HWY_API Vec256 Trunc(Vec256 v) { v.v0 = Trunc(v.v0); v.v1 = Trunc(v.v1); return v; } // Toward +infinity, aka ceiling HWY_API Vec256 Ceil(Vec256 v) { v.v0 = Ceil(v.v0); v.v1 = Ceil(v.v1); return v; } // Toward -infinity, aka floor HWY_API Vec256 Floor(Vec256 v) { v.v0 = Floor(v.v0); v.v1 = Floor(v.v1); return v; } // ------------------------------ Floating-point classification template HWY_API Mask256 IsNaN(const Vec256 v) { return v != v; } template HWY_API Mask256 IsInf(const Vec256 v) { const Full256 d; const RebindToSigned di; const VFromD vi = BitCast(di, v); // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2()))); } // Returns whether normal/subnormal/zero. template HWY_API Mask256 IsFinite(const Vec256 v) { const Full256 d; const RebindToUnsigned du; const RebindToSigned di; // cheaper than unsigned comparison const VFromD vu = BitCast(du, v); // 'Shift left' to clear the sign bit, then right so we can compare with the // max exponent (cannot compare with MaxExponentTimes2 directly because it is // negative and non-negative floats would be greater). const VFromD exp = BitCast(di, ShiftRight() + 1>(Add(vu, vu))); return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField()))); } // ================================================== COMPARE // Comparisons fill a lane with 1-bits if the condition is true, else 0. template HWY_API Mask256 RebindMask(Full256 /*tag*/, Mask256 m) { static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); return Mask256{Mask128{m.m0.raw}, Mask128{m.m1.raw}}; } template HWY_API Mask256 TestBit(Vec256 v, Vec256 bit) { static_assert(!hwy::IsFloat(), "Only integer vectors supported"); return (v & bit) == bit; } template HWY_API Mask256 operator==(Vec256 a, const Vec256 b) { Mask256 m; m.m0 = operator==(a.v0, b.v0); m.m1 = operator==(a.v1, b.v1); return m; } template HWY_API Mask256 operator!=(Vec256 a, const Vec256 b) { Mask256 m; m.m0 = operator!=(a.v0, b.v0); m.m1 = operator!=(a.v1, b.v1); return m; } template HWY_API Mask256 operator<(Vec256 a, const Vec256 b) { Mask256 m; m.m0 = operator<(a.v0, b.v0); m.m1 = operator<(a.v1, b.v1); return m; } template HWY_API Mask256 operator>(Vec256 a, const Vec256 b) { Mask256 m; m.m0 = operator>(a.v0, b.v0); m.m1 = operator>(a.v1, b.v1); return m; } template HWY_API Mask256 operator<=(Vec256 a, const Vec256 b) { Mask256 m; m.m0 = operator<=(a.v0, b.v0); m.m1 = operator<=(a.v1, b.v1); return m; } template HWY_API Mask256 operator>=(Vec256 a, const Vec256 b) { Mask256 m; m.m0 = operator>=(a.v0, b.v0); m.m1 = operator>=(a.v1, b.v1); return m; } // ------------------------------ FirstN (Iota, Lt) template HWY_API Mask256 FirstN(const Full256 d, size_t num) { const RebindToSigned di; // Signed comparisons may be cheaper. return RebindMask(d, Iota(di, 0) < Set(di, static_cast>(num))); } // ================================================== LOGICAL template HWY_API Vec256 Not(Vec256 v) { v.v0 = Not(v.v0); v.v1 = Not(v.v1); return v; } template HWY_API Vec256 And(Vec256 a, Vec256 b) { a.v0 = And(a.v0, b.v0); a.v1 = And(a.v1, b.v1); return a; } template HWY_API Vec256 AndNot(Vec256 not_mask, Vec256 mask) { not_mask.v0 = AndNot(not_mask.v0, mask.v0); not_mask.v1 = AndNot(not_mask.v1, mask.v1); return not_mask; } template HWY_API Vec256 Or(Vec256 a, Vec256 b) { a.v0 = Or(a.v0, b.v0); a.v1 = Or(a.v1, b.v1); return a; } template HWY_API Vec256 Xor(Vec256 a, Vec256 b) { a.v0 = Xor(a.v0, b.v0); a.v1 = Xor(a.v1, b.v1); return a; } template HWY_API Vec256 Xor3(Vec256 x1, Vec256 x2, Vec256 x3) { return Xor(x1, Xor(x2, x3)); } template HWY_API Vec256 Or3(Vec256 o1, Vec256 o2, Vec256 o3) { return Or(o1, Or(o2, o3)); } template HWY_API Vec256 OrAnd(Vec256 o, Vec256 a1, Vec256 a2) { return Or(o, And(a1, a2)); } template HWY_API Vec256 IfVecThenElse(Vec256 mask, Vec256 yes, Vec256 no) { return IfThenElse(MaskFromVec(mask), yes, no); } // ------------------------------ Operator overloads (internal-only if float) template HWY_API Vec256 operator&(const Vec256 a, const Vec256 b) { return And(a, b); } template HWY_API Vec256 operator|(const Vec256 a, const Vec256 b) { return Or(a, b); } template HWY_API Vec256 operator^(const Vec256 a, const Vec256 b) { return Xor(a, b); } // ------------------------------ CopySign template HWY_API Vec256 CopySign(const Vec256 magn, const Vec256 sign) { static_assert(IsFloat(), "Only makes sense for floating-point"); const auto msb = SignBit(Full256()); return Or(AndNot(msb, magn), And(msb, sign)); } template HWY_API Vec256 CopySignToAbs(const Vec256 abs, const Vec256 sign) { static_assert(IsFloat(), "Only makes sense for floating-point"); return Or(abs, And(SignBit(Full256()), sign)); } // ------------------------------ Mask // Mask and Vec are the same (true = FF..FF). template HWY_API Mask256 MaskFromVec(const Vec256 v) { Mask256 m; m.m0 = MaskFromVec(v.v0); m.m1 = MaskFromVec(v.v1); return m; } template HWY_API Vec256 VecFromMask(Full256 d, Mask256 m) { const Half dh; Vec256 v; v.v0 = VecFromMask(dh, m.m0); v.v1 = VecFromMask(dh, m.m1); return v; } // mask ? yes : no template HWY_API Vec256 IfThenElse(Mask256 mask, Vec256 yes, Vec256 no) { yes.v0 = IfThenElse(mask.m0, yes.v0, no.v0); yes.v1 = IfThenElse(mask.m1, yes.v1, no.v1); return yes; } // mask ? yes : 0 template HWY_API Vec256 IfThenElseZero(Mask256 mask, Vec256 yes) { return yes & VecFromMask(Full256(), mask); } // mask ? 0 : no template HWY_API Vec256 IfThenZeroElse(Mask256 mask, Vec256 no) { return AndNot(VecFromMask(Full256(), mask), no); } template HWY_API Vec256 IfNegativeThenElse(Vec256 v, Vec256 yes, Vec256 no) { v.v0 = IfNegativeThenElse(v.v0, yes.v0, no.v0); v.v1 = IfNegativeThenElse(v.v1, yes.v1, no.v1); return v; } template HWY_API Vec256 ZeroIfNegative(Vec256 v) { return IfThenZeroElse(v < Zero(Full256()), v); } // ------------------------------ Mask logical template HWY_API Mask256 Not(const Mask256 m) { return MaskFromVec(Not(VecFromMask(Full256(), m))); } template HWY_API Mask256 And(const Mask256 a, Mask256 b) { const Full256 d; return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask256 AndNot(const Mask256 a, Mask256 b) { const Full256 d; return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask256 Or(const Mask256 a, Mask256 b) { const Full256 d; return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask256 Xor(const Mask256 a, Mask256 b) { const Full256 d; return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask256 ExclusiveNeither(const Mask256 a, Mask256 b) { const Full256 d; return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); } // ------------------------------ Shl (BroadcastSignBit, IfThenElse) template HWY_API Vec256 operator<<(Vec256 v, const Vec256 bits) { v.v0 = operator<<(v.v0, bits.v0); v.v1 = operator<<(v.v1, bits.v1); return v; } // ------------------------------ Shr (BroadcastSignBit, IfThenElse) template HWY_API Vec256 operator>>(Vec256 v, const Vec256 bits) { v.v0 = operator>>(v.v0, bits.v0); v.v1 = operator>>(v.v1, bits.v1); return v; } // ------------------------------ BroadcastSignBit (compare, VecFromMask) template HWY_API Vec256 BroadcastSignBit(const Vec256 v) { return ShiftRight(v); } HWY_API Vec256 BroadcastSignBit(const Vec256 v) { const Full256 d; return VecFromMask(d, v < Zero(d)); } // ================================================== MEMORY // ------------------------------ Load template HWY_API Vec256 Load(Full256 d, const T* HWY_RESTRICT aligned) { const Half dh; Vec256 ret; ret.v0 = Load(dh, aligned); ret.v1 = Load(dh, aligned + Lanes(dh)); return ret; } template HWY_API Vec256 MaskedLoad(Mask256 m, Full256 d, const T* HWY_RESTRICT aligned) { return IfThenElseZero(m, Load(d, aligned)); } // LoadU == Load. template HWY_API Vec256 LoadU(Full256 d, const T* HWY_RESTRICT p) { return Load(d, p); } template HWY_API Vec256 LoadDup128(Full256 d, const T* HWY_RESTRICT p) { const Half dh; Vec256 ret; ret.v0 = ret.v1 = Load(dh, p); return ret; } // ------------------------------ Store template HWY_API void Store(Vec256 v, Full256 d, T* HWY_RESTRICT aligned) { const Half dh; Store(v.v0, dh, aligned); Store(v.v1, dh, aligned + Lanes(dh)); } // StoreU == Store. template HWY_API void StoreU(Vec256 v, Full256 d, T* HWY_RESTRICT p) { Store(v, d, p); } template HWY_API void BlendedStore(Vec256 v, Mask256 m, Full256 d, T* HWY_RESTRICT p) { StoreU(IfThenElse(m, v, LoadU(d, p)), d, p); } // ------------------------------ Stream template