// Copyright 2019 Google LLC // SPDX-License-Identifier: Apache-2.0 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // 128-bit WASM vectors and operations. // External include guard in highway.h - see comment there. #include #include #include #include "hwy/base.h" #include "hwy/ops/shared-inl.h" #ifdef HWY_WASM_OLD_NAMES #define wasm_i8x16_shuffle wasm_v8x16_shuffle #define wasm_i16x8_shuffle wasm_v16x8_shuffle #define wasm_i32x4_shuffle wasm_v32x4_shuffle #define wasm_i64x2_shuffle wasm_v64x2_shuffle #define wasm_u16x8_extend_low_u8x16 wasm_i16x8_widen_low_u8x16 #define wasm_u32x4_extend_low_u16x8 wasm_i32x4_widen_low_u16x8 #define wasm_i32x4_extend_low_i16x8 wasm_i32x4_widen_low_i16x8 #define wasm_i16x8_extend_low_i8x16 wasm_i16x8_widen_low_i8x16 #define wasm_u32x4_extend_high_u16x8 wasm_i32x4_widen_high_u16x8 #define wasm_i32x4_extend_high_i16x8 wasm_i32x4_widen_high_i16x8 #define wasm_i32x4_trunc_sat_f32x4 wasm_i32x4_trunc_saturate_f32x4 #define wasm_u8x16_add_sat wasm_u8x16_add_saturate #define wasm_u8x16_sub_sat wasm_u8x16_sub_saturate #define wasm_u16x8_add_sat wasm_u16x8_add_saturate #define wasm_u16x8_sub_sat wasm_u16x8_sub_saturate #define wasm_i8x16_add_sat wasm_i8x16_add_saturate #define wasm_i8x16_sub_sat wasm_i8x16_sub_saturate #define wasm_i16x8_add_sat wasm_i16x8_add_saturate #define wasm_i16x8_sub_sat wasm_i16x8_sub_saturate #endif HWY_BEFORE_NAMESPACE(); namespace hwy { namespace HWY_NAMESPACE { #if HWY_TARGET == HWY_WASM_EMU256 template using Full256 = Simd; #endif namespace detail { template struct Raw128 { using type = __v128_u; }; template <> struct Raw128 { using type = __f32x4; }; } // namespace detail template class Vec128 { using Raw = typename detail::Raw128::type; public: using PrivateT = T; // only for DFromV static constexpr size_t kPrivateN = N; // only for DFromV // Compound assignment. Only usable if there is a corresponding non-member // binary operator overload. For example, only f32 and f64 support division. HWY_INLINE Vec128& operator*=(const Vec128 other) { return *this = (*this * other); } HWY_INLINE Vec128& operator/=(const Vec128 other) { return *this = (*this / other); } HWY_INLINE Vec128& operator+=(const Vec128 other) { return *this = (*this + other); } HWY_INLINE Vec128& operator-=(const Vec128 other) { return *this = (*this - other); } HWY_INLINE Vec128& operator&=(const Vec128 other) { return *this = (*this & other); } HWY_INLINE Vec128& operator|=(const Vec128 other) { return *this = (*this | other); } HWY_INLINE Vec128& operator^=(const Vec128 other) { return *this = (*this ^ other); } Raw raw; }; template using Vec64 = Vec128; template using Vec32 = Vec128; template using Vec16 = Vec128; // FF..FF or 0. template struct Mask128 { typename detail::Raw128::type raw; }; template using DFromV = Simd; template using TFromV = typename V::PrivateT; // ------------------------------ BitCast namespace detail { HWY_INLINE __v128_u BitCastToInteger(__v128_u v) { return v; } HWY_INLINE __v128_u BitCastToInteger(__f32x4 v) { return static_cast<__v128_u>(v); } HWY_INLINE __v128_u BitCastToInteger(__f64x2 v) { return static_cast<__v128_u>(v); } template HWY_INLINE Vec128 BitCastToByte(Vec128 v) { return Vec128{BitCastToInteger(v.raw)}; } // Cannot rely on function overloading because return types differ. template struct BitCastFromInteger128 { HWY_INLINE __v128_u operator()(__v128_u v) { return v; } }; template <> struct BitCastFromInteger128 { HWY_INLINE __f32x4 operator()(__v128_u v) { return static_cast<__f32x4>(v); } }; template HWY_INLINE Vec128 BitCastFromByte(Simd /* tag */, Vec128 v) { return Vec128{BitCastFromInteger128()(v.raw)}; } } // namespace detail template HWY_API Vec128 BitCast(Simd d, Vec128 v) { return detail::BitCastFromByte(d, detail::BitCastToByte(v)); } // ------------------------------ Zero // Returns an all-zero vector/part. template HWY_API Vec128 Zero(Simd /* tag */) { return Vec128{wasm_i32x4_splat(0)}; } template HWY_API Vec128 Zero(Simd /* tag */) { return Vec128{wasm_f32x4_splat(0.0f)}; } template using VFromD = decltype(Zero(D())); // ------------------------------ Set // Returns a vector/part with all lanes set to "t". template HWY_API Vec128 Set(Simd /* tag */, const uint8_t t) { return Vec128{wasm_i8x16_splat(static_cast(t))}; } template HWY_API Vec128 Set(Simd /* tag */, const uint16_t t) { return Vec128{wasm_i16x8_splat(static_cast(t))}; } template HWY_API Vec128 Set(Simd /* tag */, const uint32_t t) { return Vec128{wasm_i32x4_splat(static_cast(t))}; } template HWY_API Vec128 Set(Simd /* tag */, const uint64_t t) { return Vec128{wasm_i64x2_splat(static_cast(t))}; } template HWY_API Vec128 Set(Simd /* tag */, const int8_t t) { return Vec128{wasm_i8x16_splat(t)}; } template HWY_API Vec128 Set(Simd /* tag */, const int16_t t) { return Vec128{wasm_i16x8_splat(t)}; } template HWY_API Vec128 Set(Simd /* tag */, const int32_t t) { return Vec128{wasm_i32x4_splat(t)}; } template HWY_API Vec128 Set(Simd /* tag */, const int64_t t) { return Vec128{wasm_i64x2_splat(t)}; } template HWY_API Vec128 Set(Simd /* tag */, const float t) { return Vec128{wasm_f32x4_splat(t)}; } HWY_DIAGNOSTICS(push) HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") // Returns a vector with uninitialized elements. template HWY_API Vec128 Undefined(Simd d) { return Zero(d); } HWY_DIAGNOSTICS(pop) // Returns a vector with lane i=[0, N) set to "first" + i. template Vec128 Iota(const Simd d, const T2 first) { HWY_ALIGN T lanes[16 / sizeof(T)]; for (size_t i = 0; i < 16 / sizeof(T); ++i) { lanes[i] = AddWithWraparound(hwy::IsFloatTag(), static_cast(first), i); } return Load(d, lanes); } // ================================================== ARITHMETIC // ------------------------------ Addition // Unsigned template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{wasm_i8x16_add(a.raw, b.raw)}; } template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{wasm_i16x8_add(a.raw, b.raw)}; } template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{wasm_i32x4_add(a.raw, b.raw)}; } template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{wasm_i64x2_add(a.raw, b.raw)}; } // Signed template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{wasm_i8x16_add(a.raw, b.raw)}; } template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{wasm_i16x8_add(a.raw, b.raw)}; } template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{wasm_i32x4_add(a.raw, b.raw)}; } template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{wasm_i64x2_add(a.raw, b.raw)}; } // Float template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{wasm_f32x4_add(a.raw, b.raw)}; } // ------------------------------ Subtraction // Unsigned template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{wasm_i8x16_sub(a.raw, b.raw)}; } template HWY_API Vec128 operator-(Vec128 a, Vec128 b) { return Vec128{wasm_i16x8_sub(a.raw, b.raw)}; } template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{wasm_i32x4_sub(a.raw, b.raw)}; } template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{wasm_i64x2_sub(a.raw, b.raw)}; } // Signed template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{wasm_i8x16_sub(a.raw, b.raw)}; } template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{wasm_i16x8_sub(a.raw, b.raw)}; } template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{wasm_i32x4_sub(a.raw, b.raw)}; } template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{wasm_i64x2_sub(a.raw, b.raw)}; } // Float template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{wasm_f32x4_sub(a.raw, b.raw)}; } // ------------------------------ SaturatedAdd // Returns a + b clamped to the destination range. // Unsigned template HWY_API Vec128 SaturatedAdd(const Vec128 a, const Vec128 b) { return Vec128{wasm_u8x16_add_sat(a.raw, b.raw)}; } template HWY_API Vec128 SaturatedAdd(const Vec128 a, const Vec128 b) { return Vec128{wasm_u16x8_add_sat(a.raw, b.raw)}; } // Signed template HWY_API Vec128 SaturatedAdd(const Vec128 a, const Vec128 b) { return Vec128{wasm_i8x16_add_sat(a.raw, b.raw)}; } template HWY_API Vec128 SaturatedAdd(const Vec128 a, const Vec128 b) { return Vec128{wasm_i16x8_add_sat(a.raw, b.raw)}; } // ------------------------------ SaturatedSub // Returns a - b clamped to the destination range. // Unsigned template HWY_API Vec128 SaturatedSub(const Vec128 a, const Vec128 b) { return Vec128{wasm_u8x16_sub_sat(a.raw, b.raw)}; } template HWY_API Vec128 SaturatedSub(const Vec128 a, const Vec128 b) { return Vec128{wasm_u16x8_sub_sat(a.raw, b.raw)}; } // Signed template HWY_API Vec128 SaturatedSub(const Vec128 a, const Vec128 b) { return Vec128{wasm_i8x16_sub_sat(a.raw, b.raw)}; } template HWY_API Vec128 SaturatedSub(const Vec128 a, const Vec128 b) { return Vec128{wasm_i16x8_sub_sat(a.raw, b.raw)}; } // ------------------------------ Average // Returns (a + b + 1) / 2 // Unsigned template HWY_API Vec128 AverageRound(const Vec128 a, const Vec128 b) { return Vec128{wasm_u8x16_avgr(a.raw, b.raw)}; } template HWY_API Vec128 AverageRound(const Vec128 a, const Vec128 b) { return Vec128{wasm_u16x8_avgr(a.raw, b.raw)}; } // ------------------------------ Absolute value // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. template HWY_API Vec128 Abs(const Vec128 v) { return Vec128{wasm_i8x16_abs(v.raw)}; } template HWY_API Vec128 Abs(const Vec128 v) { return Vec128{wasm_i16x8_abs(v.raw)}; } template HWY_API Vec128 Abs(const Vec128 v) { return Vec128{wasm_i32x4_abs(v.raw)}; } template HWY_API Vec128 Abs(const Vec128 v) { return Vec128{wasm_i64x2_abs(v.raw)}; } template HWY_API Vec128 Abs(const Vec128 v) { return Vec128{wasm_f32x4_abs(v.raw)}; } // ------------------------------ Shift lanes by constant #bits // Unsigned template HWY_API Vec128 ShiftLeft(const Vec128 v) { return Vec128{wasm_i16x8_shl(v.raw, kBits)}; } template HWY_API Vec128 ShiftRight(const Vec128 v) { return Vec128{wasm_u16x8_shr(v.raw, kBits)}; } template HWY_API Vec128 ShiftLeft(const Vec128 v) { return Vec128{wasm_i32x4_shl(v.raw, kBits)}; } template HWY_API Vec128 ShiftLeft(const Vec128 v) { return Vec128{wasm_i64x2_shl(v.raw, kBits)}; } template HWY_API Vec128 ShiftRight(const Vec128 v) { return Vec128{wasm_u32x4_shr(v.raw, kBits)}; } template HWY_API Vec128 ShiftRight(const Vec128 v) { return Vec128{wasm_u64x2_shr(v.raw, kBits)}; } // Signed template HWY_API Vec128 ShiftLeft(const Vec128 v) { return Vec128{wasm_i16x8_shl(v.raw, kBits)}; } template HWY_API Vec128 ShiftRight(const Vec128 v) { return Vec128{wasm_i16x8_shr(v.raw, kBits)}; } template HWY_API Vec128 ShiftLeft(const Vec128 v) { return Vec128{wasm_i32x4_shl(v.raw, kBits)}; } template HWY_API Vec128 ShiftLeft(const Vec128 v) { return Vec128{wasm_i64x2_shl(v.raw, kBits)}; } template HWY_API Vec128 ShiftRight(const Vec128 v) { return Vec128{wasm_i32x4_shr(v.raw, kBits)}; } template HWY_API Vec128 ShiftRight(const Vec128 v) { return Vec128{wasm_i64x2_shr(v.raw, kBits)}; } // 8-bit template HWY_API Vec128 ShiftLeft(const Vec128 v) { const DFromV d8; // Use raw instead of BitCast to support N=1. const Vec128 shifted{ShiftLeft(Vec128>{v.raw}).raw}; return kBits == 1 ? (v + v) : (shifted & Set(d8, static_cast((0xFF << kBits) & 0xFF))); } template HWY_API Vec128 ShiftRight(const Vec128 v) { const DFromV d8; // Use raw instead of BitCast to support N=1. const Vec128 shifted{ ShiftRight(Vec128{v.raw}).raw}; return shifted & Set(d8, 0xFF >> kBits); } template HWY_API Vec128 ShiftRight(const Vec128 v) { const DFromV di; const RebindToUnsigned du; const auto shifted = BitCast(di, ShiftRight(BitCast(du, v))); const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits)); return (shifted ^ shifted_sign) - shifted_sign; } // ------------------------------ RotateRight (ShiftRight, Or) template HWY_API Vec128 RotateRight(const Vec128 v) { constexpr size_t kSizeInBits = sizeof(T) * 8; static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); if (kBits == 0) return v; return Or(ShiftRight(v), ShiftLeft(v)); } // ------------------------------ Shift lanes by same variable #bits // After https://reviews.llvm.org/D108415 shift argument became unsigned. HWY_DIAGNOSTICS(push) HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") // Unsigned template HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { return Vec128{wasm_i16x8_shl(v.raw, bits)}; } template HWY_API Vec128 ShiftRightSame(const Vec128 v, const int bits) { return Vec128{wasm_u16x8_shr(v.raw, bits)}; } template HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { return Vec128{wasm_i32x4_shl(v.raw, bits)}; } template HWY_API Vec128 ShiftRightSame(const Vec128 v, const int bits) { return Vec128{wasm_u32x4_shr(v.raw, bits)}; } template HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { return Vec128{wasm_i64x2_shl(v.raw, bits)}; } template HWY_API Vec128 ShiftRightSame(const Vec128 v, const int bits) { return Vec128{wasm_u64x2_shr(v.raw, bits)}; } // Signed template HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { return Vec128{wasm_i16x8_shl(v.raw, bits)}; } template HWY_API Vec128 ShiftRightSame(const Vec128 v, const int bits) { return Vec128{wasm_i16x8_shr(v.raw, bits)}; } template HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { return Vec128{wasm_i32x4_shl(v.raw, bits)}; } template HWY_API Vec128 ShiftRightSame(const Vec128 v, const int bits) { return Vec128{wasm_i32x4_shr(v.raw, bits)}; } template HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { return Vec128{wasm_i64x2_shl(v.raw, bits)}; } template HWY_API Vec128 ShiftRightSame(const Vec128 v, const int bits) { return Vec128{wasm_i64x2_shr(v.raw, bits)}; } // 8-bit template HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { const DFromV d8; // Use raw instead of BitCast to support N=1. const Vec128 shifted{ ShiftLeftSame(Vec128>{v.raw}, bits).raw}; return shifted & Set(d8, static_cast((0xFF << bits) & 0xFF)); } template HWY_API Vec128 ShiftRightSame(Vec128 v, const int bits) { const DFromV d8; // Use raw instead of BitCast to support N=1. const Vec128 shifted{ ShiftRightSame(Vec128{v.raw}, bits).raw}; return shifted & Set(d8, 0xFF >> bits); } template HWY_API Vec128 ShiftRightSame(Vec128 v, const int bits) { const DFromV di; const RebindToUnsigned du; const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits)); return (shifted ^ shifted_sign) - shifted_sign; } // ignore Wsign-conversion HWY_DIAGNOSTICS(pop) // ------------------------------ Minimum // Unsigned template HWY_API Vec128 Min(Vec128 a, Vec128 b) { return Vec128{wasm_u8x16_min(a.raw, b.raw)}; } template HWY_API Vec128 Min(Vec128 a, Vec128 b) { return Vec128{wasm_u16x8_min(a.raw, b.raw)}; } template HWY_API Vec128 Min(Vec128 a, Vec128 b) { return Vec128{wasm_u32x4_min(a.raw, b.raw)}; } template HWY_API Vec128 Min(Vec128 a, Vec128 b) { // Avoid wasm_u64x2_extract_lane - not all implementations have it yet. const uint64_t a0 = static_cast(wasm_i64x2_extract_lane(a.raw, 0)); const uint64_t b0 = static_cast(wasm_i64x2_extract_lane(b.raw, 0)); const uint64_t a1 = static_cast(wasm_i64x2_extract_lane(a.raw, 1)); const uint64_t b1 = static_cast(wasm_i64x2_extract_lane(b.raw, 1)); alignas(16) uint64_t min[2] = {HWY_MIN(a0, b0), HWY_MIN(a1, b1)}; return Vec128{wasm_v128_load(min)}; } // Signed template HWY_API Vec128 Min(Vec128 a, Vec128 b) { return Vec128{wasm_i8x16_min(a.raw, b.raw)}; } template HWY_API Vec128 Min(Vec128 a, Vec128 b) { return Vec128{wasm_i16x8_min(a.raw, b.raw)}; } template HWY_API Vec128 Min(Vec128 a, Vec128 b) { return Vec128{wasm_i32x4_min(a.raw, b.raw)}; } template HWY_API Vec128 Min(Vec128 a, Vec128 b) { alignas(16) int64_t min[4]; min[0] = HWY_MIN(wasm_i64x2_extract_lane(a.raw, 0), wasm_i64x2_extract_lane(b.raw, 0)); min[1] = HWY_MIN(wasm_i64x2_extract_lane(a.raw, 1), wasm_i64x2_extract_lane(b.raw, 1)); return Vec128{wasm_v128_load(min)}; } // Float template HWY_API Vec128 Min(Vec128 a, Vec128 b) { // Equivalent to a < b ? a : b (taking into account our swapped arg order, // so that Min(NaN, x) is x to match x86). return Vec128{wasm_f32x4_pmin(b.raw, a.raw)}; } // ------------------------------ Maximum // Unsigned template HWY_API Vec128 Max(Vec128 a, Vec128 b) { return Vec128{wasm_u8x16_max(a.raw, b.raw)}; } template HWY_API Vec128 Max(Vec128 a, Vec128 b) { return Vec128{wasm_u16x8_max(a.raw, b.raw)}; } template HWY_API Vec128 Max(Vec128 a, Vec128 b) { return Vec128{wasm_u32x4_max(a.raw, b.raw)}; } template HWY_API Vec128 Max(Vec128 a, Vec128 b) { // Avoid wasm_u64x2_extract_lane - not all implementations have it yet. const uint64_t a0 = static_cast(wasm_i64x2_extract_lane(a.raw, 0)); const uint64_t b0 = static_cast(wasm_i64x2_extract_lane(b.raw, 0)); const uint64_t a1 = static_cast(wasm_i64x2_extract_lane(a.raw, 1)); const uint64_t b1 = static_cast(wasm_i64x2_extract_lane(b.raw, 1)); alignas(16) uint64_t max[2] = {HWY_MAX(a0, b0), HWY_MAX(a1, b1)}; return Vec128{wasm_v128_load(max)}; } // Signed template HWY_API Vec128 Max(Vec128 a, Vec128 b) { return Vec128{wasm_i8x16_max(a.raw, b.raw)}; } template HWY_API Vec128 Max(Vec128 a, Vec128 b) { return Vec128{wasm_i16x8_max(a.raw, b.raw)}; } template HWY_API Vec128 Max(Vec128 a, Vec128 b) { return Vec128{wasm_i32x4_max(a.raw, b.raw)}; } template HWY_API Vec128 Max(Vec128 a, Vec128 b) { alignas(16) int64_t max[2]; max[0] = HWY_MAX(wasm_i64x2_extract_lane(a.raw, 0), wasm_i64x2_extract_lane(b.raw, 0)); max[1] = HWY_MAX(wasm_i64x2_extract_lane(a.raw, 1), wasm_i64x2_extract_lane(b.raw, 1)); return Vec128{wasm_v128_load(max)}; } // Float template HWY_API Vec128 Max(Vec128 a, Vec128 b) { // Equivalent to b < a ? a : b (taking into account our swapped arg order, // so that Max(NaN, x) is x to match x86). return Vec128{wasm_f32x4_pmax(b.raw, a.raw)}; } // ------------------------------ Integer multiplication // Unsigned template HWY_API Vec128 operator*(const Vec128 a, const Vec128 b) { return Vec128{wasm_i16x8_mul(a.raw, b.raw)}; } template HWY_API Vec128 operator*(const Vec128 a, const Vec128 b) { return Vec128{wasm_i32x4_mul(a.raw, b.raw)}; } // Signed template HWY_API Vec128 operator*(const Vec128 a, const Vec128 b) { return Vec128{wasm_i16x8_mul(a.raw, b.raw)}; } template HWY_API Vec128 operator*(const Vec128 a, const Vec128 b) { return Vec128{wasm_i32x4_mul(a.raw, b.raw)}; } // Returns the upper 16 bits of a * b in each lane. template HWY_API Vec128 MulHigh(const Vec128 a, const Vec128 b) { const auto l = wasm_u32x4_extmul_low_u16x8(a.raw, b.raw); const auto h = wasm_u32x4_extmul_high_u16x8(a.raw, b.raw); // TODO(eustas): shift-right + narrow? return Vec128{ wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)}; } template HWY_API Vec128 MulHigh(const Vec128 a, const Vec128 b) { const auto l = wasm_i32x4_extmul_low_i16x8(a.raw, b.raw); const auto h = wasm_i32x4_extmul_high_i16x8(a.raw, b.raw); // TODO(eustas): shift-right + narrow? return Vec128{ wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)}; } template HWY_API Vec128 MulFixedPoint15(Vec128 a, Vec128 b) { return Vec128{wasm_i16x8_q15mulr_sat(a.raw, b.raw)}; } // Multiplies even lanes (0, 2 ..) and returns the double-width result. template HWY_API Vec128 MulEven(const Vec128 a, const Vec128 b) { const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0); const auto ae = wasm_v128_and(a.raw, kEvenMask); const auto be = wasm_v128_and(b.raw, kEvenMask); return Vec128{wasm_i64x2_mul(ae, be)}; } template HWY_API Vec128 MulEven(const Vec128 a, const Vec128 b) { const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0); const auto ae = wasm_v128_and(a.raw, kEvenMask); const auto be = wasm_v128_and(b.raw, kEvenMask); return Vec128{wasm_i64x2_mul(ae, be)}; } // ------------------------------ Negate template HWY_API Vec128 Neg(const Vec128 v) { return Xor(v, SignBit(DFromV())); } template HWY_API Vec128 Neg(const Vec128 v) { return Vec128{wasm_i8x16_neg(v.raw)}; } template HWY_API Vec128 Neg(const Vec128 v) { return Vec128{wasm_i16x8_neg(v.raw)}; } template HWY_API Vec128 Neg(const Vec128 v) { return Vec128{wasm_i32x4_neg(v.raw)}; } template HWY_API Vec128 Neg(const Vec128 v) { return Vec128{wasm_i64x2_neg(v.raw)}; } // ------------------------------ Floating-point mul / div template HWY_API Vec128 operator*(Vec128 a, Vec128 b) { return Vec128{wasm_f32x4_mul(a.raw, b.raw)}; } template HWY_API Vec128 operator/(const Vec128 a, const Vec128 b) { return Vec128{wasm_f32x4_div(a.raw, b.raw)}; } // Approximate reciprocal template HWY_API Vec128 ApproximateReciprocal(const Vec128 v) { const Vec128 one = Vec128{wasm_f32x4_splat(1.0f)}; return one / v; } // Absolute value of difference. template HWY_API Vec128 AbsDiff(const Vec128 a, const Vec128 b) { return Abs(a - b); } // ------------------------------ Floating-point multiply-add variants // Returns mul * x + add template HWY_API Vec128 MulAdd(const Vec128 mul, const Vec128 x, const Vec128 add) { return mul * x + add; } // Returns add - mul * x template HWY_API Vec128 NegMulAdd(const Vec128 mul, const Vec128 x, const Vec128 add) { return add - mul * x; } // Returns mul * x - sub template HWY_API Vec128 MulSub(const Vec128 mul, const Vec128 x, const Vec128 sub) { return mul * x - sub; } // Returns -mul * x - sub template HWY_API Vec128 NegMulSub(const Vec128 mul, const Vec128 x, const Vec128 sub) { return Neg(mul) * x - sub; } // ------------------------------ Floating-point square root // Full precision square root template HWY_API Vec128 Sqrt(const Vec128 v) { return Vec128{wasm_f32x4_sqrt(v.raw)}; } // Approximate reciprocal square root template HWY_API Vec128 ApproximateReciprocalSqrt(const Vec128 v) { // TODO(eustas): find cheaper a way to calculate this. const Vec128 one = Vec128{wasm_f32x4_splat(1.0f)}; return one / Sqrt(v); } // ------------------------------ Floating-point rounding // Toward nearest integer, ties to even template HWY_API Vec128 Round(const Vec128 v) { return Vec128{wasm_f32x4_nearest(v.raw)}; } // Toward zero, aka truncate template HWY_API Vec128 Trunc(const Vec128 v) { return Vec128{wasm_f32x4_trunc(v.raw)}; } // Toward +infinity, aka ceiling template HWY_API Vec128 Ceil(const Vec128 v) { return Vec128{wasm_f32x4_ceil(v.raw)}; } // Toward -infinity, aka floor template HWY_API Vec128 Floor(const Vec128 v) { return Vec128{wasm_f32x4_floor(v.raw)}; } // ------------------------------ Floating-point classification template HWY_API Mask128 IsNaN(const Vec128 v) { return v != v; } template HWY_API Mask128 IsInf(const Vec128 v) { const Simd d; const RebindToSigned di; const VFromD vi = BitCast(di, v); // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2()))); } // Returns whether normal/subnormal/zero. template HWY_API Mask128 IsFinite(const Vec128 v) { const Simd d; const RebindToUnsigned du; const RebindToSigned di; // cheaper than unsigned comparison const VFromD vu = BitCast(du, v); // 'Shift left' to clear the sign bit, then right so we can compare with the // max exponent (cannot compare with MaxExponentTimes2 directly because it is // negative and non-negative floats would be greater). const VFromD exp = BitCast(di, ShiftRight() + 1>(Add(vu, vu))); return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField()))); } // ================================================== COMPARE // Comparisons fill a lane with 1-bits if the condition is true, else 0. template HWY_API Mask128 RebindMask(Simd /*tag*/, Mask128 m) { static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); return Mask128{m.raw}; } template HWY_API Mask128 TestBit(Vec128 v, Vec128 bit) { static_assert(!hwy::IsFloat(), "Only integer vectors supported"); return (v & bit) == bit; } // ------------------------------ Equality // Unsigned template HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { return Mask128{wasm_i8x16_eq(a.raw, b.raw)}; } template HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { return Mask128{wasm_i16x8_eq(a.raw, b.raw)}; } template HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { return Mask128{wasm_i32x4_eq(a.raw, b.raw)}; } template HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { return Mask128{wasm_i64x2_eq(a.raw, b.raw)}; } // Signed template HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { return Mask128{wasm_i8x16_eq(a.raw, b.raw)}; } template HWY_API Mask128 operator==(Vec128 a, Vec128 b) { return Mask128{wasm_i16x8_eq(a.raw, b.raw)}; } template HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { return Mask128{wasm_i32x4_eq(a.raw, b.raw)}; } template HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { return Mask128{wasm_i64x2_eq(a.raw, b.raw)}; } // Float template HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { return Mask128{wasm_f32x4_eq(a.raw, b.raw)}; } // ------------------------------ Inequality // Unsigned template HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { return Mask128{wasm_i8x16_ne(a.raw, b.raw)}; } template HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { return Mask128{wasm_i16x8_ne(a.raw, b.raw)}; } template HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { return Mask128{wasm_i32x4_ne(a.raw, b.raw)}; } template HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { return Mask128{wasm_i64x2_ne(a.raw, b.raw)}; } // Signed template HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { return Mask128{wasm_i8x16_ne(a.raw, b.raw)}; } template HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { return Mask128{wasm_i16x8_ne(a.raw, b.raw)}; } template HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { return Mask128{wasm_i32x4_ne(a.raw, b.raw)}; } template HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { return Mask128{wasm_i64x2_ne(a.raw, b.raw)}; } // Float template HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { return Mask128{wasm_f32x4_ne(a.raw, b.raw)}; } // ------------------------------ Strict inequality template HWY_API Mask128 operator>(const Vec128 a, const Vec128 b) { return Mask128{wasm_i8x16_gt(a.raw, b.raw)}; } template HWY_API Mask128 operator>(const Vec128 a, const Vec128 b) { return Mask128{wasm_i16x8_gt(a.raw, b.raw)}; } template HWY_API Mask128 operator>(const Vec128 a, const Vec128 b) { return Mask128{wasm_i32x4_gt(a.raw, b.raw)}; } template HWY_API Mask128 operator>(const Vec128 a, const Vec128 b) { return Mask128{wasm_i64x2_gt(a.raw, b.raw)}; } template HWY_API Mask128 operator>(const Vec128 a, const Vec128 b) { return Mask128{wasm_u8x16_gt(a.raw, b.raw)}; } template HWY_API Mask128 operator>(const Vec128 a, const Vec128 b) { return Mask128{wasm_u16x8_gt(a.raw, b.raw)}; } template HWY_API Mask128 operator>(const Vec128 a, const Vec128 b) { return Mask128{wasm_u32x4_gt(a.raw, b.raw)}; } template HWY_API Mask128 operator>(const Vec128 a, const Vec128 b) { const DFromV d; const Repartition d32; const auto a32 = BitCast(d32, a); const auto b32 = BitCast(d32, b); // If the upper halves are not equal, this is the answer. const auto m_gt = a32 > b32; // Otherwise, the lower half decides. const auto m_eq = a32 == b32; const auto lo_in_hi = wasm_i32x4_shuffle(m_gt.raw, m_gt.raw, 0, 0, 2, 2); const auto lo_gt = And(m_eq, MaskFromVec(VFromD{lo_in_hi})); const auto gt = Or(lo_gt, m_gt); // Copy result in upper 32 bits to lower 32 bits. return Mask128{wasm_i32x4_shuffle(gt.raw, gt.raw, 1, 1, 3, 3)}; } template HWY_API Mask128 operator>(const Vec128 a, const Vec128 b) { return Mask128{wasm_f32x4_gt(a.raw, b.raw)}; } template HWY_API Mask128 operator<(const Vec128 a, const Vec128 b) { return operator>(b, a); } // ------------------------------ Weak inequality // Float <= >= template HWY_API Mask128 operator<=(const Vec128 a, const Vec128 b) { return Mask128{wasm_f32x4_le(a.raw, b.raw)}; } template HWY_API Mask128 operator>=(const Vec128 a, const Vec128 b) { return Mask128{wasm_f32x4_ge(a.raw, b.raw)}; } // ------------------------------ FirstN (Iota, Lt) template HWY_API Mask128 FirstN(const Simd d, size_t num) { const RebindToSigned di; // Signed comparisons may be cheaper. return RebindMask(d, Iota(di, 0) < Set(di, static_cast>(num))); } // ================================================== LOGICAL // ------------------------------ Not template HWY_API Vec128 Not(Vec128 v) { return Vec128{wasm_v128_not(v.raw)}; } // ------------------------------ And template HWY_API Vec128 And(Vec128 a, Vec128 b) { return Vec128{wasm_v128_and(a.raw, b.raw)}; } // ------------------------------ AndNot // Returns ~not_mask & mask. template HWY_API Vec128 AndNot(Vec128 not_mask, Vec128 mask) { return Vec128{wasm_v128_andnot(mask.raw, not_mask.raw)}; } // ------------------------------ Or template HWY_API Vec128 Or(Vec128 a, Vec128 b) { return Vec128{wasm_v128_or(a.raw, b.raw)}; } // ------------------------------ Xor template HWY_API Vec128 Xor(Vec128 a, Vec128 b) { return Vec128{wasm_v128_xor(a.raw, b.raw)}; } // ------------------------------ Xor3 template HWY_API Vec128 Xor3(Vec128 x1, Vec128 x2, Vec128 x3) { return Xor(x1, Xor(x2, x3)); } // ------------------------------ Or3 template HWY_API Vec128 Or3(Vec128 o1, Vec128 o2, Vec128 o3) { return Or(o1, Or(o2, o3)); } // ------------------------------ OrAnd template HWY_API Vec128 OrAnd(Vec128 o, Vec128 a1, Vec128 a2) { return Or(o, And(a1, a2)); } // ------------------------------ IfVecThenElse template HWY_API Vec128 IfVecThenElse(Vec128 mask, Vec128 yes, Vec128 no) { return IfThenElse(MaskFromVec(mask), yes, no); } // ------------------------------ Operator overloads (internal-only if float) template HWY_API Vec128 operator&(const Vec128 a, const Vec128 b) { return And(a, b); } template HWY_API Vec128 operator|(const Vec128 a, const Vec128 b) { return Or(a, b); } template HWY_API Vec128 operator^(const Vec128 a, const Vec128 b) { return Xor(a, b); } // ------------------------------ CopySign template HWY_API Vec128 CopySign(const Vec128 magn, const Vec128 sign) { static_assert(IsFloat(), "Only makes sense for floating-point"); const auto msb = SignBit(DFromV()); return Or(AndNot(msb, magn), And(msb, sign)); } template HWY_API Vec128 CopySignToAbs(const Vec128 abs, const Vec128 sign) { static_assert(IsFloat(), "Only makes sense for floating-point"); return Or(abs, And(SignBit(DFromV()), sign)); } // ------------------------------ BroadcastSignBit (compare) template HWY_API Vec128 BroadcastSignBit(const Vec128 v) { return ShiftRight(v); } template HWY_API Vec128 BroadcastSignBit(const Vec128 v) { const DFromV d; return VecFromMask(d, v < Zero(d)); } // ------------------------------ Mask // Mask and Vec are the same (true = FF..FF). template HWY_API Mask128 MaskFromVec(const Vec128 v) { return Mask128{v.raw}; } template HWY_API Vec128 VecFromMask(Simd /* tag */, Mask128 v) { return Vec128{v.raw}; } // mask ? yes : no template HWY_API Vec128 IfThenElse(Mask128 mask, Vec128 yes, Vec128 no) { return Vec128{wasm_v128_bitselect(yes.raw, no.raw, mask.raw)}; } // mask ? yes : 0 template HWY_API Vec128 IfThenElseZero(Mask128 mask, Vec128 yes) { return yes & VecFromMask(DFromV(), mask); } // mask ? 0 : no template HWY_API Vec128 IfThenZeroElse(Mask128 mask, Vec128 no) { return AndNot(VecFromMask(DFromV(), mask), no); } template HWY_API Vec128 IfNegativeThenElse(Vec128 v, Vec128 yes, Vec128 no) { static_assert(IsSigned(), "Only works for signed/float"); const DFromV d; const RebindToSigned di; v = BitCast(d, BroadcastSignBit(BitCast(di, v))); return IfThenElse(MaskFromVec(v), yes, no); } template HWY_API Vec128 ZeroIfNegative(Vec128 v) { const DFromV d; const auto zero = Zero(d); return IfThenElse(Mask128{(v > zero).raw}, v, zero); } // ------------------------------ Mask logical template HWY_API Mask128 Not(const Mask128 m) { return MaskFromVec(Not(VecFromMask(Simd(), m))); } template HWY_API Mask128 And(const Mask128 a, Mask128 b) { const Simd d; return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask128 AndNot(const Mask128 a, Mask128 b) { const Simd d; return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask128 Or(const Mask128 a, Mask128 b) { const Simd d; return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask128 Xor(const Mask128 a, Mask128 b) { const Simd d; return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask128 ExclusiveNeither(const Mask128 a, Mask128 b) { const Simd d; return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); } // ------------------------------ Shl (BroadcastSignBit, IfThenElse) // The x86 multiply-by-Pow2() trick will not work because WASM saturates // float->int correctly to 2^31-1 (not 2^31). Because WASM's shifts take a // scalar count operand, per-lane shift instructions would require extract_lane // for each lane, and hoping that shuffle is correctly mapped to a native // instruction. Using non-vector shifts would incur a store-load forwarding // stall when loading the result vector. We instead test bits of the shift // count to "predicate" a shift of the entire vector by a constant. template HWY_API Vec128 operator<<(Vec128 v, const Vec128 bits) { const DFromV d; Mask128 mask; // Need a signed type for BroadcastSignBit. auto test = BitCast(RebindToSigned(), bits); // Move the highest valid bit of the shift count into the sign bit. test = ShiftLeft<12>(test); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftLeft<8>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftLeft<4>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftLeft<2>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); return IfThenElse(mask, ShiftLeft<1>(v), v); } template HWY_API Vec128 operator<<(Vec128 v, const Vec128 bits) { const DFromV d; Mask128 mask; // Need a signed type for BroadcastSignBit. auto test = BitCast(RebindToSigned(), bits); // Move the highest valid bit of the shift count into the sign bit. test = ShiftLeft<27>(test); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftLeft<16>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftLeft<8>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftLeft<4>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftLeft<2>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); return IfThenElse(mask, ShiftLeft<1>(v), v); } template HWY_API Vec128 operator<<(Vec128 v, const Vec128 bits) { const DFromV d; alignas(16) T lanes[2]; alignas(16) T bits_lanes[2]; Store(v, d, lanes); Store(bits, d, bits_lanes); lanes[0] <<= bits_lanes[0]; lanes[1] <<= bits_lanes[1]; return Load(d, lanes); } // ------------------------------ Shr (BroadcastSignBit, IfThenElse) template HWY_API Vec128 operator>>(Vec128 v, const Vec128 bits) { const DFromV d; Mask128 mask; // Need a signed type for BroadcastSignBit. auto test = BitCast(RebindToSigned(), bits); // Move the highest valid bit of the shift count into the sign bit. test = ShiftLeft<12>(test); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftRight<8>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftRight<4>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftRight<2>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); return IfThenElse(mask, ShiftRight<1>(v), v); } template HWY_API Vec128 operator>>(Vec128 v, const Vec128 bits) { const DFromV d; Mask128 mask; // Need a signed type for BroadcastSignBit. auto test = BitCast(RebindToSigned(), bits); // Move the highest valid bit of the shift count into the sign bit. test = ShiftLeft<27>(test); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftRight<16>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftRight<8>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftRight<4>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftRight<2>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); return IfThenElse(mask, ShiftRight<1>(v), v); } // ================================================== MEMORY // ------------------------------ Load template HWY_API Vec128 Load(Full128 /* tag */, const T* HWY_RESTRICT aligned) { return Vec128{wasm_v128_load(aligned)}; } template HWY_API Vec128 MaskedLoad(Mask128 m, Simd d, const T* HWY_RESTRICT aligned) { return IfThenElseZero(m, Load(d, aligned)); } // Partial load. template HWY_API Vec128 Load(Simd /* tag */, const T* HWY_RESTRICT p) { Vec128 v; CopyBytes(p, &v); return v; } // LoadU == Load. template HWY_API Vec128 LoadU(Simd d, const T* HWY_RESTRICT p) { return Load(d, p); } // 128-bit SIMD => nothing to duplicate, same as an unaligned load. template HWY_API Vec128 LoadDup128(Simd d, const T* HWY_RESTRICT p) { return Load(d, p); } // ------------------------------ Store template HWY_API void Store(Vec128 v, Full128 /* tag */, T* HWY_RESTRICT aligned) { wasm_v128_store(aligned, v.raw); } // Partial store. template HWY_API void Store(Vec128 v, Simd /* tag */, T* HWY_RESTRICT p) { CopyBytes(&v, p); } HWY_API void Store(const Vec128 v, Simd /* tag */, float* HWY_RESTRICT p) { *p = wasm_f32x4_extract_lane(v.raw, 0); } // StoreU == Store. template HWY_API void StoreU(Vec128 v, Simd d, T* HWY_RESTRICT p) { Store(v, d, p); } template HWY_API void BlendedStore(Vec128 v, Mask128 m, Simd d, T* HWY_RESTRICT p) { StoreU(IfThenElse(m, v, LoadU(d, p)), d, p); } // ------------------------------ Non-temporal stores // Same as aligned stores on non-x86. template HWY_API void Stream(Vec128 v, Simd /* tag */, T* HWY_RESTRICT aligned) { wasm_v128_store(aligned, v.raw); } // ------------------------------ Scatter (Store) template HWY_API void ScatterOffset(Vec128 v, Simd d, T* HWY_RESTRICT base, const Vec128 offset) { static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); alignas(16) T lanes[N]; Store(v, d, lanes); alignas(16) Offset offset_lanes[N]; Store(offset, Rebind(), offset_lanes); uint8_t* base_bytes = reinterpret_cast(base); for (size_t i = 0; i < N; ++i) { CopyBytes(&lanes[i], base_bytes + offset_lanes[i]); } } template HWY_API void ScatterIndex(Vec128 v, Simd d, T* HWY_RESTRICT base, const Vec128 index) { static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); alignas(16) T lanes[N]; Store(v, d, lanes); alignas(16) Index index_lanes[N]; Store(index, Rebind(), index_lanes); for (size_t i = 0; i < N; ++i) { base[index_lanes[i]] = lanes[i]; } } // ------------------------------ Gather (Load/Store) template HWY_API Vec128 GatherOffset(const Simd d, const T* HWY_RESTRICT base, const Vec128 offset) { static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); alignas(16) Offset offset_lanes[N]; Store(offset, Rebind(), offset_lanes); alignas(16) T lanes[N]; const uint8_t* base_bytes = reinterpret_cast(base); for (size_t i = 0; i < N; ++i) { CopyBytes(base_bytes + offset_lanes[i], &lanes[i]); } return Load(d, lanes); } template HWY_API Vec128 GatherIndex(const Simd d, const T* HWY_RESTRICT base, const Vec128 index) { static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); alignas(16) Index index_lanes[N]; Store(index, Rebind(), index_lanes); alignas(16) T lanes[N]; for (size_t i = 0; i < N; ++i) { lanes[i] = base[index_lanes[i]]; } return Load(d, lanes); } // ================================================== SWIZZLE // ------------------------------ ExtractLane namespace detail { template HWY_INLINE T ExtractLane(const Vec128 v) { return static_cast(wasm_i8x16_extract_lane(v.raw, kLane)); } template HWY_INLINE T ExtractLane(const Vec128 v) { return static_cast(wasm_i16x8_extract_lane(v.raw, kLane)); } template HWY_INLINE T ExtractLane(const Vec128 v) { return static_cast(wasm_i32x4_extract_lane(v.raw, kLane)); } template HWY_INLINE T ExtractLane(const Vec128 v) { return static_cast(wasm_i64x2_extract_lane(v.raw, kLane)); } template HWY_INLINE float ExtractLane(const Vec128 v) { return wasm_f32x4_extract_lane(v.raw, kLane); } } // namespace detail // One overload per vector length just in case *_extract_lane raise compile // errors if their argument is out of bounds (even if that would never be // reached at runtime). template HWY_API T ExtractLane(const Vec128 v, size_t i) { HWY_DASSERT(i == 0); (void)i; return GetLane(v); } template HWY_API T ExtractLane(const Vec128 v, size_t i) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::ExtractLane<0>(v); case 1: return detail::ExtractLane<1>(v); } } #endif alignas(16) T lanes[2]; Store(v, DFromV(), lanes); return lanes[i]; } template HWY_API T ExtractLane(const Vec128 v, size_t i) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::ExtractLane<0>(v); case 1: return detail::ExtractLane<1>(v); case 2: return detail::ExtractLane<2>(v); case 3: return detail::ExtractLane<3>(v); } } #endif alignas(16) T lanes[4]; Store(v, DFromV(), lanes); return lanes[i]; } template HWY_API T ExtractLane(const Vec128 v, size_t i) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::ExtractLane<0>(v); case 1: return detail::ExtractLane<1>(v); case 2: return detail::ExtractLane<2>(v); case 3: return detail::ExtractLane<3>(v); case 4: return detail::ExtractLane<4>(v); case 5: return detail::ExtractLane<5>(v); case 6: return detail::ExtractLane<6>(v); case 7: return detail::ExtractLane<7>(v); } } #endif alignas(16) T lanes[8]; Store(v, DFromV(), lanes); return lanes[i]; } template HWY_API T ExtractLane(const Vec128 v, size_t i) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::ExtractLane<0>(v); case 1: return detail::ExtractLane<1>(v); case 2: return detail::ExtractLane<2>(v); case 3: return detail::ExtractLane<3>(v); case 4: return detail::ExtractLane<4>(v); case 5: return detail::ExtractLane<5>(v); case 6: return detail::ExtractLane<6>(v); case 7: return detail::ExtractLane<7>(v); case 8: return detail::ExtractLane<8>(v); case 9: return detail::ExtractLane<9>(v); case 10: return detail::ExtractLane<10>(v); case 11: return detail::ExtractLane<11>(v); case 12: return detail::ExtractLane<12>(v); case 13: return detail::ExtractLane<13>(v); case 14: return detail::ExtractLane<14>(v); case 15: return detail::ExtractLane<15>(v); } } #endif alignas(16) T lanes[16]; Store(v, DFromV(), lanes); return lanes[i]; } // ------------------------------ GetLane template HWY_API T GetLane(const Vec128 v) { return detail::ExtractLane<0>(v); } // ------------------------------ InsertLane namespace detail { template HWY_INLINE Vec128 InsertLane(const Vec128 v, T t) { static_assert(kLane < N, "Lane index out of bounds"); return Vec128{ wasm_i8x16_replace_lane(v.raw, kLane, static_cast(t))}; } template HWY_INLINE Vec128 InsertLane(const Vec128 v, T t) { static_assert(kLane < N, "Lane index out of bounds"); return Vec128{ wasm_i16x8_replace_lane(v.raw, kLane, static_cast(t))}; } template HWY_INLINE Vec128 InsertLane(const Vec128 v, T t) { static_assert(kLane < N, "Lane index out of bounds"); return Vec128{ wasm_i32x4_replace_lane(v.raw, kLane, static_cast(t))}; } template HWY_INLINE Vec128 InsertLane(const Vec128 v, T t) { static_assert(kLane < N, "Lane index out of bounds"); return Vec128{ wasm_i64x2_replace_lane(v.raw, kLane, static_cast(t))}; } template HWY_INLINE Vec128 InsertLane(const Vec128 v, float t) { static_assert(kLane < N, "Lane index out of bounds"); return Vec128{wasm_f32x4_replace_lane(v.raw, kLane, t)}; } template HWY_INLINE Vec128 InsertLane(const Vec128 v, double t) { static_assert(kLane < 2, "Lane index out of bounds"); return Vec128{wasm_f64x2_replace_lane(v.raw, kLane, t)}; } } // namespace detail // Requires one overload per vector length because InsertLane<3> may be a // compile error if it calls wasm_f64x2_replace_lane. template HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { HWY_DASSERT(i == 0); (void)i; return Set(DFromV(), t); } template HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::InsertLane<0>(v, t); case 1: return detail::InsertLane<1>(v, t); } } #endif const DFromV d; alignas(16) T lanes[2]; Store(v, d, lanes); lanes[i] = t; return Load(d, lanes); } template HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::InsertLane<0>(v, t); case 1: return detail::InsertLane<1>(v, t); case 2: return detail::InsertLane<2>(v, t); case 3: return detail::InsertLane<3>(v, t); } } #endif const DFromV d; alignas(16) T lanes[4]; Store(v, d, lanes); lanes[i] = t; return Load(d, lanes); } template HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::InsertLane<0>(v, t); case 1: return detail::InsertLane<1>(v, t); case 2: return detail::InsertLane<2>(v, t); case 3: return detail::InsertLane<3>(v, t); case 4: return detail::InsertLane<4>(v, t); case 5: return detail::InsertLane<5>(v, t); case 6: return detail::InsertLane<6>(v, t); case 7: return detail::InsertLane<7>(v, t); } } #endif const DFromV d; alignas(16) T lanes[8]; Store(v, d, lanes); lanes[i] = t; return Load(d, lanes); } template HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::InsertLane<0>(v, t); case 1: return detail::InsertLane<1>(v, t); case 2: return detail::InsertLane<2>(v, t); case 3: return detail::InsertLane<3>(v, t); case 4: return detail::InsertLane<4>(v, t); case 5: return detail::InsertLane<5>(v, t); case 6: return detail::InsertLane<6>(v, t); case 7: return detail::InsertLane<7>(v, t); case 8: return detail::InsertLane<8>(v, t); case 9: return detail::InsertLane<9>(v, t); case 10: return detail::InsertLane<10>(v, t); case 11: return detail::InsertLane<11>(v, t); case 12: return detail::InsertLane<12>(v, t); case 13: return detail::InsertLane<13>(v, t); case 14: return detail::InsertLane<14>(v, t); case 15: return detail::InsertLane<15>(v, t); } } #endif const DFromV d; alignas(16) T lanes[16]; Store(v, d, lanes); lanes[i] = t; return Load(d, lanes); } // ------------------------------ LowerHalf template HWY_API Vec128 LowerHalf(Simd /* tag */, Vec128 v) { return Vec128{v.raw}; } template HWY_API Vec128 LowerHalf(Vec128 v) { return LowerHalf(Simd(), v); } // ------------------------------ ShiftLeftBytes // 0x01..0F, kBytes = 1 => 0x02..0F00 template HWY_API Vec128 ShiftLeftBytes(Simd /* tag */, Vec128 v) { static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); const __i8x16 zero = wasm_i8x16_splat(0); switch (kBytes) { case 0: return v; case 1: return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14)}; case 2: return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13)}; case 3: return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)}; case 4: return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)}; case 5: return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)}; case 6: return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9)}; case 7: return Vec128{wasm_i8x16_shuffle( v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8)}; case 8: return Vec128{wasm_i8x16_shuffle( v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7)}; case 9: return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6)}; case 10: return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5)}; case 11: return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4)}; case 12: return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3)}; case 13: return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2)}; case 14: return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1)}; case 15: return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 0)}; } return Vec128{zero}; } template HWY_API Vec128 ShiftLeftBytes(Vec128 v) { return ShiftLeftBytes(Simd(), v); } // ------------------------------ ShiftLeftLanes template HWY_API Vec128 ShiftLeftLanes(Simd d, const Vec128 v) { const Repartition d8; return BitCast(d, ShiftLeftBytes(BitCast(d8, v))); } template HWY_API Vec128 ShiftLeftLanes(const Vec128 v) { return ShiftLeftLanes(DFromV(), v); } // ------------------------------ ShiftRightBytes namespace detail { // Helper function allows zeroing invalid lanes in caller. template HWY_API __i8x16 ShrBytes(const Vec128 v) { static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); const __i8x16 zero = wasm_i8x16_splat(0); switch (kBytes) { case 0: return v.raw; case 1: return wasm_i8x16_shuffle(v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); case 2: return wasm_i8x16_shuffle(v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16); case 3: return wasm_i8x16_shuffle(v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16); case 4: return wasm_i8x16_shuffle(v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16); case 5: return wasm_i8x16_shuffle(v.raw, zero, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16); case 6: return wasm_i8x16_shuffle(v.raw, zero, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16); case 7: return wasm_i8x16_shuffle(v.raw, zero, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16); case 8: return wasm_i8x16_shuffle(v.raw, zero, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16); case 9: return wasm_i8x16_shuffle(v.raw, zero, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16); case 10: return wasm_i8x16_shuffle(v.raw, zero, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16); case 11: return wasm_i8x16_shuffle(v.raw, zero, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16); case 12: return wasm_i8x16_shuffle(v.raw, zero, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16); case 13: return wasm_i8x16_shuffle(v.raw, zero, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16); case 14: return wasm_i8x16_shuffle(v.raw, zero, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16); case 15: return wasm_i8x16_shuffle(v.raw, zero, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16); case 16: return zero; } } } // namespace detail // 0x01..0F, kBytes = 1 => 0x0001..0E template HWY_API Vec128 ShiftRightBytes(Simd /* tag */, Vec128 v) { // For partial vectors, clear upper lanes so we shift in zeros. if (N != 16 / sizeof(T)) { const Vec128 vfull{v.raw}; v = Vec128{IfThenElseZero(FirstN(Full128(), N), vfull).raw}; } return Vec128{detail::ShrBytes(v)}; } // ------------------------------ ShiftRightLanes template HWY_API Vec128 ShiftRightLanes(Simd d, const Vec128 v) { const Repartition d8; return BitCast(d, ShiftRightBytes(d8, BitCast(d8, v))); } // ------------------------------ UpperHalf (ShiftRightBytes) // Full input: copy hi into lo (smaller instruction encoding than shifts). template HWY_API Vec64 UpperHalf(Full64 /* tag */, const Vec128 v) { return Vec64{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)}; } HWY_API Vec64 UpperHalf(Full64 /* tag */, const Vec128 v) { return Vec64{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)}; } // Partial template HWY_API Vec128 UpperHalf(Half> /* tag */, Vec128 v) { const DFromV d; const RebindToUnsigned du; const auto vu = BitCast(du, v); const auto upper = BitCast(d, ShiftRightBytes(du, vu)); return Vec128{upper.raw}; } // ------------------------------ CombineShiftRightBytes template > HWY_API V CombineShiftRightBytes(Full128 /* tag */, V hi, V lo) { static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); switch (kBytes) { case 0: return lo; case 1: return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)}; case 2: return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17)}; case 3: return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18)}; case 4: return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19)}; case 5: return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20)}; case 6: return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21)}; case 7: return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22)}; case 8: return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23)}; case 9: return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24)}; case 10: return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25)}; case 11: return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26)}; case 12: return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27)}; case 13: return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28)}; case 14: return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29)}; case 15: return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30)}; } return hi; } template > HWY_API V CombineShiftRightBytes(Simd d, V hi, V lo) { constexpr size_t kSize = N * sizeof(T); static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid"); const Repartition d8; const Full128 d_full8; using V8 = VFromD; const V8 hi8{BitCast(d8, hi).raw}; // Move into most-significant bytes const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw}); const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(d_full8, hi8, lo8); return V{BitCast(Full128(), r).raw}; } // ------------------------------ Broadcast/splat any lane template HWY_API Vec128 Broadcast(const Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); return Vec128{wasm_i16x8_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane)}; } template HWY_API Vec128 Broadcast(const Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); return Vec128{ wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)}; } template HWY_API Vec128 Broadcast(const Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); return Vec128{wasm_i64x2_shuffle(v.raw, v.raw, kLane, kLane)}; } // ------------------------------ TableLookupBytes // Returns vector of bytes[from[i]]. "from" is also interpreted as bytes, i.e. // lane indices in [0, 16). template HWY_API Vec128 TableLookupBytes(const Vec128 bytes, const Vec128 from) { // Not yet available in all engines, see // https://github.com/WebAssembly/simd/blob/bdcc304b2d379f4601c2c44ea9b44ed9484fde7e/proposals/simd/ImplementationStatus.md // V8 implementation of this had a bug, fixed on 2021-04-03: // https://chromium-review.googlesource.com/c/v8/v8/+/2822951 #if 0 return Vec128{wasm_i8x16_swizzle(bytes.raw, from.raw)}; #else alignas(16) uint8_t control[16]; alignas(16) uint8_t input[16]; alignas(16) uint8_t output[16]; wasm_v128_store(control, from.raw); wasm_v128_store(input, bytes.raw); for (size_t i = 0; i < 16; ++i) { output[i] = control[i] < 16 ? input[control[i]] : 0; } return Vec128{wasm_v128_load(output)}; #endif } template HWY_API Vec128 TableLookupBytesOr0(const Vec128 bytes, const Vec128 from) { const Simd d; // Mask size must match vector type, so cast everything to this type. Repartition di8; Repartition> d_bytes8; const auto msb = BitCast(di8, from) < Zero(di8); const auto lookup = TableLookupBytes(BitCast(d_bytes8, bytes), BitCast(di8, from)); return BitCast(d, IfThenZeroElse(msb, lookup)); } // ------------------------------ Hard-coded shuffles // Notation: let Vec128 have lanes 3,2,1,0 (0 is least-significant). // Shuffle0321 rotates one lane to the right (the previous least-significant // lane is now most-significant). These could also be implemented via // CombineShiftRightBytes but the shuffle_abcd notation is more convenient. // Swap 32-bit halves in 64-bit halves. template HWY_API Vec128 Shuffle2301(const Vec128 v) { static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); static_assert(N == 2 || N == 4, "Does not make sense for N=1"); return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)}; } // These are used by generic_ops-inl to implement LoadInterleaved3. namespace detail { template HWY_API Vec128 Shuffle2301(const Vec128 a, const Vec128 b) { static_assert(N == 2 || N == 4, "Does not make sense for N=1"); return Vec128{wasm_i8x16_shuffle(a.raw, b.raw, 1, 0, 3 + 16, 2 + 16, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)}; } template HWY_API Vec128 Shuffle2301(const Vec128 a, const Vec128 b) { static_assert(N == 2 || N == 4, "Does not make sense for N=1"); return Vec128{wasm_i16x8_shuffle(a.raw, b.raw, 1, 0, 3 + 8, 2 + 8, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)}; } template HWY_API Vec128 Shuffle2301(const Vec128 a, const Vec128 b) { static_assert(N == 2 || N == 4, "Does not make sense for N=1"); return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 1, 0, 3 + 4, 2 + 4)}; } template HWY_API Vec128 Shuffle1230(const Vec128 a, const Vec128 b) { static_assert(N == 2 || N == 4, "Does not make sense for N=1"); return Vec128{wasm_i8x16_shuffle(a.raw, b.raw, 0, 3, 2 + 16, 1 + 16, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)}; } template HWY_API Vec128 Shuffle1230(const Vec128 a, const Vec128 b) { static_assert(N == 2 || N == 4, "Does not make sense for N=1"); return Vec128{wasm_i16x8_shuffle(a.raw, b.raw, 0, 3, 2 + 8, 1 + 8, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)}; } template HWY_API Vec128 Shuffle1230(const Vec128 a, const Vec128 b) { static_assert(N == 2 || N == 4, "Does not make sense for N=1"); return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 0, 3, 2 + 4, 1 + 4)}; } template HWY_API Vec128 Shuffle3012(const Vec128 a, const Vec128 b) { static_assert(N == 2 || N == 4, "Does not make sense for N=1"); return Vec128{wasm_i8x16_shuffle(a.raw, b.raw, 2, 1, 0 + 16, 3 + 16, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)}; } template HWY_API Vec128 Shuffle3012(const Vec128 a, const Vec128 b) { static_assert(N == 2 || N == 4, "Does not make sense for N=1"); return Vec128{wasm_i16x8_shuffle(a.raw, b.raw, 2, 1, 0 + 8, 3 + 8, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)}; } template HWY_API Vec128 Shuffle3012(const Vec128 a, const Vec128 b) { static_assert(N == 2 || N == 4, "Does not make sense for N=1"); return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 2, 1, 0 + 4, 3 + 4)}; } } // namespace detail // Swap 64-bit halves template HWY_API Vec128 Shuffle01(const Vec128 v) { static_assert(sizeof(T) == 8, "Only for 64-bit lanes"); return Vec128{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)}; } template HWY_API Vec128 Shuffle1032(const Vec128 v) { static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); return Vec128{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)}; } // Rotate right 32 bits template HWY_API Vec128 Shuffle0321(const Vec128 v) { static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)}; } // Rotate left 32 bits template HWY_API Vec128 Shuffle2103(const Vec128 v) { static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)}; } // Reverse template HWY_API Vec128 Shuffle0123(const Vec128 v) { static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)}; } // ------------------------------ TableLookupLanes // Returned by SetTableIndices for use by TableLookupLanes. template struct Indices128 { __v128_u raw; }; template HWY_API Indices128 IndicesFromVec(Simd d, Vec128 vec) { static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); #if HWY_IS_DEBUG_BUILD const Rebind di; HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && AllTrue(di, Lt(vec, Set(di, static_cast(N))))); #endif const Repartition d8; using V8 = VFromD; const Repartition d16; // Broadcast each lane index to all bytes of T and shift to bytes static_assert(sizeof(T) == 4 || sizeof(T) == 8, ""); if (sizeof(T) == 4) { alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = { 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; const V8 lane_indices = TableLookupBytes(BitCast(d8, vec), Load(d8, kBroadcastLaneBytes)); const V8 byte_indices = BitCast(d8, ShiftLeft<2>(BitCast(d16, lane_indices))); alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; return Indices128{Add(byte_indices, Load(d8, kByteOffsets)).raw}; } else { alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8}; const V8 lane_indices = TableLookupBytes(BitCast(d8, vec), Load(d8, kBroadcastLaneBytes)); const V8 byte_indices = BitCast(d8, ShiftLeft<3>(BitCast(d16, lane_indices))); alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7}; return Indices128{Add(byte_indices, Load(d8, kByteOffsets)).raw}; } } template HWY_API Indices128 SetTableIndices(Simd d, const TI* idx) { const Rebind di; return IndicesFromVec(d, LoadU(di, idx)); } template HWY_API Vec128 TableLookupLanes(Vec128 v, Indices128 idx) { using TI = MakeSigned; const DFromV d; const Rebind di; return BitCast(d, TableLookupBytes(BitCast(di, v), Vec128{idx.raw})); } // ------------------------------ Reverse (Shuffle0123, Shuffle2301, Shuffle01) // Single lane: no change template HWY_API Vec128 Reverse(Simd /* tag */, const Vec128 v) { return v; } // Two lanes: shuffle template HWY_API Vec128 Reverse(Simd /* tag */, const Vec128 v) { return Vec128{Shuffle2301(Vec128{v.raw}).raw}; } template HWY_API Vec128 Reverse(Full128 /* tag */, const Vec128 v) { return Shuffle01(v); } // Four lanes: shuffle template HWY_API Vec128 Reverse(Full128 /* tag */, const Vec128 v) { return Shuffle0123(v); } // 16-bit template HWY_API Vec128 Reverse(Simd d, const Vec128 v) { const RepartitionToWide> du32; return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v)))); } // ------------------------------ Reverse2 template HWY_API Vec128 Reverse2(Simd d, const Vec128 v) { const RepartitionToWide> du32; return BitCast(d, RotateRight<16>(BitCast(du32, v))); } template HWY_API Vec128 Reverse2(Simd /* tag */, const Vec128 v) { return Shuffle2301(v); } template HWY_API Vec128 Reverse2(Simd /* tag */, const Vec128 v) { return Shuffle01(v); } // ------------------------------ Reverse4 template HWY_API Vec128 Reverse4(Simd d, const Vec128 v) { return BitCast(d, Vec128{wasm_i16x8_shuffle(v.raw, v.raw, 3, 2, 1, 0, 7, 6, 5, 4)}); } template HWY_API Vec128 Reverse4(Simd /* tag */, const Vec128 v) { return Shuffle0123(v); } template HWY_API Vec128 Reverse4(Simd /* tag */, const Vec128) { HWY_ASSERT(0); // don't have 8 u64 lanes } // ------------------------------ Reverse8 template HWY_API Vec128 Reverse8(Simd d, const Vec128 v) { return Reverse(d, v); } template HWY_API Vec128 Reverse8(Simd, const Vec128) { HWY_ASSERT(0); // don't have 8 lanes unless 16-bit } // ------------------------------ InterleaveLower template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{wasm_i8x16_shuffle( a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)}; } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{ wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)}; } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)}; } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)}; } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{wasm_i8x16_shuffle( a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)}; } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{ wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)}; } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)}; } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)}; } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)}; } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)}; } // Additional overload for the optional tag. template HWY_API V InterleaveLower(DFromV /* tag */, V a, V b) { return InterleaveLower(a, b); } // ------------------------------ InterleaveUpper (UpperHalf) // All functions inside detail lack the required D parameter. namespace detail { template HWY_API Vec128 InterleaveUpper(Vec128 a, Vec128 b) { return Vec128{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)}; } template HWY_API Vec128 InterleaveUpper(Vec128 a, Vec128 b) { return Vec128{ wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)}; } template HWY_API Vec128 InterleaveUpper(Vec128 a, Vec128 b) { return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)}; } template HWY_API Vec128 InterleaveUpper(Vec128 a, Vec128 b) { return Vec128{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)}; } template HWY_API Vec128 InterleaveUpper(Vec128 a, Vec128 b) { return Vec128{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)}; } template HWY_API Vec128 InterleaveUpper(Vec128 a, Vec128 b) { return Vec128{ wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)}; } template HWY_API Vec128 InterleaveUpper(Vec128 a, Vec128 b) { return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)}; } template HWY_API Vec128 InterleaveUpper(Vec128 a, Vec128 b) { return Vec128{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)}; } template HWY_API Vec128 InterleaveUpper(Vec128 a, Vec128 b) { return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)}; } template HWY_API Vec128 InterleaveUpper(Vec128 a, Vec128 b) { return Vec128{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)}; } } // namespace detail // Full template > HWY_API V InterleaveUpper(Full128 /* tag */, V a, V b) { return detail::InterleaveUpper(a, b); } // Partial template > HWY_API V InterleaveUpper(Simd d, V a, V b) { const Half d2; return InterleaveLower(d, V{UpperHalf(d2, a).raw}, V{UpperHalf(d2, b).raw}); } // ------------------------------ ZipLower/ZipUpper (InterleaveLower) // Same as Interleave*, except that the return lanes are double-width integers; // this is necessary because the single-lane scalar cannot return two values. template >> HWY_API VFromD ZipLower(V a, V b) { return BitCast(DW(), InterleaveLower(a, b)); } template , class DW = RepartitionToWide> HWY_API VFromD ZipLower(DW dw, V a, V b) { return BitCast(dw, InterleaveLower(D(), a, b)); } template , class DW = RepartitionToWide> HWY_API VFromD ZipUpper(DW dw, V a, V b) { return BitCast(dw, InterleaveUpper(D(), a, b)); } // ================================================== COMBINE // ------------------------------ Combine (InterleaveLower) // N = N/2 + N/2 (upper half undefined) template HWY_API Vec128 Combine(Simd d, Vec128 hi_half, Vec128 lo_half) { const Half d2; const RebindToUnsigned du2; // Treat half-width input as one lane, and expand to two lanes. using VU = Vec128, 2>; const VU lo{BitCast(du2, lo_half).raw}; const VU hi{BitCast(du2, hi_half).raw}; return BitCast(d, InterleaveLower(lo, hi)); } // ------------------------------ ZeroExtendVector (Combine, IfThenElseZero) template HWY_API Vec128 ZeroExtendVector(Simd d, Vec128 lo) { return IfThenElseZero(FirstN(d, N / 2), Vec128{lo.raw}); } // ------------------------------ ConcatLowerLower // hiH,hiL loH,loL |-> hiL,loL (= lower halves) template HWY_API Vec128 ConcatLowerLower(Full128 /* tag */, const Vec128 hi, const Vec128 lo) { return Vec128{wasm_i64x2_shuffle(lo.raw, hi.raw, 0, 2)}; } template HWY_API Vec128 ConcatLowerLower(Simd d, const Vec128 hi, const Vec128 lo) { const Half d2; return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo)); } // ------------------------------ ConcatUpperUpper template HWY_API Vec128 ConcatUpperUpper(Full128 /* tag */, const Vec128 hi, const Vec128 lo) { return Vec128{wasm_i64x2_shuffle(lo.raw, hi.raw, 1, 3)}; } template HWY_API Vec128 ConcatUpperUpper(Simd d, const Vec128 hi, const Vec128 lo) { const Half d2; return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo)); } // ------------------------------ ConcatLowerUpper template HWY_API Vec128 ConcatLowerUpper(Full128 d, const Vec128 hi, const Vec128 lo) { return CombineShiftRightBytes<8>(d, hi, lo); } template HWY_API Vec128 ConcatLowerUpper(Simd d, const Vec128 hi, const Vec128 lo) { const Half d2; return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo)); } // ------------------------------ ConcatUpperLower template HWY_API Vec128 ConcatUpperLower(Simd d, const Vec128 hi, const Vec128 lo) { return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi); } // ------------------------------ ConcatOdd // 8-bit full template HWY_API Vec128 ConcatOdd(Full128 /* tag */, Vec128 hi, Vec128 lo) { return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31)}; } // 8-bit x8 template HWY_API Vec128 ConcatOdd(Simd /* tag */, Vec128 hi, Vec128 lo) { // Don't care about upper half. return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 17, 19, 21, 23, 1, 3, 5, 7, 17, 19, 21, 23)}; } // 8-bit x4 template HWY_API Vec128 ConcatOdd(Simd /* tag */, Vec128 hi, Vec128 lo) { // Don't care about upper 3/4. return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 17, 19, 1, 3, 17, 19, 1, 3, 17, 19, 1, 3, 17, 19)}; } // 16-bit full template HWY_API Vec128 ConcatOdd(Full128 /* tag */, Vec128 hi, Vec128 lo) { return Vec128{ wasm_i16x8_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 9, 11, 13, 15)}; } // 16-bit x4 template HWY_API Vec128 ConcatOdd(Simd /* tag */, Vec128 hi, Vec128 lo) { // Don't care about upper half. return Vec128{ wasm_i16x8_shuffle(lo.raw, hi.raw, 1, 3, 9, 11, 1, 3, 9, 11)}; } // 32-bit full template HWY_API Vec128 ConcatOdd(Full128 /* tag */, Vec128 hi, Vec128 lo) { return Vec128{wasm_i32x4_shuffle(lo.raw, hi.raw, 1, 3, 5, 7)}; } // Any T x2 template HWY_API Vec128 ConcatOdd(Simd d, Vec128 hi, Vec128 lo) { return InterleaveUpper(d, lo, hi); } // ------------------------------ ConcatEven (InterleaveLower) // 8-bit full template HWY_API Vec128 ConcatEven(Full128 /* tag */, Vec128 hi, Vec128 lo) { return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30)}; } // 8-bit x8 template HWY_API Vec128 ConcatEven(Simd /* tag */, Vec128 hi, Vec128 lo) { // Don't care about upper half. return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 16, 18, 20, 22, 0, 2, 4, 6, 16, 18, 20, 22)}; } // 8-bit x4 template HWY_API Vec128 ConcatEven(Simd /* tag */, Vec128 hi, Vec128 lo) { // Don't care about upper 3/4. return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 16, 18, 0, 2, 16, 18, 0, 2, 16, 18, 0, 2, 16, 18)}; } // 16-bit full template HWY_API Vec128 ConcatEven(Full128 /* tag */, Vec128 hi, Vec128 lo) { return Vec128{ wasm_i16x8_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 8, 10, 12, 14)}; } // 16-bit x4 template HWY_API Vec128 ConcatEven(Simd /* tag */, Vec128 hi, Vec128 lo) { // Don't care about upper half. return Vec128{ wasm_i16x8_shuffle(lo.raw, hi.raw, 0, 2, 8, 10, 0, 2, 8, 10)}; } // 32-bit full template HWY_API Vec128 ConcatEven(Full128 /* tag */, Vec128 hi, Vec128 lo) { return Vec128{wasm_i32x4_shuffle(lo.raw, hi.raw, 0, 2, 4, 6)}; } // Any T x2 template HWY_API Vec128 ConcatEven(Simd d, Vec128 hi, Vec128 lo) { return InterleaveLower(d, lo, hi); } // ------------------------------ DupEven (InterleaveLower) template HWY_API Vec128 DupEven(Vec128 v) { return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 0, 0, 2, 2)}; } template HWY_API Vec128 DupEven(const Vec128 v) { return InterleaveLower(DFromV(), v, v); } // ------------------------------ DupOdd (InterleaveUpper) template HWY_API Vec128 DupOdd(Vec128 v) { return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 1, 1, 3, 3)}; } template HWY_API Vec128 DupOdd(const Vec128 v) { return InterleaveUpper(DFromV(), v, v); } // ------------------------------ OddEven namespace detail { template HWY_INLINE Vec128 OddEven(hwy::SizeTag<1> /* tag */, const Vec128 a, const Vec128 b) { const DFromV d; const Repartition d8; alignas(16) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0}; return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a); } template HWY_INLINE Vec128 OddEven(hwy::SizeTag<2> /* tag */, const Vec128 a, const Vec128 b) { return Vec128{ wasm_i16x8_shuffle(a.raw, b.raw, 8, 1, 10, 3, 12, 5, 14, 7)}; } template HWY_INLINE Vec128 OddEven(hwy::SizeTag<4> /* tag */, const Vec128 a, const Vec128 b) { return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)}; } template HWY_INLINE Vec128 OddEven(hwy::SizeTag<8> /* tag */, const Vec128 a, const Vec128 b) { return Vec128{wasm_i64x2_shuffle(a.raw, b.raw, 2, 1)}; } } // namespace detail template HWY_API Vec128 OddEven(const Vec128 a, const Vec128 b) { return detail::OddEven(hwy::SizeTag(), a, b); } template HWY_API Vec128 OddEven(const Vec128 a, const Vec128 b) { return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)}; } // ------------------------------ OddEvenBlocks template HWY_API Vec128 OddEvenBlocks(Vec128 /* odd */, Vec128 even) { return even; } // ------------------------------ SwapAdjacentBlocks template HWY_API Vec128 SwapAdjacentBlocks(Vec128 v) { return v; } // ------------------------------ ReverseBlocks // Single block: no change template HWY_API Vec128 ReverseBlocks(Full128 /* tag */, const Vec128 v) { return v; } // ================================================== CONVERT // ------------------------------ Promotions (part w/ narrow lanes -> full) // Unsigned: zero-extend. template HWY_API Vec128 PromoteTo(Simd /* tag */, const Vec128 v) { return Vec128{wasm_u16x8_extend_low_u8x16(v.raw)}; } template HWY_API Vec128 PromoteTo(Simd /* tag */, const Vec128 v) { return Vec128{ wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))}; } template HWY_API Vec128 PromoteTo(Simd /* tag */, const Vec128 v) { return Vec128{wasm_u16x8_extend_low_u8x16(v.raw)}; } template HWY_API Vec128 PromoteTo(Simd /* tag */, const Vec128 v) { return Vec128{ wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))}; } template HWY_API Vec128 PromoteTo(Simd /* tag */, const Vec128 v) { return Vec128{wasm_u32x4_extend_low_u16x8(v.raw)}; } template HWY_API Vec128 PromoteTo(Simd /* tag */, const Vec128 v) { return Vec128{wasm_u64x2_extend_low_u32x4(v.raw)}; } template HWY_API Vec128 PromoteTo(Simd /* tag */, const Vec128 v) { return Vec128{wasm_u32x4_extend_low_u16x8(v.raw)}; } // Signed: replicate sign bit. template HWY_API Vec128 PromoteTo(Simd /* tag */, const Vec128 v) { return Vec128{wasm_i16x8_extend_low_i8x16(v.raw)}; } template HWY_API Vec128 PromoteTo(Simd /* tag */, const Vec128 v) { return Vec128{ wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(v.raw))}; } template HWY_API Vec128 PromoteTo(Simd /* tag */, const Vec128 v) { return Vec128{wasm_i32x4_extend_low_i16x8(v.raw)}; } template HWY_API Vec128 PromoteTo(Simd /* tag */, const Vec128 v) { return Vec128{wasm_i64x2_extend_low_i32x4(v.raw)}; } template HWY_API Vec128 PromoteTo(Simd /* tag */, const Vec128 v) { return Vec128{wasm_f64x2_convert_low_i32x4(v.raw)}; } template HWY_API Vec128 PromoteTo(Simd df32, const Vec128 v) { const RebindToSigned di32; const RebindToUnsigned du32; // Expand to u32 so we can shift. const auto bits16 = PromoteTo(du32, Vec128{v.raw}); const auto sign = ShiftRight<15>(bits16); const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F); const auto mantissa = bits16 & Set(du32, 0x3FF); const auto subnormal = BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) * Set(df32, 1.0f / 16384 / 1024)); const auto biased_exp32 = biased_exp + Set(du32, 127 - 15); const auto mantissa32 = ShiftLeft<23 - 10>(mantissa); const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32; const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal); return BitCast(df32, ShiftLeft<31>(sign) | bits32); } template HWY_API Vec128 PromoteTo(Simd df32, const Vec128 v) { const Rebind du16; const RebindToSigned di32; return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); } // ------------------------------ Demotions (full -> part w/ narrow lanes) template HWY_API Vec128 DemoteTo(Simd /* tag */, const Vec128 v) { return Vec128{wasm_u16x8_narrow_i32x4(v.raw, v.raw)}; } template HWY_API Vec128 DemoteTo(Simd /* tag */, const Vec128 v) { return Vec128{wasm_i16x8_narrow_i32x4(v.raw, v.raw)}; } template HWY_API Vec128 DemoteTo(Simd /* tag */, const Vec128 v) { const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw); return Vec128{ wasm_u8x16_narrow_i16x8(intermediate, intermediate)}; } template HWY_API Vec128 DemoteTo(Simd /* tag */, const Vec128 v) { return Vec128{wasm_u8x16_narrow_i16x8(v.raw, v.raw)}; } template HWY_API Vec128 DemoteTo(Simd /* tag */, const Vec128 v) { const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw); return Vec128{wasm_i8x16_narrow_i16x8(intermediate, intermediate)}; } template HWY_API Vec128 DemoteTo(Simd /* tag */, const Vec128 v) { return Vec128{wasm_i8x16_narrow_i16x8(v.raw, v.raw)}; } template HWY_API Vec128 DemoteTo(Simd /* di */, const Vec128 v) { return Vec128{wasm_i32x4_trunc_sat_f64x2_zero(v.raw)}; } template HWY_API Vec128 DemoteTo(Simd df16, const Vec128 v) { const RebindToUnsigned du16; const Rebind du; const RebindToSigned di; const auto bits32 = BitCast(du, v); const auto sign = ShiftRight<31>(bits32); const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF); const auto mantissa32 = bits32 & Set(du, 0x7FFFFF); const auto k15 = Set(di, 15); const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15); const auto is_tiny = exp < Set(di, -24); const auto is_subnormal = exp < Set(di, -14); const auto biased_exp16 = BitCast(du, IfThenZeroElse(is_subnormal, exp + k15)); const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11) const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) + (mantissa32 >> (Set(du, 13) + sub_exp)); const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m, ShiftRight<13>(mantissa32)); // <1024 const auto sign16 = ShiftLeft<15>(sign); const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16; const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16)); return Vec128{DemoteTo(du16, bits16).raw}; } template HWY_API Vec128 DemoteTo(Simd dbf16, const Vec128 v) { const Rebind di32; const Rebind du32; // for logical shift right const Rebind du16; const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v))); return BitCast(dbf16, DemoteTo(du16, bits_in_32)); } template HWY_API Vec128 ReorderDemote2To( Simd dbf16, Vec128 a, Vec128 b) { const RebindToUnsigned du16; const Repartition du32; const Vec128 b_in_even = ShiftRight<16>(BitCast(du32, b)); const auto u16 = OddEven(BitCast(du16, a), BitCast(du16, b_in_even)); return BitCast(dbf16, u16); } // Specializations for partial vectors because i16x8_narrow_i32x4 sets lanes // above 2*N. HWY_API Vec128 ReorderDemote2To(Simd dn, Vec128 a, Vec128 b) { const Half dnh; // Pretend the result has twice as many lanes so we can InterleaveLower. const Vec128 an{DemoteTo(dnh, a).raw}; const Vec128 bn{DemoteTo(dnh, b).raw}; return InterleaveLower(an, bn); } HWY_API Vec128 ReorderDemote2To(Simd dn, Vec128 a, Vec128 b) { const Half dnh; // Pretend the result has twice as many lanes so we can InterleaveLower. const Vec128 an{DemoteTo(dnh, a).raw}; const Vec128 bn{DemoteTo(dnh, b).raw}; return InterleaveLower(an, bn); } HWY_API Vec128 ReorderDemote2To(Full128 /*d16*/, Vec128 a, Vec128 b) { return Vec128{wasm_i16x8_narrow_i32x4(a.raw, b.raw)}; } // For already range-limited input [0, 255]. template HWY_API Vec128 U8FromU32(const Vec128 v) { const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw); return Vec128{ wasm_u8x16_narrow_i16x8(intermediate, intermediate)}; } // ------------------------------ Truncations template * = nullptr> HWY_API Vec128 TruncateTo(Simd /* tag */, const Vec128 v) { const Repartition> d; const auto v1 = BitCast(d, v); return Vec128{v1.raw}; } HWY_API Vec16 TruncateTo(Full16 /* tag */, const Vec128 v) { const Full128 d; const auto v1 = BitCast(d, v); const auto v2 = ConcatEven(d, v1, v1); const auto v4 = ConcatEven(d, v2, v2); return LowerHalf(LowerHalf(LowerHalf(ConcatEven(d, v4, v4)))); } HWY_API Vec32 TruncateTo(Full32 /* tag */, const Vec128 v) { const Full128 d; const auto v1 = BitCast(d, v); const auto v2 = ConcatEven(d, v1, v1); return LowerHalf(LowerHalf(ConcatEven(d, v2, v2))); } HWY_API Vec64 TruncateTo(Full64 /* tag */, const Vec128 v) { const Full128 d; const auto v1 = BitCast(d, v); return LowerHalf(ConcatEven(d, v1, v1)); } template = 2>* = nullptr> HWY_API Vec128 TruncateTo(Simd /* tag */, const Vec128 v) { const Full128 d; const auto v1 = Vec128{v.raw}; const auto v2 = ConcatEven(d, v1, v1); const auto v3 = ConcatEven(d, v2, v2); return Vec128{v3.raw}; } template = 2>* = nullptr> HWY_API Vec128 TruncateTo(Simd /* tag */, const Vec128 v) { const Full128 d; const auto v1 = Vec128{v.raw}; const auto v2 = ConcatEven(d, v1, v1); return Vec128{v2.raw}; } template = 2>* = nullptr> HWY_API Vec128 TruncateTo(Simd /* tag */, const Vec128 v) { const Full128 d; const auto v1 = Vec128{v.raw}; const auto v2 = ConcatEven(d, v1, v1); return Vec128{v2.raw}; } // ------------------------------ Convert i32 <=> f32 (Round) template HWY_API Vec128 ConvertTo(Simd /* tag */, const Vec128 v) { return Vec128{wasm_f32x4_convert_i32x4(v.raw)}; } template HWY_API Vec128 ConvertTo(Simd /* tag */, const Vec128 v) { return Vec128{wasm_f32x4_convert_u32x4(v.raw)}; } // Truncates (rounds toward zero). template HWY_API Vec128 ConvertTo(Simd /* tag */, const Vec128 v) { return Vec128{wasm_i32x4_trunc_sat_f32x4(v.raw)}; } template HWY_API Vec128 NearestInt(const Vec128 v) { return ConvertTo(Simd(), Round(v)); } // ================================================== MISC // ------------------------------ SumsOf8 (ShiftRight, Add) template HWY_API Vec128 SumsOf8(const Vec128 v) { const DFromV du8; const RepartitionToWide du16; const RepartitionToWide du32; const RepartitionToWide du64; using VU16 = VFromD; const VU16 vFDB97531 = ShiftRight<8>(BitCast(du16, v)); const VU16 vECA86420 = And(BitCast(du16, v), Set(du16, 0xFF)); const VU16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420); const VU16 szz_FE_zz_BA_zz_76_zz_32 = BitCast(du16, ShiftRight<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10))); const VU16 sxx_FC_xx_B8_xx_74_xx_30 = Add(sFE_DC_BA_98_76_54_32_10, szz_FE_zz_BA_zz_76_zz_32); const VU16 szz_zz_xx_FC_zz_zz_xx_74 = BitCast(du16, ShiftRight<32>(BitCast(du64, sxx_FC_xx_B8_xx_74_xx_30))); const VU16 sxx_xx_xx_F8_xx_xx_xx_70 = Add(sxx_FC_xx_B8_xx_74_xx_30, szz_zz_xx_FC_zz_zz_xx_74); return And(BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70), Set(du64, 0xFFFF)); } // ------------------------------ LoadMaskBits (TestBit) namespace detail { template HWY_INLINE Mask128 LoadMaskBits(Simd d, uint64_t bits) { const RebindToUnsigned du; // Easier than Set(), which would require an >8-bit type, which would not // compile for T=uint8_t, N=1. const Vec128 vbits{wasm_i32x4_splat(static_cast(bits))}; // Replicate bytes 8x such that each byte contains the bit that governs it. alignas(16) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1}; const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8)); alignas(16) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128}; return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit))); } template HWY_INLINE Mask128 LoadMaskBits(Simd d, uint64_t bits) { const RebindToUnsigned du; alignas(16) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128}; return RebindMask( d, TestBit(Set(du, static_cast(bits)), Load(du, kBit))); } template HWY_INLINE Mask128 LoadMaskBits(Simd d, uint64_t bits) { const RebindToUnsigned du; alignas(16) constexpr uint32_t kBit[8] = {1, 2, 4, 8}; return RebindMask( d, TestBit(Set(du, static_cast(bits)), Load(du, kBit))); } template HWY_INLINE Mask128 LoadMaskBits(Simd d, uint64_t bits) { const RebindToUnsigned du; alignas(16) constexpr uint64_t kBit[8] = {1, 2}; return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit))); } } // namespace detail // `p` points to at least 8 readable bytes, not all of which need be valid. template HWY_API Mask128 LoadMaskBits(Simd d, const uint8_t* HWY_RESTRICT bits) { uint64_t mask_bits = 0; CopyBytes<(N + 7) / 8>(bits, &mask_bits); return detail::LoadMaskBits(d, mask_bits); } // ------------------------------ Mask namespace detail { // Full template HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, const Mask128 mask) { alignas(16) uint64_t lanes[2]; wasm_v128_store(lanes, mask.raw); constexpr uint64_t kMagic = 0x103070F1F3F80ULL; const uint64_t lo = ((lanes[0] * kMagic) >> 56); const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00; return (hi + lo); } // 64-bit template HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, const Mask128 mask) { constexpr uint64_t kMagic = 0x103070F1F3F80ULL; return (static_cast(wasm_i64x2_extract_lane(mask.raw, 0)) * kMagic) >> 56; } // 32-bit or less: need masking template HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, const Mask128 mask) { uint64_t bytes = static_cast(wasm_i64x2_extract_lane(mask.raw, 0)); // Clear potentially undefined bytes. bytes &= (1ULL << (N * 8)) - 1; constexpr uint64_t kMagic = 0x103070F1F3F80ULL; return (bytes * kMagic) >> 56; } template HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, const Mask128 mask) { // Remove useless lower half of each u16 while preserving the sign bit. const __i16x8 zero = wasm_i16x8_splat(0); const Mask128 mask8{wasm_i8x16_narrow_i16x8(mask.raw, zero)}; return BitsFromMask(hwy::SizeTag<1>(), mask8); } template HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, const Mask128 mask) { const __i32x4 mask_i = static_cast<__i32x4>(mask.raw); const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8); const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice); alignas(16) uint32_t lanes[4]; wasm_v128_store(lanes, sliced_mask); return lanes[0] | lanes[1] | lanes[2] | lanes[3]; } template HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, const Mask128 mask) { const __i64x2 mask_i = static_cast<__i64x2>(mask.raw); const __i64x2 slice = wasm_i64x2_make(1, 2); const __i64x2 sliced_mask = wasm_v128_and(mask_i, slice); alignas(16) uint64_t lanes[2]; wasm_v128_store(lanes, sliced_mask); return lanes[0] | lanes[1]; } // Returns the lowest N bits for the BitsFromMask result. template constexpr uint64_t OnlyActive(uint64_t bits) { return ((N * sizeof(T)) == 16) ? bits : bits & ((1ull << N) - 1); } // Returns 0xFF for bytes with index >= N, otherwise 0. template constexpr __i8x16 BytesAbove() { return /**/ (N == 0) ? wasm_i32x4_make(-1, -1, -1, -1) : (N == 4) ? wasm_i32x4_make(0, -1, -1, -1) : (N == 8) ? wasm_i32x4_make(0, 0, -1, -1) : (N == 12) ? wasm_i32x4_make(0, 0, 0, -1) : (N == 16) ? wasm_i32x4_make(0, 0, 0, 0) : (N == 2) ? wasm_i16x8_make(0, -1, -1, -1, -1, -1, -1, -1) : (N == 6) ? wasm_i16x8_make(0, 0, 0, -1, -1, -1, -1, -1) : (N == 10) ? wasm_i16x8_make(0, 0, 0, 0, 0, -1, -1, -1) : (N == 14) ? wasm_i16x8_make(0, 0, 0, 0, 0, 0, 0, -1) : (N == 1) ? wasm_i8x16_make(0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1) : (N == 3) ? wasm_i8x16_make(0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1) : (N == 5) ? wasm_i8x16_make(0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1) : (N == 7) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1) : (N == 9) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1) : (N == 11) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1) : (N == 13) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1) : wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1); } template HWY_INLINE uint64_t BitsFromMask(const Mask128 mask) { return OnlyActive(BitsFromMask(hwy::SizeTag(), mask)); } template HWY_INLINE size_t CountTrue(hwy::SizeTag<1> tag, const Mask128 m) { return PopCount(BitsFromMask(tag, m)); } template HWY_INLINE size_t CountTrue(hwy::SizeTag<2> tag, const Mask128 m) { return PopCount(BitsFromMask(tag, m)); } template HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, const Mask128 m) { const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8); const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift); alignas(16) uint64_t lanes[2]; wasm_v128_store(lanes, shifted_bits); return PopCount(lanes[0] | lanes[1]); } template HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128 m) { alignas(16) int64_t lanes[2]; wasm_v128_store(lanes, m.raw); return static_cast(-(lanes[0] + lanes[1])); } } // namespace detail // `p` points to at least 8 writable bytes. template HWY_API size_t StoreMaskBits(const Simd /* tag */, const Mask128 mask, uint8_t* bits) { const uint64_t mask_bits = detail::BitsFromMask(mask); const size_t kNumBytes = (N + 7) / 8; CopyBytes(&mask_bits, bits); return kNumBytes; } template HWY_API size_t CountTrue(const Simd /* tag */, const Mask128 m) { return detail::CountTrue(hwy::SizeTag(), m); } // Partial vector template HWY_API size_t CountTrue(const Simd d, const Mask128 m) { // Ensure all undefined bytes are 0. const Mask128 mask{detail::BytesAbove()}; return CountTrue(d, Mask128{AndNot(mask, m).raw}); } // Full vector template HWY_API bool AllFalse(const Full128 d, const Mask128 m) { #if 0 // Casting followed by wasm_i8x16_any_true results in wasm error: // i32.eqz[0] expected type i32, found i8x16.popcnt of type s128 const auto v8 = BitCast(Full128(), VecFromMask(d, m)); return !wasm_i8x16_any_true(v8.raw); #else (void)d; return (wasm_i64x2_extract_lane(m.raw, 0) | wasm_i64x2_extract_lane(m.raw, 1)) == 0; #endif } // Full vector namespace detail { template HWY_INLINE bool AllTrue(hwy::SizeTag<1> /*tag*/, const Mask128 m) { return wasm_i8x16_all_true(m.raw); } template HWY_INLINE bool AllTrue(hwy::SizeTag<2> /*tag*/, const Mask128 m) { return wasm_i16x8_all_true(m.raw); } template HWY_INLINE bool AllTrue(hwy::SizeTag<4> /*tag*/, const Mask128 m) { return wasm_i32x4_all_true(m.raw); } template HWY_INLINE bool AllTrue(hwy::SizeTag<8> /*tag*/, const Mask128 m) { return wasm_i64x2_all_true(m.raw); } } // namespace detail template HWY_API bool AllTrue(const Simd /* tag */, const Mask128 m) { return detail::AllTrue(hwy::SizeTag(), m); } // Partial vectors template HWY_API bool AllFalse(Simd /* tag */, const Mask128 m) { // Ensure all undefined bytes are 0. const Mask128 mask{detail::BytesAbove()}; return AllFalse(Full128(), Mask128{AndNot(mask, m).raw}); } template HWY_API bool AllTrue(const Simd /* d */, const Mask128 m) { // Ensure all undefined bytes are FF. const Mask128 mask{detail::BytesAbove()}; return AllTrue(Full128(), Mask128{Or(mask, m).raw}); } template HWY_API size_t FindKnownFirstTrue(const Simd /* tag */, const Mask128 mask) { const uint64_t bits = detail::BitsFromMask(mask); return Num0BitsBelowLS1Bit_Nonzero64(bits); } template HWY_API intptr_t FindFirstTrue(const Simd /* tag */, const Mask128 mask) { const uint64_t bits = detail::BitsFromMask(mask); return bits ? static_cast(Num0BitsBelowLS1Bit_Nonzero64(bits)) : -1; } // ------------------------------ Compress namespace detail { template HWY_INLINE Vec128 IdxFromBits(const uint64_t mask_bits) { HWY_DASSERT(mask_bits < 256); const Simd d; const Rebind d8; const Simd du; // We need byte indices for TableLookupBytes (one vector's worth for each of // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We // can instead store lane indices and convert to byte indices (2*lane + 0..1), // with the doubling baked into the table. Unpacking nibbles is likely more // costly than the higher cache footprint from storing bytes. alignas(16) constexpr uint8_t table[256 * 8] = { // PrintCompress16x8Tables 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 2, 0, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 4, 0, 2, 6, 8, 10, 12, 14, /**/ 0, 4, 2, 6, 8, 10, 12, 14, // 2, 4, 0, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 6, 0, 2, 4, 8, 10, 12, 14, /**/ 0, 6, 2, 4, 8, 10, 12, 14, // 2, 6, 0, 4, 8, 10, 12, 14, /**/ 0, 2, 6, 4, 8, 10, 12, 14, // 4, 6, 0, 2, 8, 10, 12, 14, /**/ 0, 4, 6, 2, 8, 10, 12, 14, // 2, 4, 6, 0, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 8, 0, 2, 4, 6, 10, 12, 14, /**/ 0, 8, 2, 4, 6, 10, 12, 14, // 2, 8, 0, 4, 6, 10, 12, 14, /**/ 0, 2, 8, 4, 6, 10, 12, 14, // 4, 8, 0, 2, 6, 10, 12, 14, /**/ 0, 4, 8, 2, 6, 10, 12, 14, // 2, 4, 8, 0, 6, 10, 12, 14, /**/ 0, 2, 4, 8, 6, 10, 12, 14, // 6, 8, 0, 2, 4, 10, 12, 14, /**/ 0, 6, 8, 2, 4, 10, 12, 14, // 2, 6, 8, 0, 4, 10, 12, 14, /**/ 0, 2, 6, 8, 4, 10, 12, 14, // 4, 6, 8, 0, 2, 10, 12, 14, /**/ 0, 4, 6, 8, 2, 10, 12, 14, // 2, 4, 6, 8, 0, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 10, 0, 2, 4, 6, 8, 12, 14, /**/ 0, 10, 2, 4, 6, 8, 12, 14, // 2, 10, 0, 4, 6, 8, 12, 14, /**/ 0, 2, 10, 4, 6, 8, 12, 14, // 4, 10, 0, 2, 6, 8, 12, 14, /**/ 0, 4, 10, 2, 6, 8, 12, 14, // 2, 4, 10, 0, 6, 8, 12, 14, /**/ 0, 2, 4, 10, 6, 8, 12, 14, // 6, 10, 0, 2, 4, 8, 12, 14, /**/ 0, 6, 10, 2, 4, 8, 12, 14, // 2, 6, 10, 0, 4, 8, 12, 14, /**/ 0, 2, 6, 10, 4, 8, 12, 14, // 4, 6, 10, 0, 2, 8, 12, 14, /**/ 0, 4, 6, 10, 2, 8, 12, 14, // 2, 4, 6, 10, 0, 8, 12, 14, /**/ 0, 2, 4, 6, 10, 8, 12, 14, // 8, 10, 0, 2, 4, 6, 12, 14, /**/ 0, 8, 10, 2, 4, 6, 12, 14, // 2, 8, 10, 0, 4, 6, 12, 14, /**/ 0, 2, 8, 10, 4, 6, 12, 14, // 4, 8, 10, 0, 2, 6, 12, 14, /**/ 0, 4, 8, 10, 2, 6, 12, 14, // 2, 4, 8, 10, 0, 6, 12, 14, /**/ 0, 2, 4, 8, 10, 6, 12, 14, // 6, 8, 10, 0, 2, 4, 12, 14, /**/ 0, 6, 8, 10, 2, 4, 12, 14, // 2, 6, 8, 10, 0, 4, 12, 14, /**/ 0, 2, 6, 8, 10, 4, 12, 14, // 4, 6, 8, 10, 0, 2, 12, 14, /**/ 0, 4, 6, 8, 10, 2, 12, 14, // 2, 4, 6, 8, 10, 0, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 12, 0, 2, 4, 6, 8, 10, 14, /**/ 0, 12, 2, 4, 6, 8, 10, 14, // 2, 12, 0, 4, 6, 8, 10, 14, /**/ 0, 2, 12, 4, 6, 8, 10, 14, // 4, 12, 0, 2, 6, 8, 10, 14, /**/ 0, 4, 12, 2, 6, 8, 10, 14, // 2, 4, 12, 0, 6, 8, 10, 14, /**/ 0, 2, 4, 12, 6, 8, 10, 14, // 6, 12, 0, 2, 4, 8, 10, 14, /**/ 0, 6, 12, 2, 4, 8, 10, 14, // 2, 6, 12, 0, 4, 8, 10, 14, /**/ 0, 2, 6, 12, 4, 8, 10, 14, // 4, 6, 12, 0, 2, 8, 10, 14, /**/ 0, 4, 6, 12, 2, 8, 10, 14, // 2, 4, 6, 12, 0, 8, 10, 14, /**/ 0, 2, 4, 6, 12, 8, 10, 14, // 8, 12, 0, 2, 4, 6, 10, 14, /**/ 0, 8, 12, 2, 4, 6, 10, 14, // 2, 8, 12, 0, 4, 6, 10, 14, /**/ 0, 2, 8, 12, 4, 6, 10, 14, // 4, 8, 12, 0, 2, 6, 10, 14, /**/ 0, 4, 8, 12, 2, 6, 10, 14, // 2, 4, 8, 12, 0, 6, 10, 14, /**/ 0, 2, 4, 8, 12, 6, 10, 14, // 6, 8, 12, 0, 2, 4, 10, 14, /**/ 0, 6, 8, 12, 2, 4, 10, 14, // 2, 6, 8, 12, 0, 4, 10, 14, /**/ 0, 2, 6, 8, 12, 4, 10, 14, // 4, 6, 8, 12, 0, 2, 10, 14, /**/ 0, 4, 6, 8, 12, 2, 10, 14, // 2, 4, 6, 8, 12, 0, 10, 14, /**/ 0, 2, 4, 6, 8, 12, 10, 14, // 10, 12, 0, 2, 4, 6, 8, 14, /**/ 0, 10, 12, 2, 4, 6, 8, 14, // 2, 10, 12, 0, 4, 6, 8, 14, /**/ 0, 2, 10, 12, 4, 6, 8, 14, // 4, 10, 12, 0, 2, 6, 8, 14, /**/ 0, 4, 10, 12, 2, 6, 8, 14, // 2, 4, 10, 12, 0, 6, 8, 14, /**/ 0, 2, 4, 10, 12, 6, 8, 14, // 6, 10, 12, 0, 2, 4, 8, 14, /**/ 0, 6, 10, 12, 2, 4, 8, 14, // 2, 6, 10, 12, 0, 4, 8, 14, /**/ 0, 2, 6, 10, 12, 4, 8, 14, // 4, 6, 10, 12, 0, 2, 8, 14, /**/ 0, 4, 6, 10, 12, 2, 8, 14, // 2, 4, 6, 10, 12, 0, 8, 14, /**/ 0, 2, 4, 6, 10, 12, 8, 14, // 8, 10, 12, 0, 2, 4, 6, 14, /**/ 0, 8, 10, 12, 2, 4, 6, 14, // 2, 8, 10, 12, 0, 4, 6, 14, /**/ 0, 2, 8, 10, 12, 4, 6, 14, // 4, 8, 10, 12, 0, 2, 6, 14, /**/ 0, 4, 8, 10, 12, 2, 6, 14, // 2, 4, 8, 10, 12, 0, 6, 14, /**/ 0, 2, 4, 8, 10, 12, 6, 14, // 6, 8, 10, 12, 0, 2, 4, 14, /**/ 0, 6, 8, 10, 12, 2, 4, 14, // 2, 6, 8, 10, 12, 0, 4, 14, /**/ 0, 2, 6, 8, 10, 12, 4, 14, // 4, 6, 8, 10, 12, 0, 2, 14, /**/ 0, 4, 6, 8, 10, 12, 2, 14, // 2, 4, 6, 8, 10, 12, 0, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 14, 0, 2, 4, 6, 8, 10, 12, /**/ 0, 14, 2, 4, 6, 8, 10, 12, // 2, 14, 0, 4, 6, 8, 10, 12, /**/ 0, 2, 14, 4, 6, 8, 10, 12, // 4, 14, 0, 2, 6, 8, 10, 12, /**/ 0, 4, 14, 2, 6, 8, 10, 12, // 2, 4, 14, 0, 6, 8, 10, 12, /**/ 0, 2, 4, 14, 6, 8, 10, 12, // 6, 14, 0, 2, 4, 8, 10, 12, /**/ 0, 6, 14, 2, 4, 8, 10, 12, // 2, 6, 14, 0, 4, 8, 10, 12, /**/ 0, 2, 6, 14, 4, 8, 10, 12, // 4, 6, 14, 0, 2, 8, 10, 12, /**/ 0, 4, 6, 14, 2, 8, 10, 12, // 2, 4, 6, 14, 0, 8, 10, 12, /**/ 0, 2, 4, 6, 14, 8, 10, 12, // 8, 14, 0, 2, 4, 6, 10, 12, /**/ 0, 8, 14, 2, 4, 6, 10, 12, // 2, 8, 14, 0, 4, 6, 10, 12, /**/ 0, 2, 8, 14, 4, 6, 10, 12, // 4, 8, 14, 0, 2, 6, 10, 12, /**/ 0, 4, 8, 14, 2, 6, 10, 12, // 2, 4, 8, 14, 0, 6, 10, 12, /**/ 0, 2, 4, 8, 14, 6, 10, 12, // 6, 8, 14, 0, 2, 4, 10, 12, /**/ 0, 6, 8, 14, 2, 4, 10, 12, // 2, 6, 8, 14, 0, 4, 10, 12, /**/ 0, 2, 6, 8, 14, 4, 10, 12, // 4, 6, 8, 14, 0, 2, 10, 12, /**/ 0, 4, 6, 8, 14, 2, 10, 12, // 2, 4, 6, 8, 14, 0, 10, 12, /**/ 0, 2, 4, 6, 8, 14, 10, 12, // 10, 14, 0, 2, 4, 6, 8, 12, /**/ 0, 10, 14, 2, 4, 6, 8, 12, // 2, 10, 14, 0, 4, 6, 8, 12, /**/ 0, 2, 10, 14, 4, 6, 8, 12, // 4, 10, 14, 0, 2, 6, 8, 12, /**/ 0, 4, 10, 14, 2, 6, 8, 12, // 2, 4, 10, 14, 0, 6, 8, 12, /**/ 0, 2, 4, 10, 14, 6, 8, 12, // 6, 10, 14, 0, 2, 4, 8, 12, /**/ 0, 6, 10, 14, 2, 4, 8, 12, // 2, 6, 10, 14, 0, 4, 8, 12, /**/ 0, 2, 6, 10, 14, 4, 8, 12, // 4, 6, 10, 14, 0, 2, 8, 12, /**/ 0, 4, 6, 10, 14, 2, 8, 12, // 2, 4, 6, 10, 14, 0, 8, 12, /**/ 0, 2, 4, 6, 10, 14, 8, 12, // 8, 10, 14, 0, 2, 4, 6, 12, /**/ 0, 8, 10, 14, 2, 4, 6, 12, // 2, 8, 10, 14, 0, 4, 6, 12, /**/ 0, 2, 8, 10, 14, 4, 6, 12, // 4, 8, 10, 14, 0, 2, 6, 12, /**/ 0, 4, 8, 10, 14, 2, 6, 12, // 2, 4, 8, 10, 14, 0, 6, 12, /**/ 0, 2, 4, 8, 10, 14, 6, 12, // 6, 8, 10, 14, 0, 2, 4, 12, /**/ 0, 6, 8, 10, 14, 2, 4, 12, // 2, 6, 8, 10, 14, 0, 4, 12, /**/ 0, 2, 6, 8, 10, 14, 4, 12, // 4, 6, 8, 10, 14, 0, 2, 12, /**/ 0, 4, 6, 8, 10, 14, 2, 12, // 2, 4, 6, 8, 10, 14, 0, 12, /**/ 0, 2, 4, 6, 8, 10, 14, 12, // 12, 14, 0, 2, 4, 6, 8, 10, /**/ 0, 12, 14, 2, 4, 6, 8, 10, // 2, 12, 14, 0, 4, 6, 8, 10, /**/ 0, 2, 12, 14, 4, 6, 8, 10, // 4, 12, 14, 0, 2, 6, 8, 10, /**/ 0, 4, 12, 14, 2, 6, 8, 10, // 2, 4, 12, 14, 0, 6, 8, 10, /**/ 0, 2, 4, 12, 14, 6, 8, 10, // 6, 12, 14, 0, 2, 4, 8, 10, /**/ 0, 6, 12, 14, 2, 4, 8, 10, // 2, 6, 12, 14, 0, 4, 8, 10, /**/ 0, 2, 6, 12, 14, 4, 8, 10, // 4, 6, 12, 14, 0, 2, 8, 10, /**/ 0, 4, 6, 12, 14, 2, 8, 10, // 2, 4, 6, 12, 14, 0, 8, 10, /**/ 0, 2, 4, 6, 12, 14, 8, 10, // 8, 12, 14, 0, 2, 4, 6, 10, /**/ 0, 8, 12, 14, 2, 4, 6, 10, // 2, 8, 12, 14, 0, 4, 6, 10, /**/ 0, 2, 8, 12, 14, 4, 6, 10, // 4, 8, 12, 14, 0, 2, 6, 10, /**/ 0, 4, 8, 12, 14, 2, 6, 10, // 2, 4, 8, 12, 14, 0, 6, 10, /**/ 0, 2, 4, 8, 12, 14, 6, 10, // 6, 8, 12, 14, 0, 2, 4, 10, /**/ 0, 6, 8, 12, 14, 2, 4, 10, // 2, 6, 8, 12, 14, 0, 4, 10, /**/ 0, 2, 6, 8, 12, 14, 4, 10, // 4, 6, 8, 12, 14, 0, 2, 10, /**/ 0, 4, 6, 8, 12, 14, 2, 10, // 2, 4, 6, 8, 12, 14, 0, 10, /**/ 0, 2, 4, 6, 8, 12, 14, 10, // 10, 12, 14, 0, 2, 4, 6, 8, /**/ 0, 10, 12, 14, 2, 4, 6, 8, // 2, 10, 12, 14, 0, 4, 6, 8, /**/ 0, 2, 10, 12, 14, 4, 6, 8, // 4, 10, 12, 14, 0, 2, 6, 8, /**/ 0, 4, 10, 12, 14, 2, 6, 8, // 2, 4, 10, 12, 14, 0, 6, 8, /**/ 0, 2, 4, 10, 12, 14, 6, 8, // 6, 10, 12, 14, 0, 2, 4, 8, /**/ 0, 6, 10, 12, 14, 2, 4, 8, // 2, 6, 10, 12, 14, 0, 4, 8, /**/ 0, 2, 6, 10, 12, 14, 4, 8, // 4, 6, 10, 12, 14, 0, 2, 8, /**/ 0, 4, 6, 10, 12, 14, 2, 8, // 2, 4, 6, 10, 12, 14, 0, 8, /**/ 0, 2, 4, 6, 10, 12, 14, 8, // 8, 10, 12, 14, 0, 2, 4, 6, /**/ 0, 8, 10, 12, 14, 2, 4, 6, // 2, 8, 10, 12, 14, 0, 4, 6, /**/ 0, 2, 8, 10, 12, 14, 4, 6, // 4, 8, 10, 12, 14, 0, 2, 6, /**/ 0, 4, 8, 10, 12, 14, 2, 6, // 2, 4, 8, 10, 12, 14, 0, 6, /**/ 0, 2, 4, 8, 10, 12, 14, 6, // 6, 8, 10, 12, 14, 0, 2, 4, /**/ 0, 6, 8, 10, 12, 14, 2, 4, // 2, 6, 8, 10, 12, 14, 0, 4, /**/ 0, 2, 6, 8, 10, 12, 14, 4, // 4, 6, 8, 10, 12, 14, 0, 2, /**/ 0, 4, 6, 8, 10, 12, 14, 2, // 2, 4, 6, 8, 10, 12, 14, 0, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; const Vec128 byte_idx{Load(d8, table + mask_bits * 8).raw}; const Vec128 pairs = ZipLower(byte_idx, byte_idx); return BitCast(d, pairs + Set(du, 0x0100)); } template HWY_INLINE Vec128 IdxFromNotBits(const uint64_t mask_bits) { HWY_DASSERT(mask_bits < 256); const Simd d; const Rebind d8; const Simd du; // We need byte indices for TableLookupBytes (one vector's worth for each of // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We // can instead store lane indices and convert to byte indices (2*lane + 0..1), // with the doubling baked into the table. Unpacking nibbles is likely more // costly than the higher cache footprint from storing bytes. alignas(16) constexpr uint8_t table[256 * 8] = { // PrintCompressNot16x8Tables 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 14, 0, // 0, 4, 6, 8, 10, 12, 14, 2, /**/ 4, 6, 8, 10, 12, 14, 0, 2, // 0, 2, 6, 8, 10, 12, 14, 4, /**/ 2, 6, 8, 10, 12, 14, 0, 4, // 0, 6, 8, 10, 12, 14, 2, 4, /**/ 6, 8, 10, 12, 14, 0, 2, 4, // 0, 2, 4, 8, 10, 12, 14, 6, /**/ 2, 4, 8, 10, 12, 14, 0, 6, // 0, 4, 8, 10, 12, 14, 2, 6, /**/ 4, 8, 10, 12, 14, 0, 2, 6, // 0, 2, 8, 10, 12, 14, 4, 6, /**/ 2, 8, 10, 12, 14, 0, 4, 6, // 0, 8, 10, 12, 14, 2, 4, 6, /**/ 8, 10, 12, 14, 0, 2, 4, 6, // 0, 2, 4, 6, 10, 12, 14, 8, /**/ 2, 4, 6, 10, 12, 14, 0, 8, // 0, 4, 6, 10, 12, 14, 2, 8, /**/ 4, 6, 10, 12, 14, 0, 2, 8, // 0, 2, 6, 10, 12, 14, 4, 8, /**/ 2, 6, 10, 12, 14, 0, 4, 8, // 0, 6, 10, 12, 14, 2, 4, 8, /**/ 6, 10, 12, 14, 0, 2, 4, 8, // 0, 2, 4, 10, 12, 14, 6, 8, /**/ 2, 4, 10, 12, 14, 0, 6, 8, // 0, 4, 10, 12, 14, 2, 6, 8, /**/ 4, 10, 12, 14, 0, 2, 6, 8, // 0, 2, 10, 12, 14, 4, 6, 8, /**/ 2, 10, 12, 14, 0, 4, 6, 8, // 0, 10, 12, 14, 2, 4, 6, 8, /**/ 10, 12, 14, 0, 2, 4, 6, 8, // 0, 2, 4, 6, 8, 12, 14, 10, /**/ 2, 4, 6, 8, 12, 14, 0, 10, // 0, 4, 6, 8, 12, 14, 2, 10, /**/ 4, 6, 8, 12, 14, 0, 2, 10, // 0, 2, 6, 8, 12, 14, 4, 10, /**/ 2, 6, 8, 12, 14, 0, 4, 10, // 0, 6, 8, 12, 14, 2, 4, 10, /**/ 6, 8, 12, 14, 0, 2, 4, 10, // 0, 2, 4, 8, 12, 14, 6, 10, /**/ 2, 4, 8, 12, 14, 0, 6, 10, // 0, 4, 8, 12, 14, 2, 6, 10, /**/ 4, 8, 12, 14, 0, 2, 6, 10, // 0, 2, 8, 12, 14, 4, 6, 10, /**/ 2, 8, 12, 14, 0, 4, 6, 10, // 0, 8, 12, 14, 2, 4, 6, 10, /**/ 8, 12, 14, 0, 2, 4, 6, 10, // 0, 2, 4, 6, 12, 14, 8, 10, /**/ 2, 4, 6, 12, 14, 0, 8, 10, // 0, 4, 6, 12, 14, 2, 8, 10, /**/ 4, 6, 12, 14, 0, 2, 8, 10, // 0, 2, 6, 12, 14, 4, 8, 10, /**/ 2, 6, 12, 14, 0, 4, 8, 10, // 0, 6, 12, 14, 2, 4, 8, 10, /**/ 6, 12, 14, 0, 2, 4, 8, 10, // 0, 2, 4, 12, 14, 6, 8, 10, /**/ 2, 4, 12, 14, 0, 6, 8, 10, // 0, 4, 12, 14, 2, 6, 8, 10, /**/ 4, 12, 14, 0, 2, 6, 8, 10, // 0, 2, 12, 14, 4, 6, 8, 10, /**/ 2, 12, 14, 0, 4, 6, 8, 10, // 0, 12, 14, 2, 4, 6, 8, 10, /**/ 12, 14, 0, 2, 4, 6, 8, 10, // 0, 2, 4, 6, 8, 10, 14, 12, /**/ 2, 4, 6, 8, 10, 14, 0, 12, // 0, 4, 6, 8, 10, 14, 2, 12, /**/ 4, 6, 8, 10, 14, 0, 2, 12, // 0, 2, 6, 8, 10, 14, 4, 12, /**/ 2, 6, 8, 10, 14, 0, 4, 12, // 0, 6, 8, 10, 14, 2, 4, 12, /**/ 6, 8, 10, 14, 0, 2, 4, 12, // 0, 2, 4, 8, 10, 14, 6, 12, /**/ 2, 4, 8, 10, 14, 0, 6, 12, // 0, 4, 8, 10, 14, 2, 6, 12, /**/ 4, 8, 10, 14, 0, 2, 6, 12, // 0, 2, 8, 10, 14, 4, 6, 12, /**/ 2, 8, 10, 14, 0, 4, 6, 12, // 0, 8, 10, 14, 2, 4, 6, 12, /**/ 8, 10, 14, 0, 2, 4, 6, 12, // 0, 2, 4, 6, 10, 14, 8, 12, /**/ 2, 4, 6, 10, 14, 0, 8, 12, // 0, 4, 6, 10, 14, 2, 8, 12, /**/ 4, 6, 10, 14, 0, 2, 8, 12, // 0, 2, 6, 10, 14, 4, 8, 12, /**/ 2, 6, 10, 14, 0, 4, 8, 12, // 0, 6, 10, 14, 2, 4, 8, 12, /**/ 6, 10, 14, 0, 2, 4, 8, 12, // 0, 2, 4, 10, 14, 6, 8, 12, /**/ 2, 4, 10, 14, 0, 6, 8, 12, // 0, 4, 10, 14, 2, 6, 8, 12, /**/ 4, 10, 14, 0, 2, 6, 8, 12, // 0, 2, 10, 14, 4, 6, 8, 12, /**/ 2, 10, 14, 0, 4, 6, 8, 12, // 0, 10, 14, 2, 4, 6, 8, 12, /**/ 10, 14, 0, 2, 4, 6, 8, 12, // 0, 2, 4, 6, 8, 14, 10, 12, /**/ 2, 4, 6, 8, 14, 0, 10, 12, // 0, 4, 6, 8, 14, 2, 10, 12, /**/ 4, 6, 8, 14, 0, 2, 10, 12, // 0, 2, 6, 8, 14, 4, 10, 12, /**/ 2, 6, 8, 14, 0, 4, 10, 12, // 0, 6, 8, 14, 2, 4, 10, 12, /**/ 6, 8, 14, 0, 2, 4, 10, 12, // 0, 2, 4, 8, 14, 6, 10, 12, /**/ 2, 4, 8, 14, 0, 6, 10, 12, // 0, 4, 8, 14, 2, 6, 10, 12, /**/ 4, 8, 14, 0, 2, 6, 10, 12, // 0, 2, 8, 14, 4, 6, 10, 12, /**/ 2, 8, 14, 0, 4, 6, 10, 12, // 0, 8, 14, 2, 4, 6, 10, 12, /**/ 8, 14, 0, 2, 4, 6, 10, 12, // 0, 2, 4, 6, 14, 8, 10, 12, /**/ 2, 4, 6, 14, 0, 8, 10, 12, // 0, 4, 6, 14, 2, 8, 10, 12, /**/ 4, 6, 14, 0, 2, 8, 10, 12, // 0, 2, 6, 14, 4, 8, 10, 12, /**/ 2, 6, 14, 0, 4, 8, 10, 12, // 0, 6, 14, 2, 4, 8, 10, 12, /**/ 6, 14, 0, 2, 4, 8, 10, 12, // 0, 2, 4, 14, 6, 8, 10, 12, /**/ 2, 4, 14, 0, 6, 8, 10, 12, // 0, 4, 14, 2, 6, 8, 10, 12, /**/ 4, 14, 0, 2, 6, 8, 10, 12, // 0, 2, 14, 4, 6, 8, 10, 12, /**/ 2, 14, 0, 4, 6, 8, 10, 12, // 0, 14, 2, 4, 6, 8, 10, 12, /**/ 14, 0, 2, 4, 6, 8, 10, 12, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 0, 14, // 0, 4, 6, 8, 10, 12, 2, 14, /**/ 4, 6, 8, 10, 12, 0, 2, 14, // 0, 2, 6, 8, 10, 12, 4, 14, /**/ 2, 6, 8, 10, 12, 0, 4, 14, // 0, 6, 8, 10, 12, 2, 4, 14, /**/ 6, 8, 10, 12, 0, 2, 4, 14, // 0, 2, 4, 8, 10, 12, 6, 14, /**/ 2, 4, 8, 10, 12, 0, 6, 14, // 0, 4, 8, 10, 12, 2, 6, 14, /**/ 4, 8, 10, 12, 0, 2, 6, 14, // 0, 2, 8, 10, 12, 4, 6, 14, /**/ 2, 8, 10, 12, 0, 4, 6, 14, // 0, 8, 10, 12, 2, 4, 6, 14, /**/ 8, 10, 12, 0, 2, 4, 6, 14, // 0, 2, 4, 6, 10, 12, 8, 14, /**/ 2, 4, 6, 10, 12, 0, 8, 14, // 0, 4, 6, 10, 12, 2, 8, 14, /**/ 4, 6, 10, 12, 0, 2, 8, 14, // 0, 2, 6, 10, 12, 4, 8, 14, /**/ 2, 6, 10, 12, 0, 4, 8, 14, // 0, 6, 10, 12, 2, 4, 8, 14, /**/ 6, 10, 12, 0, 2, 4, 8, 14, // 0, 2, 4, 10, 12, 6, 8, 14, /**/ 2, 4, 10, 12, 0, 6, 8, 14, // 0, 4, 10, 12, 2, 6, 8, 14, /**/ 4, 10, 12, 0, 2, 6, 8, 14, // 0, 2, 10, 12, 4, 6, 8, 14, /**/ 2, 10, 12, 0, 4, 6, 8, 14, // 0, 10, 12, 2, 4, 6, 8, 14, /**/ 10, 12, 0, 2, 4, 6, 8, 14, // 0, 2, 4, 6, 8, 12, 10, 14, /**/ 2, 4, 6, 8, 12, 0, 10, 14, // 0, 4, 6, 8, 12, 2, 10, 14, /**/ 4, 6, 8, 12, 0, 2, 10, 14, // 0, 2, 6, 8, 12, 4, 10, 14, /**/ 2, 6, 8, 12, 0, 4, 10, 14, // 0, 6, 8, 12, 2, 4, 10, 14, /**/ 6, 8, 12, 0, 2, 4, 10, 14, // 0, 2, 4, 8, 12, 6, 10, 14, /**/ 2, 4, 8, 12, 0, 6, 10, 14, // 0, 4, 8, 12, 2, 6, 10, 14, /**/ 4, 8, 12, 0, 2, 6, 10, 14, // 0, 2, 8, 12, 4, 6, 10, 14, /**/ 2, 8, 12, 0, 4, 6, 10, 14, // 0, 8, 12, 2, 4, 6, 10, 14, /**/ 8, 12, 0, 2, 4, 6, 10, 14, // 0, 2, 4, 6, 12, 8, 10, 14, /**/ 2, 4, 6, 12, 0, 8, 10, 14, // 0, 4, 6, 12, 2, 8, 10, 14, /**/ 4, 6, 12, 0, 2, 8, 10, 14, // 0, 2, 6, 12, 4, 8, 10, 14, /**/ 2, 6, 12, 0, 4, 8, 10, 14, // 0, 6, 12, 2, 4, 8, 10, 14, /**/ 6, 12, 0, 2, 4, 8, 10, 14, // 0, 2, 4, 12, 6, 8, 10, 14, /**/ 2, 4, 12, 0, 6, 8, 10, 14, // 0, 4, 12, 2, 6, 8, 10, 14, /**/ 4, 12, 0, 2, 6, 8, 10, 14, // 0, 2, 12, 4, 6, 8, 10, 14, /**/ 2, 12, 0, 4, 6, 8, 10, 14, // 0, 12, 2, 4, 6, 8, 10, 14, /**/ 12, 0, 2, 4, 6, 8, 10, 14, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 0, 12, 14, // 0, 4, 6, 8, 10, 2, 12, 14, /**/ 4, 6, 8, 10, 0, 2, 12, 14, // 0, 2, 6, 8, 10, 4, 12, 14, /**/ 2, 6, 8, 10, 0, 4, 12, 14, // 0, 6, 8, 10, 2, 4, 12, 14, /**/ 6, 8, 10, 0, 2, 4, 12, 14, // 0, 2, 4, 8, 10, 6, 12, 14, /**/ 2, 4, 8, 10, 0, 6, 12, 14, // 0, 4, 8, 10, 2, 6, 12, 14, /**/ 4, 8, 10, 0, 2, 6, 12, 14, // 0, 2, 8, 10, 4, 6, 12, 14, /**/ 2, 8, 10, 0, 4, 6, 12, 14, // 0, 8, 10, 2, 4, 6, 12, 14, /**/ 8, 10, 0, 2, 4, 6, 12, 14, // 0, 2, 4, 6, 10, 8, 12, 14, /**/ 2, 4, 6, 10, 0, 8, 12, 14, // 0, 4, 6, 10, 2, 8, 12, 14, /**/ 4, 6, 10, 0, 2, 8, 12, 14, // 0, 2, 6, 10, 4, 8, 12, 14, /**/ 2, 6, 10, 0, 4, 8, 12, 14, // 0, 6, 10, 2, 4, 8, 12, 14, /**/ 6, 10, 0, 2, 4, 8, 12, 14, // 0, 2, 4, 10, 6, 8, 12, 14, /**/ 2, 4, 10, 0, 6, 8, 12, 14, // 0, 4, 10, 2, 6, 8, 12, 14, /**/ 4, 10, 0, 2, 6, 8, 12, 14, // 0, 2, 10, 4, 6, 8, 12, 14, /**/ 2, 10, 0, 4, 6, 8, 12, 14, // 0, 10, 2, 4, 6, 8, 12, 14, /**/ 10, 0, 2, 4, 6, 8, 12, 14, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 0, 10, 12, 14, // 0, 4, 6, 8, 2, 10, 12, 14, /**/ 4, 6, 8, 0, 2, 10, 12, 14, // 0, 2, 6, 8, 4, 10, 12, 14, /**/ 2, 6, 8, 0, 4, 10, 12, 14, // 0, 6, 8, 2, 4, 10, 12, 14, /**/ 6, 8, 0, 2, 4, 10, 12, 14, // 0, 2, 4, 8, 6, 10, 12, 14, /**/ 2, 4, 8, 0, 6, 10, 12, 14, // 0, 4, 8, 2, 6, 10, 12, 14, /**/ 4, 8, 0, 2, 6, 10, 12, 14, // 0, 2, 8, 4, 6, 10, 12, 14, /**/ 2, 8, 0, 4, 6, 10, 12, 14, // 0, 8, 2, 4, 6, 10, 12, 14, /**/ 8, 0, 2, 4, 6, 10, 12, 14, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 0, 8, 10, 12, 14, // 0, 4, 6, 2, 8, 10, 12, 14, /**/ 4, 6, 0, 2, 8, 10, 12, 14, // 0, 2, 6, 4, 8, 10, 12, 14, /**/ 2, 6, 0, 4, 8, 10, 12, 14, // 0, 6, 2, 4, 8, 10, 12, 14, /**/ 6, 0, 2, 4, 8, 10, 12, 14, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 0, 6, 8, 10, 12, 14, // 0, 4, 2, 6, 8, 10, 12, 14, /**/ 4, 0, 2, 6, 8, 10, 12, 14, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 0, 4, 6, 8, 10, 12, 14, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; const Vec128 byte_idx{Load(d8, table + mask_bits * 8).raw}; const Vec128 pairs = ZipLower(byte_idx, byte_idx); return BitCast(d, pairs + Set(du, 0x0100)); } template HWY_INLINE Vec128 IdxFromBits(const uint64_t mask_bits) { HWY_DASSERT(mask_bits < 16); // There are only 4 lanes, so we can afford to load the index vector directly. alignas(16) constexpr uint8_t u8_indices[16 * 16] = { // PrintCompress32x4Tables 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, // 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, // 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, // 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, // 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, // 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, // 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, // 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; const Simd d; const Repartition d8; return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); } template HWY_INLINE Vec128 IdxFromNotBits(const uint64_t mask_bits) { HWY_DASSERT(mask_bits < 16); // There are only 4 lanes, so we can afford to load the index vector directly. alignas(16) constexpr uint8_t u8_indices[16 * 16] = { // PrintCompressNot32x4Tables 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; const Simd d; const Repartition d8; return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); } template HWY_INLINE Vec128 IdxFromBits(const uint64_t mask_bits) { HWY_DASSERT(mask_bits < 4); // There are only 2 lanes, so we can afford to load the index vector directly. alignas(16) constexpr uint8_t u8_indices[4 * 16] = { // PrintCompress64x2Tables 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; const Simd d; const Repartition d8; return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); } template HWY_INLINE Vec128 IdxFromNotBits(const uint64_t mask_bits) { HWY_DASSERT(mask_bits < 4); // There are only 2 lanes, so we can afford to load the index vector directly. alignas(16) constexpr uint8_t u8_indices[4 * 16] = { // PrintCompressNot64x2Tables 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; const Simd d; const Repartition d8; return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); } // Helper functions called by both Compress and CompressStore - avoids a // redundant BitsFromMask in the latter. template HWY_INLINE Vec128 Compress(Vec128 v, const uint64_t mask_bits) { const auto idx = detail::IdxFromBits(mask_bits); const DFromV d; const RebindToSigned di; return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx))); } template HWY_INLINE Vec128 CompressNot(Vec128 v, const uint64_t mask_bits) { const auto idx = detail::IdxFromNotBits(mask_bits); const DFromV d; const RebindToSigned di; return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx))); } } // namespace detail template struct CompressIsPartition { #if HWY_TARGET == HWY_WASM_EMU256 enum { value = 0 }; #else enum { value = (sizeof(T) != 1) }; #endif }; // Single lane: no-op template HWY_API Vec128 Compress(Vec128 v, Mask128 /*m*/) { return v; } // Two lanes: conditional swap template HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep. const Full128 d; const Vec128 m = VecFromMask(d, mask); const Vec128 maskL = DupEven(m); const Vec128 maskH = DupOdd(m); const Vec128 swap = AndNot(maskL, maskH); return IfVecThenElse(swap, Shuffle01(v), v); } // General case, 2 or 4 byte lanes template HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { return detail::Compress(v, detail::BitsFromMask(mask)); } // Single lane: no-op template HWY_API Vec128 CompressNot(Vec128 v, Mask128 /*m*/) { return v; } // Two lanes: conditional swap template HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep. const Full128 d; const Vec128 m = VecFromMask(d, mask); const Vec128 maskL = DupEven(m); const Vec128 maskH = DupOdd(m); const Vec128 swap = AndNot(maskH, maskL); return IfVecThenElse(swap, Shuffle01(v), v); } // General case, 2 or 4 byte lanes template HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { // For partial vectors, we cannot pull the Not() into the table because // BitsFromMask clears the upper bits. if (N < 16 / sizeof(T)) { return detail::Compress(v, detail::BitsFromMask(Not(mask))); } return detail::CompressNot(v, detail::BitsFromMask(mask)); } // ------------------------------ CompressBlocksNot HWY_API Vec128 CompressBlocksNot(Vec128 v, Mask128 /* m */) { return v; } // ------------------------------ CompressBits template HWY_API Vec128 CompressBits(Vec128 v, const uint8_t* HWY_RESTRICT bits) { uint64_t mask_bits = 0; constexpr size_t kNumBytes = (N + 7) / 8; CopyBytes(bits, &mask_bits); if (N < 8) { mask_bits &= (1ull << N) - 1; } return detail::Compress(v, mask_bits); } // ------------------------------ CompressStore template HWY_API size_t CompressStore(Vec128 v, const Mask128 mask, Simd d, T* HWY_RESTRICT unaligned) { const uint64_t mask_bits = detail::BitsFromMask(mask); const auto c = detail::Compress(v, mask_bits); StoreU(c, d, unaligned); return PopCount(mask_bits); } // ------------------------------ CompressBlendedStore template HWY_API size_t CompressBlendedStore(Vec128 v, Mask128 m, Simd d, T* HWY_RESTRICT unaligned) { const RebindToUnsigned du; // so we can support fp16/bf16 using TU = TFromD; const uint64_t mask_bits = detail::BitsFromMask(m); const size_t count = PopCount(mask_bits); const Vec128 compressed = detail::Compress(BitCast(du, v), mask_bits); const Mask128 store_mask = RebindMask(d, FirstN(du, count)); BlendedStore(BitCast(d, compressed), store_mask, d, unaligned); return count; } // ------------------------------ CompressBitsStore template HWY_API size_t CompressBitsStore(Vec128 v, const uint8_t* HWY_RESTRICT bits, Simd d, T* HWY_RESTRICT unaligned) { uint64_t mask_bits = 0; constexpr size_t kNumBytes = (N + 7) / 8; CopyBytes(bits, &mask_bits); if (N < 8) { mask_bits &= (1ull << N) - 1; } const auto c = detail::Compress(v, mask_bits); StoreU(c, d, unaligned); return PopCount(mask_bits); } // ------------------------------ StoreInterleaved2/3/4 // HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in // generic_ops-inl.h. // ------------------------------ MulEven/Odd (Load) HWY_INLINE Vec128 MulEven(const Vec128 a, const Vec128 b) { alignas(16) uint64_t mul[2]; mul[0] = Mul128(static_cast(wasm_i64x2_extract_lane(a.raw, 0)), static_cast(wasm_i64x2_extract_lane(b.raw, 0)), &mul[1]); return Load(Full128(), mul); } HWY_INLINE Vec128 MulOdd(const Vec128 a, const Vec128 b) { alignas(16) uint64_t mul[2]; mul[0] = Mul128(static_cast(wasm_i64x2_extract_lane(a.raw, 1)), static_cast(wasm_i64x2_extract_lane(b.raw, 1)), &mul[1]); return Load(Full128(), mul); } // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) template HWY_API Vec128 ReorderWidenMulAccumulate(Simd df32, Vec128 a, Vec128 b, const Vec128 sum0, Vec128& sum1) { const Rebind du32; using VU32 = VFromD; const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32 // Using shift/and instead of Zip leads to the odd/even order that // RearrangeToOddPlusEven prefers. const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); const VU32 ao = And(BitCast(du32, a), odd); const VU32 be = ShiftLeft<16>(BitCast(du32, b)); const VU32 bo = And(BitCast(du32, b), odd); sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1); return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0); } // Even if N=1, the input is always at least 2 lanes, hence i32x4_dot_i16x8 is // safe. template HWY_API Vec128 ReorderWidenMulAccumulate( Simd /*d32*/, Vec128 a, Vec128 b, const Vec128 sum0, Vec128& /*sum1*/) { return sum0 + Vec128{wasm_i32x4_dot_i16x8(a.raw, b.raw)}; } // ------------------------------ RearrangeToOddPlusEven template HWY_API Vec128 RearrangeToOddPlusEven( const Vec128 sum0, const Vec128 /*sum1*/) { return sum0; // invariant already holds } template HWY_API Vec128 RearrangeToOddPlusEven(const Vec128 sum0, const Vec128 sum1) { return Add(sum0, sum1); } // ------------------------------ Reductions namespace detail { // N=1 for any T: no-op template HWY_INLINE Vec128 SumOfLanes(hwy::SizeTag /* tag */, const Vec128 v) { return v; } template HWY_INLINE Vec128 MinOfLanes(hwy::SizeTag /* tag */, const Vec128 v) { return v; } template HWY_INLINE Vec128 MaxOfLanes(hwy::SizeTag /* tag */, const Vec128 v) { return v; } // u32/i32/f32: // N=2 template HWY_INLINE Vec128 SumOfLanes(hwy::SizeTag<4> /* tag */, const Vec128 v10) { return v10 + Vec128{Shuffle2301(Vec128{v10.raw}).raw}; } template HWY_INLINE Vec128 MinOfLanes(hwy::SizeTag<4> /* tag */, const Vec128 v10) { return Min(v10, Vec128{Shuffle2301(Vec128{v10.raw}).raw}); } template HWY_INLINE Vec128 MaxOfLanes(hwy::SizeTag<4> /* tag */, const Vec128 v10) { return Max(v10, Vec128{Shuffle2301(Vec128{v10.raw}).raw}); } // N=4 (full) template HWY_INLINE Vec128 SumOfLanes(hwy::SizeTag<4> /* tag */, const Vec128 v3210) { const Vec128 v1032 = Shuffle1032(v3210); const Vec128 v31_20_31_20 = v3210 + v1032; const Vec128 v20_31_20_31 = Shuffle0321(v31_20_31_20); return v20_31_20_31 + v31_20_31_20; } template HWY_INLINE Vec128 MinOfLanes(hwy::SizeTag<4> /* tag */, const Vec128 v3210) { const Vec128 v1032 = Shuffle1032(v3210); const Vec128 v31_20_31_20 = Min(v3210, v1032); const Vec128 v20_31_20_31 = Shuffle0321(v31_20_31_20); return Min(v20_31_20_31, v31_20_31_20); } template HWY_INLINE Vec128 MaxOfLanes(hwy::SizeTag<4> /* tag */, const Vec128 v3210) { const Vec128 v1032 = Shuffle1032(v3210); const Vec128 v31_20_31_20 = Max(v3210, v1032); const Vec128 v20_31_20_31 = Shuffle0321(v31_20_31_20); return Max(v20_31_20_31, v31_20_31_20); } // u64/i64/f64: // N=2 (full) template HWY_INLINE Vec128 SumOfLanes(hwy::SizeTag<8> /* tag */, const Vec128 v10) { const Vec128 v01 = Shuffle01(v10); return v10 + v01; } template HWY_INLINE Vec128 MinOfLanes(hwy::SizeTag<8> /* tag */, const Vec128 v10) { const Vec128 v01 = Shuffle01(v10); return Min(v10, v01); } template HWY_INLINE Vec128 MaxOfLanes(hwy::SizeTag<8> /* tag */, const Vec128 v10) { const Vec128 v01 = Shuffle01(v10); return Max(v10, v01); } template HWY_API Vec128 SumOfLanes(hwy::SizeTag<2> /* tag */, Vec128 v) { const Simd d; const RepartitionToWide d32; const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); const auto odd = ShiftRight<16>(BitCast(d32, v)); const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd); // Also broadcast into odd lanes. return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum)); } template HWY_API Vec128 SumOfLanes(hwy::SizeTag<2> /* tag */, Vec128 v) { const Simd d; const RepartitionToWide d32; // Sign-extend const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); const auto odd = ShiftRight<16>(BitCast(d32, v)); const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd); // Also broadcast into odd lanes. return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum)); } template HWY_API Vec128 MinOfLanes(hwy::SizeTag<2> /* tag */, Vec128 v) { const Simd d; const RepartitionToWide d32; const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); const auto odd = ShiftRight<16>(BitCast(d32, v)); const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd)); // Also broadcast into odd lanes. return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); } template HWY_API Vec128 MinOfLanes(hwy::SizeTag<2> /* tag */, Vec128 v) { const Simd d; const RepartitionToWide d32; // Sign-extend const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); const auto odd = ShiftRight<16>(BitCast(d32, v)); const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd)); // Also broadcast into odd lanes. return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); } template HWY_API Vec128 MaxOfLanes(hwy::SizeTag<2> /* tag */, Vec128 v) { const Simd d; const RepartitionToWide d32; const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); const auto odd = ShiftRight<16>(BitCast(d32, v)); const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd)); // Also broadcast into odd lanes. return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); } template HWY_API Vec128 MaxOfLanes(hwy::SizeTag<2> /* tag */, Vec128 v) { const Simd d; const RepartitionToWide d32; // Sign-extend const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); const auto odd = ShiftRight<16>(BitCast(d32, v)); const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd)); // Also broadcast into odd lanes. return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); } } // namespace detail // Supported for u/i/f 32/64. Returns the same value in each lane. template HWY_API Vec128 SumOfLanes(Simd /* tag */, const Vec128 v) { return detail::SumOfLanes(hwy::SizeTag(), v); } template HWY_API Vec128 MinOfLanes(Simd /* tag */, const Vec128 v) { return detail::MinOfLanes(hwy::SizeTag(), v); } template HWY_API Vec128 MaxOfLanes(Simd /* tag */, const Vec128 v) { return detail::MaxOfLanes(hwy::SizeTag(), v); } // ------------------------------ Lt128 template HWY_INLINE Mask128 Lt128(Simd d, Vec128 a, Vec128 b) { static_assert(!IsSigned() && sizeof(T) == 8, "T must be u64"); // Truth table of Eq and Lt for Hi and Lo u64. // (removed lines with (=H && cH) or (=L && cL) - cannot both be true) // =H =L cH cL | out = cH | (=H & cL) // 0 0 0 0 | 0 // 0 0 0 1 | 0 // 0 0 1 0 | 1 // 0 0 1 1 | 1 // 0 1 0 0 | 0 // 0 1 0 1 | 0 // 0 1 1 0 | 1 // 1 0 0 0 | 0 // 1 0 0 1 | 1 // 1 1 0 0 | 0 const Mask128 eqHL = Eq(a, b); const Vec128 ltHL = VecFromMask(d, Lt(a, b)); // We need to bring cL to the upper lane/bit corresponding to cH. Comparing // the result of InterleaveUpper/Lower requires 9 ops, whereas shifting the // comparison result leftwards requires only 4. IfThenElse compiles to the // same code as OrAnd(). const Vec128 ltLx = DupEven(ltHL); const Vec128 outHx = IfThenElse(eqHL, ltLx, ltHL); return MaskFromVec(DupOdd(outHx)); } template HWY_INLINE Mask128 Lt128Upper(Simd d, Vec128 a, Vec128 b) { const Vec128 ltHL = VecFromMask(d, Lt(a, b)); return MaskFromVec(InterleaveUpper(d, ltHL, ltHL)); } // ------------------------------ Eq128 template HWY_INLINE Mask128 Eq128(Simd d, Vec128 a, Vec128 b) { static_assert(!IsSigned() && sizeof(T) == 8, "T must be u64"); const Vec128 eqHL = VecFromMask(d, Eq(a, b)); return MaskFromVec(And(Reverse2(d, eqHL), eqHL)); } template HWY_INLINE Mask128 Eq128Upper(Simd d, Vec128 a, Vec128 b) { const Vec128 eqHL = VecFromMask(d, Eq(a, b)); return MaskFromVec(InterleaveUpper(d, eqHL, eqHL)); } // ------------------------------ Ne128 template HWY_INLINE Mask128 Ne128(Simd d, Vec128 a, Vec128 b) { static_assert(!IsSigned() && sizeof(T) == 8, "T must be u64"); const Vec128 neHL = VecFromMask(d, Ne(a, b)); return MaskFromVec(Or(Reverse2(d, neHL), neHL)); } template HWY_INLINE Mask128 Ne128Upper(Simd d, Vec128 a, Vec128 b) { const Vec128 neHL = VecFromMask(d, Ne(a, b)); return MaskFromVec(InterleaveUpper(d, neHL, neHL)); } // ------------------------------ Min128, Max128 (Lt128) // Without a native OddEven, it seems infeasible to go faster than Lt128. template HWY_INLINE VFromD Min128(D d, const VFromD a, const VFromD b) { return IfThenElse(Lt128(d, a, b), a, b); } template HWY_INLINE VFromD Max128(D d, const VFromD a, const VFromD b) { return IfThenElse(Lt128(d, b, a), a, b); } template HWY_INLINE VFromD Min128Upper(D d, const VFromD a, const VFromD b) { return IfThenElse(Lt128Upper(d, a, b), a, b); } template HWY_INLINE VFromD Max128Upper(D d, const VFromD a, const VFromD b) { return IfThenElse(Lt128Upper(d, b, a), a, b); } // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace hwy HWY_AFTER_NAMESPACE();