summaryrefslogtreecommitdiffstats
path: root/third_party/highway/hwy/ops/wasm_128-inl.h
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/highway/hwy/ops/wasm_128-inl.h')
-rw-r--r--third_party/highway/hwy/ops/wasm_128-inl.h4591
1 files changed, 4591 insertions, 0 deletions
diff --git a/third_party/highway/hwy/ops/wasm_128-inl.h b/third_party/highway/hwy/ops/wasm_128-inl.h
new file mode 100644
index 0000000000..095fd4f1f0
--- /dev/null
+++ b/third_party/highway/hwy/ops/wasm_128-inl.h
@@ -0,0 +1,4591 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// 128-bit WASM vectors and operations.
+// External include guard in highway.h - see comment there.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <wasm_simd128.h>
+
+#include "hwy/base.h"
+#include "hwy/ops/shared-inl.h"
+
+#ifdef HWY_WASM_OLD_NAMES
+#define wasm_i8x16_shuffle wasm_v8x16_shuffle
+#define wasm_i16x8_shuffle wasm_v16x8_shuffle
+#define wasm_i32x4_shuffle wasm_v32x4_shuffle
+#define wasm_i64x2_shuffle wasm_v64x2_shuffle
+#define wasm_u16x8_extend_low_u8x16 wasm_i16x8_widen_low_u8x16
+#define wasm_u32x4_extend_low_u16x8 wasm_i32x4_widen_low_u16x8
+#define wasm_i32x4_extend_low_i16x8 wasm_i32x4_widen_low_i16x8
+#define wasm_i16x8_extend_low_i8x16 wasm_i16x8_widen_low_i8x16
+#define wasm_u32x4_extend_high_u16x8 wasm_i32x4_widen_high_u16x8
+#define wasm_i32x4_extend_high_i16x8 wasm_i32x4_widen_high_i16x8
+#define wasm_i32x4_trunc_sat_f32x4 wasm_i32x4_trunc_saturate_f32x4
+#define wasm_u8x16_add_sat wasm_u8x16_add_saturate
+#define wasm_u8x16_sub_sat wasm_u8x16_sub_saturate
+#define wasm_u16x8_add_sat wasm_u16x8_add_saturate
+#define wasm_u16x8_sub_sat wasm_u16x8_sub_saturate
+#define wasm_i8x16_add_sat wasm_i8x16_add_saturate
+#define wasm_i8x16_sub_sat wasm_i8x16_sub_saturate
+#define wasm_i16x8_add_sat wasm_i16x8_add_saturate
+#define wasm_i16x8_sub_sat wasm_i16x8_sub_saturate
+#endif
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+#if HWY_TARGET == HWY_WASM_EMU256
+template <typename T>
+using Full256 = Simd<T, 32 / sizeof(T), 0>;
+#endif
+
+namespace detail {
+
+template <typename T>
+struct Raw128 {
+ using type = __v128_u;
+};
+template <>
+struct Raw128<float> {
+ using type = __f32x4;
+};
+
+} // namespace detail
+
+template <typename T, size_t N = 16 / sizeof(T)>
+class Vec128 {
+ using Raw = typename detail::Raw128<T>::type;
+
+ public:
+ using PrivateT = T; // only for DFromV
+ static constexpr size_t kPrivateN = N; // only for DFromV
+
+ // Compound assignment. Only usable if there is a corresponding non-member
+ // binary operator overload. For example, only f32 and f64 support division.
+ HWY_INLINE Vec128& operator*=(const Vec128 other) {
+ return *this = (*this * other);
+ }
+ HWY_INLINE Vec128& operator/=(const Vec128 other) {
+ return *this = (*this / other);
+ }
+ HWY_INLINE Vec128& operator+=(const Vec128 other) {
+ return *this = (*this + other);
+ }
+ HWY_INLINE Vec128& operator-=(const Vec128 other) {
+ return *this = (*this - other);
+ }
+ HWY_INLINE Vec128& operator&=(const Vec128 other) {
+ return *this = (*this & other);
+ }
+ HWY_INLINE Vec128& operator|=(const Vec128 other) {
+ return *this = (*this | other);
+ }
+ HWY_INLINE Vec128& operator^=(const Vec128 other) {
+ return *this = (*this ^ other);
+ }
+
+ Raw raw;
+};
+
+template <typename T>
+using Vec64 = Vec128<T, 8 / sizeof(T)>;
+
+template <typename T>
+using Vec32 = Vec128<T, 4 / sizeof(T)>;
+
+template <typename T>
+using Vec16 = Vec128<T, 2 / sizeof(T)>;
+
+// FF..FF or 0.
+template <typename T, size_t N = 16 / sizeof(T)>
+struct Mask128 {
+ typename detail::Raw128<T>::type raw;
+};
+
+template <class V>
+using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;
+
+template <class V>
+using TFromV = typename V::PrivateT;
+
+// ------------------------------ BitCast
+
+namespace detail {
+
+HWY_INLINE __v128_u BitCastToInteger(__v128_u v) { return v; }
+HWY_INLINE __v128_u BitCastToInteger(__f32x4 v) {
+ return static_cast<__v128_u>(v);
+}
+HWY_INLINE __v128_u BitCastToInteger(__f64x2 v) {
+ return static_cast<__v128_u>(v);
+}
+
+template <typename T, size_t N>
+HWY_INLINE Vec128<uint8_t, N * sizeof(T)> BitCastToByte(Vec128<T, N> v) {
+ return Vec128<uint8_t, N * sizeof(T)>{BitCastToInteger(v.raw)};
+}
+
+// Cannot rely on function overloading because return types differ.
+template <typename T>
+struct BitCastFromInteger128 {
+ HWY_INLINE __v128_u operator()(__v128_u v) { return v; }
+};
+template <>
+struct BitCastFromInteger128<float> {
+ HWY_INLINE __f32x4 operator()(__v128_u v) { return static_cast<__f32x4>(v); }
+};
+
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> BitCastFromByte(Simd<T, N, 0> /* tag */,
+ Vec128<uint8_t, N * sizeof(T)> v) {
+ return Vec128<T, N>{BitCastFromInteger128<T>()(v.raw)};
+}
+
+} // namespace detail
+
+template <typename T, size_t N, typename FromT>
+HWY_API Vec128<T, N> BitCast(Simd<T, N, 0> d,
+ Vec128<FromT, N * sizeof(T) / sizeof(FromT)> v) {
+ return detail::BitCastFromByte(d, detail::BitCastToByte(v));
+}
+
+// ------------------------------ Zero
+
+// Returns an all-zero vector/part.
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_API Vec128<T, N> Zero(Simd<T, N, 0> /* tag */) {
+ return Vec128<T, N>{wasm_i32x4_splat(0)};
+}
+template <size_t N, HWY_IF_LE128(float, N)>
+HWY_API Vec128<float, N> Zero(Simd<float, N, 0> /* tag */) {
+ return Vec128<float, N>{wasm_f32x4_splat(0.0f)};
+}
+
+template <class D>
+using VFromD = decltype(Zero(D()));
+
+// ------------------------------ Set
+
+// Returns a vector/part with all lanes set to "t".
+template <size_t N, HWY_IF_LE128(uint8_t, N)>
+HWY_API Vec128<uint8_t, N> Set(Simd<uint8_t, N, 0> /* tag */, const uint8_t t) {
+ return Vec128<uint8_t, N>{wasm_i8x16_splat(static_cast<int8_t>(t))};
+}
+template <size_t N, HWY_IF_LE128(uint16_t, N)>
+HWY_API Vec128<uint16_t, N> Set(Simd<uint16_t, N, 0> /* tag */,
+ const uint16_t t) {
+ return Vec128<uint16_t, N>{wasm_i16x8_splat(static_cast<int16_t>(t))};
+}
+template <size_t N, HWY_IF_LE128(uint32_t, N)>
+HWY_API Vec128<uint32_t, N> Set(Simd<uint32_t, N, 0> /* tag */,
+ const uint32_t t) {
+ return Vec128<uint32_t, N>{wasm_i32x4_splat(static_cast<int32_t>(t))};
+}
+template <size_t N, HWY_IF_LE128(uint64_t, N)>
+HWY_API Vec128<uint64_t, N> Set(Simd<uint64_t, N, 0> /* tag */,
+ const uint64_t t) {
+ return Vec128<uint64_t, N>{wasm_i64x2_splat(static_cast<int64_t>(t))};
+}
+
+template <size_t N, HWY_IF_LE128(int8_t, N)>
+HWY_API Vec128<int8_t, N> Set(Simd<int8_t, N, 0> /* tag */, const int8_t t) {
+ return Vec128<int8_t, N>{wasm_i8x16_splat(t)};
+}
+template <size_t N, HWY_IF_LE128(int16_t, N)>
+HWY_API Vec128<int16_t, N> Set(Simd<int16_t, N, 0> /* tag */, const int16_t t) {
+ return Vec128<int16_t, N>{wasm_i16x8_splat(t)};
+}
+template <size_t N, HWY_IF_LE128(int32_t, N)>
+HWY_API Vec128<int32_t, N> Set(Simd<int32_t, N, 0> /* tag */, const int32_t t) {
+ return Vec128<int32_t, N>{wasm_i32x4_splat(t)};
+}
+template <size_t N, HWY_IF_LE128(int64_t, N)>
+HWY_API Vec128<int64_t, N> Set(Simd<int64_t, N, 0> /* tag */, const int64_t t) {
+ return Vec128<int64_t, N>{wasm_i64x2_splat(t)};
+}
+
+template <size_t N, HWY_IF_LE128(float, N)>
+HWY_API Vec128<float, N> Set(Simd<float, N, 0> /* tag */, const float t) {
+ return Vec128<float, N>{wasm_f32x4_splat(t)};
+}
+
+HWY_DIAGNOSTICS(push)
+HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
+
+// Returns a vector with uninitialized elements.
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_API Vec128<T, N> Undefined(Simd<T, N, 0> d) {
+ return Zero(d);
+}
+
+HWY_DIAGNOSTICS(pop)
+
+// Returns a vector with lane i=[0, N) set to "first" + i.
+template <typename T, size_t N, typename T2, HWY_IF_LE128(T, N)>
+Vec128<T, N> Iota(const Simd<T, N, 0> d, const T2 first) {
+ HWY_ALIGN T lanes[16 / sizeof(T)];
+ for (size_t i = 0; i < 16 / sizeof(T); ++i) {
+ lanes[i] =
+ AddWithWraparound(hwy::IsFloatTag<T>(), static_cast<T>(first), i);
+ }
+ return Load(d, lanes);
+}
+
+// ================================================== ARITHMETIC
+
+// ------------------------------ Addition
+
+// Unsigned
+template <size_t N>
+HWY_API Vec128<uint8_t, N> operator+(const Vec128<uint8_t, N> a,
+ const Vec128<uint8_t, N> b) {
+ return Vec128<uint8_t, N>{wasm_i8x16_add(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Vec128<uint16_t, N> operator+(const Vec128<uint16_t, N> a,
+ const Vec128<uint16_t, N> b) {
+ return Vec128<uint16_t, N>{wasm_i16x8_add(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Vec128<uint32_t, N> operator+(const Vec128<uint32_t, N> a,
+ const Vec128<uint32_t, N> b) {
+ return Vec128<uint32_t, N>{wasm_i32x4_add(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Vec128<uint64_t, N> operator+(const Vec128<uint64_t, N> a,
+ const Vec128<uint64_t, N> b) {
+ return Vec128<uint64_t, N>{wasm_i64x2_add(a.raw, b.raw)};
+}
+
+// Signed
+template <size_t N>
+HWY_API Vec128<int8_t, N> operator+(const Vec128<int8_t, N> a,
+ const Vec128<int8_t, N> b) {
+ return Vec128<int8_t, N>{wasm_i8x16_add(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Vec128<int16_t, N> operator+(const Vec128<int16_t, N> a,
+ const Vec128<int16_t, N> b) {
+ return Vec128<int16_t, N>{wasm_i16x8_add(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Vec128<int32_t, N> operator+(const Vec128<int32_t, N> a,
+ const Vec128<int32_t, N> b) {
+ return Vec128<int32_t, N>{wasm_i32x4_add(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Vec128<int64_t, N> operator+(const Vec128<int64_t, N> a,
+ const Vec128<int64_t, N> b) {
+ return Vec128<int64_t, N>{wasm_i64x2_add(a.raw, b.raw)};
+}
+
+// Float
+template <size_t N>
+HWY_API Vec128<float, N> operator+(const Vec128<float, N> a,
+ const Vec128<float, N> b) {
+ return Vec128<float, N>{wasm_f32x4_add(a.raw, b.raw)};
+}
+
+// ------------------------------ Subtraction
+
+// Unsigned
+template <size_t N>
+HWY_API Vec128<uint8_t, N> operator-(const Vec128<uint8_t, N> a,
+ const Vec128<uint8_t, N> b) {
+ return Vec128<uint8_t, N>{wasm_i8x16_sub(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Vec128<uint16_t, N> operator-(Vec128<uint16_t, N> a,
+ Vec128<uint16_t, N> b) {
+ return Vec128<uint16_t, N>{wasm_i16x8_sub(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Vec128<uint32_t, N> operator-(const Vec128<uint32_t, N> a,
+ const Vec128<uint32_t, N> b) {
+ return Vec128<uint32_t, N>{wasm_i32x4_sub(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Vec128<uint64_t, N> operator-(const Vec128<uint64_t, N> a,
+ const Vec128<uint64_t, N> b) {
+ return Vec128<uint64_t, N>{wasm_i64x2_sub(a.raw, b.raw)};
+}
+
+// Signed
+template <size_t N>
+HWY_API Vec128<int8_t, N> operator-(const Vec128<int8_t, N> a,
+ const Vec128<int8_t, N> b) {
+ return Vec128<int8_t, N>{wasm_i8x16_sub(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Vec128<int16_t, N> operator-(const Vec128<int16_t, N> a,
+ const Vec128<int16_t, N> b) {
+ return Vec128<int16_t, N>{wasm_i16x8_sub(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Vec128<int32_t, N> operator-(const Vec128<int32_t, N> a,
+ const Vec128<int32_t, N> b) {
+ return Vec128<int32_t, N>{wasm_i32x4_sub(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Vec128<int64_t, N> operator-(const Vec128<int64_t, N> a,
+ const Vec128<int64_t, N> b) {
+ return Vec128<int64_t, N>{wasm_i64x2_sub(a.raw, b.raw)};
+}
+
+// Float
+template <size_t N>
+HWY_API Vec128<float, N> operator-(const Vec128<float, N> a,
+ const Vec128<float, N> b) {
+ return Vec128<float, N>{wasm_f32x4_sub(a.raw, b.raw)};
+}
+
+// ------------------------------ SaturatedAdd
+
+// Returns a + b clamped to the destination range.
+
+// Unsigned
+template <size_t N>
+HWY_API Vec128<uint8_t, N> SaturatedAdd(const Vec128<uint8_t, N> a,
+ const Vec128<uint8_t, N> b) {
+ return Vec128<uint8_t, N>{wasm_u8x16_add_sat(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Vec128<uint16_t, N> SaturatedAdd(const Vec128<uint16_t, N> a,
+ const Vec128<uint16_t, N> b) {
+ return Vec128<uint16_t, N>{wasm_u16x8_add_sat(a.raw, b.raw)};
+}
+
+// Signed
+template <size_t N>
+HWY_API Vec128<int8_t, N> SaturatedAdd(const Vec128<int8_t, N> a,
+ const Vec128<int8_t, N> b) {
+ return Vec128<int8_t, N>{wasm_i8x16_add_sat(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Vec128<int16_t, N> SaturatedAdd(const Vec128<int16_t, N> a,
+ const Vec128<int16_t, N> b) {
+ return Vec128<int16_t, N>{wasm_i16x8_add_sat(a.raw, b.raw)};
+}
+
+// ------------------------------ SaturatedSub
+
+// Returns a - b clamped to the destination range.
+
+// Unsigned
+template <size_t N>
+HWY_API Vec128<uint8_t, N> SaturatedSub(const Vec128<uint8_t, N> a,
+ const Vec128<uint8_t, N> b) {
+ return Vec128<uint8_t, N>{wasm_u8x16_sub_sat(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Vec128<uint16_t, N> SaturatedSub(const Vec128<uint16_t, N> a,
+ const Vec128<uint16_t, N> b) {
+ return Vec128<uint16_t, N>{wasm_u16x8_sub_sat(a.raw, b.raw)};
+}
+
+// Signed
+template <size_t N>
+HWY_API Vec128<int8_t, N> SaturatedSub(const Vec128<int8_t, N> a,
+ const Vec128<int8_t, N> b) {
+ return Vec128<int8_t, N>{wasm_i8x16_sub_sat(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Vec128<int16_t, N> SaturatedSub(const Vec128<int16_t, N> a,
+ const Vec128<int16_t, N> b) {
+ return Vec128<int16_t, N>{wasm_i16x8_sub_sat(a.raw, b.raw)};
+}
+
+// ------------------------------ Average
+
+// Returns (a + b + 1) / 2
+
+// Unsigned
+template <size_t N>
+HWY_API Vec128<uint8_t, N> AverageRound(const Vec128<uint8_t, N> a,
+ const Vec128<uint8_t, N> b) {
+ return Vec128<uint8_t, N>{wasm_u8x16_avgr(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Vec128<uint16_t, N> AverageRound(const Vec128<uint16_t, N> a,
+ const Vec128<uint16_t, N> b) {
+ return Vec128<uint16_t, N>{wasm_u16x8_avgr(a.raw, b.raw)};
+}
+
+// ------------------------------ Absolute value
+
+// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
+template <size_t N>
+HWY_API Vec128<int8_t, N> Abs(const Vec128<int8_t, N> v) {
+ return Vec128<int8_t, N>{wasm_i8x16_abs(v.raw)};
+}
+template <size_t N>
+HWY_API Vec128<int16_t, N> Abs(const Vec128<int16_t, N> v) {
+ return Vec128<int16_t, N>{wasm_i16x8_abs(v.raw)};
+}
+template <size_t N>
+HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) {
+ return Vec128<int32_t, N>{wasm_i32x4_abs(v.raw)};
+}
+template <size_t N>
+HWY_API Vec128<int64_t, N> Abs(const Vec128<int64_t, N> v) {
+ return Vec128<int64_t, N>{wasm_i64x2_abs(v.raw)};
+}
+
+template <size_t N>
+HWY_API Vec128<float, N> Abs(const Vec128<float, N> v) {
+ return Vec128<float, N>{wasm_f32x4_abs(v.raw)};
+}
+
+// ------------------------------ Shift lanes by constant #bits
+
+// Unsigned
+template <int kBits, size_t N>
+HWY_API Vec128<uint16_t, N> ShiftLeft(const Vec128<uint16_t, N> v) {
+ return Vec128<uint16_t, N>{wasm_i16x8_shl(v.raw, kBits)};
+}
+template <int kBits, size_t N>
+HWY_API Vec128<uint16_t, N> ShiftRight(const Vec128<uint16_t, N> v) {
+ return Vec128<uint16_t, N>{wasm_u16x8_shr(v.raw, kBits)};
+}
+template <int kBits, size_t N>
+HWY_API Vec128<uint32_t, N> ShiftLeft(const Vec128<uint32_t, N> v) {
+ return Vec128<uint32_t, N>{wasm_i32x4_shl(v.raw, kBits)};
+}
+template <int kBits, size_t N>
+HWY_API Vec128<uint64_t, N> ShiftLeft(const Vec128<uint64_t, N> v) {
+ return Vec128<uint64_t, N>{wasm_i64x2_shl(v.raw, kBits)};
+}
+template <int kBits, size_t N>
+HWY_API Vec128<uint32_t, N> ShiftRight(const Vec128<uint32_t, N> v) {
+ return Vec128<uint32_t, N>{wasm_u32x4_shr(v.raw, kBits)};
+}
+template <int kBits, size_t N>
+HWY_API Vec128<uint64_t, N> ShiftRight(const Vec128<uint64_t, N> v) {
+ return Vec128<uint64_t, N>{wasm_u64x2_shr(v.raw, kBits)};
+}
+
+// Signed
+template <int kBits, size_t N>
+HWY_API Vec128<int16_t, N> ShiftLeft(const Vec128<int16_t, N> v) {
+ return Vec128<int16_t, N>{wasm_i16x8_shl(v.raw, kBits)};
+}
+template <int kBits, size_t N>
+HWY_API Vec128<int16_t, N> ShiftRight(const Vec128<int16_t, N> v) {
+ return Vec128<int16_t, N>{wasm_i16x8_shr(v.raw, kBits)};
+}
+template <int kBits, size_t N>
+HWY_API Vec128<int32_t, N> ShiftLeft(const Vec128<int32_t, N> v) {
+ return Vec128<int32_t, N>{wasm_i32x4_shl(v.raw, kBits)};
+}
+template <int kBits, size_t N>
+HWY_API Vec128<int64_t, N> ShiftLeft(const Vec128<int64_t, N> v) {
+ return Vec128<int64_t, N>{wasm_i64x2_shl(v.raw, kBits)};
+}
+template <int kBits, size_t N>
+HWY_API Vec128<int32_t, N> ShiftRight(const Vec128<int32_t, N> v) {
+ return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, kBits)};
+}
+template <int kBits, size_t N>
+HWY_API Vec128<int64_t, N> ShiftRight(const Vec128<int64_t, N> v) {
+ return Vec128<int64_t, N>{wasm_i64x2_shr(v.raw, kBits)};
+}
+
+// 8-bit
+template <int kBits, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
+HWY_API Vec128<T, N> ShiftLeft(const Vec128<T, N> v) {
+ const DFromV<decltype(v)> d8;
+ // Use raw instead of BitCast to support N=1.
+ const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw};
+ return kBits == 1
+ ? (v + v)
+ : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
+}
+
+template <int kBits, size_t N>
+HWY_API Vec128<uint8_t, N> ShiftRight(const Vec128<uint8_t, N> v) {
+ const DFromV<decltype(v)> d8;
+ // Use raw instead of BitCast to support N=1.
+ const Vec128<uint8_t, N> shifted{
+ ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
+ return shifted & Set(d8, 0xFF >> kBits);
+}
+
+template <int kBits, size_t N>
+HWY_API Vec128<int8_t, N> ShiftRight(const Vec128<int8_t, N> v) {
+ const DFromV<decltype(v)> di;
+ const RebindToUnsigned<decltype(di)> du;
+ const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
+ const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
+ return (shifted ^ shifted_sign) - shifted_sign;
+}
+
+// ------------------------------ RotateRight (ShiftRight, Or)
+template <int kBits, typename T, size_t N>
+HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
+ constexpr size_t kSizeInBits = sizeof(T) * 8;
+ static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
+ if (kBits == 0) return v;
+ return Or(ShiftRight<kBits>(v), ShiftLeft<kSizeInBits - kBits>(v));
+}
+
+// ------------------------------ Shift lanes by same variable #bits
+
+// After https://reviews.llvm.org/D108415 shift argument became unsigned.
+HWY_DIAGNOSTICS(push)
+HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
+
+// Unsigned
+template <size_t N>
+HWY_API Vec128<uint16_t, N> ShiftLeftSame(const Vec128<uint16_t, N> v,
+ const int bits) {
+ return Vec128<uint16_t, N>{wasm_i16x8_shl(v.raw, bits)};
+}
+template <size_t N>
+HWY_API Vec128<uint16_t, N> ShiftRightSame(const Vec128<uint16_t, N> v,
+ const int bits) {
+ return Vec128<uint16_t, N>{wasm_u16x8_shr(v.raw, bits)};
+}
+template <size_t N>
+HWY_API Vec128<uint32_t, N> ShiftLeftSame(const Vec128<uint32_t, N> v,
+ const int bits) {
+ return Vec128<uint32_t, N>{wasm_i32x4_shl(v.raw, bits)};
+}
+template <size_t N>
+HWY_API Vec128<uint32_t, N> ShiftRightSame(const Vec128<uint32_t, N> v,
+ const int bits) {
+ return Vec128<uint32_t, N>{wasm_u32x4_shr(v.raw, bits)};
+}
+template <size_t N>
+HWY_API Vec128<uint64_t, N> ShiftLeftSame(const Vec128<uint64_t, N> v,
+ const int bits) {
+ return Vec128<uint64_t, N>{wasm_i64x2_shl(v.raw, bits)};
+}
+template <size_t N>
+HWY_API Vec128<uint64_t, N> ShiftRightSame(const Vec128<uint64_t, N> v,
+ const int bits) {
+ return Vec128<uint64_t, N>{wasm_u64x2_shr(v.raw, bits)};
+}
+
+// Signed
+template <size_t N>
+HWY_API Vec128<int16_t, N> ShiftLeftSame(const Vec128<int16_t, N> v,
+ const int bits) {
+ return Vec128<int16_t, N>{wasm_i16x8_shl(v.raw, bits)};
+}
+template <size_t N>
+HWY_API Vec128<int16_t, N> ShiftRightSame(const Vec128<int16_t, N> v,
+ const int bits) {
+ return Vec128<int16_t, N>{wasm_i16x8_shr(v.raw, bits)};
+}
+template <size_t N>
+HWY_API Vec128<int32_t, N> ShiftLeftSame(const Vec128<int32_t, N> v,
+ const int bits) {
+ return Vec128<int32_t, N>{wasm_i32x4_shl(v.raw, bits)};
+}
+template <size_t N>
+HWY_API Vec128<int32_t, N> ShiftRightSame(const Vec128<int32_t, N> v,
+ const int bits) {
+ return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, bits)};
+}
+template <size_t N>
+HWY_API Vec128<int64_t, N> ShiftLeftSame(const Vec128<int64_t, N> v,
+ const int bits) {
+ return Vec128<int64_t, N>{wasm_i64x2_shl(v.raw, bits)};
+}
+template <size_t N>
+HWY_API Vec128<int64_t, N> ShiftRightSame(const Vec128<int64_t, N> v,
+ const int bits) {
+ return Vec128<int64_t, N>{wasm_i64x2_shr(v.raw, bits)};
+}
+
+// 8-bit
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
+HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, const int bits) {
+ const DFromV<decltype(v)> d8;
+ // Use raw instead of BitCast to support N=1.
+ const Vec128<T, N> shifted{
+ ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw};
+ return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF));
+}
+
+template <size_t N>
+HWY_API Vec128<uint8_t, N> ShiftRightSame(Vec128<uint8_t, N> v,
+ const int bits) {
+ const DFromV<decltype(v)> d8;
+ // Use raw instead of BitCast to support N=1.
+ const Vec128<uint8_t, N> shifted{
+ ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw};
+ return shifted & Set(d8, 0xFF >> bits);
+}
+
+template <size_t N>
+HWY_API Vec128<int8_t, N> ShiftRightSame(Vec128<int8_t, N> v, const int bits) {
+ const DFromV<decltype(v)> di;
+ const RebindToUnsigned<decltype(di)> du;
+ const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
+ const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits));
+ return (shifted ^ shifted_sign) - shifted_sign;
+}
+
+// ignore Wsign-conversion
+HWY_DIAGNOSTICS(pop)
+
+// ------------------------------ Minimum
+
+// Unsigned
+template <size_t N>
+HWY_API Vec128<uint8_t, N> Min(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
+ return Vec128<uint8_t, N>{wasm_u8x16_min(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Vec128<uint16_t, N> Min(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) {
+ return Vec128<uint16_t, N>{wasm_u16x8_min(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Vec128<uint32_t, N> Min(Vec128<uint32_t, N> a, Vec128<uint32_t, N> b) {
+ return Vec128<uint32_t, N>{wasm_u32x4_min(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Vec128<uint64_t, N> Min(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
+ // Avoid wasm_u64x2_extract_lane - not all implementations have it yet.
+ const uint64_t a0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0));
+ const uint64_t b0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0));
+ const uint64_t a1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1));
+ const uint64_t b1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1));
+ alignas(16) uint64_t min[2] = {HWY_MIN(a0, b0), HWY_MIN(a1, b1)};
+ return Vec128<uint64_t, N>{wasm_v128_load(min)};
+}
+
+// Signed
+template <size_t N>
+HWY_API Vec128<int8_t, N> Min(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
+ return Vec128<int8_t, N>{wasm_i8x16_min(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Vec128<int16_t, N> Min(Vec128<int16_t, N> a, Vec128<int16_t, N> b) {
+ return Vec128<int16_t, N>{wasm_i16x8_min(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Vec128<int32_t, N> Min(Vec128<int32_t, N> a, Vec128<int32_t, N> b) {
+ return Vec128<int32_t, N>{wasm_i32x4_min(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Vec128<int64_t, N> Min(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
+ alignas(16) int64_t min[4];
+ min[0] = HWY_MIN(wasm_i64x2_extract_lane(a.raw, 0),
+ wasm_i64x2_extract_lane(b.raw, 0));
+ min[1] = HWY_MIN(wasm_i64x2_extract_lane(a.raw, 1),
+ wasm_i64x2_extract_lane(b.raw, 1));
+ return Vec128<int64_t, N>{wasm_v128_load(min)};
+}
+
+// Float
+template <size_t N>
+HWY_API Vec128<float, N> Min(Vec128<float, N> a, Vec128<float, N> b) {
+ // Equivalent to a < b ? a : b (taking into account our swapped arg order,
+ // so that Min(NaN, x) is x to match x86).
+ return Vec128<float, N>{wasm_f32x4_pmin(b.raw, a.raw)};
+}
+
+// ------------------------------ Maximum
+
+// Unsigned
+template <size_t N>
+HWY_API Vec128<uint8_t, N> Max(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
+ return Vec128<uint8_t, N>{wasm_u8x16_max(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Vec128<uint16_t, N> Max(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) {
+ return Vec128<uint16_t, N>{wasm_u16x8_max(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Vec128<uint32_t, N> Max(Vec128<uint32_t, N> a, Vec128<uint32_t, N> b) {
+ return Vec128<uint32_t, N>{wasm_u32x4_max(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Vec128<uint64_t, N> Max(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
+ // Avoid wasm_u64x2_extract_lane - not all implementations have it yet.
+ const uint64_t a0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0));
+ const uint64_t b0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0));
+ const uint64_t a1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1));
+ const uint64_t b1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1));
+ alignas(16) uint64_t max[2] = {HWY_MAX(a0, b0), HWY_MAX(a1, b1)};
+ return Vec128<uint64_t, N>{wasm_v128_load(max)};
+}
+
+// Signed
+template <size_t N>
+HWY_API Vec128<int8_t, N> Max(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
+ return Vec128<int8_t, N>{wasm_i8x16_max(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Vec128<int16_t, N> Max(Vec128<int16_t, N> a, Vec128<int16_t, N> b) {
+ return Vec128<int16_t, N>{wasm_i16x8_max(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Vec128<int32_t, N> Max(Vec128<int32_t, N> a, Vec128<int32_t, N> b) {
+ return Vec128<int32_t, N>{wasm_i32x4_max(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Vec128<int64_t, N> Max(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
+ alignas(16) int64_t max[2];
+ max[0] = HWY_MAX(wasm_i64x2_extract_lane(a.raw, 0),
+ wasm_i64x2_extract_lane(b.raw, 0));
+ max[1] = HWY_MAX(wasm_i64x2_extract_lane(a.raw, 1),
+ wasm_i64x2_extract_lane(b.raw, 1));
+ return Vec128<int64_t, N>{wasm_v128_load(max)};
+}
+
+// Float
+template <size_t N>
+HWY_API Vec128<float, N> Max(Vec128<float, N> a, Vec128<float, N> b) {
+ // Equivalent to b < a ? a : b (taking into account our swapped arg order,
+ // so that Max(NaN, x) is x to match x86).
+ return Vec128<float, N>{wasm_f32x4_pmax(b.raw, a.raw)};
+}
+
+// ------------------------------ Integer multiplication
+
+// Unsigned
+template <size_t N>
+HWY_API Vec128<uint16_t, N> operator*(const Vec128<uint16_t, N> a,
+ const Vec128<uint16_t, N> b) {
+ return Vec128<uint16_t, N>{wasm_i16x8_mul(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Vec128<uint32_t, N> operator*(const Vec128<uint32_t, N> a,
+ const Vec128<uint32_t, N> b) {
+ return Vec128<uint32_t, N>{wasm_i32x4_mul(a.raw, b.raw)};
+}
+
+// Signed
+template <size_t N>
+HWY_API Vec128<int16_t, N> operator*(const Vec128<int16_t, N> a,
+ const Vec128<int16_t, N> b) {
+ return Vec128<int16_t, N>{wasm_i16x8_mul(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Vec128<int32_t, N> operator*(const Vec128<int32_t, N> a,
+ const Vec128<int32_t, N> b) {
+ return Vec128<int32_t, N>{wasm_i32x4_mul(a.raw, b.raw)};
+}
+
+// Returns the upper 16 bits of a * b in each lane.
+template <size_t N>
+HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a,
+ const Vec128<uint16_t, N> b) {
+ const auto l = wasm_u32x4_extmul_low_u16x8(a.raw, b.raw);
+ const auto h = wasm_u32x4_extmul_high_u16x8(a.raw, b.raw);
+ // TODO(eustas): shift-right + narrow?
+ return Vec128<uint16_t, N>{
+ wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
+}
+template <size_t N>
+HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a,
+ const Vec128<int16_t, N> b) {
+ const auto l = wasm_i32x4_extmul_low_i16x8(a.raw, b.raw);
+ const auto h = wasm_i32x4_extmul_high_i16x8(a.raw, b.raw);
+ // TODO(eustas): shift-right + narrow?
+ return Vec128<int16_t, N>{
+ wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
+}
+
+template <size_t N>
+HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
+ Vec128<int16_t, N> b) {
+ return Vec128<int16_t, N>{wasm_i16x8_q15mulr_sat(a.raw, b.raw)};
+}
+
+// Multiplies even lanes (0, 2 ..) and returns the double-width result.
+template <size_t N>
+HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
+ const Vec128<int32_t, N> b) {
+ const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
+ const auto ae = wasm_v128_and(a.raw, kEvenMask);
+ const auto be = wasm_v128_and(b.raw, kEvenMask);
+ return Vec128<int64_t, (N + 1) / 2>{wasm_i64x2_mul(ae, be)};
+}
+template <size_t N>
+HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(const Vec128<uint32_t, N> a,
+ const Vec128<uint32_t, N> b) {
+ const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
+ const auto ae = wasm_v128_and(a.raw, kEvenMask);
+ const auto be = wasm_v128_and(b.raw, kEvenMask);
+ return Vec128<uint64_t, (N + 1) / 2>{wasm_i64x2_mul(ae, be)};
+}
+
+// ------------------------------ Negate
+
+template <typename T, size_t N, HWY_IF_FLOAT(T)>
+HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
+ return Xor(v, SignBit(DFromV<decltype(v)>()));
+}
+
+template <size_t N>
+HWY_API Vec128<int8_t, N> Neg(const Vec128<int8_t, N> v) {
+ return Vec128<int8_t, N>{wasm_i8x16_neg(v.raw)};
+}
+template <size_t N>
+HWY_API Vec128<int16_t, N> Neg(const Vec128<int16_t, N> v) {
+ return Vec128<int16_t, N>{wasm_i16x8_neg(v.raw)};
+}
+template <size_t N>
+HWY_API Vec128<int32_t, N> Neg(const Vec128<int32_t, N> v) {
+ return Vec128<int32_t, N>{wasm_i32x4_neg(v.raw)};
+}
+template <size_t N>
+HWY_API Vec128<int64_t, N> Neg(const Vec128<int64_t, N> v) {
+ return Vec128<int64_t, N>{wasm_i64x2_neg(v.raw)};
+}
+
+// ------------------------------ Floating-point mul / div
+
+template <size_t N>
+HWY_API Vec128<float, N> operator*(Vec128<float, N> a, Vec128<float, N> b) {
+ return Vec128<float, N>{wasm_f32x4_mul(a.raw, b.raw)};
+}
+
+template <size_t N>
+HWY_API Vec128<float, N> operator/(const Vec128<float, N> a,
+ const Vec128<float, N> b) {
+ return Vec128<float, N>{wasm_f32x4_div(a.raw, b.raw)};
+}
+
+// Approximate reciprocal
+template <size_t N>
+HWY_API Vec128<float, N> ApproximateReciprocal(const Vec128<float, N> v) {
+ const Vec128<float, N> one = Vec128<float, N>{wasm_f32x4_splat(1.0f)};
+ return one / v;
+}
+
+// Absolute value of difference.
+template <size_t N>
+HWY_API Vec128<float, N> AbsDiff(const Vec128<float, N> a,
+ const Vec128<float, N> b) {
+ return Abs(a - b);
+}
+
+// ------------------------------ Floating-point multiply-add variants
+
+// Returns mul * x + add
+template <size_t N>
+HWY_API Vec128<float, N> MulAdd(const Vec128<float, N> mul,
+ const Vec128<float, N> x,
+ const Vec128<float, N> add) {
+ return mul * x + add;
+}
+
+// Returns add - mul * x
+template <size_t N>
+HWY_API Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
+ const Vec128<float, N> x,
+ const Vec128<float, N> add) {
+ return add - mul * x;
+}
+
+// Returns mul * x - sub
+template <size_t N>
+HWY_API Vec128<float, N> MulSub(const Vec128<float, N> mul,
+ const Vec128<float, N> x,
+ const Vec128<float, N> sub) {
+ return mul * x - sub;
+}
+
+// Returns -mul * x - sub
+template <size_t N>
+HWY_API Vec128<float, N> NegMulSub(const Vec128<float, N> mul,
+ const Vec128<float, N> x,
+ const Vec128<float, N> sub) {
+ return Neg(mul) * x - sub;
+}
+
+// ------------------------------ Floating-point square root
+
+// Full precision square root
+template <size_t N>
+HWY_API Vec128<float, N> Sqrt(const Vec128<float, N> v) {
+ return Vec128<float, N>{wasm_f32x4_sqrt(v.raw)};
+}
+
+// Approximate reciprocal square root
+template <size_t N>
+HWY_API Vec128<float, N> ApproximateReciprocalSqrt(const Vec128<float, N> v) {
+ // TODO(eustas): find cheaper a way to calculate this.
+ const Vec128<float, N> one = Vec128<float, N>{wasm_f32x4_splat(1.0f)};
+ return one / Sqrt(v);
+}
+
+// ------------------------------ Floating-point rounding
+
+// Toward nearest integer, ties to even
+template <size_t N>
+HWY_API Vec128<float, N> Round(const Vec128<float, N> v) {
+ return Vec128<float, N>{wasm_f32x4_nearest(v.raw)};
+}
+
+// Toward zero, aka truncate
+template <size_t N>
+HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) {
+ return Vec128<float, N>{wasm_f32x4_trunc(v.raw)};
+}
+
+// Toward +infinity, aka ceiling
+template <size_t N>
+HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) {
+ return Vec128<float, N>{wasm_f32x4_ceil(v.raw)};
+}
+
+// Toward -infinity, aka floor
+template <size_t N>
+HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) {
+ return Vec128<float, N>{wasm_f32x4_floor(v.raw)};
+}
+
+// ------------------------------ Floating-point classification
+template <typename T, size_t N>
+HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
+ return v != v;
+}
+
+template <typename T, size_t N, HWY_IF_FLOAT(T)>
+HWY_API Mask128<T, N> IsInf(const Vec128<T, N> v) {
+ const Simd<T, N, 0> d;
+ const RebindToSigned<decltype(d)> di;
+ const VFromD<decltype(di)> vi = BitCast(di, v);
+ // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
+ return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
+}
+
+// Returns whether normal/subnormal/zero.
+template <typename T, size_t N, HWY_IF_FLOAT(T)>
+HWY_API Mask128<T, N> IsFinite(const Vec128<T, N> v) {
+ const Simd<T, N, 0> d;
+ const RebindToUnsigned<decltype(d)> du;
+ const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
+ const VFromD<decltype(du)> vu = BitCast(du, v);
+ // 'Shift left' to clear the sign bit, then right so we can compare with the
+ // max exponent (cannot compare with MaxExponentTimes2 directly because it is
+ // negative and non-negative floats would be greater).
+ const VFromD<decltype(di)> exp =
+ BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
+ return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
+}
+
+// ================================================== COMPARE
+
+// Comparisons fill a lane with 1-bits if the condition is true, else 0.
+
+template <typename TFrom, typename TTo, size_t N>
+HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N, 0> /*tag*/,
+ Mask128<TFrom, N> m) {
+ static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
+ return Mask128<TTo, N>{m.raw};
+}
+
+template <typename T, size_t N>
+HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
+ static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
+ return (v & bit) == bit;
+}
+
+// ------------------------------ Equality
+
+// Unsigned
+template <size_t N>
+HWY_API Mask128<uint8_t, N> operator==(const Vec128<uint8_t, N> a,
+ const Vec128<uint8_t, N> b) {
+ return Mask128<uint8_t, N>{wasm_i8x16_eq(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Mask128<uint16_t, N> operator==(const Vec128<uint16_t, N> a,
+ const Vec128<uint16_t, N> b) {
+ return Mask128<uint16_t, N>{wasm_i16x8_eq(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Mask128<uint32_t, N> operator==(const Vec128<uint32_t, N> a,
+ const Vec128<uint32_t, N> b) {
+ return Mask128<uint32_t, N>{wasm_i32x4_eq(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
+ const Vec128<uint64_t, N> b) {
+ return Mask128<uint64_t, N>{wasm_i64x2_eq(a.raw, b.raw)};
+}
+
+// Signed
+template <size_t N>
+HWY_API Mask128<int8_t, N> operator==(const Vec128<int8_t, N> a,
+ const Vec128<int8_t, N> b) {
+ return Mask128<int8_t, N>{wasm_i8x16_eq(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Mask128<int16_t, N> operator==(Vec128<int16_t, N> a,
+ Vec128<int16_t, N> b) {
+ return Mask128<int16_t, N>{wasm_i16x8_eq(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Mask128<int32_t, N> operator==(const Vec128<int32_t, N> a,
+ const Vec128<int32_t, N> b) {
+ return Mask128<int32_t, N>{wasm_i32x4_eq(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
+ const Vec128<int64_t, N> b) {
+ return Mask128<int64_t, N>{wasm_i64x2_eq(a.raw, b.raw)};
+}
+
+// Float
+template <size_t N>
+HWY_API Mask128<float, N> operator==(const Vec128<float, N> a,
+ const Vec128<float, N> b) {
+ return Mask128<float, N>{wasm_f32x4_eq(a.raw, b.raw)};
+}
+
+// ------------------------------ Inequality
+
+// Unsigned
+template <size_t N>
+HWY_API Mask128<uint8_t, N> operator!=(const Vec128<uint8_t, N> a,
+ const Vec128<uint8_t, N> b) {
+ return Mask128<uint8_t, N>{wasm_i8x16_ne(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Mask128<uint16_t, N> operator!=(const Vec128<uint16_t, N> a,
+ const Vec128<uint16_t, N> b) {
+ return Mask128<uint16_t, N>{wasm_i16x8_ne(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Mask128<uint32_t, N> operator!=(const Vec128<uint32_t, N> a,
+ const Vec128<uint32_t, N> b) {
+ return Mask128<uint32_t, N>{wasm_i32x4_ne(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Mask128<uint64_t, N> operator!=(const Vec128<uint64_t, N> a,
+ const Vec128<uint64_t, N> b) {
+ return Mask128<uint64_t, N>{wasm_i64x2_ne(a.raw, b.raw)};
+}
+
+// Signed
+template <size_t N>
+HWY_API Mask128<int8_t, N> operator!=(const Vec128<int8_t, N> a,
+ const Vec128<int8_t, N> b) {
+ return Mask128<int8_t, N>{wasm_i8x16_ne(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Mask128<int16_t, N> operator!=(const Vec128<int16_t, N> a,
+ const Vec128<int16_t, N> b) {
+ return Mask128<int16_t, N>{wasm_i16x8_ne(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Mask128<int32_t, N> operator!=(const Vec128<int32_t, N> a,
+ const Vec128<int32_t, N> b) {
+ return Mask128<int32_t, N>{wasm_i32x4_ne(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Mask128<int64_t, N> operator!=(const Vec128<int64_t, N> a,
+ const Vec128<int64_t, N> b) {
+ return Mask128<int64_t, N>{wasm_i64x2_ne(a.raw, b.raw)};
+}
+
+// Float
+template <size_t N>
+HWY_API Mask128<float, N> operator!=(const Vec128<float, N> a,
+ const Vec128<float, N> b) {
+ return Mask128<float, N>{wasm_f32x4_ne(a.raw, b.raw)};
+}
+
+// ------------------------------ Strict inequality
+
+template <size_t N>
+HWY_API Mask128<int8_t, N> operator>(const Vec128<int8_t, N> a,
+ const Vec128<int8_t, N> b) {
+ return Mask128<int8_t, N>{wasm_i8x16_gt(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Mask128<int16_t, N> operator>(const Vec128<int16_t, N> a,
+ const Vec128<int16_t, N> b) {
+ return Mask128<int16_t, N>{wasm_i16x8_gt(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Mask128<int32_t, N> operator>(const Vec128<int32_t, N> a,
+ const Vec128<int32_t, N> b) {
+ return Mask128<int32_t, N>{wasm_i32x4_gt(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Mask128<int64_t, N> operator>(const Vec128<int64_t, N> a,
+ const Vec128<int64_t, N> b) {
+ return Mask128<int64_t, N>{wasm_i64x2_gt(a.raw, b.raw)};
+}
+
+template <size_t N>
+HWY_API Mask128<uint8_t, N> operator>(const Vec128<uint8_t, N> a,
+ const Vec128<uint8_t, N> b) {
+ return Mask128<uint8_t, N>{wasm_u8x16_gt(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Mask128<uint16_t, N> operator>(const Vec128<uint16_t, N> a,
+ const Vec128<uint16_t, N> b) {
+ return Mask128<uint16_t, N>{wasm_u16x8_gt(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Mask128<uint32_t, N> operator>(const Vec128<uint32_t, N> a,
+ const Vec128<uint32_t, N> b) {
+ return Mask128<uint32_t, N>{wasm_u32x4_gt(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Mask128<uint64_t, N> operator>(const Vec128<uint64_t, N> a,
+ const Vec128<uint64_t, N> b) {
+ const DFromV<decltype(a)> d;
+ const Repartition<uint32_t, decltype(d)> d32;
+ const auto a32 = BitCast(d32, a);
+ const auto b32 = BitCast(d32, b);
+ // If the upper halves are not equal, this is the answer.
+ const auto m_gt = a32 > b32;
+
+ // Otherwise, the lower half decides.
+ const auto m_eq = a32 == b32;
+ const auto lo_in_hi = wasm_i32x4_shuffle(m_gt.raw, m_gt.raw, 0, 0, 2, 2);
+ const auto lo_gt = And(m_eq, MaskFromVec(VFromD<decltype(d32)>{lo_in_hi}));
+
+ const auto gt = Or(lo_gt, m_gt);
+ // Copy result in upper 32 bits to lower 32 bits.
+ return Mask128<uint64_t, N>{wasm_i32x4_shuffle(gt.raw, gt.raw, 1, 1, 3, 3)};
+}
+
+template <size_t N>
+HWY_API Mask128<float, N> operator>(const Vec128<float, N> a,
+ const Vec128<float, N> b) {
+ return Mask128<float, N>{wasm_f32x4_gt(a.raw, b.raw)};
+}
+
+template <typename T, size_t N>
+HWY_API Mask128<T, N> operator<(const Vec128<T, N> a, const Vec128<T, N> b) {
+ return operator>(b, a);
+}
+
+// ------------------------------ Weak inequality
+
+// Float <= >=
+template <size_t N>
+HWY_API Mask128<float, N> operator<=(const Vec128<float, N> a,
+ const Vec128<float, N> b) {
+ return Mask128<float, N>{wasm_f32x4_le(a.raw, b.raw)};
+}
+template <size_t N>
+HWY_API Mask128<float, N> operator>=(const Vec128<float, N> a,
+ const Vec128<float, N> b) {
+ return Mask128<float, N>{wasm_f32x4_ge(a.raw, b.raw)};
+}
+
+// ------------------------------ FirstN (Iota, Lt)
+
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_API Mask128<T, N> FirstN(const Simd<T, N, 0> d, size_t num) {
+ const RebindToSigned<decltype(d)> di; // Signed comparisons may be cheaper.
+ return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
+}
+
+// ================================================== LOGICAL
+
+// ------------------------------ Not
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> Not(Vec128<T, N> v) {
+ return Vec128<T, N>{wasm_v128_not(v.raw)};
+}
+
+// ------------------------------ And
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) {
+ return Vec128<T, N>{wasm_v128_and(a.raw, b.raw)};
+}
+
+// ------------------------------ AndNot
+
+// Returns ~not_mask & mask.
+template <typename T, size_t N>
+HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
+ return Vec128<T, N>{wasm_v128_andnot(mask.raw, not_mask.raw)};
+}
+
+// ------------------------------ Or
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) {
+ return Vec128<T, N>{wasm_v128_or(a.raw, b.raw)};
+}
+
+// ------------------------------ Xor
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) {
+ return Vec128<T, N>{wasm_v128_xor(a.raw, b.raw)};
+}
+
+// ------------------------------ Xor3
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
+ return Xor(x1, Xor(x2, x3));
+}
+
+// ------------------------------ Or3
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
+ return Or(o1, Or(o2, o3));
+}
+
+// ------------------------------ OrAnd
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
+ return Or(o, And(a1, a2));
+}
+
+// ------------------------------ IfVecThenElse
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
+ Vec128<T, N> no) {
+ return IfThenElse(MaskFromVec(mask), yes, no);
+}
+
+// ------------------------------ Operator overloads (internal-only if float)
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) {
+ return And(a, b);
+}
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) {
+ return Or(a, b);
+}
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) {
+ return Xor(a, b);
+}
+
+// ------------------------------ CopySign
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> CopySign(const Vec128<T, N> magn,
+ const Vec128<T, N> sign) {
+ static_assert(IsFloat<T>(), "Only makes sense for floating-point");
+ const auto msb = SignBit(DFromV<decltype(magn)>());
+ return Or(AndNot(msb, magn), And(msb, sign));
+}
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> CopySignToAbs(const Vec128<T, N> abs,
+ const Vec128<T, N> sign) {
+ static_assert(IsFloat<T>(), "Only makes sense for floating-point");
+ return Or(abs, And(SignBit(DFromV<decltype(abs)>()), sign));
+}
+
+// ------------------------------ BroadcastSignBit (compare)
+
+template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
+HWY_API Vec128<T, N> BroadcastSignBit(const Vec128<T, N> v) {
+ return ShiftRight<sizeof(T) * 8 - 1>(v);
+}
+template <size_t N>
+HWY_API Vec128<int8_t, N> BroadcastSignBit(const Vec128<int8_t, N> v) {
+ const DFromV<decltype(v)> d;
+ return VecFromMask(d, v < Zero(d));
+}
+
+// ------------------------------ Mask
+
+// Mask and Vec are the same (true = FF..FF).
+template <typename T, size_t N>
+HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
+ return Mask128<T, N>{v.raw};
+}
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> VecFromMask(Simd<T, N, 0> /* tag */, Mask128<T, N> v) {
+ return Vec128<T, N>{v.raw};
+}
+
+// mask ? yes : no
+template <typename T, size_t N>
+HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
+ Vec128<T, N> no) {
+ return Vec128<T, N>{wasm_v128_bitselect(yes.raw, no.raw, mask.raw)};
+}
+
+// mask ? yes : 0
+template <typename T, size_t N>
+HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
+ return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
+}
+
+// mask ? 0 : no
+template <typename T, size_t N>
+HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
+ return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
+}
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
+ Vec128<T, N> no) {
+ static_assert(IsSigned<T>(), "Only works for signed/float");
+ const DFromV<decltype(v)> d;
+ const RebindToSigned<decltype(d)> di;
+
+ v = BitCast(d, BroadcastSignBit(BitCast(di, v)));
+ return IfThenElse(MaskFromVec(v), yes, no);
+}
+
+template <typename T, size_t N, HWY_IF_FLOAT(T)>
+HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
+ const DFromV<decltype(v)> d;
+ const auto zero = Zero(d);
+ return IfThenElse(Mask128<T, N>{(v > zero).raw}, v, zero);
+}
+
+// ------------------------------ Mask logical
+
+template <typename T, size_t N>
+HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
+ return MaskFromVec(Not(VecFromMask(Simd<T, N, 0>(), m)));
+}
+
+template <typename T, size_t N>
+HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
+ const Simd<T, N, 0> d;
+ return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
+}
+
+template <typename T, size_t N>
+HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
+ const Simd<T, N, 0> d;
+ return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
+}
+
+template <typename T, size_t N>
+HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
+ const Simd<T, N, 0> d;
+ return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
+}
+
+template <typename T, size_t N>
+HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
+ const Simd<T, N, 0> d;
+ return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
+}
+
+template <typename T, size_t N>
+HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) {
+ const Simd<T, N, 0> d;
+ return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
+}
+
+// ------------------------------ Shl (BroadcastSignBit, IfThenElse)
+
+// The x86 multiply-by-Pow2() trick will not work because WASM saturates
+// float->int correctly to 2^31-1 (not 2^31). Because WASM's shifts take a
+// scalar count operand, per-lane shift instructions would require extract_lane
+// for each lane, and hoping that shuffle is correctly mapped to a native
+// instruction. Using non-vector shifts would incur a store-load forwarding
+// stall when loading the result vector. We instead test bits of the shift
+// count to "predicate" a shift of the entire vector by a constant.
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
+HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) {
+ const DFromV<decltype(v)> d;
+ Mask128<T, N> mask;
+ // Need a signed type for BroadcastSignBit.
+ auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
+ // Move the highest valid bit of the shift count into the sign bit.
+ test = ShiftLeft<12>(test);
+
+ mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
+ test = ShiftLeft<1>(test); // next bit (descending order)
+ v = IfThenElse(mask, ShiftLeft<8>(v), v);
+
+ mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
+ test = ShiftLeft<1>(test); // next bit (descending order)
+ v = IfThenElse(mask, ShiftLeft<4>(v), v);
+
+ mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
+ test = ShiftLeft<1>(test); // next bit (descending order)
+ v = IfThenElse(mask, ShiftLeft<2>(v), v);
+
+ mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
+ return IfThenElse(mask, ShiftLeft<1>(v), v);
+}
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
+HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) {
+ const DFromV<decltype(v)> d;
+ Mask128<T, N> mask;
+ // Need a signed type for BroadcastSignBit.
+ auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
+ // Move the highest valid bit of the shift count into the sign bit.
+ test = ShiftLeft<27>(test);
+
+ mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
+ test = ShiftLeft<1>(test); // next bit (descending order)
+ v = IfThenElse(mask, ShiftLeft<16>(v), v);
+
+ mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
+ test = ShiftLeft<1>(test); // next bit (descending order)
+ v = IfThenElse(mask, ShiftLeft<8>(v), v);
+
+ mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
+ test = ShiftLeft<1>(test); // next bit (descending order)
+ v = IfThenElse(mask, ShiftLeft<4>(v), v);
+
+ mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
+ test = ShiftLeft<1>(test); // next bit (descending order)
+ v = IfThenElse(mask, ShiftLeft<2>(v), v);
+
+ mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
+ return IfThenElse(mask, ShiftLeft<1>(v), v);
+}
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
+HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) {
+ const DFromV<decltype(v)> d;
+ alignas(16) T lanes[2];
+ alignas(16) T bits_lanes[2];
+ Store(v, d, lanes);
+ Store(bits, d, bits_lanes);
+ lanes[0] <<= bits_lanes[0];
+ lanes[1] <<= bits_lanes[1];
+ return Load(d, lanes);
+}
+
+// ------------------------------ Shr (BroadcastSignBit, IfThenElse)
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
+HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, const Vec128<T, N> bits) {
+ const DFromV<decltype(v)> d;
+ Mask128<T, N> mask;
+ // Need a signed type for BroadcastSignBit.
+ auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
+ // Move the highest valid bit of the shift count into the sign bit.
+ test = ShiftLeft<12>(test);
+
+ mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
+ test = ShiftLeft<1>(test); // next bit (descending order)
+ v = IfThenElse(mask, ShiftRight<8>(v), v);
+
+ mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
+ test = ShiftLeft<1>(test); // next bit (descending order)
+ v = IfThenElse(mask, ShiftRight<4>(v), v);
+
+ mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
+ test = ShiftLeft<1>(test); // next bit (descending order)
+ v = IfThenElse(mask, ShiftRight<2>(v), v);
+
+ mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
+ return IfThenElse(mask, ShiftRight<1>(v), v);
+}
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
+HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, const Vec128<T, N> bits) {
+ const DFromV<decltype(v)> d;
+ Mask128<T, N> mask;
+ // Need a signed type for BroadcastSignBit.
+ auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
+ // Move the highest valid bit of the shift count into the sign bit.
+ test = ShiftLeft<27>(test);
+
+ mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
+ test = ShiftLeft<1>(test); // next bit (descending order)
+ v = IfThenElse(mask, ShiftRight<16>(v), v);
+
+ mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
+ test = ShiftLeft<1>(test); // next bit (descending order)
+ v = IfThenElse(mask, ShiftRight<8>(v), v);
+
+ mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
+ test = ShiftLeft<1>(test); // next bit (descending order)
+ v = IfThenElse(mask, ShiftRight<4>(v), v);
+
+ mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
+ test = ShiftLeft<1>(test); // next bit (descending order)
+ v = IfThenElse(mask, ShiftRight<2>(v), v);
+
+ mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
+ return IfThenElse(mask, ShiftRight<1>(v), v);
+}
+
+// ================================================== MEMORY
+
+// ------------------------------ Load
+
+template <typename T>
+HWY_API Vec128<T> Load(Full128<T> /* tag */, const T* HWY_RESTRICT aligned) {
+ return Vec128<T>{wasm_v128_load(aligned)};
+}
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> d,
+ const T* HWY_RESTRICT aligned) {
+ return IfThenElseZero(m, Load(d, aligned));
+}
+
+// Partial load.
+template <typename T, size_t N, HWY_IF_LE64(T, N)>
+HWY_API Vec128<T, N> Load(Simd<T, N, 0> /* tag */, const T* HWY_RESTRICT p) {
+ Vec128<T, N> v;
+ CopyBytes<sizeof(T) * N>(p, &v);
+ return v;
+}
+
+// LoadU == Load.
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_API Vec128<T, N> LoadU(Simd<T, N, 0> d, const T* HWY_RESTRICT p) {
+ return Load(d, p);
+}
+
+// 128-bit SIMD => nothing to duplicate, same as an unaligned load.
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_API Vec128<T, N> LoadDup128(Simd<T, N, 0> d, const T* HWY_RESTRICT p) {
+ return Load(d, p);
+}
+
+// ------------------------------ Store
+
+template <typename T>
+HWY_API void Store(Vec128<T> v, Full128<T> /* tag */, T* HWY_RESTRICT aligned) {
+ wasm_v128_store(aligned, v.raw);
+}
+
+// Partial store.
+template <typename T, size_t N, HWY_IF_LE64(T, N)>
+HWY_API void Store(Vec128<T, N> v, Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
+ CopyBytes<sizeof(T) * N>(&v, p);
+}
+
+HWY_API void Store(const Vec128<float, 1> v, Simd<float, 1, 0> /* tag */,
+ float* HWY_RESTRICT p) {
+ *p = wasm_f32x4_extract_lane(v.raw, 0);
+}
+
+// StoreU == Store.
+template <typename T, size_t N>
+HWY_API void StoreU(Vec128<T, N> v, Simd<T, N, 0> d, T* HWY_RESTRICT p) {
+ Store(v, d, p);
+}
+
+template <typename T, size_t N>
+HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N, 0> d,
+ T* HWY_RESTRICT p) {
+ StoreU(IfThenElse(m, v, LoadU(d, p)), d, p);
+}
+
+// ------------------------------ Non-temporal stores
+
+// Same as aligned stores on non-x86.
+
+template <typename T, size_t N>
+HWY_API void Stream(Vec128<T, N> v, Simd<T, N, 0> /* tag */,
+ T* HWY_RESTRICT aligned) {
+ wasm_v128_store(aligned, v.raw);
+}
+
+// ------------------------------ Scatter (Store)
+
+template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)>
+HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N, 0> d,
+ T* HWY_RESTRICT base,
+ const Vec128<Offset, N> offset) {
+ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
+
+ alignas(16) T lanes[N];
+ Store(v, d, lanes);
+
+ alignas(16) Offset offset_lanes[N];
+ Store(offset, Rebind<Offset, decltype(d)>(), offset_lanes);
+
+ uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
+ for (size_t i = 0; i < N; ++i) {
+ CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
+ }
+}
+
+template <typename T, size_t N, typename Index, HWY_IF_LE128(T, N)>
+HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N, 0> d, T* HWY_RESTRICT base,
+ const Vec128<Index, N> index) {
+ static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
+
+ alignas(16) T lanes[N];
+ Store(v, d, lanes);
+
+ alignas(16) Index index_lanes[N];
+ Store(index, Rebind<Index, decltype(d)>(), index_lanes);
+
+ for (size_t i = 0; i < N; ++i) {
+ base[index_lanes[i]] = lanes[i];
+ }
+}
+
+// ------------------------------ Gather (Load/Store)
+
+template <typename T, size_t N, typename Offset>
+HWY_API Vec128<T, N> GatherOffset(const Simd<T, N, 0> d,
+ const T* HWY_RESTRICT base,
+ const Vec128<Offset, N> offset) {
+ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
+
+ alignas(16) Offset offset_lanes[N];
+ Store(offset, Rebind<Offset, decltype(d)>(), offset_lanes);
+
+ alignas(16) T lanes[N];
+ const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
+ for (size_t i = 0; i < N; ++i) {
+ CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
+ }
+ return Load(d, lanes);
+}
+
+template <typename T, size_t N, typename Index>
+HWY_API Vec128<T, N> GatherIndex(const Simd<T, N, 0> d,
+ const T* HWY_RESTRICT base,
+ const Vec128<Index, N> index) {
+ static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
+
+ alignas(16) Index index_lanes[N];
+ Store(index, Rebind<Index, decltype(d)>(), index_lanes);
+
+ alignas(16) T lanes[N];
+ for (size_t i = 0; i < N; ++i) {
+ lanes[i] = base[index_lanes[i]];
+ }
+ return Load(d, lanes);
+}
+
+// ================================================== SWIZZLE
+
+// ------------------------------ ExtractLane
+
+namespace detail {
+
+template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
+HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
+ return static_cast<T>(wasm_i8x16_extract_lane(v.raw, kLane));
+}
+template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
+HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
+ return static_cast<T>(wasm_i16x8_extract_lane(v.raw, kLane));
+}
+template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
+HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
+ return static_cast<T>(wasm_i32x4_extract_lane(v.raw, kLane));
+}
+template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
+HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
+ return static_cast<T>(wasm_i64x2_extract_lane(v.raw, kLane));
+}
+
+template <size_t kLane, size_t N>
+HWY_INLINE float ExtractLane(const Vec128<float, N> v) {
+ return wasm_f32x4_extract_lane(v.raw, kLane);
+}
+
+} // namespace detail
+
+// One overload per vector length just in case *_extract_lane raise compile
+// errors if their argument is out of bounds (even if that would never be
+// reached at runtime).
+template <typename T>
+HWY_API T ExtractLane(const Vec128<T, 1> v, size_t i) {
+ HWY_DASSERT(i == 0);
+ (void)i;
+ return GetLane(v);
+}
+
+template <typename T>
+HWY_API T ExtractLane(const Vec128<T, 2> v, size_t i) {
+#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
+ if (__builtin_constant_p(i)) {
+ switch (i) {
+ case 0:
+ return detail::ExtractLane<0>(v);
+ case 1:
+ return detail::ExtractLane<1>(v);
+ }
+ }
+#endif
+ alignas(16) T lanes[2];
+ Store(v, DFromV<decltype(v)>(), lanes);
+ return lanes[i];
+}
+
+template <typename T>
+HWY_API T ExtractLane(const Vec128<T, 4> v, size_t i) {
+#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
+ if (__builtin_constant_p(i)) {
+ switch (i) {
+ case 0:
+ return detail::ExtractLane<0>(v);
+ case 1:
+ return detail::ExtractLane<1>(v);
+ case 2:
+ return detail::ExtractLane<2>(v);
+ case 3:
+ return detail::ExtractLane<3>(v);
+ }
+ }
+#endif
+ alignas(16) T lanes[4];
+ Store(v, DFromV<decltype(v)>(), lanes);
+ return lanes[i];
+}
+
+template <typename T>
+HWY_API T ExtractLane(const Vec128<T, 8> v, size_t i) {
+#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
+ if (__builtin_constant_p(i)) {
+ switch (i) {
+ case 0:
+ return detail::ExtractLane<0>(v);
+ case 1:
+ return detail::ExtractLane<1>(v);
+ case 2:
+ return detail::ExtractLane<2>(v);
+ case 3:
+ return detail::ExtractLane<3>(v);
+ case 4:
+ return detail::ExtractLane<4>(v);
+ case 5:
+ return detail::ExtractLane<5>(v);
+ case 6:
+ return detail::ExtractLane<6>(v);
+ case 7:
+ return detail::ExtractLane<7>(v);
+ }
+ }
+#endif
+ alignas(16) T lanes[8];
+ Store(v, DFromV<decltype(v)>(), lanes);
+ return lanes[i];
+}
+
+template <typename T>
+HWY_API T ExtractLane(const Vec128<T, 16> v, size_t i) {
+#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
+ if (__builtin_constant_p(i)) {
+ switch (i) {
+ case 0:
+ return detail::ExtractLane<0>(v);
+ case 1:
+ return detail::ExtractLane<1>(v);
+ case 2:
+ return detail::ExtractLane<2>(v);
+ case 3:
+ return detail::ExtractLane<3>(v);
+ case 4:
+ return detail::ExtractLane<4>(v);
+ case 5:
+ return detail::ExtractLane<5>(v);
+ case 6:
+ return detail::ExtractLane<6>(v);
+ case 7:
+ return detail::ExtractLane<7>(v);
+ case 8:
+ return detail::ExtractLane<8>(v);
+ case 9:
+ return detail::ExtractLane<9>(v);
+ case 10:
+ return detail::ExtractLane<10>(v);
+ case 11:
+ return detail::ExtractLane<11>(v);
+ case 12:
+ return detail::ExtractLane<12>(v);
+ case 13:
+ return detail::ExtractLane<13>(v);
+ case 14:
+ return detail::ExtractLane<14>(v);
+ case 15:
+ return detail::ExtractLane<15>(v);
+ }
+ }
+#endif
+ alignas(16) T lanes[16];
+ Store(v, DFromV<decltype(v)>(), lanes);
+ return lanes[i];
+}
+
+// ------------------------------ GetLane
+template <typename T, size_t N>
+HWY_API T GetLane(const Vec128<T, N> v) {
+ return detail::ExtractLane<0>(v);
+}
+
+// ------------------------------ InsertLane
+
+namespace detail {
+
+template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
+HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
+ static_assert(kLane < N, "Lane index out of bounds");
+ return Vec128<T, N>{
+ wasm_i8x16_replace_lane(v.raw, kLane, static_cast<int8_t>(t))};
+}
+
+template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
+HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
+ static_assert(kLane < N, "Lane index out of bounds");
+ return Vec128<T, N>{
+ wasm_i16x8_replace_lane(v.raw, kLane, static_cast<int16_t>(t))};
+}
+
+template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
+HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
+ static_assert(kLane < N, "Lane index out of bounds");
+ return Vec128<T, N>{
+ wasm_i32x4_replace_lane(v.raw, kLane, static_cast<int32_t>(t))};
+}
+
+template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
+HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
+ static_assert(kLane < N, "Lane index out of bounds");
+ return Vec128<T, N>{
+ wasm_i64x2_replace_lane(v.raw, kLane, static_cast<int64_t>(t))};
+}
+
+template <size_t kLane, size_t N>
+HWY_INLINE Vec128<float, N> InsertLane(const Vec128<float, N> v, float t) {
+ static_assert(kLane < N, "Lane index out of bounds");
+ return Vec128<float, N>{wasm_f32x4_replace_lane(v.raw, kLane, t)};
+}
+
+template <size_t kLane, size_t N>
+HWY_INLINE Vec128<double, N> InsertLane(const Vec128<double, N> v, double t) {
+ static_assert(kLane < 2, "Lane index out of bounds");
+ return Vec128<double, N>{wasm_f64x2_replace_lane(v.raw, kLane, t)};
+}
+
+} // namespace detail
+
+// Requires one overload per vector length because InsertLane<3> may be a
+// compile error if it calls wasm_f64x2_replace_lane.
+
+template <typename T>
+HWY_API Vec128<T, 1> InsertLane(const Vec128<T, 1> v, size_t i, T t) {
+ HWY_DASSERT(i == 0);
+ (void)i;
+ return Set(DFromV<decltype(v)>(), t);
+}
+
+template <typename T>
+HWY_API Vec128<T, 2> InsertLane(const Vec128<T, 2> v, size_t i, T t) {
+#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
+ if (__builtin_constant_p(i)) {
+ switch (i) {
+ case 0:
+ return detail::InsertLane<0>(v, t);
+ case 1:
+ return detail::InsertLane<1>(v, t);
+ }
+ }
+#endif
+ const DFromV<decltype(v)> d;
+ alignas(16) T lanes[2];
+ Store(v, d, lanes);
+ lanes[i] = t;
+ return Load(d, lanes);
+}
+
+template <typename T>
+HWY_API Vec128<T, 4> InsertLane(const Vec128<T, 4> v, size_t i, T t) {
+#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
+ if (__builtin_constant_p(i)) {
+ switch (i) {
+ case 0:
+ return detail::InsertLane<0>(v, t);
+ case 1:
+ return detail::InsertLane<1>(v, t);
+ case 2:
+ return detail::InsertLane<2>(v, t);
+ case 3:
+ return detail::InsertLane<3>(v, t);
+ }
+ }
+#endif
+ const DFromV<decltype(v)> d;
+ alignas(16) T lanes[4];
+ Store(v, d, lanes);
+ lanes[i] = t;
+ return Load(d, lanes);
+}
+
+template <typename T>
+HWY_API Vec128<T, 8> InsertLane(const Vec128<T, 8> v, size_t i, T t) {
+#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
+ if (__builtin_constant_p(i)) {
+ switch (i) {
+ case 0:
+ return detail::InsertLane<0>(v, t);
+ case 1:
+ return detail::InsertLane<1>(v, t);
+ case 2:
+ return detail::InsertLane<2>(v, t);
+ case 3:
+ return detail::InsertLane<3>(v, t);
+ case 4:
+ return detail::InsertLane<4>(v, t);
+ case 5:
+ return detail::InsertLane<5>(v, t);
+ case 6:
+ return detail::InsertLane<6>(v, t);
+ case 7:
+ return detail::InsertLane<7>(v, t);
+ }
+ }
+#endif
+ const DFromV<decltype(v)> d;
+ alignas(16) T lanes[8];
+ Store(v, d, lanes);
+ lanes[i] = t;
+ return Load(d, lanes);
+}
+
+template <typename T>
+HWY_API Vec128<T, 16> InsertLane(const Vec128<T, 16> v, size_t i, T t) {
+#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
+ if (__builtin_constant_p(i)) {
+ switch (i) {
+ case 0:
+ return detail::InsertLane<0>(v, t);
+ case 1:
+ return detail::InsertLane<1>(v, t);
+ case 2:
+ return detail::InsertLane<2>(v, t);
+ case 3:
+ return detail::InsertLane<3>(v, t);
+ case 4:
+ return detail::InsertLane<4>(v, t);
+ case 5:
+ return detail::InsertLane<5>(v, t);
+ case 6:
+ return detail::InsertLane<6>(v, t);
+ case 7:
+ return detail::InsertLane<7>(v, t);
+ case 8:
+ return detail::InsertLane<8>(v, t);
+ case 9:
+ return detail::InsertLane<9>(v, t);
+ case 10:
+ return detail::InsertLane<10>(v, t);
+ case 11:
+ return detail::InsertLane<11>(v, t);
+ case 12:
+ return detail::InsertLane<12>(v, t);
+ case 13:
+ return detail::InsertLane<13>(v, t);
+ case 14:
+ return detail::InsertLane<14>(v, t);
+ case 15:
+ return detail::InsertLane<15>(v, t);
+ }
+ }
+#endif
+ const DFromV<decltype(v)> d;
+ alignas(16) T lanes[16];
+ Store(v, d, lanes);
+ lanes[i] = t;
+ return Load(d, lanes);
+}
+
+// ------------------------------ LowerHalf
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N / 2> LowerHalf(Simd<T, N / 2, 0> /* tag */,
+ Vec128<T, N> v) {
+ return Vec128<T, N / 2>{v.raw};
+}
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
+ return LowerHalf(Simd<T, N / 2, 0>(), v);
+}
+
+// ------------------------------ ShiftLeftBytes
+
+// 0x01..0F, kBytes = 1 => 0x02..0F00
+template <int kBytes, typename T, size_t N>
+HWY_API Vec128<T, N> ShiftLeftBytes(Simd<T, N, 0> /* tag */, Vec128<T, N> v) {
+ static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
+ const __i8x16 zero = wasm_i8x16_splat(0);
+ switch (kBytes) {
+ case 0:
+ return v;
+
+ case 1:
+ return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5,
+ 6, 7, 8, 9, 10, 11, 12, 13, 14)};
+
+ case 2:
+ return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4,
+ 5, 6, 7, 8, 9, 10, 11, 12, 13)};
+
+ case 3:
+ return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2,
+ 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)};
+
+ case 4:
+ return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1,
+ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)};
+
+ case 5:
+ return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0,
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)};
+
+ case 6:
+ return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
+ 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9)};
+
+ case 7:
+ return Vec128<T, N>{wasm_i8x16_shuffle(
+ v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8)};
+
+ case 8:
+ return Vec128<T, N>{wasm_i8x16_shuffle(
+ v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7)};
+
+ case 9:
+ return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 0, 1, 2, 3, 4, 5,
+ 6)};
+
+ case 10:
+ return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 0, 1, 2, 3, 4,
+ 5)};
+
+ case 11:
+ return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 0, 1, 2, 3,
+ 4)};
+
+ case 12:
+ return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 0, 1,
+ 2, 3)};
+
+ case 13:
+ return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 0,
+ 1, 2)};
+
+ case 14:
+ return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 0, 1)};
+
+ case 15:
+ return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 0)};
+ }
+ return Vec128<T, N>{zero};
+}
+
+template <int kBytes, typename T, size_t N>
+HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) {
+ return ShiftLeftBytes<kBytes>(Simd<T, N, 0>(), v);
+}
+
+// ------------------------------ ShiftLeftLanes
+
+template <int kLanes, typename T, size_t N>
+HWY_API Vec128<T, N> ShiftLeftLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
+ const Repartition<uint8_t, decltype(d)> d8;
+ return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
+}
+
+template <int kLanes, typename T, size_t N>
+HWY_API Vec128<T, N> ShiftLeftLanes(const Vec128<T, N> v) {
+ return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
+}
+
+// ------------------------------ ShiftRightBytes
+namespace detail {
+
+// Helper function allows zeroing invalid lanes in caller.
+template <int kBytes, typename T, size_t N>
+HWY_API __i8x16 ShrBytes(const Vec128<T, N> v) {
+ static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
+ const __i8x16 zero = wasm_i8x16_splat(0);
+
+ switch (kBytes) {
+ case 0:
+ return v.raw;
+
+ case 1:
+ return wasm_i8x16_shuffle(v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+ 12, 13, 14, 15, 16);
+
+ case 2:
+ return wasm_i8x16_shuffle(v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+ 13, 14, 15, 16, 16);
+
+ case 3:
+ return wasm_i8x16_shuffle(v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+ 13, 14, 15, 16, 16, 16);
+
+ case 4:
+ return wasm_i8x16_shuffle(v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+ 14, 15, 16, 16, 16, 16);
+
+ case 5:
+ return wasm_i8x16_shuffle(v.raw, zero, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 16, 16, 16, 16);
+
+ case 6:
+ return wasm_i8x16_shuffle(v.raw, zero, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 16, 16, 16, 16, 16);
+
+ case 7:
+ return wasm_i8x16_shuffle(v.raw, zero, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 16, 16, 16, 16, 16, 16);
+
+ case 8:
+ return wasm_i8x16_shuffle(v.raw, zero, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+ 16, 16, 16, 16, 16, 16, 16);
+
+ case 9:
+ return wasm_i8x16_shuffle(v.raw, zero, 9, 10, 11, 12, 13, 14, 15, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16);
+
+ case 10:
+ return wasm_i8x16_shuffle(v.raw, zero, 10, 11, 12, 13, 14, 15, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16);
+
+ case 11:
+ return wasm_i8x16_shuffle(v.raw, zero, 11, 12, 13, 14, 15, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16);
+
+ case 12:
+ return wasm_i8x16_shuffle(v.raw, zero, 12, 13, 14, 15, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16);
+
+ case 13:
+ return wasm_i8x16_shuffle(v.raw, zero, 13, 14, 15, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16);
+
+ case 14:
+ return wasm_i8x16_shuffle(v.raw, zero, 14, 15, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16);
+
+ case 15:
+ return wasm_i8x16_shuffle(v.raw, zero, 15, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16);
+ case 16:
+ return zero;
+ }
+}
+
+} // namespace detail
+
+// 0x01..0F, kBytes = 1 => 0x0001..0E
+template <int kBytes, typename T, size_t N>
+HWY_API Vec128<T, N> ShiftRightBytes(Simd<T, N, 0> /* tag */, Vec128<T, N> v) {
+ // For partial vectors, clear upper lanes so we shift in zeros.
+ if (N != 16 / sizeof(T)) {
+ const Vec128<T> vfull{v.raw};
+ v = Vec128<T, N>{IfThenElseZero(FirstN(Full128<T>(), N), vfull).raw};
+ }
+ return Vec128<T, N>{detail::ShrBytes<kBytes>(v)};
+}
+
+// ------------------------------ ShiftRightLanes
+template <int kLanes, typename T, size_t N>
+HWY_API Vec128<T, N> ShiftRightLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
+ const Repartition<uint8_t, decltype(d)> d8;
+ return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
+}
+
+// ------------------------------ UpperHalf (ShiftRightBytes)
+
+// Full input: copy hi into lo (smaller instruction encoding than shifts).
+template <typename T>
+HWY_API Vec64<T> UpperHalf(Full64<T> /* tag */, const Vec128<T> v) {
+ return Vec64<T>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
+}
+HWY_API Vec64<float> UpperHalf(Full64<float> /* tag */, const Vec128<float> v) {
+ return Vec64<float>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
+}
+
+// Partial
+template <typename T, size_t N, HWY_IF_LE64(T, N)>
+HWY_API Vec128<T, (N + 1) / 2> UpperHalf(Half<Simd<T, N, 0>> /* tag */,
+ Vec128<T, N> v) {
+ const DFromV<decltype(v)> d;
+ const RebindToUnsigned<decltype(d)> du;
+ const auto vu = BitCast(du, v);
+ const auto upper = BitCast(d, ShiftRightBytes<N * sizeof(T) / 2>(du, vu));
+ return Vec128<T, (N + 1) / 2>{upper.raw};
+}
+
+// ------------------------------ CombineShiftRightBytes
+
+template <int kBytes, typename T, class V = Vec128<T>>
+HWY_API V CombineShiftRightBytes(Full128<T> /* tag */, V hi, V lo) {
+ static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
+ switch (kBytes) {
+ case 0:
+ return lo;
+
+ case 1:
+ return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+ 11, 12, 13, 14, 15, 16)};
+
+ case 2:
+ return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+ 11, 12, 13, 14, 15, 16, 17)};
+
+ case 3:
+ return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+ 12, 13, 14, 15, 16, 17, 18)};
+
+ case 4:
+ return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+ 13, 14, 15, 16, 17, 18, 19)};
+
+ case 5:
+ return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+ 14, 15, 16, 17, 18, 19, 20)};
+
+ case 6:
+ return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11, 12, 13,
+ 14, 15, 16, 17, 18, 19, 20, 21)};
+
+ case 7:
+ return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22)};
+
+ case 8:
+ return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23)};
+
+ case 9:
+ return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14, 15, 16,
+ 17, 18, 19, 20, 21, 22, 23, 24)};
+
+ case 10:
+ return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14, 15, 16,
+ 17, 18, 19, 20, 21, 22, 23, 24, 25)};
+
+ case 11:
+ return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15, 16, 17,
+ 18, 19, 20, 21, 22, 23, 24, 25, 26)};
+
+ case 12:
+ return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16, 17, 18,
+ 19, 20, 21, 22, 23, 24, 25, 26, 27)};
+
+ case 13:
+ return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17, 18, 19,
+ 20, 21, 22, 23, 24, 25, 26, 27, 28)};
+
+ case 14:
+ return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18, 19, 20,
+ 21, 22, 23, 24, 25, 26, 27, 28, 29)};
+
+ case 15:
+ return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19, 20, 21,
+ 22, 23, 24, 25, 26, 27, 28, 29, 30)};
+ }
+ return hi;
+}
+
+template <int kBytes, typename T, size_t N, HWY_IF_LE64(T, N),
+ class V = Vec128<T, N>>
+HWY_API V CombineShiftRightBytes(Simd<T, N, 0> d, V hi, V lo) {
+ constexpr size_t kSize = N * sizeof(T);
+ static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
+ const Repartition<uint8_t, decltype(d)> d8;
+ const Full128<uint8_t> d_full8;
+ using V8 = VFromD<decltype(d_full8)>;
+ const V8 hi8{BitCast(d8, hi).raw};
+ // Move into most-significant bytes
+ const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw});
+ const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(d_full8, hi8, lo8);
+ return V{BitCast(Full128<T>(), r).raw};
+}
+
+// ------------------------------ Broadcast/splat any lane
+
+template <int kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
+HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
+ static_assert(0 <= kLane && kLane < N, "Invalid lane");
+ return Vec128<T, N>{wasm_i16x8_shuffle(v.raw, v.raw, kLane, kLane, kLane,
+ kLane, kLane, kLane, kLane, kLane)};
+}
+
+template <int kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
+HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
+ static_assert(0 <= kLane && kLane < N, "Invalid lane");
+ return Vec128<T, N>{
+ wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
+}
+
+template <int kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
+HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
+ static_assert(0 <= kLane && kLane < N, "Invalid lane");
+ return Vec128<T, N>{wasm_i64x2_shuffle(v.raw, v.raw, kLane, kLane)};
+}
+
+// ------------------------------ TableLookupBytes
+
+// Returns vector of bytes[from[i]]. "from" is also interpreted as bytes, i.e.
+// lane indices in [0, 16).
+template <typename T, size_t N, typename TI, size_t NI>
+HWY_API Vec128<TI, NI> TableLookupBytes(const Vec128<T, N> bytes,
+ const Vec128<TI, NI> from) {
+// Not yet available in all engines, see
+// https://github.com/WebAssembly/simd/blob/bdcc304b2d379f4601c2c44ea9b44ed9484fde7e/proposals/simd/ImplementationStatus.md
+// V8 implementation of this had a bug, fixed on 2021-04-03:
+// https://chromium-review.googlesource.com/c/v8/v8/+/2822951
+#if 0
+ return Vec128<TI, NI>{wasm_i8x16_swizzle(bytes.raw, from.raw)};
+#else
+ alignas(16) uint8_t control[16];
+ alignas(16) uint8_t input[16];
+ alignas(16) uint8_t output[16];
+ wasm_v128_store(control, from.raw);
+ wasm_v128_store(input, bytes.raw);
+ for (size_t i = 0; i < 16; ++i) {
+ output[i] = control[i] < 16 ? input[control[i]] : 0;
+ }
+ return Vec128<TI, NI>{wasm_v128_load(output)};
+#endif
+}
+
+template <typename T, size_t N, typename TI, size_t NI>
+HWY_API Vec128<TI, NI> TableLookupBytesOr0(const Vec128<T, N> bytes,
+ const Vec128<TI, NI> from) {
+ const Simd<TI, NI, 0> d;
+ // Mask size must match vector type, so cast everything to this type.
+ Repartition<int8_t, decltype(d)> di8;
+ Repartition<int8_t, Simd<T, N, 0>> d_bytes8;
+ const auto msb = BitCast(di8, from) < Zero(di8);
+ const auto lookup =
+ TableLookupBytes(BitCast(d_bytes8, bytes), BitCast(di8, from));
+ return BitCast(d, IfThenZeroElse(msb, lookup));
+}
+
+// ------------------------------ Hard-coded shuffles
+
+// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
+// Shuffle0321 rotates one lane to the right (the previous least-significant
+// lane is now most-significant). These could also be implemented via
+// CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
+
+// Swap 32-bit halves in 64-bit halves.
+template <typename T, size_t N>
+HWY_API Vec128<T, N> Shuffle2301(const Vec128<T, N> v) {
+ static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
+ static_assert(N == 2 || N == 4, "Does not make sense for N=1");
+ return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
+}
+
+// These are used by generic_ops-inl to implement LoadInterleaved3.
+namespace detail {
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
+HWY_API Vec128<T, N> Shuffle2301(const Vec128<T, N> a, const Vec128<T, N> b) {
+ static_assert(N == 2 || N == 4, "Does not make sense for N=1");
+ return Vec128<T, N>{wasm_i8x16_shuffle(a.raw, b.raw, 1, 0, 3 + 16, 2 + 16,
+ 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
+ 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)};
+}
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
+HWY_API Vec128<T, N> Shuffle2301(const Vec128<T, N> a, const Vec128<T, N> b) {
+ static_assert(N == 2 || N == 4, "Does not make sense for N=1");
+ return Vec128<T, N>{wasm_i16x8_shuffle(a.raw, b.raw, 1, 0, 3 + 8, 2 + 8,
+ 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)};
+}
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
+HWY_API Vec128<T, N> Shuffle2301(const Vec128<T, N> a, const Vec128<T, N> b) {
+ static_assert(N == 2 || N == 4, "Does not make sense for N=1");
+ return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 1, 0, 3 + 4, 2 + 4)};
+}
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
+HWY_API Vec128<T, N> Shuffle1230(const Vec128<T, N> a, const Vec128<T, N> b) {
+ static_assert(N == 2 || N == 4, "Does not make sense for N=1");
+ return Vec128<T, N>{wasm_i8x16_shuffle(a.raw, b.raw, 0, 3, 2 + 16, 1 + 16,
+ 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
+ 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)};
+}
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
+HWY_API Vec128<T, N> Shuffle1230(const Vec128<T, N> a, const Vec128<T, N> b) {
+ static_assert(N == 2 || N == 4, "Does not make sense for N=1");
+ return Vec128<T, N>{wasm_i16x8_shuffle(a.raw, b.raw, 0, 3, 2 + 8, 1 + 8,
+ 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)};
+}
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
+HWY_API Vec128<T, N> Shuffle1230(const Vec128<T, N> a, const Vec128<T, N> b) {
+ static_assert(N == 2 || N == 4, "Does not make sense for N=1");
+ return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 3, 2 + 4, 1 + 4)};
+}
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
+HWY_API Vec128<T, N> Shuffle3012(const Vec128<T, N> a, const Vec128<T, N> b) {
+ static_assert(N == 2 || N == 4, "Does not make sense for N=1");
+ return Vec128<T, N>{wasm_i8x16_shuffle(a.raw, b.raw, 2, 1, 0 + 16, 3 + 16,
+ 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
+ 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)};
+}
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
+HWY_API Vec128<T, N> Shuffle3012(const Vec128<T, N> a, const Vec128<T, N> b) {
+ static_assert(N == 2 || N == 4, "Does not make sense for N=1");
+ return Vec128<T, N>{wasm_i16x8_shuffle(a.raw, b.raw, 2, 1, 0 + 8, 3 + 8,
+ 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)};
+}
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
+HWY_API Vec128<T, N> Shuffle3012(const Vec128<T, N> a, const Vec128<T, N> b) {
+ static_assert(N == 2 || N == 4, "Does not make sense for N=1");
+ return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 1, 0 + 4, 3 + 4)};
+}
+
+} // namespace detail
+
+// Swap 64-bit halves
+template <typename T>
+HWY_API Vec128<T> Shuffle01(const Vec128<T> v) {
+ static_assert(sizeof(T) == 8, "Only for 64-bit lanes");
+ return Vec128<T>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
+}
+template <typename T>
+HWY_API Vec128<T> Shuffle1032(const Vec128<T> v) {
+ static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
+ return Vec128<T>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
+}
+
+// Rotate right 32 bits
+template <typename T>
+HWY_API Vec128<T> Shuffle0321(const Vec128<T> v) {
+ static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
+ return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
+}
+
+// Rotate left 32 bits
+template <typename T>
+HWY_API Vec128<T> Shuffle2103(const Vec128<T> v) {
+ static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
+ return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
+}
+
+// Reverse
+template <typename T>
+HWY_API Vec128<T> Shuffle0123(const Vec128<T> v) {
+ static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
+ return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
+}
+
+// ------------------------------ TableLookupLanes
+
+// Returned by SetTableIndices for use by TableLookupLanes.
+template <typename T, size_t N = 16 / sizeof(T)>
+struct Indices128 {
+ __v128_u raw;
+};
+
+template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N)>
+HWY_API Indices128<T, N> IndicesFromVec(Simd<T, N, 0> d, Vec128<TI, N> vec) {
+ static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
+#if HWY_IS_DEBUG_BUILD
+ const Rebind<TI, decltype(d)> di;
+ HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
+ AllTrue(di, Lt(vec, Set(di, static_cast<TI>(N)))));
+#endif
+
+ const Repartition<uint8_t, decltype(d)> d8;
+ using V8 = VFromD<decltype(d8)>;
+ const Repartition<uint16_t, decltype(d)> d16;
+
+ // Broadcast each lane index to all bytes of T and shift to bytes
+ static_assert(sizeof(T) == 4 || sizeof(T) == 8, "");
+ if (sizeof(T) == 4) {
+ alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
+ 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
+ const V8 lane_indices =
+ TableLookupBytes(BitCast(d8, vec), Load(d8, kBroadcastLaneBytes));
+ const V8 byte_indices =
+ BitCast(d8, ShiftLeft<2>(BitCast(d16, lane_indices)));
+ alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3,
+ 0, 1, 2, 3, 0, 1, 2, 3};
+ return Indices128<T, N>{Add(byte_indices, Load(d8, kByteOffsets)).raw};
+ } else {
+ alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
+ const V8 lane_indices =
+ TableLookupBytes(BitCast(d8, vec), Load(d8, kBroadcastLaneBytes));
+ const V8 byte_indices =
+ BitCast(d8, ShiftLeft<3>(BitCast(d16, lane_indices)));
+ alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 4, 5, 6, 7,
+ 0, 1, 2, 3, 4, 5, 6, 7};
+ return Indices128<T, N>{Add(byte_indices, Load(d8, kByteOffsets)).raw};
+ }
+}
+
+template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N)>
+HWY_API Indices128<T, N> SetTableIndices(Simd<T, N, 0> d, const TI* idx) {
+ const Rebind<TI, decltype(d)> di;
+ return IndicesFromVec(d, LoadU(di, idx));
+}
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) {
+ using TI = MakeSigned<T>;
+ const DFromV<decltype(v)> d;
+ const Rebind<TI, decltype(d)> di;
+ return BitCast(d, TableLookupBytes(BitCast(di, v), Vec128<TI, N>{idx.raw}));
+}
+
+// ------------------------------ Reverse (Shuffle0123, Shuffle2301, Shuffle01)
+
+// Single lane: no change
+template <typename T>
+HWY_API Vec128<T, 1> Reverse(Simd<T, 1, 0> /* tag */, const Vec128<T, 1> v) {
+ return v;
+}
+
+// Two lanes: shuffle
+template <typename T, HWY_IF_LANE_SIZE(T, 4)>
+HWY_API Vec128<T, 2> Reverse(Simd<T, 2, 0> /* tag */, const Vec128<T, 2> v) {
+ return Vec128<T, 2>{Shuffle2301(Vec128<T>{v.raw}).raw};
+}
+
+template <typename T, HWY_IF_LANE_SIZE(T, 8)>
+HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
+ return Shuffle01(v);
+}
+
+// Four lanes: shuffle
+template <typename T, HWY_IF_LANE_SIZE(T, 4)>
+HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
+ return Shuffle0123(v);
+}
+
+// 16-bit
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
+HWY_API Vec128<T, N> Reverse(Simd<T, N, 0> d, const Vec128<T, N> v) {
+ const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
+ return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v))));
+}
+
+// ------------------------------ Reverse2
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
+HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> d, const Vec128<T, N> v) {
+ const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
+ return BitCast(d, RotateRight<16>(BitCast(du32, v)));
+}
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
+HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
+ return Shuffle2301(v);
+}
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
+HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
+ return Shuffle01(v);
+}
+
+// ------------------------------ Reverse4
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
+HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> d, const Vec128<T, N> v) {
+ return BitCast(d, Vec128<uint16_t, N>{wasm_i16x8_shuffle(v.raw, v.raw, 3, 2,
+ 1, 0, 7, 6, 5, 4)});
+}
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
+HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
+ return Shuffle0123(v);
+}
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
+HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> /* tag */, const Vec128<T, N>) {
+ HWY_ASSERT(0); // don't have 8 u64 lanes
+}
+
+// ------------------------------ Reverse8
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
+HWY_API Vec128<T, N> Reverse8(Simd<T, N, 0> d, const Vec128<T, N> v) {
+ return Reverse(d, v);
+}
+
+template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
+HWY_API Vec128<T, N> Reverse8(Simd<T, N, 0>, const Vec128<T, N>) {
+ HWY_ASSERT(0); // don't have 8 lanes unless 16-bit
+}
+
+// ------------------------------ InterleaveLower
+
+template <size_t N>
+HWY_API Vec128<uint8_t, N> InterleaveLower(Vec128<uint8_t, N> a,
+ Vec128<uint8_t, N> b) {
+ return Vec128<uint8_t, N>{wasm_i8x16_shuffle(
+ a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
+}
+template <size_t N>
+HWY_API Vec128<uint16_t, N> InterleaveLower(Vec128<uint16_t, N> a,
+ Vec128<uint16_t, N> b) {
+ return Vec128<uint16_t, N>{
+ wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
+}
+template <size_t N>
+HWY_API Vec128<uint32_t, N> InterleaveLower(Vec128<uint32_t, N> a,
+ Vec128<uint32_t, N> b) {
+ return Vec128<uint32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
+}
+template <size_t N>
+HWY_API Vec128<uint64_t, N> InterleaveLower(Vec128<uint64_t, N> a,
+ Vec128<uint64_t, N> b) {
+ return Vec128<uint64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
+}
+
+template <size_t N>
+HWY_API Vec128<int8_t, N> InterleaveLower(Vec128<int8_t, N> a,
+ Vec128<int8_t, N> b) {
+ return Vec128<int8_t, N>{wasm_i8x16_shuffle(
+ a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
+}
+template <size_t N>
+HWY_API Vec128<int16_t, N> InterleaveLower(Vec128<int16_t, N> a,
+ Vec128<int16_t, N> b) {
+ return Vec128<int16_t, N>{
+ wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
+}
+template <size_t N>
+HWY_API Vec128<int32_t, N> InterleaveLower(Vec128<int32_t, N> a,
+ Vec128<int32_t, N> b) {
+ return Vec128<int32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
+}
+template <size_t N>
+HWY_API Vec128<int64_t, N> InterleaveLower(Vec128<int64_t, N> a,
+ Vec128<int64_t, N> b) {
+ return Vec128<int64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
+}
+
+template <size_t N>
+HWY_API Vec128<float, N> InterleaveLower(Vec128<float, N> a,
+ Vec128<float, N> b) {
+ return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
+}
+
+template <size_t N>
+HWY_API Vec128<double, N> InterleaveLower(Vec128<double, N> a,
+ Vec128<double, N> b) {
+ return Vec128<double, N>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
+}
+
+// Additional overload for the optional tag.
+template <class V>
+HWY_API V InterleaveLower(DFromV<V> /* tag */, V a, V b) {
+ return InterleaveLower(a, b);
+}
+
+// ------------------------------ InterleaveUpper (UpperHalf)
+
+// All functions inside detail lack the required D parameter.
+namespace detail {
+
+template <size_t N>
+HWY_API Vec128<uint8_t, N> InterleaveUpper(Vec128<uint8_t, N> a,
+ Vec128<uint8_t, N> b) {
+ return Vec128<uint8_t, N>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10,
+ 26, 11, 27, 12, 28, 13, 29, 14,
+ 30, 15, 31)};
+}
+template <size_t N>
+HWY_API Vec128<uint16_t, N> InterleaveUpper(Vec128<uint16_t, N> a,
+ Vec128<uint16_t, N> b) {
+ return Vec128<uint16_t, N>{
+ wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
+}
+template <size_t N>
+HWY_API Vec128<uint32_t, N> InterleaveUpper(Vec128<uint32_t, N> a,
+ Vec128<uint32_t, N> b) {
+ return Vec128<uint32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
+}
+template <size_t N>
+HWY_API Vec128<uint64_t, N> InterleaveUpper(Vec128<uint64_t, N> a,
+ Vec128<uint64_t, N> b) {
+ return Vec128<uint64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
+}
+
+template <size_t N>
+HWY_API Vec128<int8_t, N> InterleaveUpper(Vec128<int8_t, N> a,
+ Vec128<int8_t, N> b) {
+ return Vec128<int8_t, N>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10,
+ 26, 11, 27, 12, 28, 13, 29, 14,
+ 30, 15, 31)};
+}
+template <size_t N>
+HWY_API Vec128<int16_t, N> InterleaveUpper(Vec128<int16_t, N> a,
+ Vec128<int16_t, N> b) {
+ return Vec128<int16_t, N>{
+ wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
+}
+template <size_t N>
+HWY_API Vec128<int32_t, N> InterleaveUpper(Vec128<int32_t, N> a,
+ Vec128<int32_t, N> b) {
+ return Vec128<int32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
+}
+template <size_t N>
+HWY_API Vec128<int64_t, N> InterleaveUpper(Vec128<int64_t, N> a,
+ Vec128<int64_t, N> b) {
+ return Vec128<int64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
+}
+
+template <size_t N>
+HWY_API Vec128<float, N> InterleaveUpper(Vec128<float, N> a,
+ Vec128<float, N> b) {
+ return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
+}
+
+template <size_t N>
+HWY_API Vec128<double, N> InterleaveUpper(Vec128<double, N> a,
+ Vec128<double, N> b) {
+ return Vec128<double, N>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
+}
+
+} // namespace detail
+
+// Full
+template <typename T, class V = Vec128<T>>
+HWY_API V InterleaveUpper(Full128<T> /* tag */, V a, V b) {
+ return detail::InterleaveUpper(a, b);
+}
+
+// Partial
+template <typename T, size_t N, HWY_IF_LE64(T, N), class V = Vec128<T, N>>
+HWY_API V InterleaveUpper(Simd<T, N, 0> d, V a, V b) {
+ const Half<decltype(d)> d2;
+ return InterleaveLower(d, V{UpperHalf(d2, a).raw}, V{UpperHalf(d2, b).raw});
+}
+
+// ------------------------------ ZipLower/ZipUpper (InterleaveLower)
+
+// Same as Interleave*, except that the return lanes are double-width integers;
+// this is necessary because the single-lane scalar cannot return two values.
+template <class V, class DW = RepartitionToWide<DFromV<V>>>
+HWY_API VFromD<DW> ZipLower(V a, V b) {
+ return BitCast(DW(), InterleaveLower(a, b));
+}
+template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
+HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
+ return BitCast(dw, InterleaveLower(D(), a, b));
+}
+
+template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
+HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
+ return BitCast(dw, InterleaveUpper(D(), a, b));
+}
+
+// ================================================== COMBINE
+
+// ------------------------------ Combine (InterleaveLower)
+
+// N = N/2 + N/2 (upper half undefined)
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_API Vec128<T, N> Combine(Simd<T, N, 0> d, Vec128<T, N / 2> hi_half,
+ Vec128<T, N / 2> lo_half) {
+ const Half<decltype(d)> d2;
+ const RebindToUnsigned<decltype(d2)> du2;
+ // Treat half-width input as one lane, and expand to two lanes.
+ using VU = Vec128<UnsignedFromSize<N * sizeof(T) / 2>, 2>;
+ const VU lo{BitCast(du2, lo_half).raw};
+ const VU hi{BitCast(du2, hi_half).raw};
+ return BitCast(d, InterleaveLower(lo, hi));
+}
+
+// ------------------------------ ZeroExtendVector (Combine, IfThenElseZero)
+
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_API Vec128<T, N> ZeroExtendVector(Simd<T, N, 0> d, Vec128<T, N / 2> lo) {
+ return IfThenElseZero(FirstN(d, N / 2), Vec128<T, N>{lo.raw});
+}
+
+// ------------------------------ ConcatLowerLower
+
+// hiH,hiL loH,loL |-> hiL,loL (= lower halves)
+template <typename T>
+HWY_API Vec128<T> ConcatLowerLower(Full128<T> /* tag */, const Vec128<T> hi,
+ const Vec128<T> lo) {
+ return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 0, 2)};
+}
+template <typename T, size_t N, HWY_IF_LE64(T, N)>
+HWY_API Vec128<T, N> ConcatLowerLower(Simd<T, N, 0> d, const Vec128<T, N> hi,
+ const Vec128<T, N> lo) {
+ const Half<decltype(d)> d2;
+ return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo));
+}
+
+// ------------------------------ ConcatUpperUpper
+
+template <typename T>
+HWY_API Vec128<T> ConcatUpperUpper(Full128<T> /* tag */, const Vec128<T> hi,
+ const Vec128<T> lo) {
+ return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 1, 3)};
+}
+template <typename T, size_t N, HWY_IF_LE64(T, N)>
+HWY_API Vec128<T, N> ConcatUpperUpper(Simd<T, N, 0> d, const Vec128<T, N> hi,
+ const Vec128<T, N> lo) {
+ const Half<decltype(d)> d2;
+ return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo));
+}
+
+// ------------------------------ ConcatLowerUpper
+
+template <typename T>
+HWY_API Vec128<T> ConcatLowerUpper(Full128<T> d, const Vec128<T> hi,
+ const Vec128<T> lo) {
+ return CombineShiftRightBytes<8>(d, hi, lo);
+}
+template <typename T, size_t N, HWY_IF_LE64(T, N)>
+HWY_API Vec128<T, N> ConcatLowerUpper(Simd<T, N, 0> d, const Vec128<T, N> hi,
+ const Vec128<T, N> lo) {
+ const Half<decltype(d)> d2;
+ return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo));
+}
+
+// ------------------------------ ConcatUpperLower
+template <typename T, size_t N>
+HWY_API Vec128<T, N> ConcatUpperLower(Simd<T, N, 0> d, const Vec128<T, N> hi,
+ const Vec128<T, N> lo) {
+ return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi);
+}
+
+// ------------------------------ ConcatOdd
+
+// 8-bit full
+template <typename T, HWY_IF_LANE_SIZE(T, 1)>
+HWY_API Vec128<T> ConcatOdd(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
+ return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 9, 11, 13, 15,
+ 17, 19, 21, 23, 25, 27, 29, 31)};
+}
+
+// 8-bit x8
+template <typename T, HWY_IF_LANE_SIZE(T, 1)>
+HWY_API Vec128<T, 8> ConcatOdd(Simd<T, 8, 0> /* tag */, Vec128<T, 8> hi,
+ Vec128<T, 8> lo) {
+ // Don't care about upper half.
+ return Vec128<T, 8>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 17, 19, 21,
+ 23, 1, 3, 5, 7, 17, 19, 21, 23)};
+}
+
+// 8-bit x4
+template <typename T, HWY_IF_LANE_SIZE(T, 1)>
+HWY_API Vec128<T, 4> ConcatOdd(Simd<T, 4, 0> /* tag */, Vec128<T, 4> hi,
+ Vec128<T, 4> lo) {
+ // Don't care about upper 3/4.
+ return Vec128<T, 4>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 17, 19, 1, 3, 17,
+ 19, 1, 3, 17, 19, 1, 3, 17, 19)};
+}
+
+// 16-bit full
+template <typename T, HWY_IF_LANE_SIZE(T, 2)>
+HWY_API Vec128<T> ConcatOdd(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
+ return Vec128<T>{
+ wasm_i16x8_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 9, 11, 13, 15)};
+}
+
+// 16-bit x4
+template <typename T, HWY_IF_LANE_SIZE(T, 2)>
+HWY_API Vec128<T, 4> ConcatOdd(Simd<T, 4, 0> /* tag */, Vec128<T, 4> hi,
+ Vec128<T, 4> lo) {
+ // Don't care about upper half.
+ return Vec128<T, 4>{
+ wasm_i16x8_shuffle(lo.raw, hi.raw, 1, 3, 9, 11, 1, 3, 9, 11)};
+}
+
+// 32-bit full
+template <typename T, HWY_IF_LANE_SIZE(T, 4)>
+HWY_API Vec128<T> ConcatOdd(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
+ return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 1, 3, 5, 7)};
+}
+
+// Any T x2
+template <typename T>
+HWY_API Vec128<T, 2> ConcatOdd(Simd<T, 2, 0> d, Vec128<T, 2> hi,
+ Vec128<T, 2> lo) {
+ return InterleaveUpper(d, lo, hi);
+}
+
+// ------------------------------ ConcatEven (InterleaveLower)
+
+// 8-bit full
+template <typename T, HWY_IF_LANE_SIZE(T, 1)>
+HWY_API Vec128<T> ConcatEven(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
+ return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30)};
+}
+
+// 8-bit x8
+template <typename T, HWY_IF_LANE_SIZE(T, 1)>
+HWY_API Vec128<T, 8> ConcatEven(Simd<T, 8, 0> /* tag */, Vec128<T, 8> hi,
+ Vec128<T, 8> lo) {
+ // Don't care about upper half.
+ return Vec128<T, 8>{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 16, 18, 20,
+ 22, 0, 2, 4, 6, 16, 18, 20, 22)};
+}
+
+// 8-bit x4
+template <typename T, HWY_IF_LANE_SIZE(T, 1)>
+HWY_API Vec128<T, 4> ConcatEven(Simd<T, 4, 0> /* tag */, Vec128<T, 4> hi,
+ Vec128<T, 4> lo) {
+ // Don't care about upper 3/4.
+ return Vec128<T, 4>{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 16, 18, 0, 2, 16,
+ 18, 0, 2, 16, 18, 0, 2, 16, 18)};
+}
+
+// 16-bit full
+template <typename T, HWY_IF_LANE_SIZE(T, 2)>
+HWY_API Vec128<T> ConcatEven(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
+ return Vec128<T>{
+ wasm_i16x8_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 8, 10, 12, 14)};
+}
+
+// 16-bit x4
+template <typename T, HWY_IF_LANE_SIZE(T, 2)>
+HWY_API Vec128<T, 4> ConcatEven(Simd<T, 4, 0> /* tag */, Vec128<T, 4> hi,
+ Vec128<T, 4> lo) {
+ // Don't care about upper half.
+ return Vec128<T, 4>{
+ wasm_i16x8_shuffle(lo.raw, hi.raw, 0, 2, 8, 10, 0, 2, 8, 10)};
+}
+
+// 32-bit full
+template <typename T, HWY_IF_LANE_SIZE(T, 4)>
+HWY_API Vec128<T> ConcatEven(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
+ return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 0, 2, 4, 6)};
+}
+
+// Any T x2
+template <typename T>
+HWY_API Vec128<T, 2> ConcatEven(Simd<T, 2, 0> d, Vec128<T, 2> hi,
+ Vec128<T, 2> lo) {
+ return InterleaveLower(d, lo, hi);
+}
+
+// ------------------------------ DupEven (InterleaveLower)
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
+HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
+ return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 0, 0, 2, 2)};
+}
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
+HWY_API Vec128<T, N> DupEven(const Vec128<T, N> v) {
+ return InterleaveLower(DFromV<decltype(v)>(), v, v);
+}
+
+// ------------------------------ DupOdd (InterleaveUpper)
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
+HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
+ return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 1, 3, 3)};
+}
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
+HWY_API Vec128<T, N> DupOdd(const Vec128<T, N> v) {
+ return InterleaveUpper(DFromV<decltype(v)>(), v, v);
+}
+
+// ------------------------------ OddEven
+
+namespace detail {
+
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<1> /* tag */, const Vec128<T, N> a,
+ const Vec128<T, N> b) {
+ const DFromV<decltype(a)> d;
+ const Repartition<uint8_t, decltype(d)> d8;
+ alignas(16) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
+ 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
+ return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
+}
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<2> /* tag */, const Vec128<T, N> a,
+ const Vec128<T, N> b) {
+ return Vec128<T, N>{
+ wasm_i16x8_shuffle(a.raw, b.raw, 8, 1, 10, 3, 12, 5, 14, 7)};
+}
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<4> /* tag */, const Vec128<T, N> a,
+ const Vec128<T, N> b) {
+ return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
+}
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<8> /* tag */, const Vec128<T, N> a,
+ const Vec128<T, N> b) {
+ return Vec128<T, N>{wasm_i64x2_shuffle(a.raw, b.raw, 2, 1)};
+}
+
+} // namespace detail
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
+ return detail::OddEven(hwy::SizeTag<sizeof(T)>(), a, b);
+}
+template <size_t N>
+HWY_API Vec128<float, N> OddEven(const Vec128<float, N> a,
+ const Vec128<float, N> b) {
+ return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
+}
+
+// ------------------------------ OddEvenBlocks
+template <typename T, size_t N>
+HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
+ return even;
+}
+
+// ------------------------------ SwapAdjacentBlocks
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
+ return v;
+}
+
+// ------------------------------ ReverseBlocks
+
+// Single block: no change
+template <typename T>
+HWY_API Vec128<T> ReverseBlocks(Full128<T> /* tag */, const Vec128<T> v) {
+ return v;
+}
+
+// ================================================== CONVERT
+
+// ------------------------------ Promotions (part w/ narrow lanes -> full)
+
+// Unsigned: zero-extend.
+template <size_t N, HWY_IF_LE128(uint16_t, N)>
+HWY_API Vec128<uint16_t, N> PromoteTo(Simd<uint16_t, N, 0> /* tag */,
+ const Vec128<uint8_t, N> v) {
+ return Vec128<uint16_t, N>{wasm_u16x8_extend_low_u8x16(v.raw)};
+}
+template <size_t N, HWY_IF_LE128(uint32_t, N)>
+HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N, 0> /* tag */,
+ const Vec128<uint8_t, N> v) {
+ return Vec128<uint32_t, N>{
+ wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
+}
+template <size_t N, HWY_IF_LE128(int16_t, N)>
+HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N, 0> /* tag */,
+ const Vec128<uint8_t, N> v) {
+ return Vec128<int16_t, N>{wasm_u16x8_extend_low_u8x16(v.raw)};
+}
+template <size_t N, HWY_IF_LE128(int32_t, N)>
+HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
+ const Vec128<uint8_t, N> v) {
+ return Vec128<int32_t, N>{
+ wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
+}
+template <size_t N, HWY_IF_LE128(uint32_t, N)>
+HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N, 0> /* tag */,
+ const Vec128<uint16_t, N> v) {
+ return Vec128<uint32_t, N>{wasm_u32x4_extend_low_u16x8(v.raw)};
+}
+template <size_t N, HWY_IF_LE128(uint64_t, N)>
+HWY_API Vec128<uint64_t, N> PromoteTo(Simd<uint64_t, N, 0> /* tag */,
+ const Vec128<uint32_t, N> v) {
+ return Vec128<uint64_t, N>{wasm_u64x2_extend_low_u32x4(v.raw)};
+}
+
+template <size_t N, HWY_IF_LE128(int32_t, N)>
+HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
+ const Vec128<uint16_t, N> v) {
+ return Vec128<int32_t, N>{wasm_u32x4_extend_low_u16x8(v.raw)};
+}
+
+// Signed: replicate sign bit.
+template <size_t N, HWY_IF_LE128(int16_t, N)>
+HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N, 0> /* tag */,
+ const Vec128<int8_t, N> v) {
+ return Vec128<int16_t, N>{wasm_i16x8_extend_low_i8x16(v.raw)};
+}
+template <size_t N, HWY_IF_LE128(int32_t, N)>
+HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
+ const Vec128<int8_t, N> v) {
+ return Vec128<int32_t, N>{
+ wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(v.raw))};
+}
+template <size_t N, HWY_IF_LE128(int32_t, N)>
+HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
+ const Vec128<int16_t, N> v) {
+ return Vec128<int32_t, N>{wasm_i32x4_extend_low_i16x8(v.raw)};
+}
+template <size_t N, HWY_IF_LE128(int64_t, N)>
+HWY_API Vec128<int64_t, N> PromoteTo(Simd<int64_t, N, 0> /* tag */,
+ const Vec128<int32_t, N> v) {
+ return Vec128<int64_t, N>{wasm_i64x2_extend_low_i32x4(v.raw)};
+}
+
+template <size_t N, HWY_IF_LE128(double, N)>
+HWY_API Vec128<double, N> PromoteTo(Simd<double, N, 0> /* tag */,
+ const Vec128<int32_t, N> v) {
+ return Vec128<double, N>{wasm_f64x2_convert_low_i32x4(v.raw)};
+}
+
+template <size_t N, HWY_IF_LE128(float, N)>
+HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> df32,
+ const Vec128<float16_t, N> v) {
+ const RebindToSigned<decltype(df32)> di32;
+ const RebindToUnsigned<decltype(df32)> du32;
+ // Expand to u32 so we can shift.
+ const auto bits16 = PromoteTo(du32, Vec128<uint16_t, N>{v.raw});
+ const auto sign = ShiftRight<15>(bits16);
+ const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F);
+ const auto mantissa = bits16 & Set(du32, 0x3FF);
+ const auto subnormal =
+ BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) *
+ Set(df32, 1.0f / 16384 / 1024));
+
+ const auto biased_exp32 = biased_exp + Set(du32, 127 - 15);
+ const auto mantissa32 = ShiftLeft<23 - 10>(mantissa);
+ const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
+ const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal);
+ return BitCast(df32, ShiftLeft<31>(sign) | bits32);
+}
+
+template <size_t N, HWY_IF_LE128(float, N)>
+HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> df32,
+ const Vec128<bfloat16_t, N> v) {
+ const Rebind<uint16_t, decltype(df32)> du16;
+ const RebindToSigned<decltype(df32)> di32;
+ return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
+}
+
+// ------------------------------ Demotions (full -> part w/ narrow lanes)
+
+template <size_t N>
+HWY_API Vec128<uint16_t, N> DemoteTo(Simd<uint16_t, N, 0> /* tag */,
+ const Vec128<int32_t, N> v) {
+ return Vec128<uint16_t, N>{wasm_u16x8_narrow_i32x4(v.raw, v.raw)};
+}
+
+template <size_t N>
+HWY_API Vec128<int16_t, N> DemoteTo(Simd<int16_t, N, 0> /* tag */,
+ const Vec128<int32_t, N> v) {
+ return Vec128<int16_t, N>{wasm_i16x8_narrow_i32x4(v.raw, v.raw)};
+}
+
+template <size_t N>
+HWY_API Vec128<uint8_t, N> DemoteTo(Simd<uint8_t, N, 0> /* tag */,
+ const Vec128<int32_t, N> v) {
+ const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
+ return Vec128<uint8_t, N>{
+ wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
+}
+
+template <size_t N>
+HWY_API Vec128<uint8_t, N> DemoteTo(Simd<uint8_t, N, 0> /* tag */,
+ const Vec128<int16_t, N> v) {
+ return Vec128<uint8_t, N>{wasm_u8x16_narrow_i16x8(v.raw, v.raw)};
+}
+
+template <size_t N>
+HWY_API Vec128<int8_t, N> DemoteTo(Simd<int8_t, N, 0> /* tag */,
+ const Vec128<int32_t, N> v) {
+ const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
+ return Vec128<int8_t, N>{wasm_i8x16_narrow_i16x8(intermediate, intermediate)};
+}
+
+template <size_t N>
+HWY_API Vec128<int8_t, N> DemoteTo(Simd<int8_t, N, 0> /* tag */,
+ const Vec128<int16_t, N> v) {
+ return Vec128<int8_t, N>{wasm_i8x16_narrow_i16x8(v.raw, v.raw)};
+}
+
+template <size_t N>
+HWY_API Vec128<int32_t, N> DemoteTo(Simd<int32_t, N, 0> /* di */,
+ const Vec128<double, N> v) {
+ return Vec128<int32_t, N>{wasm_i32x4_trunc_sat_f64x2_zero(v.raw)};
+}
+
+template <size_t N>
+HWY_API Vec128<float16_t, N> DemoteTo(Simd<float16_t, N, 0> df16,
+ const Vec128<float, N> v) {
+ const RebindToUnsigned<decltype(df16)> du16;
+ const Rebind<uint32_t, decltype(du16)> du;
+ const RebindToSigned<decltype(du)> di;
+ const auto bits32 = BitCast(du, v);
+ const auto sign = ShiftRight<31>(bits32);
+ const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF);
+ const auto mantissa32 = bits32 & Set(du, 0x7FFFFF);
+
+ const auto k15 = Set(di, 15);
+ const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15);
+ const auto is_tiny = exp < Set(di, -24);
+
+ const auto is_subnormal = exp < Set(di, -14);
+ const auto biased_exp16 =
+ BitCast(du, IfThenZeroElse(is_subnormal, exp + k15));
+ const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11)
+ const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) +
+ (mantissa32 >> (Set(du, 13) + sub_exp));
+ const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m,
+ ShiftRight<13>(mantissa32)); // <1024
+
+ const auto sign16 = ShiftLeft<15>(sign);
+ const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
+ const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16));
+ return Vec128<float16_t, N>{DemoteTo(du16, bits16).raw};
+}
+
+template <size_t N>
+HWY_API Vec128<bfloat16_t, N> DemoteTo(Simd<bfloat16_t, N, 0> dbf16,
+ const Vec128<float, N> v) {
+ const Rebind<int32_t, decltype(dbf16)> di32;
+ const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
+ const Rebind<uint16_t, decltype(dbf16)> du16;
+ const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
+ return BitCast(dbf16, DemoteTo(du16, bits_in_32));
+}
+
+template <size_t N>
+HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
+ Simd<bfloat16_t, 2 * N, 0> dbf16, Vec128<float, N> a, Vec128<float, N> b) {
+ const RebindToUnsigned<decltype(dbf16)> du16;
+ const Repartition<uint32_t, decltype(dbf16)> du32;
+ const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(BitCast(du32, b));
+ const auto u16 = OddEven(BitCast(du16, a), BitCast(du16, b_in_even));
+ return BitCast(dbf16, u16);
+}
+
+// Specializations for partial vectors because i16x8_narrow_i32x4 sets lanes
+// above 2*N.
+HWY_API Vec128<int16_t, 2> ReorderDemote2To(Simd<int16_t, 2, 0> dn,
+ Vec128<int32_t, 1> a,
+ Vec128<int32_t, 1> b) {
+ const Half<decltype(dn)> dnh;
+ // Pretend the result has twice as many lanes so we can InterleaveLower.
+ const Vec128<int16_t, 2> an{DemoteTo(dnh, a).raw};
+ const Vec128<int16_t, 2> bn{DemoteTo(dnh, b).raw};
+ return InterleaveLower(an, bn);
+}
+HWY_API Vec128<int16_t, 4> ReorderDemote2To(Simd<int16_t, 4, 0> dn,
+ Vec128<int32_t, 2> a,
+ Vec128<int32_t, 2> b) {
+ const Half<decltype(dn)> dnh;
+ // Pretend the result has twice as many lanes so we can InterleaveLower.
+ const Vec128<int16_t, 4> an{DemoteTo(dnh, a).raw};
+ const Vec128<int16_t, 4> bn{DemoteTo(dnh, b).raw};
+ return InterleaveLower(an, bn);
+}
+HWY_API Vec128<int16_t> ReorderDemote2To(Full128<int16_t> /*d16*/,
+ Vec128<int32_t> a, Vec128<int32_t> b) {
+ return Vec128<int16_t>{wasm_i16x8_narrow_i32x4(a.raw, b.raw)};
+}
+
+// For already range-limited input [0, 255].
+template <size_t N>
+HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) {
+ const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
+ return Vec128<uint8_t, N>{
+ wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
+}
+
+// ------------------------------ Truncations
+
+template <typename From, typename To, HWY_IF_UNSIGNED(From),
+ HWY_IF_UNSIGNED(To),
+ hwy::EnableIf<(sizeof(To) < sizeof(From))>* = nullptr>
+HWY_API Vec128<To, 1> TruncateTo(Simd<To, 1, 0> /* tag */,
+ const Vec128<From, 1> v) {
+ const Repartition<To, DFromV<decltype(v)>> d;
+ const auto v1 = BitCast(d, v);
+ return Vec128<To, 1>{v1.raw};
+}
+
+HWY_API Vec16<uint8_t> TruncateTo(Full16<uint8_t> /* tag */,
+ const Vec128<uint64_t> v) {
+ const Full128<uint8_t> d;
+ const auto v1 = BitCast(d, v);
+ const auto v2 = ConcatEven(d, v1, v1);
+ const auto v4 = ConcatEven(d, v2, v2);
+ return LowerHalf(LowerHalf(LowerHalf(ConcatEven(d, v4, v4))));
+}
+
+HWY_API Vec32<uint16_t> TruncateTo(Full32<uint16_t> /* tag */,
+ const Vec128<uint64_t> v) {
+ const Full128<uint16_t> d;
+ const auto v1 = BitCast(d, v);
+ const auto v2 = ConcatEven(d, v1, v1);
+ return LowerHalf(LowerHalf(ConcatEven(d, v2, v2)));
+}
+
+HWY_API Vec64<uint32_t> TruncateTo(Full64<uint32_t> /* tag */,
+ const Vec128<uint64_t> v) {
+ const Full128<uint32_t> d;
+ const auto v1 = BitCast(d, v);
+ return LowerHalf(ConcatEven(d, v1, v1));
+}
+
+template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
+HWY_API Vec128<uint8_t, N> TruncateTo(Simd<uint8_t, N, 0> /* tag */,
+ const Vec128<uint32_t, N> v) {
+ const Full128<uint8_t> d;
+ const auto v1 = Vec128<uint8_t>{v.raw};
+ const auto v2 = ConcatEven(d, v1, v1);
+ const auto v3 = ConcatEven(d, v2, v2);
+ return Vec128<uint8_t, N>{v3.raw};
+}
+
+template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
+HWY_API Vec128<uint16_t, N> TruncateTo(Simd<uint16_t, N, 0> /* tag */,
+ const Vec128<uint32_t, N> v) {
+ const Full128<uint16_t> d;
+ const auto v1 = Vec128<uint16_t>{v.raw};
+ const auto v2 = ConcatEven(d, v1, v1);
+ return Vec128<uint16_t, N>{v2.raw};
+}
+
+template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
+HWY_API Vec128<uint8_t, N> TruncateTo(Simd<uint8_t, N, 0> /* tag */,
+ const Vec128<uint16_t, N> v) {
+ const Full128<uint8_t> d;
+ const auto v1 = Vec128<uint8_t>{v.raw};
+ const auto v2 = ConcatEven(d, v1, v1);
+ return Vec128<uint8_t, N>{v2.raw};
+}
+
+// ------------------------------ Convert i32 <=> f32 (Round)
+
+template <size_t N>
+HWY_API Vec128<float, N> ConvertTo(Simd<float, N, 0> /* tag */,
+ const Vec128<int32_t, N> v) {
+ return Vec128<float, N>{wasm_f32x4_convert_i32x4(v.raw)};
+}
+template <size_t N>
+HWY_API Vec128<float, N> ConvertTo(Simd<float, N, 0> /* tag */,
+ const Vec128<uint32_t, N> v) {
+ return Vec128<float, N>{wasm_f32x4_convert_u32x4(v.raw)};
+}
+// Truncates (rounds toward zero).
+template <size_t N>
+HWY_API Vec128<int32_t, N> ConvertTo(Simd<int32_t, N, 0> /* tag */,
+ const Vec128<float, N> v) {
+ return Vec128<int32_t, N>{wasm_i32x4_trunc_sat_f32x4(v.raw)};
+}
+
+template <size_t N>
+HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
+ return ConvertTo(Simd<int32_t, N, 0>(), Round(v));
+}
+
+// ================================================== MISC
+
+// ------------------------------ SumsOf8 (ShiftRight, Add)
+template <size_t N>
+HWY_API Vec128<uint64_t, N / 8> SumsOf8(const Vec128<uint8_t, N> v) {
+ const DFromV<decltype(v)> du8;
+ const RepartitionToWide<decltype(du8)> du16;
+ const RepartitionToWide<decltype(du16)> du32;
+ const RepartitionToWide<decltype(du32)> du64;
+ using VU16 = VFromD<decltype(du16)>;
+
+ const VU16 vFDB97531 = ShiftRight<8>(BitCast(du16, v));
+ const VU16 vECA86420 = And(BitCast(du16, v), Set(du16, 0xFF));
+ const VU16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420);
+
+ const VU16 szz_FE_zz_BA_zz_76_zz_32 =
+ BitCast(du16, ShiftRight<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10)));
+ const VU16 sxx_FC_xx_B8_xx_74_xx_30 =
+ Add(sFE_DC_BA_98_76_54_32_10, szz_FE_zz_BA_zz_76_zz_32);
+ const VU16 szz_zz_xx_FC_zz_zz_xx_74 =
+ BitCast(du16, ShiftRight<32>(BitCast(du64, sxx_FC_xx_B8_xx_74_xx_30)));
+ const VU16 sxx_xx_xx_F8_xx_xx_xx_70 =
+ Add(sxx_FC_xx_B8_xx_74_xx_30, szz_zz_xx_FC_zz_zz_xx_74);
+ return And(BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70), Set(du64, 0xFFFF));
+}
+
+// ------------------------------ LoadMaskBits (TestBit)
+
+namespace detail {
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
+HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t bits) {
+ const RebindToUnsigned<decltype(d)> du;
+ // Easier than Set(), which would require an >8-bit type, which would not
+ // compile for T=uint8_t, N=1.
+ const Vec128<T, N> vbits{wasm_i32x4_splat(static_cast<int32_t>(bits))};
+
+ // Replicate bytes 8x such that each byte contains the bit that governs it.
+ alignas(16) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
+ 1, 1, 1, 1, 1, 1, 1, 1};
+ const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8));
+
+ alignas(16) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
+ 1, 2, 4, 8, 16, 32, 64, 128};
+ return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
+}
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
+HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t bits) {
+ const RebindToUnsigned<decltype(d)> du;
+ alignas(16) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
+ return RebindMask(
+ d, TestBit(Set(du, static_cast<uint16_t>(bits)), Load(du, kBit)));
+}
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
+HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t bits) {
+ const RebindToUnsigned<decltype(d)> du;
+ alignas(16) constexpr uint32_t kBit[8] = {1, 2, 4, 8};
+ return RebindMask(
+ d, TestBit(Set(du, static_cast<uint32_t>(bits)), Load(du, kBit)));
+}
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
+HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t bits) {
+ const RebindToUnsigned<decltype(d)> du;
+ alignas(16) constexpr uint64_t kBit[8] = {1, 2};
+ return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit)));
+}
+
+} // namespace detail
+
+// `p` points to at least 8 readable bytes, not all of which need be valid.
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_API Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d,
+ const uint8_t* HWY_RESTRICT bits) {
+ uint64_t mask_bits = 0;
+ CopyBytes<(N + 7) / 8>(bits, &mask_bits);
+ return detail::LoadMaskBits(d, mask_bits);
+}
+
+// ------------------------------ Mask
+
+namespace detail {
+
+// Full
+template <typename T>
+HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
+ const Mask128<T> mask) {
+ alignas(16) uint64_t lanes[2];
+ wasm_v128_store(lanes, mask.raw);
+
+ constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
+ const uint64_t lo = ((lanes[0] * kMagic) >> 56);
+ const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00;
+ return (hi + lo);
+}
+
+// 64-bit
+template <typename T>
+HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
+ const Mask128<T, 8> mask) {
+ constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
+ return (static_cast<uint64_t>(wasm_i64x2_extract_lane(mask.raw, 0)) *
+ kMagic) >>
+ 56;
+}
+
+// 32-bit or less: need masking
+template <typename T, size_t N, HWY_IF_LE32(T, N)>
+HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
+ const Mask128<T, N> mask) {
+ uint64_t bytes = static_cast<uint64_t>(wasm_i64x2_extract_lane(mask.raw, 0));
+ // Clear potentially undefined bytes.
+ bytes &= (1ULL << (N * 8)) - 1;
+ constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
+ return (bytes * kMagic) >> 56;
+}
+
+template <typename T, size_t N>
+HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/,
+ const Mask128<T, N> mask) {
+ // Remove useless lower half of each u16 while preserving the sign bit.
+ const __i16x8 zero = wasm_i16x8_splat(0);
+ const Mask128<uint8_t, N> mask8{wasm_i8x16_narrow_i16x8(mask.raw, zero)};
+ return BitsFromMask(hwy::SizeTag<1>(), mask8);
+}
+
+template <typename T, size_t N>
+HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/,
+ const Mask128<T, N> mask) {
+ const __i32x4 mask_i = static_cast<__i32x4>(mask.raw);
+ const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8);
+ const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice);
+ alignas(16) uint32_t lanes[4];
+ wasm_v128_store(lanes, sliced_mask);
+ return lanes[0] | lanes[1] | lanes[2] | lanes[3];
+}
+
+template <typename T, size_t N>
+HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/,
+ const Mask128<T, N> mask) {
+ const __i64x2 mask_i = static_cast<__i64x2>(mask.raw);
+ const __i64x2 slice = wasm_i64x2_make(1, 2);
+ const __i64x2 sliced_mask = wasm_v128_and(mask_i, slice);
+ alignas(16) uint64_t lanes[2];
+ wasm_v128_store(lanes, sliced_mask);
+ return lanes[0] | lanes[1];
+}
+
+// Returns the lowest N bits for the BitsFromMask result.
+template <typename T, size_t N>
+constexpr uint64_t OnlyActive(uint64_t bits) {
+ return ((N * sizeof(T)) == 16) ? bits : bits & ((1ull << N) - 1);
+}
+
+// Returns 0xFF for bytes with index >= N, otherwise 0.
+template <size_t N>
+constexpr __i8x16 BytesAbove() {
+ return /**/
+ (N == 0) ? wasm_i32x4_make(-1, -1, -1, -1)
+ : (N == 4) ? wasm_i32x4_make(0, -1, -1, -1)
+ : (N == 8) ? wasm_i32x4_make(0, 0, -1, -1)
+ : (N == 12) ? wasm_i32x4_make(0, 0, 0, -1)
+ : (N == 16) ? wasm_i32x4_make(0, 0, 0, 0)
+ : (N == 2) ? wasm_i16x8_make(0, -1, -1, -1, -1, -1, -1, -1)
+ : (N == 6) ? wasm_i16x8_make(0, 0, 0, -1, -1, -1, -1, -1)
+ : (N == 10) ? wasm_i16x8_make(0, 0, 0, 0, 0, -1, -1, -1)
+ : (N == 14) ? wasm_i16x8_make(0, 0, 0, 0, 0, 0, 0, -1)
+ : (N == 1) ? wasm_i8x16_make(0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1)
+ : (N == 3) ? wasm_i8x16_make(0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1)
+ : (N == 5) ? wasm_i8x16_make(0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1)
+ : (N == 7) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1)
+ : (N == 9) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1,
+ -1, -1, -1)
+ : (N == 11)
+ ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1)
+ : (N == 13)
+ ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1)
+ : wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1);
+}
+
+template <typename T, size_t N>
+HWY_INLINE uint64_t BitsFromMask(const Mask128<T, N> mask) {
+ return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
+}
+
+template <typename T>
+HWY_INLINE size_t CountTrue(hwy::SizeTag<1> tag, const Mask128<T> m) {
+ return PopCount(BitsFromMask(tag, m));
+}
+
+template <typename T>
+HWY_INLINE size_t CountTrue(hwy::SizeTag<2> tag, const Mask128<T> m) {
+ return PopCount(BitsFromMask(tag, m));
+}
+
+template <typename T>
+HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, const Mask128<T> m) {
+ const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8);
+ const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift);
+ alignas(16) uint64_t lanes[2];
+ wasm_v128_store(lanes, shifted_bits);
+ return PopCount(lanes[0] | lanes[1]);
+}
+
+template <typename T>
+HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128<T> m) {
+ alignas(16) int64_t lanes[2];
+ wasm_v128_store(lanes, m.raw);
+ return static_cast<size_t>(-(lanes[0] + lanes[1]));
+}
+
+} // namespace detail
+
+// `p` points to at least 8 writable bytes.
+template <typename T, size_t N>
+HWY_API size_t StoreMaskBits(const Simd<T, N, 0> /* tag */,
+ const Mask128<T, N> mask, uint8_t* bits) {
+ const uint64_t mask_bits = detail::BitsFromMask(mask);
+ const size_t kNumBytes = (N + 7) / 8;
+ CopyBytes<kNumBytes>(&mask_bits, bits);
+ return kNumBytes;
+}
+
+template <typename T, size_t N>
+HWY_API size_t CountTrue(const Simd<T, N, 0> /* tag */, const Mask128<T> m) {
+ return detail::CountTrue(hwy::SizeTag<sizeof(T)>(), m);
+}
+
+// Partial vector
+template <typename T, size_t N, HWY_IF_LE64(T, N)>
+HWY_API size_t CountTrue(const Simd<T, N, 0> d, const Mask128<T, N> m) {
+ // Ensure all undefined bytes are 0.
+ const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
+ return CountTrue(d, Mask128<T>{AndNot(mask, m).raw});
+}
+
+// Full vector
+template <typename T>
+HWY_API bool AllFalse(const Full128<T> d, const Mask128<T> m) {
+#if 0
+ // Casting followed by wasm_i8x16_any_true results in wasm error:
+ // i32.eqz[0] expected type i32, found i8x16.popcnt of type s128
+ const auto v8 = BitCast(Full128<int8_t>(), VecFromMask(d, m));
+ return !wasm_i8x16_any_true(v8.raw);
+#else
+ (void)d;
+ return (wasm_i64x2_extract_lane(m.raw, 0) |
+ wasm_i64x2_extract_lane(m.raw, 1)) == 0;
+#endif
+}
+
+// Full vector
+namespace detail {
+template <typename T>
+HWY_INLINE bool AllTrue(hwy::SizeTag<1> /*tag*/, const Mask128<T> m) {
+ return wasm_i8x16_all_true(m.raw);
+}
+template <typename T>
+HWY_INLINE bool AllTrue(hwy::SizeTag<2> /*tag*/, const Mask128<T> m) {
+ return wasm_i16x8_all_true(m.raw);
+}
+template <typename T>
+HWY_INLINE bool AllTrue(hwy::SizeTag<4> /*tag*/, const Mask128<T> m) {
+ return wasm_i32x4_all_true(m.raw);
+}
+template <typename T>
+HWY_INLINE bool AllTrue(hwy::SizeTag<8> /*tag*/, const Mask128<T> m) {
+ return wasm_i64x2_all_true(m.raw);
+}
+
+} // namespace detail
+
+template <typename T, size_t N>
+HWY_API bool AllTrue(const Simd<T, N, 0> /* tag */, const Mask128<T> m) {
+ return detail::AllTrue(hwy::SizeTag<sizeof(T)>(), m);
+}
+
+// Partial vectors
+
+template <typename T, size_t N, HWY_IF_LE64(T, N)>
+HWY_API bool AllFalse(Simd<T, N, 0> /* tag */, const Mask128<T, N> m) {
+ // Ensure all undefined bytes are 0.
+ const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
+ return AllFalse(Full128<T>(), Mask128<T>{AndNot(mask, m).raw});
+}
+
+template <typename T, size_t N, HWY_IF_LE64(T, N)>
+HWY_API bool AllTrue(const Simd<T, N, 0> /* d */, const Mask128<T, N> m) {
+ // Ensure all undefined bytes are FF.
+ const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
+ return AllTrue(Full128<T>(), Mask128<T>{Or(mask, m).raw});
+}
+
+template <typename T, size_t N>
+HWY_API size_t FindKnownFirstTrue(const Simd<T, N, 0> /* tag */,
+ const Mask128<T, N> mask) {
+ const uint64_t bits = detail::BitsFromMask(mask);
+ return Num0BitsBelowLS1Bit_Nonzero64(bits);
+}
+
+template <typename T, size_t N>
+HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> /* tag */,
+ const Mask128<T, N> mask) {
+ const uint64_t bits = detail::BitsFromMask(mask);
+ return bits ? static_cast<intptr_t>(Num0BitsBelowLS1Bit_Nonzero64(bits)) : -1;
+}
+
+// ------------------------------ Compress
+
+namespace detail {
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
+HWY_INLINE Vec128<T, N> IdxFromBits(const uint64_t mask_bits) {
+ HWY_DASSERT(mask_bits < 256);
+ const Simd<T, N, 0> d;
+ const Rebind<uint8_t, decltype(d)> d8;
+ const Simd<uint16_t, N, 0> du;
+
+ // We need byte indices for TableLookupBytes (one vector's worth for each of
+ // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We
+ // can instead store lane indices and convert to byte indices (2*lane + 0..1),
+ // with the doubling baked into the table. Unpacking nibbles is likely more
+ // costly than the higher cache footprint from storing bytes.
+ alignas(16) constexpr uint8_t table[256 * 8] = {
+ // PrintCompress16x8Tables
+ 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, //
+ 2, 0, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, //
+ 4, 0, 2, 6, 8, 10, 12, 14, /**/ 0, 4, 2, 6, 8, 10, 12, 14, //
+ 2, 4, 0, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, //
+ 6, 0, 2, 4, 8, 10, 12, 14, /**/ 0, 6, 2, 4, 8, 10, 12, 14, //
+ 2, 6, 0, 4, 8, 10, 12, 14, /**/ 0, 2, 6, 4, 8, 10, 12, 14, //
+ 4, 6, 0, 2, 8, 10, 12, 14, /**/ 0, 4, 6, 2, 8, 10, 12, 14, //
+ 2, 4, 6, 0, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, //
+ 8, 0, 2, 4, 6, 10, 12, 14, /**/ 0, 8, 2, 4, 6, 10, 12, 14, //
+ 2, 8, 0, 4, 6, 10, 12, 14, /**/ 0, 2, 8, 4, 6, 10, 12, 14, //
+ 4, 8, 0, 2, 6, 10, 12, 14, /**/ 0, 4, 8, 2, 6, 10, 12, 14, //
+ 2, 4, 8, 0, 6, 10, 12, 14, /**/ 0, 2, 4, 8, 6, 10, 12, 14, //
+ 6, 8, 0, 2, 4, 10, 12, 14, /**/ 0, 6, 8, 2, 4, 10, 12, 14, //
+ 2, 6, 8, 0, 4, 10, 12, 14, /**/ 0, 2, 6, 8, 4, 10, 12, 14, //
+ 4, 6, 8, 0, 2, 10, 12, 14, /**/ 0, 4, 6, 8, 2, 10, 12, 14, //
+ 2, 4, 6, 8, 0, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, //
+ 10, 0, 2, 4, 6, 8, 12, 14, /**/ 0, 10, 2, 4, 6, 8, 12, 14, //
+ 2, 10, 0, 4, 6, 8, 12, 14, /**/ 0, 2, 10, 4, 6, 8, 12, 14, //
+ 4, 10, 0, 2, 6, 8, 12, 14, /**/ 0, 4, 10, 2, 6, 8, 12, 14, //
+ 2, 4, 10, 0, 6, 8, 12, 14, /**/ 0, 2, 4, 10, 6, 8, 12, 14, //
+ 6, 10, 0, 2, 4, 8, 12, 14, /**/ 0, 6, 10, 2, 4, 8, 12, 14, //
+ 2, 6, 10, 0, 4, 8, 12, 14, /**/ 0, 2, 6, 10, 4, 8, 12, 14, //
+ 4, 6, 10, 0, 2, 8, 12, 14, /**/ 0, 4, 6, 10, 2, 8, 12, 14, //
+ 2, 4, 6, 10, 0, 8, 12, 14, /**/ 0, 2, 4, 6, 10, 8, 12, 14, //
+ 8, 10, 0, 2, 4, 6, 12, 14, /**/ 0, 8, 10, 2, 4, 6, 12, 14, //
+ 2, 8, 10, 0, 4, 6, 12, 14, /**/ 0, 2, 8, 10, 4, 6, 12, 14, //
+ 4, 8, 10, 0, 2, 6, 12, 14, /**/ 0, 4, 8, 10, 2, 6, 12, 14, //
+ 2, 4, 8, 10, 0, 6, 12, 14, /**/ 0, 2, 4, 8, 10, 6, 12, 14, //
+ 6, 8, 10, 0, 2, 4, 12, 14, /**/ 0, 6, 8, 10, 2, 4, 12, 14, //
+ 2, 6, 8, 10, 0, 4, 12, 14, /**/ 0, 2, 6, 8, 10, 4, 12, 14, //
+ 4, 6, 8, 10, 0, 2, 12, 14, /**/ 0, 4, 6, 8, 10, 2, 12, 14, //
+ 2, 4, 6, 8, 10, 0, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, //
+ 12, 0, 2, 4, 6, 8, 10, 14, /**/ 0, 12, 2, 4, 6, 8, 10, 14, //
+ 2, 12, 0, 4, 6, 8, 10, 14, /**/ 0, 2, 12, 4, 6, 8, 10, 14, //
+ 4, 12, 0, 2, 6, 8, 10, 14, /**/ 0, 4, 12, 2, 6, 8, 10, 14, //
+ 2, 4, 12, 0, 6, 8, 10, 14, /**/ 0, 2, 4, 12, 6, 8, 10, 14, //
+ 6, 12, 0, 2, 4, 8, 10, 14, /**/ 0, 6, 12, 2, 4, 8, 10, 14, //
+ 2, 6, 12, 0, 4, 8, 10, 14, /**/ 0, 2, 6, 12, 4, 8, 10, 14, //
+ 4, 6, 12, 0, 2, 8, 10, 14, /**/ 0, 4, 6, 12, 2, 8, 10, 14, //
+ 2, 4, 6, 12, 0, 8, 10, 14, /**/ 0, 2, 4, 6, 12, 8, 10, 14, //
+ 8, 12, 0, 2, 4, 6, 10, 14, /**/ 0, 8, 12, 2, 4, 6, 10, 14, //
+ 2, 8, 12, 0, 4, 6, 10, 14, /**/ 0, 2, 8, 12, 4, 6, 10, 14, //
+ 4, 8, 12, 0, 2, 6, 10, 14, /**/ 0, 4, 8, 12, 2, 6, 10, 14, //
+ 2, 4, 8, 12, 0, 6, 10, 14, /**/ 0, 2, 4, 8, 12, 6, 10, 14, //
+ 6, 8, 12, 0, 2, 4, 10, 14, /**/ 0, 6, 8, 12, 2, 4, 10, 14, //
+ 2, 6, 8, 12, 0, 4, 10, 14, /**/ 0, 2, 6, 8, 12, 4, 10, 14, //
+ 4, 6, 8, 12, 0, 2, 10, 14, /**/ 0, 4, 6, 8, 12, 2, 10, 14, //
+ 2, 4, 6, 8, 12, 0, 10, 14, /**/ 0, 2, 4, 6, 8, 12, 10, 14, //
+ 10, 12, 0, 2, 4, 6, 8, 14, /**/ 0, 10, 12, 2, 4, 6, 8, 14, //
+ 2, 10, 12, 0, 4, 6, 8, 14, /**/ 0, 2, 10, 12, 4, 6, 8, 14, //
+ 4, 10, 12, 0, 2, 6, 8, 14, /**/ 0, 4, 10, 12, 2, 6, 8, 14, //
+ 2, 4, 10, 12, 0, 6, 8, 14, /**/ 0, 2, 4, 10, 12, 6, 8, 14, //
+ 6, 10, 12, 0, 2, 4, 8, 14, /**/ 0, 6, 10, 12, 2, 4, 8, 14, //
+ 2, 6, 10, 12, 0, 4, 8, 14, /**/ 0, 2, 6, 10, 12, 4, 8, 14, //
+ 4, 6, 10, 12, 0, 2, 8, 14, /**/ 0, 4, 6, 10, 12, 2, 8, 14, //
+ 2, 4, 6, 10, 12, 0, 8, 14, /**/ 0, 2, 4, 6, 10, 12, 8, 14, //
+ 8, 10, 12, 0, 2, 4, 6, 14, /**/ 0, 8, 10, 12, 2, 4, 6, 14, //
+ 2, 8, 10, 12, 0, 4, 6, 14, /**/ 0, 2, 8, 10, 12, 4, 6, 14, //
+ 4, 8, 10, 12, 0, 2, 6, 14, /**/ 0, 4, 8, 10, 12, 2, 6, 14, //
+ 2, 4, 8, 10, 12, 0, 6, 14, /**/ 0, 2, 4, 8, 10, 12, 6, 14, //
+ 6, 8, 10, 12, 0, 2, 4, 14, /**/ 0, 6, 8, 10, 12, 2, 4, 14, //
+ 2, 6, 8, 10, 12, 0, 4, 14, /**/ 0, 2, 6, 8, 10, 12, 4, 14, //
+ 4, 6, 8, 10, 12, 0, 2, 14, /**/ 0, 4, 6, 8, 10, 12, 2, 14, //
+ 2, 4, 6, 8, 10, 12, 0, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, //
+ 14, 0, 2, 4, 6, 8, 10, 12, /**/ 0, 14, 2, 4, 6, 8, 10, 12, //
+ 2, 14, 0, 4, 6, 8, 10, 12, /**/ 0, 2, 14, 4, 6, 8, 10, 12, //
+ 4, 14, 0, 2, 6, 8, 10, 12, /**/ 0, 4, 14, 2, 6, 8, 10, 12, //
+ 2, 4, 14, 0, 6, 8, 10, 12, /**/ 0, 2, 4, 14, 6, 8, 10, 12, //
+ 6, 14, 0, 2, 4, 8, 10, 12, /**/ 0, 6, 14, 2, 4, 8, 10, 12, //
+ 2, 6, 14, 0, 4, 8, 10, 12, /**/ 0, 2, 6, 14, 4, 8, 10, 12, //
+ 4, 6, 14, 0, 2, 8, 10, 12, /**/ 0, 4, 6, 14, 2, 8, 10, 12, //
+ 2, 4, 6, 14, 0, 8, 10, 12, /**/ 0, 2, 4, 6, 14, 8, 10, 12, //
+ 8, 14, 0, 2, 4, 6, 10, 12, /**/ 0, 8, 14, 2, 4, 6, 10, 12, //
+ 2, 8, 14, 0, 4, 6, 10, 12, /**/ 0, 2, 8, 14, 4, 6, 10, 12, //
+ 4, 8, 14, 0, 2, 6, 10, 12, /**/ 0, 4, 8, 14, 2, 6, 10, 12, //
+ 2, 4, 8, 14, 0, 6, 10, 12, /**/ 0, 2, 4, 8, 14, 6, 10, 12, //
+ 6, 8, 14, 0, 2, 4, 10, 12, /**/ 0, 6, 8, 14, 2, 4, 10, 12, //
+ 2, 6, 8, 14, 0, 4, 10, 12, /**/ 0, 2, 6, 8, 14, 4, 10, 12, //
+ 4, 6, 8, 14, 0, 2, 10, 12, /**/ 0, 4, 6, 8, 14, 2, 10, 12, //
+ 2, 4, 6, 8, 14, 0, 10, 12, /**/ 0, 2, 4, 6, 8, 14, 10, 12, //
+ 10, 14, 0, 2, 4, 6, 8, 12, /**/ 0, 10, 14, 2, 4, 6, 8, 12, //
+ 2, 10, 14, 0, 4, 6, 8, 12, /**/ 0, 2, 10, 14, 4, 6, 8, 12, //
+ 4, 10, 14, 0, 2, 6, 8, 12, /**/ 0, 4, 10, 14, 2, 6, 8, 12, //
+ 2, 4, 10, 14, 0, 6, 8, 12, /**/ 0, 2, 4, 10, 14, 6, 8, 12, //
+ 6, 10, 14, 0, 2, 4, 8, 12, /**/ 0, 6, 10, 14, 2, 4, 8, 12, //
+ 2, 6, 10, 14, 0, 4, 8, 12, /**/ 0, 2, 6, 10, 14, 4, 8, 12, //
+ 4, 6, 10, 14, 0, 2, 8, 12, /**/ 0, 4, 6, 10, 14, 2, 8, 12, //
+ 2, 4, 6, 10, 14, 0, 8, 12, /**/ 0, 2, 4, 6, 10, 14, 8, 12, //
+ 8, 10, 14, 0, 2, 4, 6, 12, /**/ 0, 8, 10, 14, 2, 4, 6, 12, //
+ 2, 8, 10, 14, 0, 4, 6, 12, /**/ 0, 2, 8, 10, 14, 4, 6, 12, //
+ 4, 8, 10, 14, 0, 2, 6, 12, /**/ 0, 4, 8, 10, 14, 2, 6, 12, //
+ 2, 4, 8, 10, 14, 0, 6, 12, /**/ 0, 2, 4, 8, 10, 14, 6, 12, //
+ 6, 8, 10, 14, 0, 2, 4, 12, /**/ 0, 6, 8, 10, 14, 2, 4, 12, //
+ 2, 6, 8, 10, 14, 0, 4, 12, /**/ 0, 2, 6, 8, 10, 14, 4, 12, //
+ 4, 6, 8, 10, 14, 0, 2, 12, /**/ 0, 4, 6, 8, 10, 14, 2, 12, //
+ 2, 4, 6, 8, 10, 14, 0, 12, /**/ 0, 2, 4, 6, 8, 10, 14, 12, //
+ 12, 14, 0, 2, 4, 6, 8, 10, /**/ 0, 12, 14, 2, 4, 6, 8, 10, //
+ 2, 12, 14, 0, 4, 6, 8, 10, /**/ 0, 2, 12, 14, 4, 6, 8, 10, //
+ 4, 12, 14, 0, 2, 6, 8, 10, /**/ 0, 4, 12, 14, 2, 6, 8, 10, //
+ 2, 4, 12, 14, 0, 6, 8, 10, /**/ 0, 2, 4, 12, 14, 6, 8, 10, //
+ 6, 12, 14, 0, 2, 4, 8, 10, /**/ 0, 6, 12, 14, 2, 4, 8, 10, //
+ 2, 6, 12, 14, 0, 4, 8, 10, /**/ 0, 2, 6, 12, 14, 4, 8, 10, //
+ 4, 6, 12, 14, 0, 2, 8, 10, /**/ 0, 4, 6, 12, 14, 2, 8, 10, //
+ 2, 4, 6, 12, 14, 0, 8, 10, /**/ 0, 2, 4, 6, 12, 14, 8, 10, //
+ 8, 12, 14, 0, 2, 4, 6, 10, /**/ 0, 8, 12, 14, 2, 4, 6, 10, //
+ 2, 8, 12, 14, 0, 4, 6, 10, /**/ 0, 2, 8, 12, 14, 4, 6, 10, //
+ 4, 8, 12, 14, 0, 2, 6, 10, /**/ 0, 4, 8, 12, 14, 2, 6, 10, //
+ 2, 4, 8, 12, 14, 0, 6, 10, /**/ 0, 2, 4, 8, 12, 14, 6, 10, //
+ 6, 8, 12, 14, 0, 2, 4, 10, /**/ 0, 6, 8, 12, 14, 2, 4, 10, //
+ 2, 6, 8, 12, 14, 0, 4, 10, /**/ 0, 2, 6, 8, 12, 14, 4, 10, //
+ 4, 6, 8, 12, 14, 0, 2, 10, /**/ 0, 4, 6, 8, 12, 14, 2, 10, //
+ 2, 4, 6, 8, 12, 14, 0, 10, /**/ 0, 2, 4, 6, 8, 12, 14, 10, //
+ 10, 12, 14, 0, 2, 4, 6, 8, /**/ 0, 10, 12, 14, 2, 4, 6, 8, //
+ 2, 10, 12, 14, 0, 4, 6, 8, /**/ 0, 2, 10, 12, 14, 4, 6, 8, //
+ 4, 10, 12, 14, 0, 2, 6, 8, /**/ 0, 4, 10, 12, 14, 2, 6, 8, //
+ 2, 4, 10, 12, 14, 0, 6, 8, /**/ 0, 2, 4, 10, 12, 14, 6, 8, //
+ 6, 10, 12, 14, 0, 2, 4, 8, /**/ 0, 6, 10, 12, 14, 2, 4, 8, //
+ 2, 6, 10, 12, 14, 0, 4, 8, /**/ 0, 2, 6, 10, 12, 14, 4, 8, //
+ 4, 6, 10, 12, 14, 0, 2, 8, /**/ 0, 4, 6, 10, 12, 14, 2, 8, //
+ 2, 4, 6, 10, 12, 14, 0, 8, /**/ 0, 2, 4, 6, 10, 12, 14, 8, //
+ 8, 10, 12, 14, 0, 2, 4, 6, /**/ 0, 8, 10, 12, 14, 2, 4, 6, //
+ 2, 8, 10, 12, 14, 0, 4, 6, /**/ 0, 2, 8, 10, 12, 14, 4, 6, //
+ 4, 8, 10, 12, 14, 0, 2, 6, /**/ 0, 4, 8, 10, 12, 14, 2, 6, //
+ 2, 4, 8, 10, 12, 14, 0, 6, /**/ 0, 2, 4, 8, 10, 12, 14, 6, //
+ 6, 8, 10, 12, 14, 0, 2, 4, /**/ 0, 6, 8, 10, 12, 14, 2, 4, //
+ 2, 6, 8, 10, 12, 14, 0, 4, /**/ 0, 2, 6, 8, 10, 12, 14, 4, //
+ 4, 6, 8, 10, 12, 14, 0, 2, /**/ 0, 4, 6, 8, 10, 12, 14, 2, //
+ 2, 4, 6, 8, 10, 12, 14, 0, /**/ 0, 2, 4, 6, 8, 10, 12, 14};
+
+ const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
+ const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
+ return BitCast(d, pairs + Set(du, 0x0100));
+}
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
+HWY_INLINE Vec128<T, N> IdxFromNotBits(const uint64_t mask_bits) {
+ HWY_DASSERT(mask_bits < 256);
+ const Simd<T, N, 0> d;
+ const Rebind<uint8_t, decltype(d)> d8;
+ const Simd<uint16_t, N, 0> du;
+
+ // We need byte indices for TableLookupBytes (one vector's worth for each of
+ // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We
+ // can instead store lane indices and convert to byte indices (2*lane + 0..1),
+ // with the doubling baked into the table. Unpacking nibbles is likely more
+ // costly than the higher cache footprint from storing bytes.
+ alignas(16) constexpr uint8_t table[256 * 8] = {
+ // PrintCompressNot16x8Tables
+ 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 14, 0, //
+ 0, 4, 6, 8, 10, 12, 14, 2, /**/ 4, 6, 8, 10, 12, 14, 0, 2, //
+ 0, 2, 6, 8, 10, 12, 14, 4, /**/ 2, 6, 8, 10, 12, 14, 0, 4, //
+ 0, 6, 8, 10, 12, 14, 2, 4, /**/ 6, 8, 10, 12, 14, 0, 2, 4, //
+ 0, 2, 4, 8, 10, 12, 14, 6, /**/ 2, 4, 8, 10, 12, 14, 0, 6, //
+ 0, 4, 8, 10, 12, 14, 2, 6, /**/ 4, 8, 10, 12, 14, 0, 2, 6, //
+ 0, 2, 8, 10, 12, 14, 4, 6, /**/ 2, 8, 10, 12, 14, 0, 4, 6, //
+ 0, 8, 10, 12, 14, 2, 4, 6, /**/ 8, 10, 12, 14, 0, 2, 4, 6, //
+ 0, 2, 4, 6, 10, 12, 14, 8, /**/ 2, 4, 6, 10, 12, 14, 0, 8, //
+ 0, 4, 6, 10, 12, 14, 2, 8, /**/ 4, 6, 10, 12, 14, 0, 2, 8, //
+ 0, 2, 6, 10, 12, 14, 4, 8, /**/ 2, 6, 10, 12, 14, 0, 4, 8, //
+ 0, 6, 10, 12, 14, 2, 4, 8, /**/ 6, 10, 12, 14, 0, 2, 4, 8, //
+ 0, 2, 4, 10, 12, 14, 6, 8, /**/ 2, 4, 10, 12, 14, 0, 6, 8, //
+ 0, 4, 10, 12, 14, 2, 6, 8, /**/ 4, 10, 12, 14, 0, 2, 6, 8, //
+ 0, 2, 10, 12, 14, 4, 6, 8, /**/ 2, 10, 12, 14, 0, 4, 6, 8, //
+ 0, 10, 12, 14, 2, 4, 6, 8, /**/ 10, 12, 14, 0, 2, 4, 6, 8, //
+ 0, 2, 4, 6, 8, 12, 14, 10, /**/ 2, 4, 6, 8, 12, 14, 0, 10, //
+ 0, 4, 6, 8, 12, 14, 2, 10, /**/ 4, 6, 8, 12, 14, 0, 2, 10, //
+ 0, 2, 6, 8, 12, 14, 4, 10, /**/ 2, 6, 8, 12, 14, 0, 4, 10, //
+ 0, 6, 8, 12, 14, 2, 4, 10, /**/ 6, 8, 12, 14, 0, 2, 4, 10, //
+ 0, 2, 4, 8, 12, 14, 6, 10, /**/ 2, 4, 8, 12, 14, 0, 6, 10, //
+ 0, 4, 8, 12, 14, 2, 6, 10, /**/ 4, 8, 12, 14, 0, 2, 6, 10, //
+ 0, 2, 8, 12, 14, 4, 6, 10, /**/ 2, 8, 12, 14, 0, 4, 6, 10, //
+ 0, 8, 12, 14, 2, 4, 6, 10, /**/ 8, 12, 14, 0, 2, 4, 6, 10, //
+ 0, 2, 4, 6, 12, 14, 8, 10, /**/ 2, 4, 6, 12, 14, 0, 8, 10, //
+ 0, 4, 6, 12, 14, 2, 8, 10, /**/ 4, 6, 12, 14, 0, 2, 8, 10, //
+ 0, 2, 6, 12, 14, 4, 8, 10, /**/ 2, 6, 12, 14, 0, 4, 8, 10, //
+ 0, 6, 12, 14, 2, 4, 8, 10, /**/ 6, 12, 14, 0, 2, 4, 8, 10, //
+ 0, 2, 4, 12, 14, 6, 8, 10, /**/ 2, 4, 12, 14, 0, 6, 8, 10, //
+ 0, 4, 12, 14, 2, 6, 8, 10, /**/ 4, 12, 14, 0, 2, 6, 8, 10, //
+ 0, 2, 12, 14, 4, 6, 8, 10, /**/ 2, 12, 14, 0, 4, 6, 8, 10, //
+ 0, 12, 14, 2, 4, 6, 8, 10, /**/ 12, 14, 0, 2, 4, 6, 8, 10, //
+ 0, 2, 4, 6, 8, 10, 14, 12, /**/ 2, 4, 6, 8, 10, 14, 0, 12, //
+ 0, 4, 6, 8, 10, 14, 2, 12, /**/ 4, 6, 8, 10, 14, 0, 2, 12, //
+ 0, 2, 6, 8, 10, 14, 4, 12, /**/ 2, 6, 8, 10, 14, 0, 4, 12, //
+ 0, 6, 8, 10, 14, 2, 4, 12, /**/ 6, 8, 10, 14, 0, 2, 4, 12, //
+ 0, 2, 4, 8, 10, 14, 6, 12, /**/ 2, 4, 8, 10, 14, 0, 6, 12, //
+ 0, 4, 8, 10, 14, 2, 6, 12, /**/ 4, 8, 10, 14, 0, 2, 6, 12, //
+ 0, 2, 8, 10, 14, 4, 6, 12, /**/ 2, 8, 10, 14, 0, 4, 6, 12, //
+ 0, 8, 10, 14, 2, 4, 6, 12, /**/ 8, 10, 14, 0, 2, 4, 6, 12, //
+ 0, 2, 4, 6, 10, 14, 8, 12, /**/ 2, 4, 6, 10, 14, 0, 8, 12, //
+ 0, 4, 6, 10, 14, 2, 8, 12, /**/ 4, 6, 10, 14, 0, 2, 8, 12, //
+ 0, 2, 6, 10, 14, 4, 8, 12, /**/ 2, 6, 10, 14, 0, 4, 8, 12, //
+ 0, 6, 10, 14, 2, 4, 8, 12, /**/ 6, 10, 14, 0, 2, 4, 8, 12, //
+ 0, 2, 4, 10, 14, 6, 8, 12, /**/ 2, 4, 10, 14, 0, 6, 8, 12, //
+ 0, 4, 10, 14, 2, 6, 8, 12, /**/ 4, 10, 14, 0, 2, 6, 8, 12, //
+ 0, 2, 10, 14, 4, 6, 8, 12, /**/ 2, 10, 14, 0, 4, 6, 8, 12, //
+ 0, 10, 14, 2, 4, 6, 8, 12, /**/ 10, 14, 0, 2, 4, 6, 8, 12, //
+ 0, 2, 4, 6, 8, 14, 10, 12, /**/ 2, 4, 6, 8, 14, 0, 10, 12, //
+ 0, 4, 6, 8, 14, 2, 10, 12, /**/ 4, 6, 8, 14, 0, 2, 10, 12, //
+ 0, 2, 6, 8, 14, 4, 10, 12, /**/ 2, 6, 8, 14, 0, 4, 10, 12, //
+ 0, 6, 8, 14, 2, 4, 10, 12, /**/ 6, 8, 14, 0, 2, 4, 10, 12, //
+ 0, 2, 4, 8, 14, 6, 10, 12, /**/ 2, 4, 8, 14, 0, 6, 10, 12, //
+ 0, 4, 8, 14, 2, 6, 10, 12, /**/ 4, 8, 14, 0, 2, 6, 10, 12, //
+ 0, 2, 8, 14, 4, 6, 10, 12, /**/ 2, 8, 14, 0, 4, 6, 10, 12, //
+ 0, 8, 14, 2, 4, 6, 10, 12, /**/ 8, 14, 0, 2, 4, 6, 10, 12, //
+ 0, 2, 4, 6, 14, 8, 10, 12, /**/ 2, 4, 6, 14, 0, 8, 10, 12, //
+ 0, 4, 6, 14, 2, 8, 10, 12, /**/ 4, 6, 14, 0, 2, 8, 10, 12, //
+ 0, 2, 6, 14, 4, 8, 10, 12, /**/ 2, 6, 14, 0, 4, 8, 10, 12, //
+ 0, 6, 14, 2, 4, 8, 10, 12, /**/ 6, 14, 0, 2, 4, 8, 10, 12, //
+ 0, 2, 4, 14, 6, 8, 10, 12, /**/ 2, 4, 14, 0, 6, 8, 10, 12, //
+ 0, 4, 14, 2, 6, 8, 10, 12, /**/ 4, 14, 0, 2, 6, 8, 10, 12, //
+ 0, 2, 14, 4, 6, 8, 10, 12, /**/ 2, 14, 0, 4, 6, 8, 10, 12, //
+ 0, 14, 2, 4, 6, 8, 10, 12, /**/ 14, 0, 2, 4, 6, 8, 10, 12, //
+ 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 0, 14, //
+ 0, 4, 6, 8, 10, 12, 2, 14, /**/ 4, 6, 8, 10, 12, 0, 2, 14, //
+ 0, 2, 6, 8, 10, 12, 4, 14, /**/ 2, 6, 8, 10, 12, 0, 4, 14, //
+ 0, 6, 8, 10, 12, 2, 4, 14, /**/ 6, 8, 10, 12, 0, 2, 4, 14, //
+ 0, 2, 4, 8, 10, 12, 6, 14, /**/ 2, 4, 8, 10, 12, 0, 6, 14, //
+ 0, 4, 8, 10, 12, 2, 6, 14, /**/ 4, 8, 10, 12, 0, 2, 6, 14, //
+ 0, 2, 8, 10, 12, 4, 6, 14, /**/ 2, 8, 10, 12, 0, 4, 6, 14, //
+ 0, 8, 10, 12, 2, 4, 6, 14, /**/ 8, 10, 12, 0, 2, 4, 6, 14, //
+ 0, 2, 4, 6, 10, 12, 8, 14, /**/ 2, 4, 6, 10, 12, 0, 8, 14, //
+ 0, 4, 6, 10, 12, 2, 8, 14, /**/ 4, 6, 10, 12, 0, 2, 8, 14, //
+ 0, 2, 6, 10, 12, 4, 8, 14, /**/ 2, 6, 10, 12, 0, 4, 8, 14, //
+ 0, 6, 10, 12, 2, 4, 8, 14, /**/ 6, 10, 12, 0, 2, 4, 8, 14, //
+ 0, 2, 4, 10, 12, 6, 8, 14, /**/ 2, 4, 10, 12, 0, 6, 8, 14, //
+ 0, 4, 10, 12, 2, 6, 8, 14, /**/ 4, 10, 12, 0, 2, 6, 8, 14, //
+ 0, 2, 10, 12, 4, 6, 8, 14, /**/ 2, 10, 12, 0, 4, 6, 8, 14, //
+ 0, 10, 12, 2, 4, 6, 8, 14, /**/ 10, 12, 0, 2, 4, 6, 8, 14, //
+ 0, 2, 4, 6, 8, 12, 10, 14, /**/ 2, 4, 6, 8, 12, 0, 10, 14, //
+ 0, 4, 6, 8, 12, 2, 10, 14, /**/ 4, 6, 8, 12, 0, 2, 10, 14, //
+ 0, 2, 6, 8, 12, 4, 10, 14, /**/ 2, 6, 8, 12, 0, 4, 10, 14, //
+ 0, 6, 8, 12, 2, 4, 10, 14, /**/ 6, 8, 12, 0, 2, 4, 10, 14, //
+ 0, 2, 4, 8, 12, 6, 10, 14, /**/ 2, 4, 8, 12, 0, 6, 10, 14, //
+ 0, 4, 8, 12, 2, 6, 10, 14, /**/ 4, 8, 12, 0, 2, 6, 10, 14, //
+ 0, 2, 8, 12, 4, 6, 10, 14, /**/ 2, 8, 12, 0, 4, 6, 10, 14, //
+ 0, 8, 12, 2, 4, 6, 10, 14, /**/ 8, 12, 0, 2, 4, 6, 10, 14, //
+ 0, 2, 4, 6, 12, 8, 10, 14, /**/ 2, 4, 6, 12, 0, 8, 10, 14, //
+ 0, 4, 6, 12, 2, 8, 10, 14, /**/ 4, 6, 12, 0, 2, 8, 10, 14, //
+ 0, 2, 6, 12, 4, 8, 10, 14, /**/ 2, 6, 12, 0, 4, 8, 10, 14, //
+ 0, 6, 12, 2, 4, 8, 10, 14, /**/ 6, 12, 0, 2, 4, 8, 10, 14, //
+ 0, 2, 4, 12, 6, 8, 10, 14, /**/ 2, 4, 12, 0, 6, 8, 10, 14, //
+ 0, 4, 12, 2, 6, 8, 10, 14, /**/ 4, 12, 0, 2, 6, 8, 10, 14, //
+ 0, 2, 12, 4, 6, 8, 10, 14, /**/ 2, 12, 0, 4, 6, 8, 10, 14, //
+ 0, 12, 2, 4, 6, 8, 10, 14, /**/ 12, 0, 2, 4, 6, 8, 10, 14, //
+ 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 0, 12, 14, //
+ 0, 4, 6, 8, 10, 2, 12, 14, /**/ 4, 6, 8, 10, 0, 2, 12, 14, //
+ 0, 2, 6, 8, 10, 4, 12, 14, /**/ 2, 6, 8, 10, 0, 4, 12, 14, //
+ 0, 6, 8, 10, 2, 4, 12, 14, /**/ 6, 8, 10, 0, 2, 4, 12, 14, //
+ 0, 2, 4, 8, 10, 6, 12, 14, /**/ 2, 4, 8, 10, 0, 6, 12, 14, //
+ 0, 4, 8, 10, 2, 6, 12, 14, /**/ 4, 8, 10, 0, 2, 6, 12, 14, //
+ 0, 2, 8, 10, 4, 6, 12, 14, /**/ 2, 8, 10, 0, 4, 6, 12, 14, //
+ 0, 8, 10, 2, 4, 6, 12, 14, /**/ 8, 10, 0, 2, 4, 6, 12, 14, //
+ 0, 2, 4, 6, 10, 8, 12, 14, /**/ 2, 4, 6, 10, 0, 8, 12, 14, //
+ 0, 4, 6, 10, 2, 8, 12, 14, /**/ 4, 6, 10, 0, 2, 8, 12, 14, //
+ 0, 2, 6, 10, 4, 8, 12, 14, /**/ 2, 6, 10, 0, 4, 8, 12, 14, //
+ 0, 6, 10, 2, 4, 8, 12, 14, /**/ 6, 10, 0, 2, 4, 8, 12, 14, //
+ 0, 2, 4, 10, 6, 8, 12, 14, /**/ 2, 4, 10, 0, 6, 8, 12, 14, //
+ 0, 4, 10, 2, 6, 8, 12, 14, /**/ 4, 10, 0, 2, 6, 8, 12, 14, //
+ 0, 2, 10, 4, 6, 8, 12, 14, /**/ 2, 10, 0, 4, 6, 8, 12, 14, //
+ 0, 10, 2, 4, 6, 8, 12, 14, /**/ 10, 0, 2, 4, 6, 8, 12, 14, //
+ 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 0, 10, 12, 14, //
+ 0, 4, 6, 8, 2, 10, 12, 14, /**/ 4, 6, 8, 0, 2, 10, 12, 14, //
+ 0, 2, 6, 8, 4, 10, 12, 14, /**/ 2, 6, 8, 0, 4, 10, 12, 14, //
+ 0, 6, 8, 2, 4, 10, 12, 14, /**/ 6, 8, 0, 2, 4, 10, 12, 14, //
+ 0, 2, 4, 8, 6, 10, 12, 14, /**/ 2, 4, 8, 0, 6, 10, 12, 14, //
+ 0, 4, 8, 2, 6, 10, 12, 14, /**/ 4, 8, 0, 2, 6, 10, 12, 14, //
+ 0, 2, 8, 4, 6, 10, 12, 14, /**/ 2, 8, 0, 4, 6, 10, 12, 14, //
+ 0, 8, 2, 4, 6, 10, 12, 14, /**/ 8, 0, 2, 4, 6, 10, 12, 14, //
+ 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 0, 8, 10, 12, 14, //
+ 0, 4, 6, 2, 8, 10, 12, 14, /**/ 4, 6, 0, 2, 8, 10, 12, 14, //
+ 0, 2, 6, 4, 8, 10, 12, 14, /**/ 2, 6, 0, 4, 8, 10, 12, 14, //
+ 0, 6, 2, 4, 8, 10, 12, 14, /**/ 6, 0, 2, 4, 8, 10, 12, 14, //
+ 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 0, 6, 8, 10, 12, 14, //
+ 0, 4, 2, 6, 8, 10, 12, 14, /**/ 4, 0, 2, 6, 8, 10, 12, 14, //
+ 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 0, 4, 6, 8, 10, 12, 14, //
+ 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14};
+
+ const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
+ const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
+ return BitCast(d, pairs + Set(du, 0x0100));
+}
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
+HWY_INLINE Vec128<T, N> IdxFromBits(const uint64_t mask_bits) {
+ HWY_DASSERT(mask_bits < 16);
+
+ // There are only 4 lanes, so we can afford to load the index vector directly.
+ alignas(16) constexpr uint8_t u8_indices[16 * 16] = {
+ // PrintCompress32x4Tables
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
+ 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, //
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
+ 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, //
+ 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, //
+ 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, //
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
+ 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, //
+ 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, //
+ 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, //
+ 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, //
+ 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, //
+ 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, //
+ 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, //
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+ const Simd<T, N, 0> d;
+ const Repartition<uint8_t, decltype(d)> d8;
+ return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
+}
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
+HWY_INLINE Vec128<T, N> IdxFromNotBits(const uint64_t mask_bits) {
+ HWY_DASSERT(mask_bits < 16);
+
+ // There are only 4 lanes, so we can afford to load the index vector directly.
+ alignas(16) constexpr uint8_t u8_indices[16 * 16] = {
+ // PrintCompressNot32x4Tables
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5,
+ 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
+ 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+ 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
+ 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1,
+ 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+ 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1,
+ 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
+ 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5,
+ 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3,
+ 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+ 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+ 12, 13, 14, 15};
+ const Simd<T, N, 0> d;
+ const Repartition<uint8_t, decltype(d)> d8;
+ return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
+}
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
+HWY_INLINE Vec128<T, N> IdxFromBits(const uint64_t mask_bits) {
+ HWY_DASSERT(mask_bits < 4);
+
+ // There are only 2 lanes, so we can afford to load the index vector directly.
+ alignas(16) constexpr uint8_t u8_indices[4 * 16] = {
+ // PrintCompress64x2Tables
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+
+ const Simd<T, N, 0> d;
+ const Repartition<uint8_t, decltype(d)> d8;
+ return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
+}
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
+HWY_INLINE Vec128<T, N> IdxFromNotBits(const uint64_t mask_bits) {
+ HWY_DASSERT(mask_bits < 4);
+
+ // There are only 2 lanes, so we can afford to load the index vector directly.
+ alignas(16) constexpr uint8_t u8_indices[4 * 16] = {
+ // PrintCompressNot64x2Tables
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+
+ const Simd<T, N, 0> d;
+ const Repartition<uint8_t, decltype(d)> d8;
+ return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
+}
+
+// Helper functions called by both Compress and CompressStore - avoids a
+// redundant BitsFromMask in the latter.
+
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Compress(Vec128<T, N> v, const uint64_t mask_bits) {
+ const auto idx = detail::IdxFromBits<T, N>(mask_bits);
+ const DFromV<decltype(v)> d;
+ const RebindToSigned<decltype(d)> di;
+ return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
+}
+
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> CompressNot(Vec128<T, N> v, const uint64_t mask_bits) {
+ const auto idx = detail::IdxFromNotBits<T, N>(mask_bits);
+ const DFromV<decltype(v)> d;
+ const RebindToSigned<decltype(d)> di;
+ return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
+}
+
+} // namespace detail
+
+template <typename T>
+struct CompressIsPartition {
+#if HWY_TARGET == HWY_WASM_EMU256
+ enum { value = 0 };
+#else
+ enum { value = (sizeof(T) != 1) };
+#endif
+};
+
+// Single lane: no-op
+template <typename T>
+HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
+ return v;
+}
+
+// Two lanes: conditional swap
+template <typename T, HWY_IF_LANE_SIZE(T, 8)>
+HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) {
+ // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep.
+ const Full128<T> d;
+ const Vec128<T> m = VecFromMask(d, mask);
+ const Vec128<T> maskL = DupEven(m);
+ const Vec128<T> maskH = DupOdd(m);
+ const Vec128<T> swap = AndNot(maskL, maskH);
+ return IfVecThenElse(swap, Shuffle01(v), v);
+}
+
+// General case, 2 or 4 byte lanes
+template <typename T, size_t N, HWY_IF_LANE_SIZE_ONE_OF(T, 0x14)>
+HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
+ return detail::Compress(v, detail::BitsFromMask(mask));
+}
+
+// Single lane: no-op
+template <typename T>
+HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
+ return v;
+}
+
+// Two lanes: conditional swap
+template <typename T, HWY_IF_LANE_SIZE(T, 8)>
+HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
+ // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep.
+ const Full128<T> d;
+ const Vec128<T> m = VecFromMask(d, mask);
+ const Vec128<T> maskL = DupEven(m);
+ const Vec128<T> maskH = DupOdd(m);
+ const Vec128<T> swap = AndNot(maskH, maskL);
+ return IfVecThenElse(swap, Shuffle01(v), v);
+}
+
+// General case, 2 or 4 byte lanes
+template <typename T, size_t N, HWY_IF_LANE_SIZE_ONE_OF(T, 0x14)>
+HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
+ // For partial vectors, we cannot pull the Not() into the table because
+ // BitsFromMask clears the upper bits.
+ if (N < 16 / sizeof(T)) {
+ return detail::Compress(v, detail::BitsFromMask(Not(mask)));
+ }
+ return detail::CompressNot(v, detail::BitsFromMask(mask));
+}
+
+// ------------------------------ CompressBlocksNot
+HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
+ Mask128<uint64_t> /* m */) {
+ return v;
+}
+
+// ------------------------------ CompressBits
+template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
+HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
+ const uint8_t* HWY_RESTRICT bits) {
+ uint64_t mask_bits = 0;
+ constexpr size_t kNumBytes = (N + 7) / 8;
+ CopyBytes<kNumBytes>(bits, &mask_bits);
+ if (N < 8) {
+ mask_bits &= (1ull << N) - 1;
+ }
+
+ return detail::Compress(v, mask_bits);
+}
+
+// ------------------------------ CompressStore
+template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
+HWY_API size_t CompressStore(Vec128<T, N> v, const Mask128<T, N> mask,
+ Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
+ const uint64_t mask_bits = detail::BitsFromMask(mask);
+ const auto c = detail::Compress(v, mask_bits);
+ StoreU(c, d, unaligned);
+ return PopCount(mask_bits);
+}
+
+// ------------------------------ CompressBlendedStore
+template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
+HWY_API size_t CompressBlendedStore(Vec128<T, N> v, Mask128<T, N> m,
+ Simd<T, N, 0> d,
+ T* HWY_RESTRICT unaligned) {
+ const RebindToUnsigned<decltype(d)> du; // so we can support fp16/bf16
+ using TU = TFromD<decltype(du)>;
+ const uint64_t mask_bits = detail::BitsFromMask(m);
+ const size_t count = PopCount(mask_bits);
+ const Vec128<TU, N> compressed = detail::Compress(BitCast(du, v), mask_bits);
+ const Mask128<T, N> store_mask = RebindMask(d, FirstN(du, count));
+ BlendedStore(BitCast(d, compressed), store_mask, d, unaligned);
+ return count;
+}
+
+// ------------------------------ CompressBitsStore
+
+template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
+HWY_API size_t CompressBitsStore(Vec128<T, N> v,
+ const uint8_t* HWY_RESTRICT bits,
+ Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
+ uint64_t mask_bits = 0;
+ constexpr size_t kNumBytes = (N + 7) / 8;
+ CopyBytes<kNumBytes>(bits, &mask_bits);
+ if (N < 8) {
+ mask_bits &= (1ull << N) - 1;
+ }
+
+ const auto c = detail::Compress(v, mask_bits);
+ StoreU(c, d, unaligned);
+ return PopCount(mask_bits);
+}
+
+// ------------------------------ StoreInterleaved2/3/4
+
+// HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in
+// generic_ops-inl.h.
+
+// ------------------------------ MulEven/Odd (Load)
+
+HWY_INLINE Vec128<uint64_t> MulEven(const Vec128<uint64_t> a,
+ const Vec128<uint64_t> b) {
+ alignas(16) uint64_t mul[2];
+ mul[0] =
+ Mul128(static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0)),
+ static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0)), &mul[1]);
+ return Load(Full128<uint64_t>(), mul);
+}
+
+HWY_INLINE Vec128<uint64_t> MulOdd(const Vec128<uint64_t> a,
+ const Vec128<uint64_t> b) {
+ alignas(16) uint64_t mul[2];
+ mul[0] =
+ Mul128(static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1)),
+ static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1)), &mul[1]);
+ return Load(Full128<uint64_t>(), mul);
+}
+
+// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
+
+template <size_t N>
+HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N, 0> df32,
+ Vec128<bfloat16_t, 2 * N> a,
+ Vec128<bfloat16_t, 2 * N> b,
+ const Vec128<float, N> sum0,
+ Vec128<float, N>& sum1) {
+ const Rebind<uint32_t, decltype(df32)> du32;
+ using VU32 = VFromD<decltype(du32)>;
+ const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32
+ // Using shift/and instead of Zip leads to the odd/even order that
+ // RearrangeToOddPlusEven prefers.
+ const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
+ const VU32 ao = And(BitCast(du32, a), odd);
+ const VU32 be = ShiftLeft<16>(BitCast(du32, b));
+ const VU32 bo = And(BitCast(du32, b), odd);
+ sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
+ return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
+}
+
+// Even if N=1, the input is always at least 2 lanes, hence i32x4_dot_i16x8 is
+// safe.
+template <size_t N>
+HWY_API Vec128<int32_t, N> ReorderWidenMulAccumulate(
+ Simd<int32_t, N, 0> /*d32*/, Vec128<int16_t, 2 * N> a,
+ Vec128<int16_t, 2 * N> b, const Vec128<int32_t, N> sum0,
+ Vec128<int32_t, N>& /*sum1*/) {
+ return sum0 + Vec128<int32_t, N>{wasm_i32x4_dot_i16x8(a.raw, b.raw)};
+}
+
+// ------------------------------ RearrangeToOddPlusEven
+template <size_t N>
+HWY_API Vec128<int32_t, N> RearrangeToOddPlusEven(
+ const Vec128<int32_t, N> sum0, const Vec128<int32_t, N> /*sum1*/) {
+ return sum0; // invariant already holds
+}
+
+template <size_t N>
+HWY_API Vec128<float, N> RearrangeToOddPlusEven(const Vec128<float, N> sum0,
+ const Vec128<float, N> sum1) {
+ return Add(sum0, sum1);
+}
+
+// ------------------------------ Reductions
+
+namespace detail {
+
+// N=1 for any T: no-op
+template <typename T>
+HWY_INLINE Vec128<T, 1> SumOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
+ const Vec128<T, 1> v) {
+ return v;
+}
+template <typename T>
+HWY_INLINE Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
+ const Vec128<T, 1> v) {
+ return v;
+}
+template <typename T>
+HWY_INLINE Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
+ const Vec128<T, 1> v) {
+ return v;
+}
+
+// u32/i32/f32:
+
+// N=2
+template <typename T>
+HWY_INLINE Vec128<T, 2> SumOfLanes(hwy::SizeTag<4> /* tag */,
+ const Vec128<T, 2> v10) {
+ return v10 + Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw};
+}
+template <typename T>
+HWY_INLINE Vec128<T, 2> MinOfLanes(hwy::SizeTag<4> /* tag */,
+ const Vec128<T, 2> v10) {
+ return Min(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw});
+}
+template <typename T>
+HWY_INLINE Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */,
+ const Vec128<T, 2> v10) {
+ return Max(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw});
+}
+
+// N=4 (full)
+template <typename T>
+HWY_INLINE Vec128<T> SumOfLanes(hwy::SizeTag<4> /* tag */,
+ const Vec128<T> v3210) {
+ const Vec128<T> v1032 = Shuffle1032(v3210);
+ const Vec128<T> v31_20_31_20 = v3210 + v1032;
+ const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
+ return v20_31_20_31 + v31_20_31_20;
+}
+template <typename T>
+HWY_INLINE Vec128<T> MinOfLanes(hwy::SizeTag<4> /* tag */,
+ const Vec128<T> v3210) {
+ const Vec128<T> v1032 = Shuffle1032(v3210);
+ const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
+ const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
+ return Min(v20_31_20_31, v31_20_31_20);
+}
+template <typename T>
+HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<4> /* tag */,
+ const Vec128<T> v3210) {
+ const Vec128<T> v1032 = Shuffle1032(v3210);
+ const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
+ const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
+ return Max(v20_31_20_31, v31_20_31_20);
+}
+
+// u64/i64/f64:
+
+// N=2 (full)
+template <typename T>
+HWY_INLINE Vec128<T> SumOfLanes(hwy::SizeTag<8> /* tag */,
+ const Vec128<T> v10) {
+ const Vec128<T> v01 = Shuffle01(v10);
+ return v10 + v01;
+}
+template <typename T>
+HWY_INLINE Vec128<T> MinOfLanes(hwy::SizeTag<8> /* tag */,
+ const Vec128<T> v10) {
+ const Vec128<T> v01 = Shuffle01(v10);
+ return Min(v10, v01);
+}
+template <typename T>
+HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
+ const Vec128<T> v10) {
+ const Vec128<T> v01 = Shuffle01(v10);
+ return Max(v10, v01);
+}
+
+template <size_t N, HWY_IF_GE32(uint16_t, N)>
+HWY_API Vec128<uint16_t, N> SumOfLanes(hwy::SizeTag<2> /* tag */,
+ Vec128<uint16_t, N> v) {
+ const Simd<uint16_t, N, 0> d;
+ const RepartitionToWide<decltype(d)> d32;
+ const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
+ const auto odd = ShiftRight<16>(BitCast(d32, v));
+ const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
+ // Also broadcast into odd lanes.
+ return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
+}
+template <size_t N, HWY_IF_GE32(int16_t, N)>
+HWY_API Vec128<int16_t, N> SumOfLanes(hwy::SizeTag<2> /* tag */,
+ Vec128<int16_t, N> v) {
+ const Simd<int16_t, N, 0> d;
+ const RepartitionToWide<decltype(d)> d32;
+ // Sign-extend
+ const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
+ const auto odd = ShiftRight<16>(BitCast(d32, v));
+ const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
+ // Also broadcast into odd lanes.
+ return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
+}
+
+template <size_t N, HWY_IF_GE32(uint16_t, N)>
+HWY_API Vec128<uint16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
+ Vec128<uint16_t, N> v) {
+ const Simd<uint16_t, N, 0> d;
+ const RepartitionToWide<decltype(d)> d32;
+ const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
+ const auto odd = ShiftRight<16>(BitCast(d32, v));
+ const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
+ // Also broadcast into odd lanes.
+ return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
+}
+template <size_t N, HWY_IF_GE32(int16_t, N)>
+HWY_API Vec128<int16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
+ Vec128<int16_t, N> v) {
+ const Simd<int16_t, N, 0> d;
+ const RepartitionToWide<decltype(d)> d32;
+ // Sign-extend
+ const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
+ const auto odd = ShiftRight<16>(BitCast(d32, v));
+ const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
+ // Also broadcast into odd lanes.
+ return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
+}
+
+template <size_t N, HWY_IF_GE32(uint16_t, N)>
+HWY_API Vec128<uint16_t, N> MaxOfLanes(hwy::SizeTag<2> /* tag */,
+ Vec128<uint16_t, N> v) {
+ const Simd<uint16_t, N, 0> d;
+ const RepartitionToWide<decltype(d)> d32;
+ const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
+ const auto odd = ShiftRight<16>(BitCast(d32, v));
+ const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
+ // Also broadcast into odd lanes.
+ return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
+}
+template <size_t N, HWY_IF_GE32(int16_t, N)>
+HWY_API Vec128<int16_t, N> MaxOfLanes(hwy::SizeTag<2> /* tag */,
+ Vec128<int16_t, N> v) {
+ const Simd<int16_t, N, 0> d;
+ const RepartitionToWide<decltype(d)> d32;
+ // Sign-extend
+ const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
+ const auto odd = ShiftRight<16>(BitCast(d32, v));
+ const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
+ // Also broadcast into odd lanes.
+ return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
+}
+
+} // namespace detail
+
+// Supported for u/i/f 32/64. Returns the same value in each lane.
+template <typename T, size_t N>
+HWY_API Vec128<T, N> SumOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
+ return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), v);
+}
+template <typename T, size_t N>
+HWY_API Vec128<T, N> MinOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
+ return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), v);
+}
+template <typename T, size_t N>
+HWY_API Vec128<T, N> MaxOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
+ return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), v);
+}
+
+// ------------------------------ Lt128
+
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_INLINE Mask128<T, N> Lt128(Simd<T, N, 0> d, Vec128<T, N> a,
+ Vec128<T, N> b) {
+ static_assert(!IsSigned<T>() && sizeof(T) == 8, "T must be u64");
+ // Truth table of Eq and Lt for Hi and Lo u64.
+ // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
+ // =H =L cH cL | out = cH | (=H & cL)
+ // 0 0 0 0 | 0
+ // 0 0 0 1 | 0
+ // 0 0 1 0 | 1
+ // 0 0 1 1 | 1
+ // 0 1 0 0 | 0
+ // 0 1 0 1 | 0
+ // 0 1 1 0 | 1
+ // 1 0 0 0 | 0
+ // 1 0 0 1 | 1
+ // 1 1 0 0 | 0
+ const Mask128<T, N> eqHL = Eq(a, b);
+ const Vec128<T, N> ltHL = VecFromMask(d, Lt(a, b));
+ // We need to bring cL to the upper lane/bit corresponding to cH. Comparing
+ // the result of InterleaveUpper/Lower requires 9 ops, whereas shifting the
+ // comparison result leftwards requires only 4. IfThenElse compiles to the
+ // same code as OrAnd().
+ const Vec128<T, N> ltLx = DupEven(ltHL);
+ const Vec128<T, N> outHx = IfThenElse(eqHL, ltLx, ltHL);
+ return MaskFromVec(DupOdd(outHx));
+}
+
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_INLINE Mask128<T, N> Lt128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
+ Vec128<T, N> b) {
+ const Vec128<T, N> ltHL = VecFromMask(d, Lt(a, b));
+ return MaskFromVec(InterleaveUpper(d, ltHL, ltHL));
+}
+
+// ------------------------------ Eq128
+
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_INLINE Mask128<T, N> Eq128(Simd<T, N, 0> d, Vec128<T, N> a,
+ Vec128<T, N> b) {
+ static_assert(!IsSigned<T>() && sizeof(T) == 8, "T must be u64");
+ const Vec128<T, N> eqHL = VecFromMask(d, Eq(a, b));
+ return MaskFromVec(And(Reverse2(d, eqHL), eqHL));
+}
+
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_INLINE Mask128<T, N> Eq128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
+ Vec128<T, N> b) {
+ const Vec128<T, N> eqHL = VecFromMask(d, Eq(a, b));
+ return MaskFromVec(InterleaveUpper(d, eqHL, eqHL));
+}
+
+// ------------------------------ Ne128
+
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_INLINE Mask128<T, N> Ne128(Simd<T, N, 0> d, Vec128<T, N> a,
+ Vec128<T, N> b) {
+ static_assert(!IsSigned<T>() && sizeof(T) == 8, "T must be u64");
+ const Vec128<T, N> neHL = VecFromMask(d, Ne(a, b));
+ return MaskFromVec(Or(Reverse2(d, neHL), neHL));
+}
+
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_INLINE Mask128<T, N> Ne128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
+ Vec128<T, N> b) {
+ const Vec128<T, N> neHL = VecFromMask(d, Ne(a, b));
+ return MaskFromVec(InterleaveUpper(d, neHL, neHL));
+}
+
+// ------------------------------ Min128, Max128 (Lt128)
+
+// Without a native OddEven, it seems infeasible to go faster than Lt128.
+template <class D>
+HWY_INLINE VFromD<D> Min128(D d, const VFromD<D> a, const VFromD<D> b) {
+ return IfThenElse(Lt128(d, a, b), a, b);
+}
+
+template <class D>
+HWY_INLINE VFromD<D> Max128(D d, const VFromD<D> a, const VFromD<D> b) {
+ return IfThenElse(Lt128(d, b, a), a, b);
+}
+
+template <class D>
+HWY_INLINE VFromD<D> Min128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
+ return IfThenElse(Lt128Upper(d, a, b), a, b);
+}
+
+template <class D>
+HWY_INLINE VFromD<D> Max128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
+ return IfThenElse(Lt128Upper(d, b, a), a, b);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();