27 files changed, 9189 insertions, 0 deletions
diff --git a/third_party/highway/hwy/tests/arithmetic_test.cc b/third_party/highway/hwy/tests/arithmetic_test.cc
new file mode 100644
index 0000000000..fa533228a0
--- /dev/null
+++ b/third_party/highway/hwy/tests/arithmetic_test.cc
@@ -0,0 +1,499 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/arithmetic_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestPlusMinus {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v2 = Iota(d, T(2));
+    const auto v3 = Iota(d, T(3));
+    const auto v4 = Iota(d, T(4));
+
+    const size_t N = Lanes(d);
+    auto lanes = AllocateAligned<T>(N);
+    for (size_t i = 0; i < N; ++i) {
+      lanes[i] = static_cast<T>((2 + i) + (3 + i));
+    }
+    HWY_ASSERT_VEC_EQ(d, lanes.get(), Add(v2, v3));
+    HWY_ASSERT_VEC_EQ(d, Set(d, 2), Sub(v4, v2));
+
+    for (size_t i = 0; i < N; ++i) {
+      lanes[i] = static_cast<T>((2 + i) + (4 + i));
+    }
+    auto sum = v2;
+    sum = Add(sum, v4);  // sum == 6,8..
+    HWY_ASSERT_VEC_EQ(d, Load(d, lanes.get()), sum);
+
+    sum = Sub(sum, v4);
+    HWY_ASSERT_VEC_EQ(d, v2, sum);
+  }
+};
+
+struct TestPlusMinusOverflow {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v1 = Iota(d, T(1));
+    const auto vMax = Iota(d, LimitsMax<T>());
+    const auto vMin = Iota(d, LimitsMin<T>());
+
+    // Check that no UB triggered.
+    // "assert" here is formal - to avoid compiler dropping calculations
+    HWY_ASSERT_VEC_EQ(d, Add(v1, vMax), Add(vMax, v1));
+    HWY_ASSERT_VEC_EQ(d, Add(vMax, vMax), Add(vMax, vMax));
+    HWY_ASSERT_VEC_EQ(d, Sub(vMin, v1), Sub(vMin, v1));
+    HWY_ASSERT_VEC_EQ(d, Sub(vMin, vMax), Sub(vMin, vMax));
+  }
+};
+
+HWY_NOINLINE void TestAllPlusMinus() {
+  ForAllTypes(ForPartialVectors<TestPlusMinus>());
+  ForIntegerTypes(ForPartialVectors<TestPlusMinusOverflow>());
+}
+
+struct TestUnsignedSaturatingArithmetic {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v0 = Zero(d);
+    const auto vi = Iota(d, 1);
+    const auto vm = Set(d, LimitsMax<T>());
+
+    HWY_ASSERT_VEC_EQ(d, Add(v0, v0), SaturatedAdd(v0, v0));
+    HWY_ASSERT_VEC_EQ(d, Add(v0, vi), SaturatedAdd(v0, vi));
+    HWY_ASSERT_VEC_EQ(d, Add(v0, vm), SaturatedAdd(v0, vm));
+    HWY_ASSERT_VEC_EQ(d, vm, SaturatedAdd(vi, vm));
+    HWY_ASSERT_VEC_EQ(d, vm, SaturatedAdd(vm, vm));
+
+    HWY_ASSERT_VEC_EQ(d, v0, SaturatedSub(v0, v0));
+    HWY_ASSERT_VEC_EQ(d, v0, SaturatedSub(v0, vi));
+    HWY_ASSERT_VEC_EQ(d, v0, SaturatedSub(vi, vi));
+    HWY_ASSERT_VEC_EQ(d, v0, SaturatedSub(vi, vm));
+    HWY_ASSERT_VEC_EQ(d, Sub(vm, vi), SaturatedSub(vm, vi));
+  }
+};
+
+struct TestSignedSaturatingArithmetic {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v0 = Zero(d);
+    const auto vpm = Set(d, LimitsMax<T>());
+    // Ensure all lanes are positive, even if Iota wraps around
+    const auto vi = Or(And(Iota(d, 0), vpm), Set(d, 1));
+    const auto vn = Sub(v0, vi);
+    const auto vnm = Set(d, LimitsMin<T>());
+    HWY_ASSERT_MASK_EQ(d, MaskTrue(d), Gt(vi, v0));
+    HWY_ASSERT_MASK_EQ(d, MaskTrue(d), Lt(vn, v0));
+
+    HWY_ASSERT_VEC_EQ(d, v0, SaturatedAdd(v0, v0));
+    HWY_ASSERT_VEC_EQ(d, vi, SaturatedAdd(v0, vi));
+    HWY_ASSERT_VEC_EQ(d, vpm, SaturatedAdd(v0, vpm));
+    HWY_ASSERT_VEC_EQ(d, vpm, SaturatedAdd(vi, vpm));
+    HWY_ASSERT_VEC_EQ(d, vpm, SaturatedAdd(vpm, vpm));
+
+    HWY_ASSERT_VEC_EQ(d, v0, SaturatedSub(v0, v0));
+    HWY_ASSERT_VEC_EQ(d, Sub(v0, vi), SaturatedSub(v0, vi));
+    HWY_ASSERT_VEC_EQ(d, vn, SaturatedSub(vn, v0));
+    HWY_ASSERT_VEC_EQ(d, vnm, SaturatedSub(vnm, vi));
+    HWY_ASSERT_VEC_EQ(d, vnm, SaturatedSub(vnm, vpm));
+  }
+};
+
+struct TestSaturatingArithmeticOverflow {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v1 = Iota(d, T(1));
+    const auto vMax = Iota(d, LimitsMax<T>());
+    const auto vMin = Iota(d, LimitsMin<T>());
+
+    // Check that no UB triggered.
+    // "assert" here is formal - to avoid compiler dropping calculations
+    HWY_ASSERT_VEC_EQ(d, SaturatedAdd(v1, vMax), SaturatedAdd(vMax, v1));
+    HWY_ASSERT_VEC_EQ(d, SaturatedAdd(vMax, vMax), SaturatedAdd(vMax, vMax));
+    HWY_ASSERT_VEC_EQ(d, SaturatedAdd(vMin, vMax), SaturatedAdd(vMin, vMax));
+    HWY_ASSERT_VEC_EQ(d, SaturatedAdd(vMin, vMin), SaturatedAdd(vMin, vMin));
+    HWY_ASSERT_VEC_EQ(d, SaturatedSub(vMin, v1), SaturatedSub(vMin, v1));
+    HWY_ASSERT_VEC_EQ(d, SaturatedSub(vMin, vMax), SaturatedSub(vMin, vMax));
+    HWY_ASSERT_VEC_EQ(d, SaturatedSub(vMax, vMin), SaturatedSub(vMax, vMin));
+    HWY_ASSERT_VEC_EQ(d, SaturatedSub(vMin, vMin), SaturatedSub(vMin, vMin));
+  }
+};
+
+HWY_NOINLINE void TestAllSaturatingArithmetic() {
+  const ForPartialVectors<TestUnsignedSaturatingArithmetic> test_unsigned;
+  test_unsigned(uint8_t());
+  test_unsigned(uint16_t());
+
+  const ForPartialVectors<TestSignedSaturatingArithmetic> test_signed;
+  test_signed(int8_t());
+  test_signed(int16_t());
+
+  const ForPartialVectors<TestSaturatingArithmeticOverflow> test_overflow;
+  test_overflow(int8_t());
+  test_overflow(uint8_t());
+  test_overflow(int16_t());
+  test_overflow(uint16_t());
+}
+
+struct TestAverage {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v0 = Zero(d);
+    const auto v1 = Set(d, T(1));
+    const auto v2 = Set(d, T(2));
+
+    HWY_ASSERT_VEC_EQ(d, v0, AverageRound(v0, v0));
+    HWY_ASSERT_VEC_EQ(d, v1, AverageRound(v0, v1));
+    HWY_ASSERT_VEC_EQ(d, v1, AverageRound(v1, v1));
+    HWY_ASSERT_VEC_EQ(d, v2, AverageRound(v1, v2));
+    HWY_ASSERT_VEC_EQ(d, v2, AverageRound(v2, v2));
+  }
+};
+
+HWY_NOINLINE void TestAllAverage() {
+  const ForPartialVectors<TestAverage> test;
+  test(uint8_t());
+  test(uint16_t());
+}
+
+struct TestAbs {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v0 = Zero(d);
+    const auto vp1 = Set(d, T(1));
+    const auto vn1 = Set(d, T(-1));
+    const auto vpm = Set(d, LimitsMax<T>());
+    const auto vnm = Set(d, LimitsMin<T>());
+
+    HWY_ASSERT_VEC_EQ(d, v0, Abs(v0));
+    HWY_ASSERT_VEC_EQ(d, vp1, Abs(vp1));
+    HWY_ASSERT_VEC_EQ(d, vp1, Abs(vn1));
+    HWY_ASSERT_VEC_EQ(d, vpm, Abs(vpm));
+    HWY_ASSERT_VEC_EQ(d, vnm, Abs(vnm));
+  }
+};
+
+struct TestFloatAbs {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v0 = Zero(d);
+    const auto vp1 = Set(d, T(1));
+    const auto vn1 = Set(d, T(-1));
+    const auto vp2 = Set(d, T(0.01));
+    const auto vn2 = Set(d, T(-0.01));
+
+    HWY_ASSERT_VEC_EQ(d, v0, Abs(v0));
+    HWY_ASSERT_VEC_EQ(d, vp1, Abs(vp1));
+    HWY_ASSERT_VEC_EQ(d, vp1, Abs(vn1));
+    HWY_ASSERT_VEC_EQ(d, vp2, Abs(vp2));
+    HWY_ASSERT_VEC_EQ(d, vp2, Abs(vn2));
+  }
+};
+
+HWY_NOINLINE void TestAllAbs() {
+  ForSignedTypes(ForPartialVectors<TestAbs>());
+  ForFloatTypes(ForPartialVectors<TestFloatAbs>());
+}
+
+struct TestNeg {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v0 = Zero(d);
+    const auto vn = Set(d, T(-3));
+    const auto vp = Set(d, T(3));
+    HWY_ASSERT_VEC_EQ(d, v0, Neg(v0));
+    HWY_ASSERT_VEC_EQ(d, vp, Neg(vn));
+    HWY_ASSERT_VEC_EQ(d, vn, Neg(vp));
+  }
+};
+
+struct TestNegOverflow {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto vn = Set(d, LimitsMin<T>());
+    const auto vp = Set(d, LimitsMax<T>());
+    HWY_ASSERT_VEC_EQ(d, Neg(vn), Neg(vn));
+    HWY_ASSERT_VEC_EQ(d, Neg(vp), Neg(vp));
+  }
+};
+
+HWY_NOINLINE void TestAllNeg() {
+  ForSignedTypes(ForPartialVectors<TestNeg>());
+  ForFloatTypes(ForPartialVectors<TestNeg>());
+  ForSignedTypes(ForPartialVectors<TestNegOverflow>());
+}
+
+struct TestUnsignedMinMax {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v0 = Zero(d);
+    // Leave headroom such that v1 < v2 even after wraparound.
+    const auto mod = And(Iota(d, 0), Set(d, LimitsMax<T>() >> 1));
+    const auto v1 = Add(mod, Set(d, 1));
+    const auto v2 = Add(mod, Set(d, 2));
+    HWY_ASSERT_VEC_EQ(d, v1, Min(v1, v2));
+    HWY_ASSERT_VEC_EQ(d, v2, Max(v1, v2));
+    HWY_ASSERT_VEC_EQ(d, v0, Min(v1, v0));
+    HWY_ASSERT_VEC_EQ(d, v1, Max(v1, v0));
+
+    const auto vmin = Set(d, LimitsMin<T>());
+    const auto vmax = Set(d, LimitsMax<T>());
+
+    HWY_ASSERT_VEC_EQ(d, vmin, Min(vmin, vmax));
+    HWY_ASSERT_VEC_EQ(d, vmin, Min(vmax, vmin));
+
+    HWY_ASSERT_VEC_EQ(d, vmax, Max(vmin, vmax));
+    HWY_ASSERT_VEC_EQ(d, vmax, Max(vmax, vmin));
+  }
+};
+
+struct TestSignedMinMax {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    // Leave headroom such that v1 < v2 even after wraparound.
+    const auto mod = And(Iota(d, 0), Set(d, LimitsMax<T>() >> 1));
+    const auto v1 = Add(mod, Set(d, 1));
+    const auto v2 = Add(mod, Set(d, 2));
+    const auto v_neg = Sub(Zero(d), v1);
+    HWY_ASSERT_VEC_EQ(d, v1, Min(v1, v2));
+    HWY_ASSERT_VEC_EQ(d, v2, Max(v1, v2));
+    HWY_ASSERT_VEC_EQ(d, v_neg, Min(v1, v_neg));
+    HWY_ASSERT_VEC_EQ(d, v1, Max(v1, v_neg));
+
+    const auto v0 = Zero(d);
+    const auto vmin = Set(d, LimitsMin<T>());
+    const auto vmax = Set(d, LimitsMax<T>());
+    HWY_ASSERT_VEC_EQ(d, vmin, Min(v0, vmin));
+    HWY_ASSERT_VEC_EQ(d, vmin, Min(vmin, v0));
+    HWY_ASSERT_VEC_EQ(d, v0, Max(v0, vmin));
+    HWY_ASSERT_VEC_EQ(d, v0, Max(vmin, v0));
+
+    HWY_ASSERT_VEC_EQ(d, vmin, Min(vmin, vmax));
+    HWY_ASSERT_VEC_EQ(d, vmin, Min(vmax, vmin));
+
+    HWY_ASSERT_VEC_EQ(d, vmax, Max(vmin, vmax));
+    HWY_ASSERT_VEC_EQ(d, vmax, Max(vmax, vmin));
+  }
+};
+
+struct TestFloatMinMax {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v1 = Iota(d, 1);
+    const auto v2 = Iota(d, 2);
+    const auto v_neg = Iota(d, -T(Lanes(d)));
+    HWY_ASSERT_VEC_EQ(d, v1, Min(v1, v2));
+    HWY_ASSERT_VEC_EQ(d, v2, Max(v1, v2));
+    HWY_ASSERT_VEC_EQ(d, v_neg, Min(v1, v_neg));
+    HWY_ASSERT_VEC_EQ(d, v1, Max(v1, v_neg));
+
+    const auto v0 = Zero(d);
+    const auto vmin = Set(d, T(-1E30));
+    const auto vmax = Set(d, T(1E30));
+    HWY_ASSERT_VEC_EQ(d, vmin, Min(v0, vmin));
+    HWY_ASSERT_VEC_EQ(d, vmin, Min(vmin, v0));
+    HWY_ASSERT_VEC_EQ(d, v0, Max(v0, vmin));
+    HWY_ASSERT_VEC_EQ(d, v0, Max(vmin, v0));
+
+    HWY_ASSERT_VEC_EQ(d, vmin, Min(vmin, vmax));
+    HWY_ASSERT_VEC_EQ(d, vmin, Min(vmax, vmin));
+
+    HWY_ASSERT_VEC_EQ(d, vmax, Max(vmin, vmax));
+    HWY_ASSERT_VEC_EQ(d, vmax, Max(vmax, vmin));
+  }
+};
+
+HWY_NOINLINE void TestAllMinMax() {
+  ForUnsignedTypes(ForPartialVectors<TestUnsignedMinMax>());
+  ForSignedTypes(ForPartialVectors<TestSignedMinMax>());
+  ForFloatTypes(ForPartialVectors<TestFloatMinMax>());
+}
+
+template <class D>
+static HWY_NOINLINE Vec<D> Make128(D d, uint64_t hi, uint64_t lo) {
+  alignas(16) uint64_t in[2];
+  in[0] = lo;
+  in[1] = hi;
+  return LoadDup128(d, in);
+}
+
+struct TestMinMax128 {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using V = Vec<D>;
+    const size_t N = Lanes(d);
+    auto a_lanes = AllocateAligned<T>(N);
+    auto b_lanes = AllocateAligned<T>(N);
+    auto min_lanes = AllocateAligned<T>(N);
+    auto max_lanes = AllocateAligned<T>(N);
+    RandomState rng;
+
+    const V v00 = Zero(d);
+    const V v01 = Make128(d, 0, 1);
+    const V v10 = Make128(d, 1, 0);
+    const V v11 = Add(v01, v10);
+
+    // Same arg
+    HWY_ASSERT_VEC_EQ(d, v00, Min128(d, v00, v00));
+    HWY_ASSERT_VEC_EQ(d, v01, Min128(d, v01, v01));
+    HWY_ASSERT_VEC_EQ(d, v10, Min128(d, v10, v10));
+    HWY_ASSERT_VEC_EQ(d, v11, Min128(d, v11, v11));
+    HWY_ASSERT_VEC_EQ(d, v00, Max128(d, v00, v00));
+    HWY_ASSERT_VEC_EQ(d, v01, Max128(d, v01, v01));
+    HWY_ASSERT_VEC_EQ(d, v10, Max128(d, v10, v10));
+    HWY_ASSERT_VEC_EQ(d, v11, Max128(d, v11, v11));
+
+    // First arg less
+    HWY_ASSERT_VEC_EQ(d, v00, Min128(d, v00, v01));
+    HWY_ASSERT_VEC_EQ(d, v01, Min128(d, v01, v10));
+    HWY_ASSERT_VEC_EQ(d, v10, Min128(d, v10, v11));
+    HWY_ASSERT_VEC_EQ(d, v01, Max128(d, v00, v01));
+    HWY_ASSERT_VEC_EQ(d, v10, Max128(d, v01, v10));
+    HWY_ASSERT_VEC_EQ(d, v11, Max128(d, v10, v11));
+
+    // Second arg less
+    HWY_ASSERT_VEC_EQ(d, v00, Min128(d, v01, v00));
+    HWY_ASSERT_VEC_EQ(d, v01, Min128(d, v10, v01));
+    HWY_ASSERT_VEC_EQ(d, v10, Min128(d, v11, v10));
+    HWY_ASSERT_VEC_EQ(d, v01, Max128(d, v01, v00));
+    HWY_ASSERT_VEC_EQ(d, v10, Max128(d, v10, v01));
+    HWY_ASSERT_VEC_EQ(d, v11, Max128(d, v11, v10));
+
+    // Also check 128-bit blocks are independent
+    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        a_lanes[i] = Random64(&rng);
+        b_lanes[i] = Random64(&rng);
+      }
+      const V a = Load(d, a_lanes.get());
+      const V b = Load(d, b_lanes.get());
+      for (size_t i = 0; i < N; i += 2) {
+        const bool lt = a_lanes[i + 1] == b_lanes[i + 1]
+                            ? (a_lanes[i] < b_lanes[i])
+                            : (a_lanes[i + 1] < b_lanes[i + 1]);
+        min_lanes[i + 0] = lt ? a_lanes[i + 0] : b_lanes[i + 0];
+        min_lanes[i + 1] = lt ? a_lanes[i + 1] : b_lanes[i + 1];
+        max_lanes[i + 0] = lt ? b_lanes[i + 0] : a_lanes[i + 0];
+        max_lanes[i + 1] = lt ? b_lanes[i + 1] : a_lanes[i + 1];
+      }
+      HWY_ASSERT_VEC_EQ(d, min_lanes.get(), Min128(d, a, b));
+      HWY_ASSERT_VEC_EQ(d, max_lanes.get(), Max128(d, a, b));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllMinMax128() {
+  ForGEVectors<128, TestMinMax128>()(uint64_t());
+}
+
+struct TestMinMax128Upper {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using V = Vec<D>;
+    const size_t N = Lanes(d);
+    auto a_lanes = AllocateAligned<T>(N);
+    auto b_lanes = AllocateAligned<T>(N);
+    auto min_lanes = AllocateAligned<T>(N);
+    auto max_lanes = AllocateAligned<T>(N);
+    RandomState rng;
+
+    const V v00 = Zero(d);
+    const V v01 = Make128(d, 0, 1);
+    const V v10 = Make128(d, 1, 0);
+    const V v11 = Add(v01, v10);
+
+    // Same arg
+    HWY_ASSERT_VEC_EQ(d, v00, Min128Upper(d, v00, v00));
+    HWY_ASSERT_VEC_EQ(d, v01, Min128Upper(d, v01, v01));
+    HWY_ASSERT_VEC_EQ(d, v10, Min128Upper(d, v10, v10));
+    HWY_ASSERT_VEC_EQ(d, v11, Min128Upper(d, v11, v11));
+    HWY_ASSERT_VEC_EQ(d, v00, Max128Upper(d, v00, v00));
+    HWY_ASSERT_VEC_EQ(d, v01, Max128Upper(d, v01, v01));
+    HWY_ASSERT_VEC_EQ(d, v10, Max128Upper(d, v10, v10));
+    HWY_ASSERT_VEC_EQ(d, v11, Max128Upper(d, v11, v11));
+
+    // Equivalent but not equal (chooses second arg)
+    HWY_ASSERT_VEC_EQ(d, v01, Min128Upper(d, v00, v01));
+    HWY_ASSERT_VEC_EQ(d, v11, Min128Upper(d, v10, v11));
+    HWY_ASSERT_VEC_EQ(d, v00, Min128Upper(d, v01, v00));
+    HWY_ASSERT_VEC_EQ(d, v10, Min128Upper(d, v11, v10));
+    HWY_ASSERT_VEC_EQ(d, v00, Max128Upper(d, v01, v00));
+    HWY_ASSERT_VEC_EQ(d, v10, Max128Upper(d, v11, v10));
+    HWY_ASSERT_VEC_EQ(d, v01, Max128Upper(d, v00, v01));
+    HWY_ASSERT_VEC_EQ(d, v11, Max128Upper(d, v10, v11));
+
+    // First arg less
+    HWY_ASSERT_VEC_EQ(d, v01, Min128Upper(d, v01, v10));
+    HWY_ASSERT_VEC_EQ(d, v10, Max128Upper(d, v01, v10));
+
+    // Second arg less
+    HWY_ASSERT_VEC_EQ(d, v01, Min128Upper(d, v10, v01));
+    HWY_ASSERT_VEC_EQ(d, v10, Max128Upper(d, v10, v01));
+
+    // Also check 128-bit blocks are independent
+    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        a_lanes[i] = Random64(&rng);
+        b_lanes[i] = Random64(&rng);
+      }
+      const V a = Load(d, a_lanes.get());
+      const V b = Load(d, b_lanes.get());
+      for (size_t i = 0; i < N; i += 2) {
+        const bool lt = a_lanes[i + 1] < b_lanes[i + 1];
+        min_lanes[i + 0] = lt ? a_lanes[i + 0] : b_lanes[i + 0];
+        min_lanes[i + 1] = lt ? a_lanes[i + 1] : b_lanes[i + 1];
+        max_lanes[i + 0] = lt ? b_lanes[i + 0] : a_lanes[i + 0];
+        max_lanes[i + 1] = lt ? b_lanes[i + 1] : a_lanes[i + 1];
+      }
+      HWY_ASSERT_VEC_EQ(d, min_lanes.get(), Min128Upper(d, a, b));
+      HWY_ASSERT_VEC_EQ(d, max_lanes.get(), Max128Upper(d, a, b));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllMinMax128Upper() {
+  ForGEVectors<128, TestMinMax128Upper>()(uint64_t());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyArithmeticTest);
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllPlusMinus);
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSaturatingArithmetic);
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAverage);
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAbs);
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllNeg);
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax);
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax128);
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax128Upper);
+}  // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/blockwise_shift_test.cc b/third_party/highway/hwy/tests/blockwise_shift_test.cc
new file mode 100644
index 0000000000..4e5250841b
--- /dev/null
+++ b/third_party/highway/hwy/tests/blockwise_shift_test.cc
@@ -0,0 +1,270 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>  // memcpy
+
+#include <algorithm>  // std::fill
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/blockwise_shift_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestShiftBytes {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    // Scalar does not define Shift*Bytes.
+#if HWY_TARGET != HWY_SCALAR || HWY_IDE
+    const Repartition<uint8_t, D> du8;
+    const size_t N8 = Lanes(du8);
+
+    // Zero remains zero
+    const auto v0 = Zero(d);
+    HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(v0));
+    HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(d, v0));
+    HWY_ASSERT_VEC_EQ(d, v0, ShiftRightBytes<1>(d, v0));
+
+    // Zero after shifting out the high/low byte
+    auto bytes = AllocateAligned<uint8_t>(N8);
+    std::fill(bytes.get(), bytes.get() + N8, 0);
+    bytes[N8 - 1] = 0x7F;
+    const auto vhi = BitCast(d, Load(du8, bytes.get()));
+    bytes[N8 - 1] = 0;
+    bytes[0] = 0x7F;
+    const auto vlo = BitCast(d, Load(du8, bytes.get()));
+    HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(vhi));
+    HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(d, vhi));
+    HWY_ASSERT_VEC_EQ(d, v0, ShiftRightBytes<1>(d, vlo));
+
+    // Check expected result with Iota
+    const size_t N = Lanes(d);
+    auto in = AllocateAligned<T>(N);
+    const uint8_t* in_bytes = reinterpret_cast<const uint8_t*>(in.get());
+    const auto v = BitCast(d, Iota(du8, 1));
+    Store(v, d, in.get());
+
+    auto expected = AllocateAligned<T>(N);
+    uint8_t* expected_bytes = reinterpret_cast<uint8_t*>(expected.get());
+
+    const size_t block_size = HWY_MIN(N8, 16);
+    for (size_t block = 0; block < N8; block += block_size) {
+      expected_bytes[block] = 0;
+      memcpy(expected_bytes + block + 1, in_bytes + block, block_size - 1);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftBytes<1>(v));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftBytes<1>(d, v));
+
+    for (size_t block = 0; block < N8; block += block_size) {
+      memcpy(expected_bytes + block, in_bytes + block + 1, block_size - 1);
+      expected_bytes[block + block_size - 1] = 0;
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightBytes<1>(d, v));
+#else
+    (void)d;
+#endif  // #if HWY_TARGET != HWY_SCALAR
+  }
+};
+
+HWY_NOINLINE void TestAllShiftBytes() {
+  ForIntegerTypes(ForPartialVectors<TestShiftBytes>());
+}
+
+struct TestShiftLeftLanes {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    // Scalar does not define Shift*Lanes.
+#if HWY_TARGET != HWY_SCALAR || HWY_IDE
+    const auto v = Iota(d, T(1));
+    const size_t N = Lanes(d);
+    if (N == 1) return;
+    auto expected = AllocateAligned<T>(N);
+
+    HWY_ASSERT_VEC_EQ(d, v, ShiftLeftLanes<0>(v));
+    HWY_ASSERT_VEC_EQ(d, v, ShiftLeftLanes<0>(d, v));
+
+    constexpr size_t kLanesPerBlock = 16 / sizeof(T);
+
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = (i % kLanesPerBlock) == 0 ? T(0) : T(i);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftLanes<1>(v));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftLanes<1>(d, v));
+#else
+    (void)d;
+#endif  // #if HWY_TARGET != HWY_SCALAR
+  }
+};
+
+struct TestShiftRightLanes {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    // Scalar does not define Shift*Lanes.
+#if HWY_TARGET != HWY_SCALAR || HWY_IDE
+    const auto v = Iota(d, T(1));
+    const size_t N = Lanes(d);
+    if (N == 1) return;
+    auto expected = AllocateAligned<T>(N);
+
+    HWY_ASSERT_VEC_EQ(d, v, ShiftRightLanes<0>(d, v));
+
+    constexpr size_t kLanesPerBlock = 16 / sizeof(T);
+
+    for (size_t i = 0; i < N; ++i) {
+      const size_t mod = i % kLanesPerBlock;
+      expected[i] = mod == (kLanesPerBlock - 1) || i >= N - 1 ? T(0) : T(2 + i);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightLanes<1>(d, v));
+#else
+    (void)d;
+#endif  // #if HWY_TARGET != HWY_SCALAR
+  }
+};
+
+HWY_NOINLINE void TestAllShiftLeftLanes() {
+  ForAllTypes(ForPartialVectors<TestShiftLeftLanes>());
+}
+
+HWY_NOINLINE void TestAllShiftRightLanes() {
+  ForAllTypes(ForPartialVectors<TestShiftRightLanes>());
+}
+
+// Scalar does not define CombineShiftRightBytes.
+#if HWY_TARGET != HWY_SCALAR || HWY_IDE
+
+template <int kBytes>
+struct TestCombineShiftRightBytes {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T, D d) {
+    constexpr size_t kBlockSize = 16;
+    static_assert(kBytes < kBlockSize, "Shift count is per block");
+    const Repartition<uint8_t, D> d8;
+    const size_t N8 = Lanes(d8);
+    if (N8 < 16) return;
+    auto hi_bytes = AllocateAligned<uint8_t>(N8);
+    auto lo_bytes = AllocateAligned<uint8_t>(N8);
+    auto expected_bytes = AllocateAligned<uint8_t>(N8);
+    uint8_t combined[2 * kBlockSize];
+
+    // Random inputs in each lane
+    RandomState rng;
+    for (size_t rep = 0; rep < AdjustedReps(100); ++rep) {
+      for (size_t i = 0; i < N8; ++i) {
+        hi_bytes[i] = static_cast<uint8_t>(Random64(&rng) & 0xFF);
+        lo_bytes[i] = static_cast<uint8_t>(Random64(&rng) & 0xFF);
+      }
+      for (size_t i = 0; i < N8; i += kBlockSize) {
+        // Arguments are not the same size.
+        CopyBytes<kBlockSize>(&lo_bytes[i], combined);
+        CopyBytes<kBlockSize>(&hi_bytes[i], combined + kBlockSize);
+        CopyBytes<kBlockSize>(combined + kBytes, &expected_bytes[i]);
+      }
+
+      const auto hi = BitCast(d, Load(d8, hi_bytes.get()));
+      const auto lo = BitCast(d, Load(d8, lo_bytes.get()));
+      const auto expected = BitCast(d, Load(d8, expected_bytes.get()));
+      HWY_ASSERT_VEC_EQ(d, expected, CombineShiftRightBytes<kBytes>(d, hi, lo));
+    }
+  }
+};
+
+template <int kLanes>
+struct TestCombineShiftRightLanes {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T, D d) {
+    const Repartition<uint8_t, D> d8;
+    const size_t N8 = Lanes(d8);
+    if (N8 < 16) return;
+
+    auto hi_bytes = AllocateAligned<uint8_t>(N8);
+    auto lo_bytes = AllocateAligned<uint8_t>(N8);
+    auto expected_bytes = AllocateAligned<uint8_t>(N8);
+    constexpr size_t kBlockSize = 16;
+    uint8_t combined[2 * kBlockSize];
+
+    // Random inputs in each lane
+    RandomState rng;
+    for (size_t rep = 0; rep < AdjustedReps(100); ++rep) {
+      for (size_t i = 0; i < N8; ++i) {
+        hi_bytes[i] = static_cast<uint8_t>(Random64(&rng) & 0xFF);
+        lo_bytes[i] = static_cast<uint8_t>(Random64(&rng) & 0xFF);
+      }
+      for (size_t i = 0; i < N8; i += kBlockSize) {
+        // Arguments are not the same size.
+        CopyBytes<kBlockSize>(&lo_bytes[i], combined);
+        CopyBytes<kBlockSize>(&hi_bytes[i], combined + kBlockSize);
+        CopyBytes<kBlockSize>(combined + kLanes * sizeof(T),
+                              &expected_bytes[i]);
+      }
+
+      const auto hi = BitCast(d, Load(d8, hi_bytes.get()));
+      const auto lo = BitCast(d, Load(d8, lo_bytes.get()));
+      const auto expected = BitCast(d, Load(d8, expected_bytes.get()));
+      HWY_ASSERT_VEC_EQ(d, expected, CombineShiftRightLanes<kLanes>(d, hi, lo));
+    }
+  }
+};
+
+#endif  // #if HWY_TARGET != HWY_SCALAR
+
+struct TestCombineShiftRight {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T t, D d) {
+// Scalar does not define CombineShiftRightBytes.
+#if HWY_TARGET != HWY_SCALAR || HWY_IDE
+    constexpr int kMaxBytes =
+        HWY_MIN(16, static_cast<int>(MaxLanes(d) * sizeof(T)));
+    constexpr int kMaxLanes = kMaxBytes / static_cast<int>(sizeof(T));
+    TestCombineShiftRightBytes<kMaxBytes - 1>()(t, d);
+    TestCombineShiftRightBytes<HWY_MAX(kMaxBytes / 2, 1)>()(t, d);
+    TestCombineShiftRightBytes<1>()(t, d);
+
+    TestCombineShiftRightLanes<kMaxLanes - 1>()(t, d);
+    TestCombineShiftRightLanes<HWY_MAX(kMaxLanes / 2, -1)>()(t, d);
+    TestCombineShiftRightLanes<1>()(t, d);
+#else
+    (void)t;
+    (void)d;
+#endif
+  }
+};
+
+HWY_NOINLINE void TestAllCombineShiftRight() {
+  // Need at least 2 lanes.
+  ForAllTypes(ForShrinkableVectors<TestCombineShiftRight>());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyBlockwiseShiftTest);
+HWY_EXPORT_AND_TEST_P(HwyBlockwiseShiftTest, TestAllShiftBytes);
+HWY_EXPORT_AND_TEST_P(HwyBlockwiseShiftTest, TestAllShiftLeftLanes);
+HWY_EXPORT_AND_TEST_P(HwyBlockwiseShiftTest, TestAllShiftRightLanes);
+HWY_EXPORT_AND_TEST_P(HwyBlockwiseShiftTest, TestAllCombineShiftRight);
+}  // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/blockwise_test.cc b/third_party/highway/hwy/tests/blockwise_test.cc
new file mode 100644
index 0000000000..e5ac9ab362
--- /dev/null
+++ b/third_party/highway/hwy/tests/blockwise_test.cc
@@ -0,0 +1,454 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <algorithm>  // std::fill
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/blockwise_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+template <typename D, int kLane>
+struct TestBroadcastR {
+  HWY_NOINLINE void operator()() const {
+    using T = typename D::T;
+    const D d;
+    const size_t N = Lanes(d);
+    if (kLane >= N) return;
+    auto in_lanes = AllocateAligned<T>(N);
+    std::fill(in_lanes.get(), in_lanes.get() + N, T(0));
+    const size_t blockN = HWY_MIN(N * sizeof(T), 16) / sizeof(T);
+    // Need to set within each 128-bit block
+    for (size_t block = 0; block < N; block += blockN) {
+      in_lanes[block + kLane] = static_cast<T>(block + 1);
+    }
+    const auto in = Load(d, in_lanes.get());
+    auto expected = AllocateAligned<T>(N);
+    for (size_t block = 0; block < N; block += blockN) {
+      for (size_t i = 0; i < blockN; ++i) {
+        expected[block + i] = T(block + 1);
+      }
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), Broadcast<kLane>(in));
+
+    TestBroadcastR<D, kLane - 1>()();
+  }
+};
+
+template <class D>
+struct TestBroadcastR<D, -1> {
+  void operator()() const {}
+};
+
+struct TestBroadcast {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    TestBroadcastR<D, HWY_MIN(MaxLanes(d), 16 / sizeof(T)) - 1>()();
+  }
+};
+
+HWY_NOINLINE void TestAllBroadcast() {
+  const ForPartialVectors<TestBroadcast> test;
+  // No u/i8.
+  test(uint16_t());
+  test(int16_t());
+  ForUIF3264(test);
+}
+
+template <bool kFull>
+struct ChooseTableSize {
+  template <typename T, typename DIdx>
+  using type = DIdx;
+};
+template <>
+struct ChooseTableSize<true> {
+  template <typename T, typename DIdx>
+  using type = ScalableTag<T>;
+};
+
+template <bool kFull>
+struct TestTableLookupBytes {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+#if HWY_TARGET != HWY_SCALAR
+    RandomState rng;
+
+    const typename ChooseTableSize<kFull>::template type<T, D> d_tbl;
+    const Repartition<uint8_t, decltype(d_tbl)> d_tbl8;
+    const size_t NT8 = Lanes(d_tbl8);
+
+    const Repartition<uint8_t, D> d8;
+    const size_t N8 = Lanes(d8);
+
+    // Random input bytes
+    auto in_bytes = AllocateAligned<uint8_t>(NT8);
+    for (size_t i = 0; i < NT8; ++i) {
+      in_bytes[i] = Random32(&rng) & 0xFF;
+    }
+    const auto in = BitCast(d_tbl, Load(d_tbl8, in_bytes.get()));
+
+    // Enough test data; for larger vectors, upper lanes will be zero.
+    const uint8_t index_bytes_source[64] = {
+        // Same index as source, multiple outputs from same input,
+        // unused input (9), ascending/descending and nonconsecutive neighbors.
+        0,  2,  1, 2, 15, 12, 13, 14, 6,  7,  8,  5,  4,  3,  10, 11,
+        11, 10, 3, 4, 5,  8,  7,  6,  14, 13, 12, 15, 2,  1,  2,  0,
+        4,  3,  2, 2, 5,  6,  7,  7,  15, 15, 15, 15, 15, 15, 0,  1};
+    auto index_bytes = AllocateAligned<uint8_t>(N8);
+    const size_t max_index = HWY_MIN(NT8, 16) - 1;
+    for (size_t i = 0; i < N8; ++i) {
+      index_bytes[i] = (i < 64) ? index_bytes_source[i] : 0;
+      // Avoid asan error for partial vectors.
+      index_bytes[i] = static_cast<uint8_t>(HWY_MIN(index_bytes[i], max_index));
+    }
+    const auto indices = Load(d, reinterpret_cast<const T*>(index_bytes.get()));
+
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+    uint8_t* expected_bytes = reinterpret_cast<uint8_t*>(expected.get());
+
+    for (size_t block = 0; block < N8; block += 16) {
+      for (size_t i = 0; i < 16 && (block + i) < N8; ++i) {
+        const uint8_t index = index_bytes[block + i];
+        HWY_ASSERT(index <= max_index);
+        // Note that block + index may exceed NT8 on RVV, which is fine because
+        // the operation uses the larger of the table and index vector size.
+        HWY_ASSERT(block + index < HWY_MAX(N8, NT8));
+        // For large vectors, the lane index may wrap around due to block,
+        // also wrap around after 8-bit overflow.
+        expected_bytes[block + i] =
+            in_bytes[(block + index) % HWY_MIN(NT8, 256)];
+      }
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), TableLookupBytes(in, indices));
+
+    // Individually test zeroing each byte position.
+    for (size_t i = 0; i < N8; ++i) {
+      const uint8_t prev_expected = expected_bytes[i];
+      const uint8_t prev_index = index_bytes[i];
+      expected_bytes[i] = 0;
+
+      const int idx = 0x80 + (static_cast<int>(Random32(&rng) & 7) << 4);
+      HWY_ASSERT(0x80 <= idx && idx < 256);
+      index_bytes[i] = static_cast<uint8_t>(idx);
+
+      const auto indices =
+          Load(d, reinterpret_cast<const T*>(index_bytes.get()));
+      HWY_ASSERT_VEC_EQ(d, expected.get(), TableLookupBytesOr0(in, indices));
+      expected_bytes[i] = prev_expected;
+      index_bytes[i] = prev_index;
+    }
+#else
+    (void)d;
+#endif
+  }
+};
+
+HWY_NOINLINE void TestAllTableLookupBytesSame() {
+  // Partial index, same-sized table.
+  ForIntegerTypes(ForPartialVectors<TestTableLookupBytes<false>>());
+}
+
+HWY_NOINLINE void TestAllTableLookupBytesMixed() {
+  // Partial index, full-size table.
+  ForIntegerTypes(ForPartialVectors<TestTableLookupBytes<true>>());
+}
+
+struct TestInterleaveLower {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using TU = MakeUnsigned<T>;
+    const size_t N = Lanes(d);
+    auto even_lanes = AllocateAligned<T>(N);
+    auto odd_lanes = AllocateAligned<T>(N);
+    auto expected = AllocateAligned<T>(N);
+    for (size_t i = 0; i < N; ++i) {
+      even_lanes[i] = static_cast<T>(2 * i + 0);
+      odd_lanes[i] = static_cast<T>(2 * i + 1);
+    }
+    const auto even = Load(d, even_lanes.get());
+    const auto odd = Load(d, odd_lanes.get());
+
+    const size_t blockN = HWY_MIN(16 / sizeof(T), N);
+    for (size_t i = 0; i < Lanes(d); ++i) {
+      const size_t block = i / blockN;
+      const size_t index = (i % blockN) + block * 2 * blockN;
+      expected[i] = static_cast<T>(index & LimitsMax<TU>());
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveLower(even, odd));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveLower(d, even, odd));
+  }
+};
+
+struct TestInterleaveUpper {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    if (N == 1) return;
+    auto even_lanes = AllocateAligned<T>(N);
+    auto odd_lanes = AllocateAligned<T>(N);
+    auto expected = AllocateAligned<T>(N);
+    for (size_t i = 0; i < N; ++i) {
+      even_lanes[i] = static_cast<T>(2 * i + 0);
+      odd_lanes[i] = static_cast<T>(2 * i + 1);
+    }
+    const auto even = Load(d, even_lanes.get());
+    const auto odd = Load(d, odd_lanes.get());
+
+    const size_t blockN = HWY_MIN(16 / sizeof(T), N);
+    for (size_t i = 0; i < Lanes(d); ++i) {
+      const size_t block = i / blockN;
+      expected[i] = T((i % blockN) + block * 2 * blockN + blockN);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveUpper(d, even, odd));
+  }
+};
+
+HWY_NOINLINE void TestAllInterleave() {
+  // Not DemoteVectors because this cannot be supported by HWY_SCALAR.
+  ForAllTypes(ForShrinkableVectors<TestInterleaveLower>());
+  ForAllTypes(ForShrinkableVectors<TestInterleaveUpper>());
+}
+
+struct TestZipLower {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using WideT = MakeWide<T>;
+    static_assert(sizeof(T) * 2 == sizeof(WideT), "Must be double-width");
+    static_assert(IsSigned<T>() == IsSigned<WideT>(), "Must have same sign");
+    const size_t N = Lanes(d);
+    auto even_lanes = AllocateAligned<T>(N);
+    auto odd_lanes = AllocateAligned<T>(N);
+    // At least 2 lanes for HWY_SCALAR
+    auto zip_lanes = AllocateAligned<T>(HWY_MAX(N, 2));
+    const T kMaxT = LimitsMax<T>();
+    for (size_t i = 0; i < N; ++i) {
+      even_lanes[i] = static_cast<T>((2 * i + 0) & kMaxT);
+      odd_lanes[i] = static_cast<T>((2 * i + 1) & kMaxT);
+    }
+    const auto even = Load(d, even_lanes.get());
+    const auto odd = Load(d, odd_lanes.get());
+
+    const Repartition<WideT, D> dw;
+#if HWY_TARGET == HWY_SCALAR
+    // Safely handle big-endian
+    const auto expected = Set(dw, static_cast<WideT>(1ULL << (sizeof(T) * 8)));
+#else
+    const size_t blockN = HWY_MIN(size_t(16) / sizeof(T), N);
+    for (size_t i = 0; i < N; i += 2) {
+      const size_t base = (i / blockN) * blockN;
+      const size_t mod = i % blockN;
+      zip_lanes[i + 0] = even_lanes[mod / 2 + base];
+      zip_lanes[i + 1] = odd_lanes[mod / 2 + base];
+    }
+    const auto expected =
+        Load(dw, reinterpret_cast<const WideT*>(zip_lanes.get()));
+#endif  // HWY_TARGET == HWY_SCALAR
+    HWY_ASSERT_VEC_EQ(dw, expected, ZipLower(even, odd));
+    HWY_ASSERT_VEC_EQ(dw, expected, ZipLower(dw, even, odd));
+  }
+};
+
+HWY_NOINLINE void TestAllZipLower() {
+  const ForDemoteVectors<TestZipLower> lower_unsigned;
+  lower_unsigned(uint8_t());
+  lower_unsigned(uint16_t());
+#if HWY_HAVE_INTEGER64
+  lower_unsigned(uint32_t());  // generates u64
+#endif
+
+  const ForDemoteVectors<TestZipLower> lower_signed;
+  lower_signed(int8_t());
+  lower_signed(int16_t());
+#if HWY_HAVE_INTEGER64
+  lower_signed(int32_t());  // generates i64
+#endif
+
+  // No float - concatenating f32 does not result in a f64
+}
+
+// Remove this test (so it does not show as having run) if the only target is
+// HWY_SCALAR, which does not support this op.
+#if HWY_TARGETS != HWY_SCALAR
+
+struct TestZipUpper {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+#if HWY_TARGET == HWY_SCALAR
+    (void)d;
+#else
+    using WideT = MakeWide<T>;
+    static_assert(sizeof(T) * 2 == sizeof(WideT), "Must be double-width");
+    static_assert(IsSigned<T>() == IsSigned<WideT>(), "Must have same sign");
+    const size_t N = Lanes(d);
+    if (N < 16 / sizeof(T)) return;
+    auto even_lanes = AllocateAligned<T>(N);
+    auto odd_lanes = AllocateAligned<T>(N);
+    auto zip_lanes = AllocateAligned<T>(N);
+    const T kMaxT = LimitsMax<T>();
+    for (size_t i = 0; i < N; ++i) {
+      even_lanes[i] = static_cast<T>((2 * i + 0) & kMaxT);
+      odd_lanes[i] = static_cast<T>((2 * i + 1) & kMaxT);
+    }
+    const auto even = Load(d, even_lanes.get());
+    const auto odd = Load(d, odd_lanes.get());
+
+    const size_t blockN = HWY_MIN(size_t(16) / sizeof(T), N);
+
+    for (size_t i = 0; i < N; i += 2) {
+      const size_t base = (i / blockN) * blockN + blockN / 2;
+      const size_t mod = i % blockN;
+      zip_lanes[i + 0] = even_lanes[mod / 2 + base];
+      zip_lanes[i + 1] = odd_lanes[mod / 2 + base];
+    }
+    const Repartition<WideT, D> dw;
+    const auto expected =
+        Load(dw, reinterpret_cast<const WideT*>(zip_lanes.get()));
+    HWY_ASSERT_VEC_EQ(dw, expected, ZipUpper(dw, even, odd));
+#endif  // HWY_TARGET == HWY_SCALAR
+  }
+};
+
+HWY_NOINLINE void TestAllZipUpper() {
+  const ForShrinkableVectors<TestZipUpper> upper_unsigned;
+  upper_unsigned(uint8_t());
+  upper_unsigned(uint16_t());
+#if HWY_HAVE_INTEGER64
+  upper_unsigned(uint32_t());  // generates u64
+#endif
+
+  const ForShrinkableVectors<TestZipUpper> upper_signed;
+  upper_signed(int8_t());
+  upper_signed(int16_t());
+#if HWY_HAVE_INTEGER64
+  upper_signed(int32_t());  // generates i64
+#endif
+
+  // No float - concatenating f32 does not result in a f64
+}
+
+#endif  // HWY_TARGETS != HWY_SCALAR
+
+class TestSpecialShuffle32 {
+ public:
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v = Iota(d, 0);
+    VerifyLanes32(d, Shuffle2301(v), 2, 3, 0, 1, __FILE__, __LINE__);
+    VerifyLanes32(d, Shuffle1032(v), 1, 0, 3, 2, __FILE__, __LINE__);
+    VerifyLanes32(d, Shuffle0321(v), 0, 3, 2, 1, __FILE__, __LINE__);
+    VerifyLanes32(d, Shuffle2103(v), 2, 1, 0, 3, __FILE__, __LINE__);
+    VerifyLanes32(d, Shuffle0123(v), 0, 1, 2, 3, __FILE__, __LINE__);
+  }
+
+ private:
+  // HWY_INLINE works around a Clang SVE compiler bug where all but the first
+  // 128 bits (the NEON register) of actual are zero.
+  template <class D, class V>
+  HWY_INLINE void VerifyLanes32(D d, VecArg<V> actual, const size_t i3,
+                                const size_t i2, const size_t i1,
+                                const size_t i0, const char* filename,
+                                const int line) {
+    using T = TFromD<D>;
+    constexpr size_t kBlockN = 16 / sizeof(T);
+    const size_t N = Lanes(d);
+    if (N < 4) return;
+    auto expected = AllocateAligned<T>(N);
+    for (size_t block = 0; block < N; block += kBlockN) {
+      expected[block + 3] = static_cast<T>(block + i3);
+      expected[block + 2] = static_cast<T>(block + i2);
+      expected[block + 1] = static_cast<T>(block + i1);
+      expected[block + 0] = static_cast<T>(block + i0);
+    }
+    AssertVecEqual(d, expected.get(), actual, filename, line);
+  }
+};
+
+class TestSpecialShuffle64 {
+ public:
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v = Iota(d, 0);
+    VerifyLanes64(d, Shuffle01(v), 0, 1, __FILE__, __LINE__);
+  }
+
+ private:
+  // HWY_INLINE works around a Clang SVE compiler bug where all but the first
+  // 128 bits (the NEON register) of actual are zero.
+  template <class D, class V>
+  HWY_INLINE void VerifyLanes64(D d, VecArg<V> actual, const size_t i1,
+                                const size_t i0, const char* filename,
+                                const int line) {
+    using T = TFromD<D>;
+    constexpr size_t kBlockN = 16 / sizeof(T);
+    const size_t N = Lanes(d);
+    if (N < 2) return;
+    auto expected = AllocateAligned<T>(N);
+    for (size_t block = 0; block < N; block += kBlockN) {
+      expected[block + 1] = static_cast<T>(block + i1);
+      expected[block + 0] = static_cast<T>(block + i0);
+    }
+    AssertVecEqual(d, expected.get(), actual, filename, line);
+  }
+};
+
+HWY_NOINLINE void TestAllSpecialShuffles() {
+  const ForGEVectors<128, TestSpecialShuffle32> test32;
+  test32(uint32_t());
+  test32(int32_t());
+  test32(float());
+
+#if HWY_HAVE_INTEGER64
+  const ForGEVectors<128, TestSpecialShuffle64> test64;
+  test64(uint64_t());
+  test64(int64_t());
+#endif
+
+#if HWY_HAVE_FLOAT64
+  const ForGEVectors<128, TestSpecialShuffle64> test_d;
+  test_d(double());
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyBlockwiseTest);
+HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllBroadcast);
+HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllTableLookupBytesSame);
+HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllTableLookupBytesMixed);
+HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllInterleave);
+HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllZipLower);
+#if HWY_TARGETS != HWY_SCALAR
+HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllZipUpper);
+#endif
+HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllSpecialShuffles);
+}  // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/combine_test.cc b/third_party/highway/hwy/tests/combine_test.cc
new file mode 100644
index 0000000000..e2f4cbeb00
--- /dev/null
+++ b/third_party/highway/hwy/tests/combine_test.cc
@@ -0,0 +1,275 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>  // memcpy
+
+#include <algorithm>  // std::fill
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/combine_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestLowerHalf {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const Half<D> d2;
+
+    const size_t N = Lanes(d);
+    auto lanes = AllocateAligned<T>(N);
+    auto lanes2 = AllocateAligned<T>(N);
+    std::fill(lanes.get(), lanes.get() + N, T(0));
+    std::fill(lanes2.get(), lanes2.get() + N, T(0));
+    const auto v = Iota(d, 1);
+    Store(LowerHalf(d2, v), d2, lanes.get());
+    Store(LowerHalf(v), d2, lanes2.get());  // optionally without D
+    size_t i = 0;
+    for (; i < Lanes(d2); ++i) {
+      HWY_ASSERT_EQ(T(1 + i), lanes[i]);
+      HWY_ASSERT_EQ(T(1 + i), lanes2[i]);
+    }
+    // Other half remains unchanged
+    for (; i < N; ++i) {
+      HWY_ASSERT_EQ(T(0), lanes[i]);
+      HWY_ASSERT_EQ(T(0), lanes2[i]);
+    }
+  }
+};
+
+struct TestLowerQuarter {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const Half<D> d2;
+    const Half<decltype(d2)> d4;
+
+    const size_t N = Lanes(d);
+    auto lanes = AllocateAligned<T>(N);
+    auto lanes2 = AllocateAligned<T>(N);
+    std::fill(lanes.get(), lanes.get() + N, T(0));
+    std::fill(lanes2.get(), lanes2.get() + N, T(0));
+    const auto v = Iota(d, 1);
+    const auto lo = LowerHalf(d4, LowerHalf(d2, v));
+    const auto lo2 = LowerHalf(LowerHalf(v));  // optionally without D
+    Store(lo, d4, lanes.get());
+    Store(lo2, d4, lanes2.get());
+    size_t i = 0;
+    for (; i < Lanes(d4); ++i) {
+      HWY_ASSERT_EQ(T(i + 1), lanes[i]);
+      HWY_ASSERT_EQ(T(i + 1), lanes2[i]);
+    }
+    // Upper 3/4 remain unchanged
+    for (; i < N; ++i) {
+      HWY_ASSERT_EQ(T(0), lanes[i]);
+      HWY_ASSERT_EQ(T(0), lanes2[i]);
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllLowerHalf() {
+  ForAllTypes(ForHalfVectors<TestLowerHalf>());
+
+  // The minimum vector size is 128 bits, so there's no guarantee we can have
+  // quarters of 64-bit lanes, hence test 'all' other types.
+  ForHalfVectors<TestLowerQuarter, 2> test_quarter;
+  ForUI8(test_quarter);
+  ForUI16(test_quarter);  // exclude float16_t - cannot compare
+  ForUIF32(test_quarter);
+}
+
+struct TestUpperHalf {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    // Scalar does not define UpperHalf.
+#if HWY_TARGET != HWY_SCALAR
+    const Half<D> d2;
+    const size_t N2 = Lanes(d2);
+    HWY_ASSERT(N2 * 2 == Lanes(d));
+    auto expected = AllocateAligned<T>(N2);
+    size_t i = 0;
+    for (; i < N2; ++i) {
+      expected[i] = static_cast<T>(N2 + 1 + i);
+    }
+    HWY_ASSERT_VEC_EQ(d2, expected.get(), UpperHalf(d2, Iota(d, 1)));
+#else
+    (void)d;
+#endif
+  }
+};
+
+HWY_NOINLINE void TestAllUpperHalf() {
+  ForAllTypes(ForHalfVectors<TestUpperHalf>());
+}
+
+struct TestZeroExtendVector {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const Twice<D> d2;
+
+    const auto v = Iota(d, 1);
+    const size_t N = Lanes(d);
+    const size_t N2 = Lanes(d2);
+    // If equal, then N was already MaxLanes(d) and it's not clear what
+    // Combine or ZeroExtendVector should return.
+    if (N2 == N) return;
+    HWY_ASSERT(N2 == 2 * N);
+    auto lanes = AllocateAligned<T>(N2);
+    Store(v, d, &lanes[0]);
+    Store(v, d, &lanes[N]);
+
+    const auto ext = ZeroExtendVector(d2, v);
+    Store(ext, d2, lanes.get());
+
+    // Lower half is unchanged
+    HWY_ASSERT_VEC_EQ(d, v, Load(d, &lanes[0]));
+    // Upper half is zero
+    HWY_ASSERT_VEC_EQ(d, Zero(d), Load(d, &lanes[N]));
+  }
+};
+
+HWY_NOINLINE void TestAllZeroExtendVector() {
+  ForAllTypes(ForExtendableVectors<TestZeroExtendVector>());
+}
+
+struct TestCombine {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const Twice<D> d2;
+    const size_t N2 = Lanes(d2);
+    auto lanes = AllocateAligned<T>(N2);
+
+    const auto lo = Iota(d, 1);
+    const auto hi = Iota(d, static_cast<T>(N2 / 2 + 1));
+    const auto combined = Combine(d2, hi, lo);
+    Store(combined, d2, lanes.get());
+
+    const auto expected = Iota(d2, 1);
+    HWY_ASSERT_VEC_EQ(d2, expected, combined);
+  }
+};
+
+HWY_NOINLINE void TestAllCombine() {
+  ForAllTypes(ForExtendableVectors<TestCombine>());
+}
+
+struct TestConcat {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    if (N == 1) return;
+    const size_t half_bytes = N * sizeof(T) / 2;
+
+    auto hi = AllocateAligned<T>(N);
+    auto lo = AllocateAligned<T>(N);
+    auto expected = AllocateAligned<T>(N);
+    RandomState rng;
+    for (size_t rep = 0; rep < 10; ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        hi[i] = static_cast<T>(Random64(&rng) & 0xFF);
+        lo[i] = static_cast<T>(Random64(&rng) & 0xFF);
+      }
+
+      {
+        memcpy(&expected[N / 2], &hi[N / 2], half_bytes);
+        memcpy(&expected[0], &lo[0], half_bytes);
+        const auto vhi = Load(d, hi.get());
+        const auto vlo = Load(d, lo.get());
+        HWY_ASSERT_VEC_EQ(d, expected.get(), ConcatUpperLower(d, vhi, vlo));
+      }
+
+      {
+        memcpy(&expected[N / 2], &hi[N / 2], half_bytes);
+        memcpy(&expected[0], &lo[N / 2], half_bytes);
+        const auto vhi = Load(d, hi.get());
+        const auto vlo = Load(d, lo.get());
+        HWY_ASSERT_VEC_EQ(d, expected.get(), ConcatUpperUpper(d, vhi, vlo));
+      }
+
+      {
+        memcpy(&expected[N / 2], &hi[0], half_bytes);
+        memcpy(&expected[0], &lo[N / 2], half_bytes);
+        const auto vhi = Load(d, hi.get());
+        const auto vlo = Load(d, lo.get());
+        HWY_ASSERT_VEC_EQ(d, expected.get(), ConcatLowerUpper(d, vhi, vlo));
+      }
+
+      {
+        memcpy(&expected[N / 2], &hi[0], half_bytes);
+        memcpy(&expected[0], &lo[0], half_bytes);
+        const auto vhi = Load(d, hi.get());
+        const auto vlo = Load(d, lo.get());
+        HWY_ASSERT_VEC_EQ(d, expected.get(), ConcatLowerLower(d, vhi, vlo));
+      }
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllConcat() {
+  ForAllTypes(ForShrinkableVectors<TestConcat>());
+}
+
+struct TestConcatOddEven {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+#if HWY_TARGET != HWY_SCALAR
+    const size_t N = Lanes(d);
+    const auto hi = Iota(d, static_cast<T>(N));
+    const auto lo = Iota(d, 0);
+    const auto even = Add(Iota(d, 0), Iota(d, 0));
+    const auto odd = Add(even, Set(d, 1));
+    HWY_ASSERT_VEC_EQ(d, odd, ConcatOdd(d, hi, lo));
+    HWY_ASSERT_VEC_EQ(d, even, ConcatEven(d, hi, lo));
+
+    // This test catches inadvertent saturation.
+    const auto min = Set(d, LowestValue<T>());
+    const auto max = Set(d, HighestValue<T>());
+    HWY_ASSERT_VEC_EQ(d, max, ConcatOdd(d, max, max));
+    HWY_ASSERT_VEC_EQ(d, max, ConcatEven(d, max, max));
+    HWY_ASSERT_VEC_EQ(d, min, ConcatOdd(d, min, min));
+    HWY_ASSERT_VEC_EQ(d, min, ConcatEven(d, min, min));
+#else
+    (void)d;
+#endif
+  }
+};
+
+HWY_NOINLINE void TestAllConcatOddEven() {
+  ForAllTypes(ForShrinkableVectors<TestConcatOddEven>());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyCombineTest);
+HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllLowerHalf);
+HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllUpperHalf);
+HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllZeroExtendVector);
+HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllCombine);
+HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllConcat);
+HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllConcatOddEven);
+}  // namespace hwy
+
+#endif  // HWY_ONCE
diff --git a/third_party/highway/hwy/tests/compare_test.cc b/third_party/highway/hwy/tests/compare_test.cc
new file mode 100644
index 0000000000..a96e29fc62
--- /dev/null
+++ b/third_party/highway/hwy/tests/compare_test.cc
@@ -0,0 +1,509 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>  // memset
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/compare_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// All types.
+struct TestEquality {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v2 = Iota(d, 2);
+    const auto v2b = Iota(d, 2);
+    const auto v3 = Iota(d, 3);
+
+    const auto mask_false = MaskFalse(d);
+    const auto mask_true = MaskTrue(d);
+
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq(v2, v3));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq(v3, v2));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Eq(v2, v2));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Eq(v2, v2b));
+
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne(v2, v3));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne(v3, v2));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne(v2, v2));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne(v2, v2b));
+  }
+};
+
+HWY_NOINLINE void TestAllEquality() {
+  ForAllTypes(ForPartialVectors<TestEquality>());
+}
+
+// a > b should be true, verify that for Gt/Lt and with swapped args.
+template <class D>
+void EnsureGreater(D d, TFromD<D> a, TFromD<D> b, const char* file, int line) {
+  const auto mask_false = MaskFalse(d);
+  const auto mask_true = MaskTrue(d);
+
+  const auto va = Set(d, a);
+  const auto vb = Set(d, b);
+  AssertMaskEqual(d, mask_true, Gt(va, vb), file, line);
+  AssertMaskEqual(d, mask_false, Lt(va, vb), file, line);
+
+  // Swapped order
+  AssertMaskEqual(d, mask_false, Gt(vb, va), file, line);
+  AssertMaskEqual(d, mask_true, Lt(vb, va), file, line);
+
+  // Also ensure irreflexive
+  AssertMaskEqual(d, mask_false, Gt(va, va), file, line);
+  AssertMaskEqual(d, mask_false, Gt(vb, vb), file, line);
+  AssertMaskEqual(d, mask_false, Lt(va, va), file, line);
+  AssertMaskEqual(d, mask_false, Lt(vb, vb), file, line);
+}
+
+#define HWY_ENSURE_GREATER(d, a, b) EnsureGreater(d, a, b, __FILE__, __LINE__)
+
+struct TestStrictUnsigned {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const T max = LimitsMax<T>();
+    const auto v0 = Zero(d);
+    const auto v2 = And(Iota(d, T(2)), Set(d, 255));  // 0..255
+
+    const auto mask_false = MaskFalse(d);
+
+    // Individual values of interest
+    HWY_ENSURE_GREATER(d, 2, 1);
+    HWY_ENSURE_GREATER(d, 1, 0);
+    HWY_ENSURE_GREATER(d, 128, 127);
+    HWY_ENSURE_GREATER(d, max, max / 2);
+    HWY_ENSURE_GREATER(d, max, 1);
+    HWY_ENSURE_GREATER(d, max, 0);
+
+    // Also use Iota to ensure lanes are independent
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, v0));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v0, v2));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v0, v0));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v0, v0));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, v2));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v2, v2));
+  }
+};
+
+HWY_NOINLINE void TestAllStrictUnsigned() {
+  ForUnsignedTypes(ForPartialVectors<TestStrictUnsigned>());
+}
+
+struct TestStrictInt {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const T min = LimitsMin<T>();
+    const T max = LimitsMax<T>();
+    const auto v0 = Zero(d);
+    const auto v2 = And(Iota(d, T(2)), Set(d, 127));  // 0..127
+    const auto vn = Sub(Neg(v2), Set(d, 1));          // -1..-128
+
+    const auto mask_false = MaskFalse(d);
+    const auto mask_true = MaskTrue(d);
+
+    // Individual values of interest
+    HWY_ENSURE_GREATER(d, 2, 1);
+    HWY_ENSURE_GREATER(d, 1, 0);
+    HWY_ENSURE_GREATER(d, 0, -1);
+    HWY_ENSURE_GREATER(d, -1, -2);
+    HWY_ENSURE_GREATER(d, max, max / 2);
+    HWY_ENSURE_GREATER(d, max, 1);
+    HWY_ENSURE_GREATER(d, max, 0);
+    HWY_ENSURE_GREATER(d, max, -1);
+    HWY_ENSURE_GREATER(d, max, min);
+    HWY_ENSURE_GREATER(d, 0, min);
+    HWY_ENSURE_GREATER(d, min / 2, min);
+
+    // Also use Iota to ensure lanes are independent
+    HWY_ASSERT_MASK_EQ(d, mask_true, Gt(v2, vn));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Lt(vn, v2));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, vn));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Gt(vn, v2));
+
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v0, v0));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, v2));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt(vn, vn));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v0, v0));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v2, v2));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Gt(vn, vn));
+  }
+};
+
+// S-SSE3 bug (#795): same upper, differing MSB in lower
+struct TestStrictInt64 {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto m0 = MaskFalse(d);
+    const auto m1 = MaskTrue(d);
+    HWY_ASSERT_MASK_EQ(d, m0, Lt(Set(d, 0x380000000LL), Set(d, 0x300000001LL)));
+    HWY_ASSERT_MASK_EQ(d, m1, Lt(Set(d, 0xF00000000LL), Set(d, 0xF80000000LL)));
+    HWY_ASSERT_MASK_EQ(d, m1, Lt(Set(d, 0xF00000000LL), Set(d, 0xF80000001LL)));
+  }
+};
+
+HWY_NOINLINE void TestAllStrictInt() {
+  ForSignedTypes(ForPartialVectors<TestStrictInt>());
+  ForPartialVectors<TestStrictInt64>()(int64_t());
+}
+
+struct TestStrictFloat {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const T huge_neg = T(-1E35);
+    const T huge_pos = T(1E36);
+    const auto v0 = Zero(d);
+    const auto v2 = Iota(d, T(2));
+    const auto vn = Neg(v2);
+
+    const auto mask_false = MaskFalse(d);
+    const auto mask_true = MaskTrue(d);
+
+    // Individual values of interest
+    HWY_ENSURE_GREATER(d, 2, 1);
+    HWY_ENSURE_GREATER(d, 1, 0);
+    HWY_ENSURE_GREATER(d, 0, -1);
+    HWY_ENSURE_GREATER(d, -1, -2);
+    HWY_ENSURE_GREATER(d, huge_pos, 1);
+    HWY_ENSURE_GREATER(d, huge_pos, 0);
+    HWY_ENSURE_GREATER(d, huge_pos, -1);
+    HWY_ENSURE_GREATER(d, huge_pos, huge_neg);
+    HWY_ENSURE_GREATER(d, 0, huge_neg);
+
+    // Also use Iota to ensure lanes are independent
+    HWY_ASSERT_MASK_EQ(d, mask_true, Gt(v2, vn));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Lt(vn, v2));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, vn));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Gt(vn, v2));
+
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v0, v0));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, v2));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt(vn, vn));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v0, v0));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v2, v2));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Gt(vn, vn));
+  }
+};
+
+HWY_NOINLINE void TestAllStrictFloat() {
+  ForFloatTypes(ForPartialVectors<TestStrictFloat>());
+}
+
+struct TestWeakFloat {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v2 = Iota(d, T(2));
+    const auto vn = Iota(d, -T(Lanes(d)));
+
+    const auto mask_false = MaskFalse(d);
+    const auto mask_true = MaskTrue(d);
+
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ge(v2, v2));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Le(vn, vn));
+
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ge(v2, vn));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Le(vn, v2));
+
+    HWY_ASSERT_MASK_EQ(d, mask_false, Le(v2, vn));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ge(vn, v2));
+  }
+};
+
+HWY_NOINLINE void TestAllWeakFloat() {
+  ForFloatTypes(ForPartialVectors<TestWeakFloat>());
+}
+
+template <class D>
+static HWY_NOINLINE Vec<D> Make128(D d, uint64_t hi, uint64_t lo) {
+  alignas(16) uint64_t in[2];
+  in[0] = lo;
+  in[1] = hi;
+  return LoadDup128(d, in);
+}
+
+struct TestLt128 {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using V = Vec<D>;
+    const V v00 = Zero(d);
+    const V v01 = Make128(d, 0, 1);
+    const V v10 = Make128(d, 1, 0);
+    const V v11 = Add(v01, v10);
+
+    const auto mask_false = MaskFalse(d);
+    const auto mask_true = MaskTrue(d);
+
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v00, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v01, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v10, v10));
+
+    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v00, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v01, v10));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v01, v11));
+
+    // Reversed order
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v01, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v10, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v11, v01));
+
+    // Also check 128-bit blocks are independent
+    const V iota = Iota(d, 1);
+    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, iota, Add(iota, v01)));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, iota, Add(iota, v10)));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, Add(iota, v01), iota));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, Add(iota, v10), iota));
+
+    // Max value
+    const V vm = Make128(d, LimitsMax<T>(), LimitsMax<T>());
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, v10));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, v11));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v00, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v01, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v10, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v11, vm));
+  }
+};
+
+HWY_NOINLINE void TestAllLt128() { ForGEVectors<128, TestLt128>()(uint64_t()); }
+
+struct TestLt128Upper {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using V = Vec<D>;
+    const V v00 = Zero(d);
+    const V v01 = Make128(d, 0, 1);
+    const V v10 = Make128(d, 1, 0);
+    const V v11 = Add(v01, v10);
+
+    const auto mask_false = MaskFalse(d);
+    const auto mask_true = MaskTrue(d);
+
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v00, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v01, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v10, v10));
+
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v00, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v01, v10));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v01, v11));
+
+    // Reversed order
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v01, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v10, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v11, v01));
+
+    // Also check 128-bit blocks are independent
+    const V iota = Iota(d, 1);
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, iota, Add(iota, v01)));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, iota, Add(iota, v10)));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, Add(iota, v01), iota));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, Add(iota, v10), iota));
+
+    // Max value
+    const V vm = Make128(d, LimitsMax<T>(), LimitsMax<T>());
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, vm, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, vm, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, vm, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, vm, v10));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, vm, v11));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v00, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v01, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v10, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v11, vm));
+  }
+};
+
+HWY_NOINLINE void TestAllLt128Upper() {
+  ForGEVectors<128, TestLt128Upper>()(uint64_t());
+}
+
+struct TestEq128 {  // Also Ne128
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using V = Vec<D>;
+    const V v00 = Zero(d);
+    const V v01 = Make128(d, 0, 1);
+    const V v10 = Make128(d, 1, 0);
+    const V v11 = Add(v01, v10);
+
+    const auto mask_false = MaskFalse(d);
+    const auto mask_true = MaskTrue(d);
+
+    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128(d, v00, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128(d, v01, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128(d, v10, v10));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne128(d, v00, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne128(d, v01, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne128(d, v10, v10));
+
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v00, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v01, v10));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v01, v11));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v00, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v01, v10));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v01, v11));
+
+    // Reversed order
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v01, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v10, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v11, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v01, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v10, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v11, v01));
+
+    // Also check 128-bit blocks are independent
+    const V iota = Iota(d, 1);
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, iota, Add(iota, v01)));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, iota, Add(iota, v10)));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, Add(iota, v01), iota));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, Add(iota, v10), iota));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, iota, Add(iota, v01)));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, iota, Add(iota, v10)));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, Add(iota, v01), iota));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, Add(iota, v10), iota));
+
+    // Max value
+    const V vm = Make128(d, LimitsMax<T>(), LimitsMax<T>());
+    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128(d, vm, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne128(d, vm, vm));
+
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, vm, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, vm, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, vm, v10));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, vm, v11));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v00, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v01, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v10, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v11, vm));
+
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, vm, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, vm, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, vm, v10));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, vm, v11));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v00, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v01, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v10, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v11, vm));
+  }
+};
+
+HWY_NOINLINE void TestAllEq128() { ForGEVectors<128, TestEq128>()(uint64_t()); }
+
+struct TestEq128Upper {  // Also Ne128Upper
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using V = Vec<D>;
+    const V v00 = Zero(d);
+    const V v01 = Make128(d, 0, 1);
+    const V v10 = Make128(d, 1, 0);
+    const V v11 = Add(v01, v10);
+
+    const auto mask_false = MaskFalse(d);
+    const auto mask_true = MaskTrue(d);
+
+    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v00, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v01, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v10, v10));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v00, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v01, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v10, v10));
+
+    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v00, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v00, v01));
+
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v01, v10));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v01, v11));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v01, v10));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v01, v11));
+
+    // Reversed order
+    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v01, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v01, v00));
+
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v10, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v11, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v10, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v11, v01));
+
+    // Also check 128-bit blocks are independent
+    const V iota = Iota(d, 1);
+    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, iota, Add(iota, v01)));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, iota, Add(iota, v01)));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, iota, Add(iota, v10)));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, iota, Add(iota, v10)));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, Add(iota, v01), iota));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, Add(iota, v01), iota));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, Add(iota, v10), iota));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, Add(iota, v10), iota));
+
+    // Max value
+    const V vm = Make128(d, LimitsMax<T>(), LimitsMax<T>());
+    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, vm, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, vm, vm));
+
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, vm, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, vm, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, vm, v10));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, vm, v11));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v00, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v01, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v10, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v11, vm));
+
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, vm, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, vm, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, vm, v10));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, vm, v11));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v00, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v01, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v10, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v11, vm));
+  }
+};
+
+HWY_NOINLINE void TestAllEq128Upper() {
+  ForGEVectors<128, TestEq128Upper>()(uint64_t());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyCompareTest);
+HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllEquality);
+HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictUnsigned);
+HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictInt);
+HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictFloat);
+HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllWeakFloat);
+HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllLt128);
+HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllLt128Upper);
+HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllEq128);
+HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllEq128Upper);
+}  // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/compress_test.cc b/third_party/highway/hwy/tests/compress_test.cc
new file mode 100644
index 0000000000..ae008b4dc4
--- /dev/null
+++ b/third_party/highway/hwy/tests/compress_test.cc
@@ -0,0 +1,833 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>  // memset
+
+#include <array>  // IWYU pragma: keep
+
+#include "hwy/base.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/compress_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// Regenerate tables used in the implementation, instead of testing.
+#define HWY_PRINT_TABLES 0
+
+#if !HWY_PRINT_TABLES || HWY_IDE
+
+template <class D, class DI, typename T = TFromD<D>, typename TI = TFromD<DI>>
+void CheckStored(D d, DI di, const char* op, size_t expected_pos,
+                 size_t actual_pos, size_t num_to_check,
+                 const AlignedFreeUniquePtr<T[]>& in,
+                 const AlignedFreeUniquePtr<TI[]>& mask_lanes,
+                 const AlignedFreeUniquePtr<T[]>& expected, const T* actual_u,
+                 int line) {
+  if (expected_pos != actual_pos) {
+    hwy::Abort(__FILE__, line,
+               "%s: size mismatch for %s: expected %d, actual %d\n", op,
+               TypeName(T(), Lanes(d)).c_str(), static_cast<int>(expected_pos),
+               static_cast<int>(actual_pos));
+  }
+  // Modified from AssertVecEqual - we may not be checking all lanes.
+  for (size_t i = 0; i < num_to_check; ++i) {
+    if (!IsEqual(expected[i], actual_u[i])) {
+      const size_t N = Lanes(d);
+      fprintf(stderr, "%s: mismatch at i=%d of %d, line %d:\n\n", op,
+              static_cast<int>(i), static_cast<int>(num_to_check), line);
+      Print(di, "mask", Load(di, mask_lanes.get()), 0, N);
+      Print(d, "in", Load(d, in.get()), 0, N);
+      Print(d, "expect", Load(d, expected.get()), 0, num_to_check);
+      Print(d, "actual", Load(d, actual_u), 0, num_to_check);
+      HWY_ASSERT(false);
+    }
+  }
+}
+
+struct TestCompress {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    RandomState rng;
+
+    using TI = MakeSigned<T>;  // For mask > 0 comparison
+    using TU = MakeUnsigned<T>;
+    const Rebind<TI, D> di;
+    const size_t N = Lanes(d);
+
+    for (int frac : {0, 2, 3}) {
+      // For CompressStore
+      const size_t misalign = static_cast<size_t>(frac) * N / 4;
+
+      auto in_lanes = AllocateAligned<T>(N);
+      auto mask_lanes = AllocateAligned<TI>(N);
+      auto garbage = AllocateAligned<TU>(N);
+      auto expected = AllocateAligned<T>(N);
+      auto actual_a = AllocateAligned<T>(misalign + N);
+      T* actual_u = actual_a.get() + misalign;
+
+      const size_t bits_size = RoundUpTo((N + 7) / 8, 8);
+      auto bits = AllocateAligned<uint8_t>(bits_size);
+      memset(bits.get(), 0, bits_size);  // for MSAN
+
+      // Each lane should have a chance of having mask=true.
+      for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+        size_t expected_pos = 0;
+        for (size_t i = 0; i < N; ++i) {
+          const uint64_t r = Random32(&rng);
+          in_lanes[i] = T();  // cannot initialize float16_t directly.
+          CopyBytes<sizeof(T)>(&r, &in_lanes[i]);  // not same size
+          mask_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0);
+          if (mask_lanes[i] > 0) {
+            expected[expected_pos++] = in_lanes[i];
+          }
+          garbage[i] = static_cast<TU>(Random64(&rng));
+        }
+        size_t num_to_check;
+        if (CompressIsPartition<T>::value) {
+          // For non-native Compress, also check that mask=false lanes were
+          // moved to the back of the vector (highest indices).
+          size_t extra = expected_pos;
+          for (size_t i = 0; i < N; ++i) {
+            if (mask_lanes[i] == 0) {
+              expected[extra++] = in_lanes[i];
+            }
+          }
+          HWY_ASSERT(extra == N);
+          num_to_check = N;
+        } else {
+          // For native Compress, only the mask=true lanes are defined.
+          num_to_check = expected_pos;
+        }
+
+        const auto in = Load(d, in_lanes.get());
+        const auto mask =
+            RebindMask(d, Gt(Load(di, mask_lanes.get()), Zero(di)));
+        StoreMaskBits(d, mask, bits.get());
+
+        // Compress
+        memset(actual_u, 0, N * sizeof(T));
+        StoreU(Compress(in, mask), d, actual_u);
+        CheckStored(d, di, "Compress", expected_pos, expected_pos, num_to_check,
+                    in_lanes, mask_lanes, expected, actual_u, __LINE__);
+
+        // CompressNot
+        memset(actual_u, 0, N * sizeof(T));
+        StoreU(CompressNot(in, Not(mask)), d, actual_u);
+        CheckStored(d, di, "CompressNot", expected_pos, expected_pos,
+                    num_to_check, in_lanes, mask_lanes, expected, actual_u,
+                    __LINE__);
+
+        // CompressStore
+        memset(actual_u, 0, N * sizeof(T));
+        const size_t size1 = CompressStore(in, mask, d, actual_u);
+        // expected_pos instead of num_to_check because this op is not
+        // affected by CompressIsPartition.
+        CheckStored(d, di, "CompressStore", expected_pos, size1, expected_pos,
+                    in_lanes, mask_lanes, expected, actual_u, __LINE__);
+
+        // CompressBlendedStore
+        memcpy(actual_u, garbage.get(), N * sizeof(T));
+        const size_t size2 = CompressBlendedStore(in, mask, d, actual_u);
+        // expected_pos instead of num_to_check because this op only writes
+        // the mask=true lanes.
+        CheckStored(d, di, "CompressBlendedStore", expected_pos, size2,
+                    expected_pos, in_lanes, mask_lanes, expected, actual_u,
+                    __LINE__);
+        // Subsequent lanes are untouched.
+        for (size_t i = size2; i < N; ++i) {
+#if HWY_COMPILER_MSVC && HWY_TARGET == HWY_AVX2
+          // TODO(eustas): re-enable when compiler is fixed
+#else
+          HWY_ASSERT_EQ(garbage[i], reinterpret_cast<TU*>(actual_u)[i]);
+#endif
+        }
+
+        // CompressBits
+        memset(actual_u, 0, N * sizeof(T));
+        StoreU(CompressBits(in, bits.get()), d, actual_u);
+        CheckStored(d, di, "CompressBits", expected_pos, expected_pos,
+                    num_to_check, in_lanes, mask_lanes, expected, actual_u,
+                    __LINE__);
+
+        // CompressBitsStore
+        memset(actual_u, 0, N * sizeof(T));
+        const size_t size3 = CompressBitsStore(in, bits.get(), d, actual_u);
+        // expected_pos instead of num_to_check because this op is not
+        // affected by CompressIsPartition.
+        CheckStored(d, di, "CompressBitsStore", expected_pos, size3,
+                    expected_pos, in_lanes, mask_lanes, expected, actual_u,
+                    __LINE__);
+      }  // rep
+    }    // frac
+  }      // operator()
+};
+
+HWY_NOINLINE void TestAllCompress() {
+  ForAllTypes(ForPartialVectors<TestCompress>());
+}
+
+struct TestCompressBlocks {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+#if HWY_TARGET == HWY_SCALAR
+    (void)d;
+#else
+    static_assert(sizeof(T) == 8 && !IsSigned<T>(), "Should be u64");
+    RandomState rng;
+
+    using TI = MakeSigned<T>;  // For mask > 0 comparison
+    const Rebind<TI, D> di;
+    const size_t N = Lanes(d);
+
+    auto in_lanes = AllocateAligned<T>(N);
+    auto mask_lanes = AllocateAligned<TI>(N);
+    auto expected = AllocateAligned<T>(N);
+    auto actual = AllocateAligned<T>(N);
+
+    // Each lane should have a chance of having mask=true.
+    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+      size_t expected_pos = 0;
+      for (size_t i = 0; i < N; i += 2) {
+        const uint64_t bits = Random32(&rng);
+        in_lanes[i + 1] = in_lanes[i] = T();  // cannot set float16_t directly.
+        CopyBytes<sizeof(T)>(&bits, &in_lanes[i]);      // not same size
+        CopyBytes<sizeof(T)>(&bits, &in_lanes[i + 1]);  // not same size
+        mask_lanes[i + 1] = mask_lanes[i] = TI{(Random32(&rng) & 8) ? 1 : 0};
+        if (mask_lanes[i] > 0) {
+          expected[expected_pos++] = in_lanes[i];
+          expected[expected_pos++] = in_lanes[i + 1];
+        }
+      }
+      size_t num_to_check;
+      if (CompressIsPartition<T>::value) {
+        // For non-native Compress, also check that mask=false lanes were
+        // moved to the back of the vector (highest indices).
+        size_t extra = expected_pos;
+        for (size_t i = 0; i < N; ++i) {
+          if (mask_lanes[i] == 0) {
+            expected[extra++] = in_lanes[i];
+          }
+        }
+        HWY_ASSERT(extra == N);
+        num_to_check = N;
+      } else {
+        // For native Compress, only the mask=true lanes are defined.
+        num_to_check = expected_pos;
+      }
+
+      const auto in = Load(d, in_lanes.get());
+      const auto mask = RebindMask(d, Gt(Load(di, mask_lanes.get()), Zero(di)));
+
+      // CompressBlocksNot
+      memset(actual.get(), 0, N * sizeof(T));
+      StoreU(CompressBlocksNot(in, Not(mask)), d, actual.get());
+      CheckStored(d, di, "CompressBlocksNot", expected_pos, expected_pos,
+                  num_to_check, in_lanes, mask_lanes, expected, actual.get(),
+                  __LINE__);
+    }  // rep
+#endif  // HWY_TARGET == HWY_SCALAR
+  }     // operator()
+};
+
+HWY_NOINLINE void TestAllCompressBlocks() {
+  ForGE128Vectors<TestCompressBlocks>()(uint64_t());
+}
+
+#endif  // !HWY_PRINT_TABLES
+
+#if HWY_PRINT_TABLES || HWY_IDE
+namespace detail {  // for code folding
+
+void PrintCompress8x8Tables() {
+  printf("======================================= 8x8\n");
+  constexpr size_t N = 8;
+  for (uint64_t code = 0; code < (1ull << N); ++code) {
+    std::array<uint8_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    for (size_t i = 0; i < N; ++i) {
+      printf("%d,", indices[i]);
+    }
+    printf(code & 1 ? "//\n" : "/**/");
+  }
+  printf("\n");
+}
+
+void PrintCompress16x8Tables() {
+  printf("======================================= 16x8\n");
+  constexpr size_t N = 8;  // 128-bit SIMD
+  for (uint64_t code = 0; code < (1ull << N); ++code) {
+    std::array<uint8_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    // Doubled (for converting lane to byte indices)
+    for (size_t i = 0; i < N; ++i) {
+      printf("%d,", 2 * indices[i]);
+    }
+    printf(code & 1 ? "//\n" : "/**/");
+  }
+  printf("\n");
+}
+
+void PrintCompressNot16x8Tables() {
+  printf("======================================= Not 16x8\n");
+  constexpr size_t N = 8;  // 128-bit SIMD
+  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+    const uint64_t code = ~not_code;
+    std::array<uint8_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    // Doubled (for converting lane to byte indices)
+    for (size_t i = 0; i < N; ++i) {
+      printf("%d,", 2 * indices[i]);
+    }
+    printf(not_code & 1 ? "//\n" : "/**/");
+  }
+  printf("\n");
+}
+
+// Compressed to nibbles, unpacked via variable right shift. Also includes
+// FirstN bits in the nibble MSB.
+void PrintCompress32x8Tables() {
+  printf("======================================= 32/64x8\n");
+  constexpr size_t N = 8;  // AVX2 or 64-bit AVX3
+  for (uint64_t code = 0; code < (1ull << N); ++code) {
+    const size_t count = PopCount(code);
+    std::array<uint32_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    // Convert to nibbles
+    uint64_t packed = 0;
+    for (size_t i = 0; i < N; ++i) {
+      HWY_ASSERT(indices[i] < N);
+      if (i < count) {
+        indices[i] |= N;
+        HWY_ASSERT(indices[i] < 0x10);
+      }
+      packed += indices[i] << (i * 4);
+    }
+
+    HWY_ASSERT(packed < (1ull << (N * 4)));
+    printf("0x%08x,", static_cast<uint32_t>(packed));
+  }
+  printf("\n");
+}
+
+void PrintCompressNot32x8Tables() {
+  printf("======================================= Not 32/64x8\n");
+  constexpr size_t N = 8;  // AVX2 or 64-bit AVX3
+  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+    const uint64_t code = ~not_code;
+    const size_t count = PopCount(code);
+    std::array<uint32_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    // Convert to nibbles
+    uint64_t packed = 0;
+    for (size_t i = 0; i < N; ++i) {
+      HWY_ASSERT(indices[i] < N);
+      if (i < count) {
+        indices[i] |= N;
+        HWY_ASSERT(indices[i] < 0x10);
+      }
+      packed += indices[i] << (i * 4);
+    }
+
+    HWY_ASSERT(packed < (1ull << (N * 4)));
+    printf("0x%08x,", static_cast<uint32_t>(packed));
+  }
+  printf("\n");
+}
+
+// Compressed to nibbles (for AVX3 64x4)
+void PrintCompress64x4NibbleTables() {
+  printf("======================================= 64x4Nibble\n");
+  constexpr size_t N = 4;  // AVX2
+  for (uint64_t code = 0; code < (1ull << N); ++code) {
+    std::array<uint32_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    // Convert to nibbles
+    uint64_t packed = 0;
+    for (size_t i = 0; i < N; ++i) {
+      HWY_ASSERT(indices[i] < N);
+      packed += indices[i] << (i * 4);
+    }
+
+    HWY_ASSERT(packed < (1ull << (N * 4)));
+    printf("0x%08x,", static_cast<uint32_t>(packed));
+  }
+  printf("\n");
+}
+
+void PrintCompressNot64x4NibbleTables() {
+  printf("======================================= Not 64x4Nibble\n");
+  constexpr size_t N = 4;  // AVX2
+  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+    const uint64_t code = ~not_code;
+    std::array<uint32_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    // Convert to nibbles
+    uint64_t packed = 0;
+    for (size_t i = 0; i < N; ++i) {
+      HWY_ASSERT(indices[i] < N);
+      packed += indices[i] << (i * 4);
+    }
+
+    HWY_ASSERT(packed < (1ull << (N * 4)));
+    printf("0x%08x,", static_cast<uint32_t>(packed));
+  }
+  printf("\n");
+}
+
+void PrintCompressNot64x2NibbleTables() {
+  printf("======================================= Not 64x2Nibble\n");
+  constexpr size_t N = 2;  // 128-bit
+  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+    const uint64_t code = ~not_code;
+    std::array<uint32_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    // Convert to nibbles
+    uint64_t packed = 0;
+    for (size_t i = 0; i < N; ++i) {
+      HWY_ASSERT(indices[i] < N);
+      packed += indices[i] << (i * 4);
+    }
+
+    HWY_ASSERT(packed < (1ull << (N * 4)));
+    printf("0x%08x,", static_cast<uint32_t>(packed));
+  }
+  printf("\n");
+}
+
+void PrintCompress64x4Tables() {
+  printf("======================================= 64x4 uncompressed\n");
+  constexpr size_t N = 4;  // SVE_256
+  for (uint64_t code = 0; code < (1ull << N); ++code) {
+    std::array<size_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    // Store uncompressed indices because SVE TBL returns 0 if an index is out
+    // of bounds. On AVX3 we simply variable-shift because permute indices are
+    // interpreted modulo N. Compression is not worth the extra shift+AND
+    // because the table is anyway only 512 bytes.
+    for (size_t i = 0; i < N; ++i) {
+      printf("%d,", static_cast<int>(indices[i]));
+    }
+  }
+  printf("\n");
+}
+
+void PrintCompressNot64x4Tables() {
+  printf("======================================= Not 64x4 uncompressed\n");
+  constexpr size_t N = 4;  // SVE_256
+  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+    const uint64_t code = ~not_code;
+    std::array<size_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    // Store uncompressed indices because SVE TBL returns 0 if an index is out
+    // of bounds. On AVX3 we simply variable-shift because permute indices are
+    // interpreted modulo N. Compression is not worth the extra shift+AND
+    // because the table is anyway only 512 bytes.
+    for (size_t i = 0; i < N; ++i) {
+      printf("%d,", static_cast<int>(indices[i]));
+    }
+  }
+  printf("\n");
+}
+
+// Same as above, but prints pairs of u32 indices (for AVX2). Also includes
+// FirstN bits in the nibble MSB.
+void PrintCompress64x4PairTables() {
+  printf("======================================= 64x4 u32 index\n");
+  constexpr size_t N = 4;  // AVX2
+  for (uint64_t code = 0; code < (1ull << N); ++code) {
+    const size_t count = PopCount(code);
+    std::array<size_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    // Store uncompressed indices because SVE TBL returns 0 if an index is out
+    // of bounds. On AVX3 we simply variable-shift because permute indices are
+    // interpreted modulo N. Compression is not worth the extra shift+AND
+    // because the table is anyway only 512 bytes.
+    for (size_t i = 0; i < N; ++i) {
+      const int first_n_bit = i < count ? 8 : 0;
+      const int low = static_cast<int>(2 * indices[i]) + first_n_bit;
+      HWY_ASSERT(low < 0x10);
+      printf("%d, %d, ", low, low + 1);
+    }
+  }
+  printf("\n");
+}
+
+void PrintCompressNot64x4PairTables() {
+  printf("======================================= Not 64x4 u32 index\n");
+  constexpr size_t N = 4;  // AVX2
+  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+    const uint64_t code = ~not_code;
+    const size_t count = PopCount(code);
+    std::array<size_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    // Store uncompressed indices because SVE TBL returns 0 if an index is out
+    // of bounds. On AVX3 we simply variable-shift because permute indices are
+    // interpreted modulo N. Compression is not worth the extra shift+AND
+    // because the table is anyway only 512 bytes.
+    for (size_t i = 0; i < N; ++i) {
+      const int first_n_bit = i < count ? 8 : 0;
+      const int low = static_cast<int>(2 * indices[i]) + first_n_bit;
+      HWY_ASSERT(low < 0x10);
+      printf("%d, %d, ", low, low + 1);
+    }
+  }
+  printf("\n");
+}
+
+// 4-tuple of byte indices
+void PrintCompress32x4Tables() {
+  printf("======================================= 32x4\n");
+  using T = uint32_t;
+  constexpr size_t N = 4;  // SSE4
+  for (uint64_t code = 0; code < (1ull << N); ++code) {
+    std::array<uint32_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    for (size_t i = 0; i < N; ++i) {
+      for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
+        printf("%d,", static_cast<int>(sizeof(T) * indices[i] + idx_byte));
+      }
+    }
+  }
+  printf("\n");
+}
+
+void PrintCompressNot32x4Tables() {
+  printf("======================================= Not 32x4\n");
+  using T = uint32_t;
+  constexpr size_t N = 4;  // SSE4
+  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+    const uint64_t code = ~not_code;
+    std::array<uint32_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    for (size_t i = 0; i < N; ++i) {
+      for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
+        printf("%d,", static_cast<int>(sizeof(T) * indices[i] + idx_byte));
+      }
+    }
+  }
+  printf("\n");
+}
+
+// 8-tuple of byte indices
+void PrintCompress64x2Tables() {
+  printf("======================================= 64x2\n");
+  using T = uint64_t;
+  constexpr size_t N = 2;  // SSE4
+  for (uint64_t code = 0; code < (1ull << N); ++code) {
+    std::array<uint32_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    for (size_t i = 0; i < N; ++i) {
+      for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
+        printf("%d,", static_cast<int>(sizeof(T) * indices[i] + idx_byte));
+      }
+    }
+  }
+  printf("\n");
+}
+
+void PrintCompressNot64x2Tables() {
+  printf("======================================= Not 64x2\n");
+  using T = uint64_t;
+  constexpr size_t N = 2;  // SSE4
+  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+    const uint64_t code = ~not_code;
+    std::array<uint32_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    for (size_t i = 0; i < N; ++i) {
+      for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
+        printf("%d,", static_cast<int>(sizeof(T) * indices[i] + idx_byte));
+      }
+    }
+  }
+  printf("\n");
+}
+
+}  // namespace detail
+
+HWY_NOINLINE void PrintTables() {
+  // Only print once.
+#if HWY_TARGET == HWY_STATIC_TARGET
+  detail::PrintCompress32x8Tables();
+  detail::PrintCompressNot32x8Tables();
+  detail::PrintCompress64x4NibbleTables();
+  detail::PrintCompressNot64x4NibbleTables();
+  detail::PrintCompressNot64x2NibbleTables();
+  detail::PrintCompress64x4Tables();
+  detail::PrintCompressNot64x4Tables();
+  detail::PrintCompress32x4Tables();
+  detail::PrintCompressNot32x4Tables();
+  detail::PrintCompress64x2Tables();
+  detail::PrintCompressNot64x2Tables();
+  detail::PrintCompress64x4PairTables();
+  detail::PrintCompressNot64x4PairTables();
+  detail::PrintCompress16x8Tables();
+  detail::PrintCompress8x8Tables();
+  detail::PrintCompressNot16x8Tables();
+#endif
+}
+
+#endif  // HWY_PRINT_TABLES
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyCompressTest);
+#if HWY_PRINT_TABLES
+// Only print instead of running tests; this will be visible in the log.
+HWY_EXPORT_AND_TEST_P(HwyCompressTest, PrintTables);
+#else
+HWY_EXPORT_AND_TEST_P(HwyCompressTest, TestAllCompress);
+HWY_EXPORT_AND_TEST_P(HwyCompressTest, TestAllCompressBlocks);
+#endif
+}  // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/convert_test.cc b/third_party/highway/hwy/tests/convert_test.cc
new file mode 100644
index 0000000000..a7aea5fe9e
--- /dev/null
+++ b/third_party/highway/hwy/tests/convert_test.cc
@@ -0,0 +1,643 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <cmath>  // std::isfinite
+
+#include "hwy/base.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/convert_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// Cast and ensure bytes are the same. Called directly from TestAllBitCast or
+// via TestBitCastFrom.
+template <typename ToT>
+struct TestBitCast {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const Repartition<ToT, D> dto;
+    const size_t N = Lanes(d);
+    const size_t Nto = Lanes(dto);
+    if (N == 0 || Nto == 0) return;
+    HWY_ASSERT_EQ(N * sizeof(T), Nto * sizeof(ToT));
+    const auto vf = Iota(d, 1);
+    const auto vt = BitCast(dto, vf);
+    // Must return the same bits
+    auto from_lanes = AllocateAligned<T>(Lanes(d));
+    auto to_lanes = AllocateAligned<ToT>(Lanes(dto));
+    Store(vf, d, from_lanes.get());
+    Store(vt, dto, to_lanes.get());
+    HWY_ASSERT(
+        BytesEqual(from_lanes.get(), to_lanes.get(), Lanes(d) * sizeof(T)));
+  }
+};
+
+// From D to all types.
+struct TestBitCastFrom {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T t, D d) {
+    TestBitCast<uint8_t>()(t, d);
+    TestBitCast<uint16_t>()(t, d);
+    TestBitCast<uint32_t>()(t, d);
+#if HWY_HAVE_INTEGER64
+    TestBitCast<uint64_t>()(t, d);
+#endif
+    TestBitCast<int8_t>()(t, d);
+    TestBitCast<int16_t>()(t, d);
+    TestBitCast<int32_t>()(t, d);
+#if HWY_HAVE_INTEGER64
+    TestBitCast<int64_t>()(t, d);
+#endif
+    TestBitCast<float>()(t, d);
+#if HWY_HAVE_FLOAT64
+    TestBitCast<double>()(t, d);
+#endif
+  }
+};
+
+HWY_NOINLINE void TestAllBitCast() {
+  // For HWY_SCALAR and partial vectors, we can only cast to same-sized types:
+  // the former can't partition its single lane, and the latter can be smaller
+  // than a destination type.
+  const ForPartialVectors<TestBitCast<uint8_t>> to_u8;
+  to_u8(uint8_t());
+  to_u8(int8_t());
+
+  const ForPartialVectors<TestBitCast<int8_t>> to_i8;
+  to_i8(uint8_t());
+  to_i8(int8_t());
+
+  const ForPartialVectors<TestBitCast<uint16_t>> to_u16;
+  to_u16(uint16_t());
+  to_u16(int16_t());
+
+  const ForPartialVectors<TestBitCast<int16_t>> to_i16;
+  to_i16(uint16_t());
+  to_i16(int16_t());
+
+  const ForPartialVectors<TestBitCast<uint32_t>> to_u32;
+  to_u32(uint32_t());
+  to_u32(int32_t());
+  to_u32(float());
+
+  const ForPartialVectors<TestBitCast<int32_t>> to_i32;
+  to_i32(uint32_t());
+  to_i32(int32_t());
+  to_i32(float());
+
+#if HWY_HAVE_INTEGER64
+  const ForPartialVectors<TestBitCast<uint64_t>> to_u64;
+  to_u64(uint64_t());
+  to_u64(int64_t());
+#if HWY_HAVE_FLOAT64
+  to_u64(double());
+#endif
+
+  const ForPartialVectors<TestBitCast<int64_t>> to_i64;
+  to_i64(uint64_t());
+  to_i64(int64_t());
+#if HWY_HAVE_FLOAT64
+  to_i64(double());
+#endif
+#endif  // HWY_HAVE_INTEGER64
+
+  const ForPartialVectors<TestBitCast<float>> to_float;
+  to_float(uint32_t());
+  to_float(int32_t());
+  to_float(float());
+
+#if HWY_HAVE_FLOAT64
+  const ForPartialVectors<TestBitCast<double>> to_double;
+  to_double(double());
+#if HWY_HAVE_INTEGER64
+  to_double(uint64_t());
+  to_double(int64_t());
+#endif  // HWY_HAVE_INTEGER64
+#endif  // HWY_HAVE_FLOAT64
+
+#if HWY_TARGET != HWY_SCALAR
+  // For non-scalar vectors, we can cast all types to all.
+  ForAllTypes(ForGEVectors<64, TestBitCastFrom>());
+#endif
+}
+
+template <typename ToT>
+struct TestPromoteTo {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D from_d) {
+    static_assert(sizeof(T) < sizeof(ToT), "Input type must be narrower");
+    const Rebind<ToT, D> to_d;
+
+    const size_t N = Lanes(from_d);
+    auto from = AllocateAligned<T>(N);
+    auto expected = AllocateAligned<ToT>(N);
+
+    RandomState rng;
+    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        const uint64_t bits = rng();
+        CopyBytes<sizeof(T)>(&bits, &from[i]);  // not same size
+        expected[i] = from[i];
+      }
+
+      HWY_ASSERT_VEC_EQ(to_d, expected.get(),
+                        PromoteTo(to_d, Load(from_d, from.get())));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllPromoteTo() {
+  const ForPromoteVectors<TestPromoteTo<uint16_t>, 1> to_u16div2;
+  to_u16div2(uint8_t());
+
+  const ForPromoteVectors<TestPromoteTo<uint32_t>, 2> to_u32div4;
+  to_u32div4(uint8_t());
+
+  const ForPromoteVectors<TestPromoteTo<uint32_t>, 1> to_u32div2;
+  to_u32div2(uint16_t());
+
+  const ForPromoteVectors<TestPromoteTo<int16_t>, 1> to_i16div2;
+  to_i16div2(uint8_t());
+  to_i16div2(int8_t());
+
+  const ForPromoteVectors<TestPromoteTo<int32_t>, 1> to_i32div2;
+  to_i32div2(uint16_t());
+  to_i32div2(int16_t());
+
+  const ForPromoteVectors<TestPromoteTo<int32_t>, 2> to_i32div4;
+  to_i32div4(uint8_t());
+  to_i32div4(int8_t());
+
+  // Must test f16/bf16 separately because we can only load/store/convert them.
+
+#if HWY_HAVE_INTEGER64
+  const ForPromoteVectors<TestPromoteTo<uint64_t>, 1> to_u64div2;
+  to_u64div2(uint32_t());
+
+  const ForPromoteVectors<TestPromoteTo<int64_t>, 1> to_i64div2;
+  to_i64div2(int32_t());
+#endif
+
+#if HWY_HAVE_FLOAT64
+  const ForPromoteVectors<TestPromoteTo<double>, 1> to_f64div2;
+  to_f64div2(int32_t());
+  to_f64div2(float());
+#endif
+}
+
+template <typename T, HWY_IF_FLOAT(T)>
+bool IsFinite(T t) {
+  return std::isfinite(t);
+}
+// Wrapper avoids calling std::isfinite for integer types (ambiguous).
+template <typename T, HWY_IF_NOT_FLOAT(T)>
+bool IsFinite(T /*unused*/) {
+  return true;
+}
+
+template <class D>
+AlignedFreeUniquePtr<float[]> F16TestCases(D d, size_t& padded) {
+  const float test_cases[] = {
+      // +/- 1
+      1.0f, -1.0f,
+      // +/- 0
+      0.0f, -0.0f,
+      // near 0
+      0.25f, -0.25f,
+      // +/- integer
+      4.0f, -32.0f,
+      // positive near limit
+      65472.0f, 65504.0f,
+      // negative near limit
+      -65472.0f, -65504.0f,
+      // positive +/- delta
+      2.00390625f, 3.99609375f,
+      // negative +/- delta
+      -2.00390625f, -3.99609375f,
+      // No infinity/NaN - implementation-defined due to ARM.
+  };
+  constexpr size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
+  const size_t N = Lanes(d);
+  HWY_ASSERT(N != 0);
+  padded = RoundUpTo(kNumTestCases, N);  // allow loading whole vectors
+  auto in = AllocateAligned<float>(padded);
+  auto expected = AllocateAligned<float>(padded);
+  size_t i = 0;
+  for (; i < kNumTestCases; ++i) {
+    in[i] = test_cases[i];
+  }
+  for (; i < padded; ++i) {
+    in[i] = 0.0f;
+  }
+  return in;
+}
+
+struct TestF16 {
+  template <typename TF32, class DF32>
+  HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {
+#if HWY_HAVE_FLOAT16
+    size_t padded;
+    const size_t N = Lanes(d32);  // same count for f16
+    HWY_ASSERT(N != 0);
+    auto in = F16TestCases(d32, padded);
+    using TF16 = float16_t;
+    const Rebind<TF16, DF32> d16;
+    auto temp16 = AllocateAligned<TF16>(N);
+
+    for (size_t i = 0; i < padded; i += N) {
+      const auto loaded = Load(d32, &in[i]);
+      Store(DemoteTo(d16, loaded), d16, temp16.get());
+      HWY_ASSERT_VEC_EQ(d32, loaded, PromoteTo(d32, Load(d16, temp16.get())));
+    }
+#else
+    (void)d32;
+#endif
+  }
+};
+
+HWY_NOINLINE void TestAllF16() { ForDemoteVectors<TestF16>()(float()); }
+
+template <class D>
+AlignedFreeUniquePtr<float[]> BF16TestCases(D d, size_t& padded) {
+  const float test_cases[] = {
+      // +/- 1
+      1.0f, -1.0f,
+      // +/- 0
+      0.0f, -0.0f,
+      // near 0
+      0.25f, -0.25f,
+      // +/- integer
+      4.0f, -32.0f,
+      // positive near limit
+      3.389531389251535E38f, 1.99384199368e+38f,
+      // negative near limit
+      -3.389531389251535E38f, -1.99384199368e+38f,
+      // positive +/- delta
+      2.015625f, 3.984375f,
+      // negative +/- delta
+      -2.015625f, -3.984375f,
+  };
+  constexpr size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
+  const size_t N = Lanes(d);
+  HWY_ASSERT(N != 0);
+  padded = RoundUpTo(kNumTestCases, N);  // allow loading whole vectors
+  auto in = AllocateAligned<float>(padded);
+  auto expected = AllocateAligned<float>(padded);
+  size_t i = 0;
+  for (; i < kNumTestCases; ++i) {
+    in[i] = test_cases[i];
+  }
+  for (; i < padded; ++i) {
+    in[i] = 0.0f;
+  }
+  return in;
+}
+
+struct TestBF16 {
+  template <typename TF32, class DF32>
+  HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {
+#if !defined(HWY_EMULATE_SVE)
+    size_t padded;
+    auto in = BF16TestCases(d32, padded);
+    using TBF16 = bfloat16_t;
+#if HWY_TARGET == HWY_SCALAR
+    const Rebind<TBF16, DF32> dbf16;  // avoid 4/2 = 2 lanes
+#else
+    const Repartition<TBF16, DF32> dbf16;
+#endif
+    const Half<decltype(dbf16)> dbf16_half;
+    const size_t N = Lanes(d32);
+    HWY_ASSERT(Lanes(dbf16_half) <= N);
+    auto temp16 = AllocateAligned<TBF16>(N);
+
+    for (size_t i = 0; i < padded; i += N) {
+      const auto loaded = Load(d32, &in[i]);
+      const auto v16 = DemoteTo(dbf16_half, loaded);
+      Store(v16, dbf16_half, temp16.get());
+      const auto v16_loaded = Load(dbf16_half, temp16.get());
+      HWY_ASSERT_VEC_EQ(d32, loaded, PromoteTo(d32, v16_loaded));
+    }
+#else
+    (void)d32;
+#endif
+  }
+};
+
+HWY_NOINLINE void TestAllBF16() { ForShrinkableVectors<TestBF16>()(float()); }
+
+struct TestConvertU8 {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, const D du32) {
+    const Rebind<uint8_t, D> du8;
+    const auto wrap = Set(du32, 0xFF);
+    HWY_ASSERT_VEC_EQ(du8, Iota(du8, 0), U8FromU32(And(Iota(du32, 0), wrap)));
+    HWY_ASSERT_VEC_EQ(du8, Iota(du8, 0x7F),
+                      U8FromU32(And(Iota(du32, 0x7F), wrap)));
+  }
+};
+
+HWY_NOINLINE void TestAllConvertU8() {
+  ForDemoteVectors<TestConvertU8, 2>()(uint32_t());
+}
+
+template <typename From, typename To, class D>
+constexpr bool IsSupportedTruncation() {
+  return (sizeof(To) < sizeof(From)) &&
+         (Pow2(Rebind<To, D>()) + 3 >= static_cast<int>(CeilLog2(sizeof(To))));
+}
+
+struct TestTruncateTo {
+  template <typename From, typename To, class D,
+            hwy::EnableIf<!IsSupportedTruncation<From, To, D>()>* = nullptr>
+  HWY_NOINLINE void testTo(From, To, const D) {
+    // do nothing
+  }
+
+  template <typename From, typename To, class D,
+            hwy::EnableIf<IsSupportedTruncation<From, To, D>()>* = nullptr>
+  HWY_NOINLINE void testTo(From, To, const D d) {
+    constexpr uint32_t base = 0xFA578D00;
+    const Rebind<To, D> dTo;
+    const auto src = Iota(d, static_cast<From>(base));
+    const auto expected = Iota(dTo, static_cast<To>(base));
+    const VFromD<decltype(dTo)> actual = TruncateTo(dTo, src);
+    HWY_ASSERT_VEC_EQ(dTo, expected, actual);
+  }
+
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T from, const D d) {
+    testTo<T, uint8_t, D>(from, uint8_t(), d);
+    testTo<T, uint16_t, D>(from, uint16_t(), d);
+    testTo<T, uint32_t, D>(from, uint32_t(), d);
+  }
+};
+
+HWY_NOINLINE void TestAllTruncate() {
+  ForUnsignedTypes(ForPartialVectors<TestTruncateTo>());
+}
+
+// Separate function to attempt to work around a compiler bug on ARM: when this
+// is merged with TestIntFromFloat, outputs match a previous Iota(-(N+1)) input.
+struct TestIntFromFloatHuge {
+  template <typename TF, class DF>
+  HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
+    // The ARMv7 manual says that float->int saturates, i.e. chooses the
+    // nearest representable value. This works correctly on armhf with GCC, but
+    // not with clang. For reasons unknown, MSVC also runs into an out-of-memory
+    // error here.
+#if HWY_COMPILER_CLANG || HWY_COMPILER_MSVC
+    (void)df;
+#else
+    using TI = MakeSigned<TF>;
+    const Rebind<TI, DF> di;
+
+    // Workaround for incorrect 32-bit GCC codegen for SSSE3 - Print-ing
+    // the expected lvalue also seems to prevent the issue.
+    const size_t N = Lanes(df);
+    auto expected = AllocateAligned<TI>(N);
+
+    // Huge positive
+    Store(Set(di, LimitsMax<TI>()), di, expected.get());
+    HWY_ASSERT_VEC_EQ(di, expected.get(), ConvertTo(di, Set(df, TF(1E20))));
+
+    // Huge negative
+    Store(Set(di, LimitsMin<TI>()), di, expected.get());
+    HWY_ASSERT_VEC_EQ(di, expected.get(), ConvertTo(di, Set(df, TF(-1E20))));
+#endif
+  }
+};
+
+class TestIntFromFloat {
+  template <typename TF, class DF>
+  static HWY_NOINLINE void TestPowers(TF /*unused*/, const DF df) {
+    using TI = MakeSigned<TF>;
+    const Rebind<TI, DF> di;
+    constexpr size_t kBits = sizeof(TF) * 8;
+
+    // Powers of two, plus offsets to set some mantissa bits.
+    const int64_t ofs_table[3] = {0LL, 3LL << (kBits / 2), 1LL << (kBits - 15)};
+    for (int sign = 0; sign < 2; ++sign) {
+      for (size_t shift = 0; shift < kBits - 1; ++shift) {
+        for (int64_t ofs : ofs_table) {
+          const int64_t mag = (int64_t{1} << shift) + ofs;
+          const int64_t val = sign ? mag : -mag;
+          HWY_ASSERT_VEC_EQ(di, Set(di, static_cast<TI>(val)),
+                            ConvertTo(di, Set(df, static_cast<TF>(val))));
+        }
+      }
+    }
+  }
+
+  template <typename TF, class DF>
+  static HWY_NOINLINE void TestRandom(TF /*unused*/, const DF df) {
+    using TI = MakeSigned<TF>;
+    const Rebind<TI, DF> di;
+    const size_t N = Lanes(df);
+
+    // TF does not have enough precision to represent TI.
+    const double min = static_cast<double>(LimitsMin<TI>());
+    const double max = static_cast<double>(LimitsMax<TI>());
+
+    // Also check random values.
+    auto from = AllocateAligned<TF>(N);
+    auto expected = AllocateAligned<TI>(N);
+    RandomState rng;
+    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        do {
+          const uint64_t bits = rng();
+          CopyBytes<sizeof(TF)>(&bits, &from[i]);  // not same size
+        } while (!std::isfinite(from[i]));
+        if (from[i] >= max) {
+          expected[i] = LimitsMax<TI>();
+        } else if (from[i] <= min) {
+          expected[i] = LimitsMin<TI>();
+        } else {
+          expected[i] = static_cast<TI>(from[i]);
+        }
+      }
+
+      HWY_ASSERT_VEC_EQ(di, expected.get(),
+                        ConvertTo(di, Load(df, from.get())));
+    }
+  }
+
+ public:
+  template <typename TF, class DF>
+  HWY_NOINLINE void operator()(TF tf, const DF df) {
+    using TI = MakeSigned<TF>;
+    const Rebind<TI, DF> di;
+    const size_t N = Lanes(df);
+
+    // Integer positive
+    HWY_ASSERT_VEC_EQ(di, Iota(di, TI(4)), ConvertTo(di, Iota(df, TF(4.0))));
+
+    // Integer negative
+    HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), ConvertTo(di, Iota(df, -TF(N))));
+
+    // Above positive
+    HWY_ASSERT_VEC_EQ(di, Iota(di, TI(2)), ConvertTo(di, Iota(df, TF(2.001))));
+
+    // Below positive
+    HWY_ASSERT_VEC_EQ(di, Iota(di, TI(3)), ConvertTo(di, Iota(df, TF(3.9999))));
+
+    const TF eps = static_cast<TF>(0.0001);
+    // Above negative
+    HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)),
+                      ConvertTo(di, Iota(df, -TF(N + 1) + eps)));
+
+    // Below negative
+    HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N + 1)),
+                      ConvertTo(di, Iota(df, -TF(N + 1) - eps)));
+
+    TestPowers(tf, df);
+    TestRandom(tf, df);
+  }
+};
+
+HWY_NOINLINE void TestAllIntFromFloat() {
+  ForFloatTypes(ForPartialVectors<TestIntFromFloatHuge>());
+  ForFloatTypes(ForPartialVectors<TestIntFromFloat>());
+}
+
+struct TestFloatFromInt {
+  template <typename TF, class DF>
+  HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
+    using TI = MakeSigned<TF>;
+    const RebindToSigned<DF> di;
+    const size_t N = Lanes(df);
+
+    // Integer positive
+    HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), ConvertTo(df, Iota(di, TI(4))));
+
+    // Integer negative
+    HWY_ASSERT_VEC_EQ(df, Iota(df, -TF(N)), ConvertTo(df, Iota(di, -TI(N))));
+
+    // Max positive
+    HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMax<TI>())),
+                      ConvertTo(df, Set(di, LimitsMax<TI>())));
+
+    // Min negative
+    HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMin<TI>())),
+                      ConvertTo(df, Set(di, LimitsMin<TI>())));
+  }
+};
+
+HWY_NOINLINE void TestAllFloatFromInt() {
+  ForFloatTypes(ForPartialVectors<TestFloatFromInt>());
+}
+
+struct TestFloatFromUint {
+  template <typename TF, class DF>
+  HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
+    using TU = MakeUnsigned<TF>;
+    const RebindToUnsigned<DF> du;
+
+    // Integer positive
+    HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), ConvertTo(df, Iota(du, TU(4))));
+    HWY_ASSERT_VEC_EQ(df, Iota(df, TF(65535.0)),
+                      ConvertTo(df, Iota(du, 65535)));  // 2^16-1
+    if (sizeof(TF) > 4) {
+      HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4294967295.0)),
+                        ConvertTo(df, Iota(du, 4294967295ULL)));  // 2^32-1
+    }
+
+    // Max positive
+    HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMax<TU>())),
+                      ConvertTo(df, Set(du, LimitsMax<TU>())));
+
+    // Zero
+    HWY_ASSERT_VEC_EQ(df, Zero(df), ConvertTo(df, Zero(du)));
+  }
+};
+
+HWY_NOINLINE void TestAllFloatFromUint() {
+  ForFloatTypes(ForPartialVectors<TestFloatFromUint>());
+}
+
+struct TestI32F64 {
+  template <typename TF, class DF>
+  HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
+    using TI = int32_t;
+    const Rebind<TI, DF> di;
+    const size_t N = Lanes(df);
+
+    // Integer positive
+    HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), PromoteTo(df, Iota(di, TI(4))));
+
+    // Integer negative
+    HWY_ASSERT_VEC_EQ(df, Iota(df, -TF(N)), PromoteTo(df, Iota(di, -TI(N))));
+
+    // Above positive
+    HWY_ASSERT_VEC_EQ(df, Iota(df, TF(2.0)), PromoteTo(df, Iota(di, TI(2))));
+
+    // Below positive
+    HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), PromoteTo(df, Iota(di, TI(4))));
+
+    // Above negative
+    HWY_ASSERT_VEC_EQ(df, Iota(df, TF(-4.0)), PromoteTo(df, Iota(di, TI(-4))));
+
+    // Below negative
+    HWY_ASSERT_VEC_EQ(df, Iota(df, TF(-2.0)), PromoteTo(df, Iota(di, TI(-2))));
+
+    // Max positive int
+    HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMax<TI>())),
+                      PromoteTo(df, Set(di, LimitsMax<TI>())));
+
+    // Min negative int
+    HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMin<TI>())),
+                      PromoteTo(df, Set(di, LimitsMin<TI>())));
+  }
+};
+
+HWY_NOINLINE void TestAllI32F64() {
+#if HWY_HAVE_FLOAT64
+  ForDemoteVectors<TestI32F64>()(double());
+#endif
+}
+
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyConvertTest);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllBitCast);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllPromoteTo);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllF16);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllBF16);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllConvertU8);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllTruncate);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllIntFromFloat);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllFloatFromInt);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllFloatFromUint);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllI32F64);
+}  // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/crypto_test.cc b/third_party/highway/hwy/tests/crypto_test.cc
new file mode 100644
index 0000000000..b7dfb198a3
--- /dev/null
+++ b/third_party/highway/hwy/tests/crypto_test.cc
@@ -0,0 +1,553 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>  // memcpy
+
+#include "hwy/aligned_allocator.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/crypto_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+#define HWY_PRINT_CLMUL_GOLDEN 0
+
+#if HWY_TARGET != HWY_SCALAR
+
+class TestAES {
+  template <typename T, class D>
+  HWY_NOINLINE void TestSBox(T /*unused*/, D d) {
+    // The generic implementation of the S-box is difficult to verify by
+    // inspection, so we add a white-box test that verifies it using enumeration
+    // (outputs for 0..255 vs. https://en.wikipedia.org/wiki/Rijndael_S-box).
+    const uint8_t sbox[256] = {
+        0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b,
+        0xfe, 0xd7, 0xab, 0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0,
+        0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26,
+        0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
+        0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2,
+        0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0,
+        0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed,
+        0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
+        0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f,
+        0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
+        0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec,
+        0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
+        0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14,
+        0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c,
+        0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d,
+        0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
+        0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f,
+        0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e,
+        0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1, 0xf8, 0x98, 0x11,
+        0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
+        0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f,
+        0xb0, 0x54, 0xbb, 0x16};
+
+    // Ensure it's safe to load an entire vector by padding.
+    const size_t N = Lanes(d);
+    const size_t padded = RoundUpTo(256, N);
+    auto expected = AllocateAligned<T>(padded);
+    // Must wrap around to match the input (Iota).
+    for (size_t pos = 0; pos < padded;) {
+      const size_t remaining = HWY_MIN(padded - pos, size_t(256));
+      memcpy(expected.get() + pos, sbox, remaining);
+      pos += remaining;
+    }
+
+    for (size_t i = 0; i < 256; i += N) {
+      const auto in = Iota(d, static_cast<T>(i));
+      HWY_ASSERT_VEC_EQ(d, expected.get() + i, detail::SubBytes(in));
+    }
+  }
+
+ public:
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T t, D d) {
+    // Test vector (after first KeyAddition) from
+    // https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Standards-and-Guidelines/documents/examples/AES_Core128.pdf
+    alignas(16) constexpr uint8_t test_lanes[16] = {
+        0x40, 0xBF, 0xAB, 0xF4, 0x06, 0xEE, 0x4D, 0x30,
+        0x42, 0xCA, 0x6B, 0x99, 0x7A, 0x5C, 0x58, 0x16};
+    const auto test = LoadDup128(d, test_lanes);
+
+    // = ShiftRow result
+    alignas(16) constexpr uint8_t expected_sr_lanes[16] = {
+        0x09, 0x28, 0x7F, 0x47, 0x6F, 0x74, 0x6A, 0xBF,
+        0x2C, 0x4A, 0x62, 0x04, 0xDA, 0x08, 0xE3, 0xEE};
+    const auto expected_sr = LoadDup128(d, expected_sr_lanes);
+
+    // = MixColumn result
+    alignas(16) constexpr uint8_t expected_mc_lanes[16] = {
+        0x52, 0x9F, 0x16, 0xC2, 0x97, 0x86, 0x15, 0xCA,
+        0xE0, 0x1A, 0xAE, 0x54, 0xBA, 0x1A, 0x26, 0x59};
+    const auto expected_mc = LoadDup128(d, expected_mc_lanes);
+
+    // = KeyAddition result
+    alignas(16) constexpr uint8_t expected_lanes[16] = {
+        0xF2, 0x65, 0xE8, 0xD5, 0x1F, 0xD2, 0x39, 0x7B,
+        0xC3, 0xB9, 0x97, 0x6D, 0x90, 0x76, 0x50, 0x5C};
+    const auto expected = LoadDup128(d, expected_lanes);
+
+    alignas(16) uint8_t key_lanes[16];
+    for (size_t i = 0; i < 16; ++i) {
+      key_lanes[i] = expected_mc_lanes[i] ^ expected_lanes[i];
+    }
+    const auto round_key = LoadDup128(d, key_lanes);
+
+    HWY_ASSERT_VEC_EQ(d, expected_mc, AESRound(test, Zero(d)));
+    HWY_ASSERT_VEC_EQ(d, expected, AESRound(test, round_key));
+    HWY_ASSERT_VEC_EQ(d, expected_sr, AESLastRound(test, Zero(d)));
+    HWY_ASSERT_VEC_EQ(d, Xor(expected_sr, round_key),
+                      AESLastRound(test, round_key));
+
+    TestSBox(t, d);
+  }
+};
+HWY_NOINLINE void TestAllAES() { ForGEVectors<128, TestAES>()(uint8_t()); }
+
+#else
+HWY_NOINLINE void TestAllAES() {}
+#endif  // HWY_TARGET != HWY_SCALAR
+
+struct TestCLMul {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    // needs 64 bit lanes and 128-bit result
+#if HWY_TARGET != HWY_SCALAR && HWY_HAVE_INTEGER64
+    const size_t N = Lanes(d);
+    if (N == 1) return;
+
+    auto in1 = AllocateAligned<T>(N);
+    auto in2 = AllocateAligned<T>(N);
+
+    constexpr size_t kCLMulNum = 512;
+    // Depends on rng!
+    static constexpr uint64_t kCLMulLower[kCLMulNum] = {
+        0x24511d4ce34d6350ULL, 0x4ca582edde1236bbULL, 0x537e58f72dac25a8ULL,
+        0x4e942d5e130b9225ULL, 0x75a906c519257a68ULL, 0x1df9f85126d96c5eULL,
+        0x464e7c13f4ad286aULL, 0x138535ee35dabc40ULL, 0xb2f7477b892664ecULL,
+        0x01557b077167c25dULL, 0xf32682490ee49624ULL, 0x0025bac603b9e140ULL,
+        0xcaa86aca3e3daf40ULL, 0x1fbcfe4af73eb6c4ULL, 0x8ee8064dd0aae5dcULL,
+        0x1248cb547858c213ULL, 0x37a55ee5b10fb34cULL, 0x6eb5c97b958f86e2ULL,
+        0x4b1ab3eb655ea7cdULL, 0x1d66645a85627520ULL, 0xf8728e96daa36748ULL,
+        0x38621043e6ff5e3bULL, 0xd1d28b5da5ffefb4ULL, 0x0a5cd65931546df7ULL,
+        0x2a0639be3d844150ULL, 0x0e2d0f18c8d6f045ULL, 0xfacc770b963326c1ULL,
+        0x19611b31ca2ef141ULL, 0xabea29510dd87518ULL, 0x18a7dc4b205f2768ULL,
+        0x9d3975ea5612dc86ULL, 0x06319c139e374773ULL, 0x6641710400b4c390ULL,
+        0x356c29b6001c3670ULL, 0xe9e04d851e040a00ULL, 0x21febe561222d79aULL,
+        0xc071eaae6e148090ULL, 0x0eed351a0af94f5bULL, 0x04324eedb3c03688ULL,
+        0x39e89b136e0d6ccdULL, 0x07d0fd2777a31600ULL, 0x44b8573827209822ULL,
+        0x6d690229ea177d78ULL, 0x1b9749d960ba9f18ULL, 0x190945271c0fbb94ULL,
+        0x189aea0e07d2c88eULL, 0xf18eab6b65a6beb2ULL, 0x57744b21c13d0d84ULL,
+        0xf63050a613e95c2eULL, 0x12cd20d25f97102fULL, 0x5a5df0678dbcba60ULL,
+        0x0b08fb80948bfafcULL, 0x44cf1cbe7c6fc3c8ULL, 0x166a470ef25da288ULL,
+        0x2c498a609204e48cULL, 0x261b0a22585697ecULL, 0x737750574af7dde4ULL,
+        0x4079959c60b01e0cULL, 0x06ed8aac13f782d6ULL, 0x019d454ba9b5ef20ULL,
+        0xea1edbf96d49e858ULL, 0x17c2f3ebde9ac469ULL, 0x5cf72706e3d6f5e4ULL,
+        0x16e856aa3c841516ULL, 0x256f7e3cef83368eULL, 0x47e17c8eb2774e77ULL,
+        0x9b48ac150a804821ULL, 0x584523f61ccfdf22ULL, 0xedcb6a2a75d9e7f2ULL,
+        0x1fe3d1838e537aa7ULL, 0x778872e9f64549caULL, 0x2f1cea6f0d3faf92ULL,
+        0x0e8c4b6a9343f326ULL, 0x01902d1ba3048954ULL, 0xc5c1fd5269e91dc0ULL,
+        0x0ef8a4707817eb9cULL, 0x1f696f09a5354ca4ULL, 0x369cd9de808b818cULL,
+        0xf6917d1dd43fd784ULL, 0x7f4b76bf40dc166fULL, 0x4ce67698724ace12ULL,
+        0x02c3bf60e6e9cd92ULL, 0xb8229e45b21458e8ULL, 0x415efd41e91adf49ULL,
+        0x5edfcd516bb921cdULL, 0x5ff2c29429fd187eULL, 0x0af666b17103b3e0ULL,
+        0x1f5e4ff8f54c9a5bULL, 0x429253d8a5544ba6ULL, 0x19de2fdf9f4d9dcaULL,
+        0x29bf3d37ddc19a40ULL, 0x04d4513a879552baULL, 0x5cc7476cf71ee155ULL,
+        0x40011f8c238784a5ULL, 0x1a3ae50b0fd2ee2bULL, 0x7db22f432ba462baULL,
+        0x417290b0bee2284aULL, 0x055a6bd5bb853db2ULL, 0xaa667daeed8c2a34ULL,
+        0x0d6b316bda7f3577ULL, 0x72d35598468e3d5dULL, 0x375b594804bfd33aULL,
+        0x16ed3a319b540ae8ULL, 0x093bace4b4695afdULL, 0xc7118754ec2737ceULL,
+        0x0fff361f0505c81aULL, 0x996e9e7291321af0ULL, 0x496b1d9b0b89ba8cULL,
+        0x65a98b2e9181da9cULL, 0x70759c8dd45575dfULL, 0x3446fe727f5e2cbbULL,
+        0x1121ae609d195e74ULL, 0x5ff5d68ce8a21018ULL, 0x0e27eca3825b60d6ULL,
+        0x82f628bceca3d1daULL, 0x2756a0914e344047ULL, 0xa460406c1c708d50ULL,
+        0x63ce32a0c083e491ULL, 0xc883e5a685c480e0ULL, 0x602c951891e600f9ULL,
+        0x02ecb2e3911ca5f8ULL, 0x0d8675f4bb70781aULL, 0x43545cc3c78ea496ULL,
+        0x04164b01d6b011c2ULL, 0x3acbb323dcab2c9bULL, 0x31c5ba4e22793082ULL,
+        0x5a6484af5f7c2d10ULL, 0x1a929b16194e8078ULL, 0x7a6a75d03b313924ULL,
+        0x0553c73a35b1d525ULL, 0xf18628c51142be34ULL, 0x1b51cf80d7efd8f5ULL,
+        0x52e0ca4df63ee258ULL, 0x0e977099160650c9ULL, 0x6be1524e92024f70ULL,
+        0x0ee2152625438b9dULL, 0xfa32af436f6d8eb4ULL, 0x5ecf49c2154287e5ULL,
+        0x6b72f4ae3590569dULL, 0x086c5ee6e87bfb68ULL, 0x737a4f0dc04b6187ULL,
+        0x08c3439280edea41ULL, 0x9547944f01636c5cULL, 0x6acfbfc2571cd71fULL,
+        0x85d7842972449637ULL, 0x252ea5e5a7fad86aULL, 0x4e41468f99ba1632ULL,
+        0x095e0c3ae63b25a2ULL, 0xb005ce88fd1c9425ULL, 0x748e668abbe09f03ULL,
+        0xb2cfdf466b187d18ULL, 0x60b11e633d8fe845ULL, 0x07144c4d246db604ULL,
+        0x139bcaac55e96125ULL, 0x118679b5a6176327ULL, 0x1cebe90fa4d9f83fULL,
+        0x22244f52f0d312acULL, 0x669d4e17c9bfb713ULL, 0x96390e0b834bb0d0ULL,
+        0x01f7f0e82ba08071ULL, 0x2dffeee31ca6d284ULL, 0x1f4738745ef039feULL,
+        0x4ce0dd2b603b6420ULL, 0x0035fc905910a4d5ULL, 0x07df2b533df6fb04ULL,
+        0x1cee2735c9b910ddULL, 0x2bc4af565f7809eaULL, 0x2f876c1f5cb1076cULL,
+        0x33e079524099d056ULL, 0x169e0405d2f9efbaULL, 0x018643ab548a358cULL,
+        0x1bb6fc4331cffe92ULL, 0x05111d3a04e92faaULL, 0x23c27ecf0d638b73ULL,
+        0x1b79071dc1685d68ULL, 0x0662d20aba8e1e0cULL, 0xe7f6440277144c6fULL,
+        0x4ca38b64c22196c0ULL, 0x43c05f6d1936fbeeULL, 0x0654199d4d1faf0fULL,
+        0xf2014054e71c2d04ULL, 0x0a103e47e96b4c84ULL, 0x7986e691dd35b040ULL,
+        0x4e1ebb53c306a341ULL, 0x2775bb3d75d65ba6ULL, 0x0562ab0adeff0f15ULL,
+        0x3c2746ad5eba3eacULL, 0x1facdb5765680c60ULL, 0xb802a60027d81d00ULL,
+        0x1191d0f6366ae3a9ULL, 0x81a97b5ae0ea5d14ULL, 0x06bee05b6178a770ULL,
+        0xc7baeb2fe1d6aeb3ULL, 0x594cb5b867d04fdfULL, 0xf515a80138a4e350ULL,
+        0x646417ad8073cf38ULL, 0x4a229a43373fb8d4ULL, 0x10fa6eafff1ca453ULL,
+        0x9f060700895cc731ULL, 0x00521133d11d11f4ULL, 0xb940a2bb912a7a5cULL,
+        0x3fab180670ad2a3cULL, 0x45a5f0e5b6fdb95dULL, 0x27c1baad6f946b15ULL,
+        0x336c6bdbe527cf58ULL, 0x3b83aa602a5baea3ULL, 0xdf749153f9bcc376ULL,
+        0x1a05513a6c0b4a90ULL, 0xb81e0b570a075c47ULL, 0x471fabb40bdc27ceULL,
+        0x9dec9472f6853f60ULL, 0x361f71b88114193bULL, 0x3b550a8c4feeff00ULL,
+        0x0f6cde5a68bc9bc0ULL, 0x3f50121a925703e0ULL, 0x6967ff66d6d343a9ULL,
+        0xff6b5bd2ce7bc3ccULL, 0x05474cea08bf6cd8ULL, 0xf76eabbfaf108eb0ULL,
+        0x067529be4fc6d981ULL, 0x4d766b137cf8a988ULL, 0x2f09c7395c5cfbbdULL,
+        0x388793712da06228ULL, 0x02c9ff342c8f339aULL, 0x152c734139a860a3ULL,
+        0x35776eb2b270c04dULL, 0x0f8d8b41f11c4608ULL, 0x0c2071665be6b288ULL,
+        0xc034e212b3f71d88ULL, 0x071d961ef3276f99ULL, 0xf98598ee75b60773ULL,
+        0x062062c58c6724e4ULL, 0xd156438e2125572cULL, 0x38552d59a7f0f7c8ULL,
+        0x1a402178206e413cULL, 0x1f1f996c68293b26ULL, 0x8bce3cafe1730f7eULL,
+        0x2d0480a0828f6bf5ULL, 0x6c99cffa171f92f6ULL, 0x0087f842bb0ac681ULL,
+        0x11d7ed06e1e7fd3eULL, 0x07cb1186f2385dc6ULL, 0x5d7763ebff1e170fULL,
+        0x2dacc870231ac292ULL, 0x8486317a9ffb390cULL, 0x1c3a6dd20c959ac6ULL,
+        0x90dc96e3992e06b8ULL, 0x70d60bfa33e72b67ULL, 0x70c9bddd0985ee63ULL,
+        0x012c9767b3673093ULL, 0xfcd3bc5580f6a88aULL, 0x0ac80017ef6308c3ULL,
+        0xdb67d709ef4bba09ULL, 0x4c63e324f0e247ccULL, 0xa15481d3fe219d60ULL,
+        0x094c4279cdccb501ULL, 0x965a28c72575cb82ULL, 0x022869db25e391ebULL,
+        0x37f528c146023910ULL, 0x0c1290636917deceULL, 0x9aee25e96251ca9cULL,
+        0x728ac5ba853b69c2ULL, 0x9f272c93c4be20c8ULL, 0x06c1aa6319d28124ULL,
+        0x4324496b1ca8a4f7ULL, 0x0096ecfe7dfc0189ULL, 0x9e06131b19ae0020ULL,
+        0x15278b15902f4597ULL, 0x2a9fece8c13842d8ULL, 0x1d4e6781f0e1355eULL,
+        0x6855b712d3dbf7c0ULL, 0x06a07fad99be6f46ULL, 0x3ed9d7957e4d1d7cULL,
+        0x0c326f7cbc248bb2ULL, 0xe6363ad2c537cf51ULL, 0x0e12eb1c40723f13ULL,
+        0xf5c6ac850afba803ULL, 0x0322a79d615fa9f0ULL, 0x6116696ed97bd5f8ULL,
+        0x0d438080fbbdc9f1ULL, 0x2e4dc42c38f1e243ULL, 0x64948e9104f3a5bfULL,
+        0x9fd622371bdb5f00ULL, 0x0f12bf082b2a1b6eULL, 0x4b1f8d867d78031cULL,
+        0x134392ea9f5ef832ULL, 0xf3d70472321bc23eULL, 0x05fcbe5e9eea268eULL,
+        0x136dede7175a22cfULL, 0x1308f8baac2cbcccULL, 0xd691026f0915eb64ULL,
+        0x0e49a668345c3a38ULL, 0x24ddbbe8bc96f331ULL, 0x4d2ec9479b640578ULL,
+        0x450f0697327b359cULL, 0x32b45360f4488ee0ULL, 0x4f6d9ecec46a105aULL,
+        0x5500c63401ae8e80ULL, 0x47dea495cf6f98baULL, 0x13dc9a2dfca80babULL,
+        0xe6f8a93f7b24ca92ULL, 0x073f57a6d900a87fULL, 0x9ddb935fd3aa695aULL,
+        0x101e98d24b39e8aaULL, 0x6b8d0eb95a507ddcULL, 0x45a908b3903d209bULL,
+        0x6c96a3e119e617d4ULL, 0x2442787543d3be48ULL, 0xd3bc055c7544b364ULL,
+        0x7693bb042ca8653eULL, 0xb95e3a4ea5d0101eULL, 0x116f0d459bb94a73ULL,
+        0x841244b72cdc5e90ULL, 0x1271acced6cb34d3ULL, 0x07d289106524d638ULL,
+        0x537c9cf49c01b5bbULL, 0x8a8e16706bb7a5daULL, 0x12e50a9c499dc3a9ULL,
+        0x1cade520db2ba830ULL, 0x1add52f000d7db70ULL, 0x12cf15db2ce78e30ULL,
+        0x0657eaf606bfc866ULL, 0x4026816d3b05b1d0ULL, 0x1ba0ebdf90128e4aULL,
+        0xdfd649375996dd6eULL, 0x0f416e906c23d9aeULL, 0x384273cad0582a24ULL,
+        0x2ff27b0378a46189ULL, 0xc4ecd18a2d7a7616ULL, 0x35cef0b5cd51d640ULL,
+        0x7d582363643f48b7ULL, 0x0984ad746ad0ab7cULL, 0x2990a999835f9688ULL,
+        0x2d4df66a97b19e05ULL, 0x592c79720af99aa2ULL, 0x052863c230602cd3ULL,
+        0x5f5e2b15edcf2840ULL, 0x01dff1b694b978b0ULL, 0x14345a48b622025eULL,
+        0x028fab3b6407f715ULL, 0x3455d188e6feca50ULL, 0x1d0d40288fb1b5fdULL,
+        0x4685c5c2b6a1e5aeULL, 0x3a2077b1e5fe5adeULL, 0x1bc55d611445a0d8ULL,
+        0x05480ae95f3f83feULL, 0xbbb59cfcf7e17fb6ULL, 0x13f7f10970bbb990ULL,
+        0x6d00ac169425a352ULL, 0x7da0db397ef2d5d3ULL, 0x5b512a247f8d2479ULL,
+        0x637eaa6a977c3c32ULL, 0x3720f0ae37cba89cULL, 0x443df6e6aa7f525bULL,
+        0x28664c287dcef321ULL, 0x03c267c00cf35e49ULL, 0x690185572d4021deULL,
+        0x2707ff2596e321c2ULL, 0xd865f5af7722c380ULL, 0x1ea285658e33aafbULL,
+        0xc257c5e88755bef4ULL, 0x066f67275cfcc31eULL, 0xb09931945cc0fed0ULL,
+        0x58c1dc38d6e3a03fULL, 0xf99489678fc94ee8ULL, 0x75045bb99be5758aULL,
+        0x6c163bc34b40feefULL, 0x0420063ce7bdd3b4ULL, 0xf86ef10582bf2e28ULL,
+        0x162c3449ca14858cULL, 0x94106aa61dfe3280ULL, 0x4073ae7a4e7e4941ULL,
+        0x32b13fd179c250b4ULL, 0x0178fbb216a7e744ULL, 0xf840ae2f1cf92669ULL,
+        0x18fc709acc80243dULL, 0x20ac2ebd69f4d558ULL, 0x6e580ad9c73ad46aULL,
+        0x76d2b535b541c19dULL, 0x6c7a3fb9dd0ce0afULL, 0xc3481689b9754f28ULL,
+        0x156e813b6557abdbULL, 0x6ee372e31276eb10ULL, 0x19cf37c038c8d381ULL,
+        0x00d4d906c9ae3072ULL, 0x09f03cbb6dfbfd40ULL, 0x461ba31c4125f3cfULL,
+        0x25b29fc63ad9f05bULL, 0x6808c95c2dddede9ULL, 0x0564224337066d9bULL,
+        0xc87eb5f4a4d966f2ULL, 0x66fc66e1701f5847ULL, 0xc553a3559f74da28ULL,
+        0x1dfd841be574df43ULL, 0x3ee2f100c3ebc082ULL, 0x1a2c4f9517b56e89ULL,
+        0x502f65c4b535c8ffULL, 0x1da5663ab6f96ec0ULL, 0xba1f80b73988152cULL,
+        0x364ff12182ac8dc1ULL, 0xe3457a3c4871db31ULL, 0x6ae9cadf92fd7e84ULL,
+        0x9621ba3d6ca15186ULL, 0x00ff5af878c144ceULL, 0x918464dc130101a4ULL,
+        0x036511e6b187efa6ULL, 0x06667d66550ff260ULL, 0x7fd18913f9b51bc1ULL,
+        0x3740e6b27af77aa8ULL, 0x1f546c2fd358ff8aULL, 0x42f1424e3115c891ULL,
+        0x03767db4e3a1bb33ULL, 0xa171a1c564345060ULL, 0x0afcf632fd7b1324ULL,
+        0xb59508d933ffb7d0ULL, 0x57d766c42071be83ULL, 0x659f0447546114a2ULL,
+        0x4070364481c460aeULL, 0xa2b9752280644d52ULL, 0x04ab884bea5771bdULL,
+        0x87cd135602a232b4ULL, 0x15e54cd9a8155313ULL, 0x1e8005efaa3e1047ULL,
+        0x696b93f4ab15d39fULL, 0x0855a8e540de863aULL, 0x0bb11799e79f9426ULL,
+        0xeffa61e5c1b579baULL, 0x1e060a1d11808219ULL, 0x10e219205667c599ULL,
+        0x2f7b206091c49498ULL, 0xb48854c820064860ULL, 0x21c4aaa3bfbe4a38ULL,
+        0x8f4a032a3fa67e9cULL, 0x3146b3823401e2acULL, 0x3afee26f19d88400ULL,
+        0x167087c485791d38ULL, 0xb67a1ed945b0fb4bULL, 0x02436eb17e27f1c0ULL,
+        0xe05afce2ce2d2790ULL, 0x49c536fc6224cfebULL, 0x178865b3b862b856ULL,
+        0x1ce530de26acde5bULL, 0x87312c0b30a06f38ULL, 0x03e653b578558d76ULL,
+        0x4d3663c21d8b3accULL, 0x038003c23626914aULL, 0xd9d5a2c052a09451ULL,
+        0x39b5acfe08a49384ULL, 0x40f349956d5800e4ULL, 0x0968b6950b1bd8feULL,
+        0xd60b2ca030f3779cULL, 0x7c8bc11a23ce18edULL, 0xcc23374e27630bc2ULL,
+        0x2e38fc2a8bb33210ULL, 0xe421357814ee5c44ULL, 0x315fb65ea71ec671ULL,
+        0xfb1b0223f70ed290ULL, 0x30556c9f983eaf07ULL, 0x8dd438c3d0cd625aULL,
+        0x05a8fd0c7ffde71bULL, 0x764d1313b5aeec7aULL, 0x2036af5de9622f47ULL,
+        0x508a5bfadda292feULL, 0x3f77f04ba2830e90ULL, 0x9047cd9c66ca66d2ULL,
+        0x1168b5318a54eb21ULL, 0xc93462d221da2e15ULL, 0x4c2c7cc54abc066eULL,
+        0x767a56fec478240eULL, 0x095de72546595bd3ULL, 0xc9da535865158558ULL,
+        0x1baccf36f33e73fbULL, 0xf3d7dbe64df77f18ULL, 0x1f8ebbb7be4850b8ULL,
+        0x043c5ed77bce25a1ULL, 0x07d401041b2a178aULL, 0x9181ebb8bd8d5618ULL,
+        0x078b935dc3e4034aULL, 0x7b59c08954214300ULL, 0x03570dc2a4f84421ULL,
+        0xdd8715b82f6b4078ULL, 0x2bb49c8bb544163bULL, 0xc9eb125564d59686ULL,
+        0x5fdc7a38f80b810aULL, 0x3a4a6d8fff686544ULL, 0x28360e2418627d3aULL,
+        0x60874244c95ed992ULL, 0x2115cc1dd9c34ed3ULL, 0xfaa3ef61f55e9efcULL,
+        0x27ac9b1ef1adc7e6ULL, 0x95ea00478fec3f54ULL, 0x5aea808b2d99ab43ULL,
+        0xc8f79e51fe43a580ULL, 0x5dbccd714236ce25ULL, 0x783fa76ed0753458ULL,
+        0x48cb290f19d84655ULL, 0xc86a832f7696099aULL, 0x52f30c6fec0e71d3ULL,
+        0x77d4e91e8cdeb886ULL, 0x7169a703c6a79ccdULL, 0x98208145b9596f74ULL,
+        0x0945695c761c0796ULL, 0x0be897830d17bae0ULL, 0x033ad3924caeeeb4ULL,
+        0xedecb6cfa2d303a8ULL, 0x3f86b074818642e7ULL, 0xeefa7c878a8b03f4ULL,
+        0x093c101b80922551ULL, 0xfb3b4e6c26ac0034ULL, 0x162bf87999b94f5eULL,
+        0xeaedae76e975b17cULL, 0x1852aa090effe18eULL};
+
+    static constexpr uint64_t kCLMulUpper[kCLMulNum] = {
+        0xbb41199b1d587c69ULL, 0x514d94d55894ee29ULL, 0xebc6cd4d2efd5d16ULL,
+        0x042044ad2de477fdULL, 0xb865c8b0fcdf4b15ULL, 0x0724d7e551cc40f3ULL,
+        0xb15a16f39edb0bccULL, 0x37d64419ede7a171ULL, 0x2aa01bb80c753401ULL,
+        0x06ff3f8a95fdaf4dULL, 0x79898cc0838546deULL, 0x776acbd1b237c60aULL,
+        0x4c1753be4f4e0064ULL, 0x0ba9243601206ed3ULL, 0xd567c3b1bf3ec557ULL,
+        0x043fac7bcff61fb3ULL, 0x49356232b159fb2fULL, 0x3910c82038102d4dULL,
+        0x30592fef753eb300ULL, 0x7b2660e0c92a9e9aULL, 0x8246c9248d671ef0ULL,
+        0x5a0dcd95147af5faULL, 0x43fde953909cc0eaULL, 0x06147b972cb96e1bULL,
+        0xd84193a6b2411d80ULL, 0x00cd7711b950196fULL, 0x1088f9f4ade7fa64ULL,
+        0x05a13096ec113cfbULL, 0x958d816d53b00edcULL, 0x3846154a7cdba9cbULL,
+        0x8af516db6b27d1e6ULL, 0x1a1d462ab8a33b13ULL, 0x4040b0ac1b2c754cULL,
+        0x05127fe9af2fe1d6ULL, 0x9f96e79374321fa6ULL, 0x06ff64a4d9c326f3ULL,
+        0x28709566e158ac15ULL, 0x301701d7111ca51cULL, 0x31e0445d1b9d9544ULL,
+        0x0a95aff69bf1d03eULL, 0x7c298c8414ecb879ULL, 0x00801499b4143195ULL,
+        0x91521a00dd676a5cULL, 0x2777526a14c2f723ULL, 0xfa26aac6a6357dddULL,
+        0x1d265889b0187a4bULL, 0xcd6e70fa8ed283e4ULL, 0x18a815aa50ea92caULL,
+        0xc01e082694a263c6ULL, 0x4b40163ba53daf25ULL, 0xbc658caff6501673ULL,
+        0x3ba35359586b9652ULL, 0x74f96acc97a4936cULL, 0x3989dfdb0cf1d2cfULL,
+        0x358a01eaa50dda32ULL, 0x01109a5ed8f0802bULL, 0x55b84922e63c2958ULL,
+        0x55b14843d87551d5ULL, 0x1db8ec61b1b578d8ULL, 0x79a2d49ef8c3658fULL,
+        0xa304516816b3fbe0ULL, 0x163ecc09cc7b82f9ULL, 0xab91e8d22aabef00ULL,
+        0x0ed6b09262de8354ULL, 0xcfd47d34cf73f6f2ULL, 0x7dbd1db2390bc6c3ULL,
+        0x5ae789d3875e7b00ULL, 0x1d60fd0e70fe8fa4ULL, 0x690bc15d5ae4f6f5ULL,
+        0x121ef5565104fb44ULL, 0x6e98e89297353b54ULL, 0x42554949249d62edULL,
+        0xd6d6d16b12df78d2ULL, 0x320b33549b74975dULL, 0xd2a0618763d22e00ULL,
+        0x0808deb93cba2017ULL, 0x01bd3b2302a2cc70ULL, 0x0b7b8dd4d71c8dd6ULL,
+        0x34d60a3382a0756cULL, 0x40984584c8219629ULL, 0xf1152cba10093a66ULL,
+        0x068001c6b2159ccbULL, 0x3d70f13c6cda0800ULL, 0x0e6b6746a322b956ULL,
+        0x83a494319d8c770bULL, 0x0faecf64a8553e9aULL, 0xa34919222c39b1bcULL,
+        0x0c63850d89e71c6fULL, 0x585f0bee92e53dc8ULL, 0x10f222b13b4fa5deULL,
+        0x61573114f94252f2ULL, 0x09d59c311fba6c27ULL, 0x014effa7da49ed4eULL,
+        0x4a400a1bc1c31d26ULL, 0xc9091c047b484972ULL, 0x3989f341ec2230ccULL,
+        0xdcb03a98b3aee41eULL, 0x4a54a676a33a95e1ULL, 0xe499b7753951ef7cULL,
+        0x2f43b1d1061d8b48ULL, 0xc3313bdc68ceb146ULL, 0x5159f6bc0e99227fULL,
+        0x98128e6d9c05efcaULL, 0x15ea32b27f77815bULL, 0xe882c054e2654eecULL,
+        0x003d2cdb8faee8c6ULL, 0xb416dd333a9fe1dfULL, 0x73f6746aefcfc98bULL,
+        0x93dc114c10a38d70ULL, 0x05055941657845eaULL, 0x2ed7351347349334ULL,
+        0x26fb1ee2c69ae690ULL, 0xa4575d10dc5b28e0ULL, 0x3395b11295e485ebULL,
+        0xe840f198a224551cULL, 0x78e6e5a431d941d4ULL, 0xa1fee3ceab27f391ULL,
+        0x07d35b3c5698d0dcULL, 0x983c67fca9174a29ULL, 0x2bb6bbae72b5144aULL,
+        0xa7730b8d13ce58efULL, 0x51b5272883de1998ULL, 0xb334e128bb55e260ULL,
+        0x1cacf5fbbe1b9974ULL, 0x71a9df4bb743de60ULL, 0x5176fe545c2d0d7aULL,
+        0xbe592ecf1a16d672ULL, 0x27aa8a30c3efe460ULL, 0x4c78a32f47991e06ULL,
+        0x383459294312f26aULL, 0x97ba789127f1490cULL, 0x51c9aa8a3abd1ef1ULL,
+        0xcc7355188121e50fULL, 0x0ecb3a178ae334c1ULL, 0x84879a5e574b7160ULL,
+        0x0765298f6389e8f3ULL, 0x5c6750435539bb22ULL, 0x11a05cf056c937b5ULL,
+        0xb5dc2172dbfb7662ULL, 0x3ffc17915d9f40e8ULL, 0xbc7904daf3b431b0ULL,
+        0x71f2088490930a7cULL, 0xa89505fd9efb53c4ULL, 0x02e194afd61c5671ULL,
+        0x99a97f4abf35fcecULL, 0x26830aad30fae96fULL, 0x4b2abc16b25cf0b0ULL,
+        0x07ec6fffa1cafbdbULL, 0xf38188fde97a280cULL, 0x121335701afff64dULL,
+        0xea5ef38b4e672a64ULL, 0x477edbcae3eabf03ULL, 0xa32813cc0e0d244dULL,
+        0x13346d2af4972eefULL, 0xcbc18357af1cfa9aULL, 0x561b630316e73fa6ULL,
+        0xe9dfb53249249305ULL, 0x5d2b9dd1479312eeULL, 0x3458008119b56d04ULL,
+        0x50e6790b49801385ULL, 0x5bb9febe2349492bULL, 0x0c2813954299098fULL,
+        0xf747b0c890a071d5ULL, 0x417e8f82cc028d77ULL, 0xa134fee611d804f8ULL,
+        0x24c99ee9a0408761ULL, 0x3ebb224e727137f3ULL, 0x0686022073ceb846ULL,
+        0xa05e901fb82ad7daULL, 0x0ece7dc43ab470fcULL, 0x2d334ecc58f7d6a3ULL,
+        0x23166fadacc54e40ULL, 0x9c3a4472f839556eULL, 0x071717ab5267a4adULL,
+        0xb6600ac351ba3ea0ULL, 0x30ec748313bb63d4ULL, 0xb5374e39287b23ccULL,
+        0x074d75e784238aebULL, 0x77315879243914a4ULL, 0x3bbb1971490865f1ULL,
+        0xa355c21f4fbe02d3ULL, 0x0027f4bb38c8f402ULL, 0xeef8708e652bc5f0ULL,
+        0x7b9aa56cf9440050ULL, 0x113ac03c16cfc924ULL, 0x395db36d3e4bef9fULL,
+        0x5d826fabcaa597aeULL, 0x2a77d3c58786d7e0ULL, 0x85996859a3ba19d4ULL,
+        0x01e7e3c904c2d97fULL, 0x34f90b9b98d51fd0ULL, 0x243aa97fd2e99bb7ULL,
+        0x40a0cebc4f65c1e8ULL, 0x46d3922ed4a5503eULL, 0x446e7ecaf1f9c0a4ULL,
+        0x49dc11558bc2e6aeULL, 0xe7a9f20881793af8ULL, 0x5771cc4bc98103f1ULL,
+        0x2446ea6e718fce90ULL, 0x25d14aca7f7da198ULL, 0x4347af186f9af964ULL,
+        0x10cb44fc9146363aULL, 0x8a35587afce476b4ULL, 0x575144662fee3d3aULL,
+        0x69f41177a6bc7a05ULL, 0x02ff8c38d6b3c898ULL, 0x57c73589a226ca40ULL,
+        0x732f6b5baae66683ULL, 0x00c008bbedd4bb34ULL, 0x7412ff09524d6cadULL,
+        0xb8fd0b5ad8c145a8ULL, 0x74bd9f94b6cdc7dfULL, 0x68233b317ca6c19cULL,
+        0x314b9c2c08b15c54ULL, 0x5bd1ad72072ebd08ULL, 0x6610e6a6c07030e4ULL,
+        0xa4fc38e885ead7ceULL, 0x36975d1ca439e034ULL, 0xa358f0fe358ffb1aULL,
+        0x38e247ad663acf7dULL, 0x77daed3643b5deb8ULL, 0x5507c2aeae1ec3d0ULL,
+        0xfdec226c73acf775ULL, 0x1b87ff5f5033492dULL, 0xa832dee545d9033fULL,
+        0x1cee43a61e41783bULL, 0xdff82b2e2d822f69ULL, 0x2bbc9a376cb38cf2ULL,
+        0x117b1cdaf765dc02ULL, 0x26a407f5682be270ULL, 0x8eb664cf5634af28ULL,
+        0x17cb4513bec68551ULL, 0xb0df6527900cbfd0ULL, 0x335a2dc79c5afdfcULL,
+        0xa2f0ca4cd38dca88ULL, 0x1c370713b81a2de1ULL, 0x849d5df654d1adfcULL,
+        0x2fd1f7675ae14e44ULL, 0x4ff64dfc02247f7bULL, 0x3a2bcf40e395a48dULL,
+        0x436248c821b187c1ULL, 0x29f4337b1c7104c0ULL, 0xfc317c46e6630ec4ULL,
+        0x2774bccc4e3264c7ULL, 0x2d03218d9d5bee23ULL, 0x36a0ed04d659058aULL,
+        0x452484461573cab6ULL, 0x0708edf87ed6272bULL, 0xf07960a1587446cbULL,
+        0x3660167b067d84e0ULL, 0x65990a6993ddf8c4ULL, 0x0b197cd3d0b40b3fULL,
+        0x1dcec4ab619f3a05ULL, 0x722ab223a84f9182ULL, 0x0822d61a81e7c38fULL,
+        0x3d22ad75da563201ULL, 0x93cef6979fd35e0fULL, 0x05c3c25ae598b14cULL,
+        0x1338df97dd496377ULL, 0x15bc324dc9c20acfULL, 0x96397c6127e6e8cfULL,
+        0x004d01069ef2050fULL, 0x2fcf2e27893fdcbcULL, 0x072f77c3e44f4a5cULL,
+        0x5eb1d80b3fe44918ULL, 0x1f59e7c28cc21f22ULL, 0x3390ce5df055c1f8ULL,
+        0x4c0ef11df92cb6bfULL, 0x50f82f9e0848c900ULL, 0x08d0fde3ffc0ae38ULL,
+        0xbd8d0089a3fbfb73ULL, 0x118ba5b0f311ef59ULL, 0x9be9a8407b926a61ULL,
+        0x4ea04fbb21318f63ULL, 0xa1c8e7bb07b871ffULL, 0x1253a7262d5d3b02ULL,
+        0x13e997a0512e5b29ULL, 0x54318460ce9055baULL, 0x4e1d8a4db0054798ULL,
+        0x0b235226e2cade32ULL, 0x2588732c1476b315ULL, 0x16a378750ba8ac68ULL,
+        0xba0b116c04448731ULL, 0x4dd02bd47694c2f1ULL, 0x16d6797b218b6b25ULL,
+        0x769eb3709cfbf936ULL, 0x197746a0ce396f38ULL, 0x7d17ad8465961d6eULL,
+        0xfe58f4998ae19bb4ULL, 0x36df24305233ce69ULL, 0xb88a4eb008f4ee72ULL,
+        0x302b2eb923334787ULL, 0x15a4e3edbe13d448ULL, 0x39a4bf64dd7730ceULL,
+        0xedf25421b31090c4ULL, 0x4d547fc131be3b69ULL, 0x2b316e120ca3b90eULL,
+        0x0faf2357bf18a169ULL, 0x71f34b54ee2c1d62ULL, 0x18eaf6e5c93a3824ULL,
+        0x7e168ba03c1b4c18ULL, 0x1a534dd586d9e871ULL, 0xa2cccd307f5f8c38ULL,
+        0x2999a6fb4dce30f6ULL, 0x8f6d3b02c1d549a6ULL, 0x5cf7f90d817aac5aULL,
+        0xd2a4ceefe66c8170ULL, 0x11560edc4ca959feULL, 0x89e517e6f0dc464dULL,
+        0x75bb8972dddd2085ULL, 0x13859ed1e459d65aULL, 0x057114653326fa84ULL,
+        0xe2e6f465173cc86cULL, 0x0ada4076497d7de4ULL, 0xa856fa10ec6dbf8aULL,
+        0x41505d9a7c25d875ULL, 0x3091b6278382eccdULL, 0x055737185b2c3f13ULL,
+        0x2f4df8ecd6f9c632ULL, 0x0633e89c33552d98ULL, 0xf7673724d16db440ULL,
+        0x7331bd08e636c391ULL, 0x0252f29672fee426ULL, 0x1fc384946b6b9ddeULL,
+        0x03460c12c901443aULL, 0x003a0792e10abcdaULL, 0x8dbec31f624e37d0ULL,
+        0x667420d5bfe4dcbeULL, 0xfbfa30e874ed7641ULL, 0x46d1ae14db7ecef6ULL,
+        0x216bd7e8f5448768ULL, 0x32bcd40d3d69cc88ULL, 0x2e991dbc39b65abeULL,
+        0x0e8fb123a502f553ULL, 0x3d2d486b2c7560c0ULL, 0x09aba1db3079fe03ULL,
+        0xcb540c59398c9bceULL, 0x363970e5339ed600ULL, 0x2caee457c28af00eULL,
+        0x005e7d7ee47f41a0ULL, 0x69fad3eb10f44100ULL, 0x048109388c75beb3ULL,
+        0x253dddf96c7a6fb8ULL, 0x4c47f705b9d47d09ULL, 0x6cec894228b5e978ULL,
+        0x04044bb9f8ff45c2ULL, 0x079e75704d775caeULL, 0x073bd54d2a9e2c33ULL,
+        0xcec7289270a364fbULL, 0x19e7486f19cd9e4eULL, 0xb50ac15b86b76608ULL,
+        0x0620cf81f165c812ULL, 0x63eaaf13be7b11d4ULL, 0x0e0cf831948248c2ULL,
+        0xf0412df8f46e7957ULL, 0x671c1fe752517e3fULL, 0x8841bfb04dd3f540ULL,
+        0x122de4142249f353ULL, 0x40a4959fb0e76870ULL, 0x25cfd3d4b4bbc459ULL,
+        0x78a07c82930c60d0ULL, 0x12c2de24d4cbc969ULL, 0x85d44866096ad7f4ULL,
+        0x1fd917ca66b2007bULL, 0x01fbbb0751764764ULL, 0x3d2a4953c6fe0fdcULL,
+        0xcc1489c5737afd94ULL, 0x1817c5b6a5346f41ULL, 0xe605a6a7e9985644ULL,
+        0x3c50412328ff1946ULL, 0xd8c7fd65817f1291ULL, 0x0bd66975ab66339bULL,
+        0x2baf8fa1c7d10fa9ULL, 0x24abdf06ddef848dULL, 0x14df0c9b2ea4f6c2ULL,
+        0x2be950edfd2cb1f7ULL, 0x21911e21094178b6ULL, 0x0fa54d518a93b379ULL,
+        0xb52508e0ac01ab42ULL, 0x0e035b5fd8cb79beULL, 0x1c1c6d1a3b3c8648ULL,
+        0x286037b42ea9871cULL, 0xfe67bf311e48a340ULL, 0x02324131e932a472ULL,
+        0x2486dc2dd919e2deULL, 0x008aec7f1da1d2ebULL, 0x63269ba0e8d3eb3aULL,
+        0x23c0f11154adb62fULL, 0xc6052393ecd4c018ULL, 0x523585b7d2f5b9fcULL,
+        0xf7e6f8c1e87564c9ULL, 0x09eb9fe5dd32c1a3ULL, 0x4d4f86886e055472ULL,
+        0x67ea17b58a37966bULL, 0x3d3ce8c23b1ed1a8ULL, 0x0df97c5ac48857ceULL,
+        0x9b6992623759eb12ULL, 0x275aa9551ae091f2ULL, 0x08855e19ac5e62e5ULL,
+        0x1155fffe0ae083ccULL, 0xbc9c78db7c570240ULL, 0x074560c447dd2418ULL,
+        0x3bf78d330bcf1e70ULL, 0x49867cd4b7ed134bULL, 0x8e6eee0cb4470accULL,
+        0x1dabafdf59233dd6ULL, 0xea3a50d844fc3fb8ULL, 0x4f03f4454764cb87ULL,
+        0x1f2f41cc36c9e6ecULL, 0x53cba4df42963441ULL, 0x10883b70a88d91fbULL,
+        0x62b1fc77d4eb9481ULL, 0x893d8f2604b362e1ULL, 0x0933b7855368b440ULL,
+        0x9351b545703b2fceULL, 0x59c1d489b9bdd3b4ULL, 0xe72a9c4311417b18ULL,
+        0x5355df77e88eb226ULL, 0xe802c37aa963d7e1ULL, 0x381c3747bd6c3bc3ULL,
+        0x378565573444258cULL, 0x37848b1e52b43c18ULL, 0x5da2cd32bdce12b6ULL,
+        0x13166c5da615f6fdULL, 0xa51ef95efcc66ac8ULL, 0x640c95e473f1e541ULL,
+        0x6ec68def1f217500ULL, 0x49ce3543c76a4079ULL, 0x5fc6fd3cddc706b5ULL,
+        0x05c3c0f0f6a1fb0dULL, 0xe7820c0996ad1bddULL, 0x21f0d752a088f35cULL,
+        0x755405b51d6fc4a0ULL, 0x7ec7649ca4b0e351ULL, 0x3d2b6a46a251f790ULL,
+        0x23e1176b19f418adULL, 0x06056575efe8ac05ULL, 0x0f75981b6966e477ULL,
+        0x06e87ec41ad437e4ULL, 0x43f6c255d5e1cb84ULL, 0xe4e67d1120ceb580ULL,
+        0x2cd67b9e12c26d7bULL, 0xcd00b5ff7fd187f1ULL, 0x3f6cd40accdc4106ULL,
+        0x3e895c835459b330ULL, 0x0814d53a217c0850ULL, 0xc9111fe78bc3a62dULL,
+        0x719967e351473204ULL, 0xe757707d24282aa4ULL, 0x7226b7f5607f98e6ULL,
+        0x7b268ffae3c08d96ULL, 0x16d3917c8b86020eULL, 0x5128bca51c49ea64ULL,
+        0x345ffea02bb1698dULL, 0x9460f5111fe4fbc8ULL, 0x60dd1aa5762852cbULL,
+        0xbb7440ed3c81667cULL, 0x0a4b12affa7f6f5cULL, 0x95cbcb0ae03861b6ULL,
+        0x07ab3b0591db6070ULL, 0xc6476a4c3de78982ULL, 0x204e82e8623ad725ULL,
+        0x569a5b4e8ac2a5ccULL, 0x425a1d77d72ebae2ULL, 0xcdaad5551ab33830ULL,
+        0x0b7c68fd8422939eULL, 0x46d9a01f53ec3020ULL, 0x102871edbb29e852ULL,
+        0x7a8e8084039075a5ULL, 0x40eaede8615e376aULL, 0x4dc67d757a1c751fULL,
+        0x1176ef33063f9145ULL, 0x4ea230285b1c8156ULL, 0x6b2aa46ce0027392ULL,
+        0x32b13230fba1b068ULL, 0x0e69796851bb984fULL, 0xb749f4542db698c0ULL,
+        0x19ad0241ffffd49cULL, 0x2f41e92ef6caff52ULL, 0x4d0b068576747439ULL,
+        0x14d607aef7463e00ULL, 0x1443d00d85fb440eULL, 0x529b43bf68688780ULL,
+        0x21133a6bc3a3e378ULL, 0x865b6436dae0e7e5ULL, 0x6b4fe83dc1d6defcULL,
+        0x03a5858a0ca0be46ULL, 0x1e841b187e67f312ULL, 0x61ee22ef40a66940ULL,
+        0x0494bd2e9e741ef8ULL, 0x4eb59e323010e72cULL, 0x19f2abcfb749810eULL,
+        0xb30f1e4f994ef9bcULL, 0x53cf6cdd51bd2d96ULL, 0x263943036497a514ULL,
+        0x0d4b52170aa2edbaULL, 0x0c4758a1c7b4f758ULL, 0x178dadb1b502b51aULL,
+        0x1ddbb20a602eb57aULL, 0x1fc2e2564a9f27fdULL, 0xd5f8c50a0e3d6f90ULL,
+        0x0081da3bbe72ac09ULL, 0xcf140d002ccdb200ULL, 0x0ae8389f09b017feULL,
+        0x17cc9ffdc03f4440ULL, 0x04eb921d704bcdddULL, 0x139a0ce4cdc521abULL,
+        0x0bfce00c145cb0f0ULL, 0x99925ff132eff707ULL, 0x063f6e5da50c3d35ULL,
+        0xa0c25dea3f0e6e29ULL, 0x0c7a9048cc8e040fULL,
+    };
+
+    const size_t padded = RoundUpTo(kCLMulNum, N);
+    auto expected_lower = AllocateAligned<T>(padded);
+    auto expected_upper = AllocateAligned<T>(padded);
+    CopyBytes<kCLMulNum * sizeof(T)>(kCLMulLower, expected_lower.get());
+    CopyBytes<kCLMulNum * sizeof(T)>(kCLMulUpper, expected_upper.get());
+    const size_t padding_size = (padded - kCLMulNum) * sizeof(T);
+    memset(expected_lower.get() + kCLMulNum, 0, padding_size);
+    memset(expected_upper.get() + kCLMulNum, 0, padding_size);
+
+    // Random inputs in each lane
+    RandomState rng;
+    for (size_t rep = 0; rep < kCLMulNum / N; ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        in1[i] = Random64(&rng);
+        in2[i] = Random64(&rng);
+      }
+
+      const auto a = Load(d, in1.get());
+      const auto b = Load(d, in2.get());
+#if HWY_PRINT_CLMUL_GOLDEN
+      Store(CLMulLower(a, b), d, expected_lower.get() + rep * N);
+      Store(CLMulUpper(a, b), d, expected_upper.get() + rep * N);
+#else
+      HWY_ASSERT_VEC_EQ(d, expected_lower.get() + rep * N, CLMulLower(a, b));
+      HWY_ASSERT_VEC_EQ(d, expected_upper.get() + rep * N, CLMulUpper(a, b));
+#endif
+    }
+
+#if HWY_PRINT_CLMUL_GOLDEN
+    // RVV lacks PRIu64, so print 32-bit halves.
+    for (size_t i = 0; i < kCLMulNum; ++i) {
+      printf("0x%08x%08xULL,", static_cast<uint32_t>(expected_lower[i] >> 32),
+             static_cast<uint32_t>(expected_lower[i] & 0xFFFFFFFFU));
+    }
+    printf("\n");
+    for (size_t i = 0; i < kCLMulNum; ++i) {
+      printf("0x%08x%08xULL,", static_cast<uint32_t>(expected_upper[i] >> 32),
+             static_cast<uint32_t>(expected_upper[i] & 0xFFFFFFFFU));
+    }
+#endif  // HWY_PRINT_CLMUL_GOLDEN
+#else
+    (void)d;
+#endif
+  }
+};
+
+HWY_NOINLINE void TestAllCLMul() { ForGEVectors<128, TestCLMul>()(uint64_t()); }
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyCryptoTest);
+HWY_EXPORT_AND_TEST_P(HwyCryptoTest, TestAllAES);
+HWY_EXPORT_AND_TEST_P(HwyCryptoTest, TestAllCLMul);
+}  // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/demote_test.cc b/third_party/highway/hwy/tests/demote_test.cc
new file mode 100644
index 0000000000..22469113d5
--- /dev/null
+++ b/third_party/highway/hwy/tests/demote_test.cc
@@ -0,0 +1,328 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <cmath>  // std::isfinite
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/demote_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+// Causes build timeout.
+#if !HWY_IS_MSAN
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+template <typename T, HWY_IF_FLOAT(T)>
+bool IsFiniteT(T t) {
+  return std::isfinite(t);
+}
+// Wrapper avoids calling std::isfinite for integer types (ambiguous).
+template <typename T, HWY_IF_NOT_FLOAT(T)>
+bool IsFiniteT(T /*unused*/) {
+  return true;
+}
+
+template <typename ToT>
+struct TestDemoteTo {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D from_d) {
+    static_assert(!IsFloat<ToT>(), "Use TestDemoteToFloat for float output");
+    static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider");
+    const Rebind<ToT, D> to_d;
+
+    const size_t N = Lanes(from_d);
+    auto from = AllocateAligned<T>(N);
+    auto expected = AllocateAligned<ToT>(N);
+
+    // Narrower range in the wider type, for clamping before we cast
+    const T min = LimitsMin<ToT>();
+    const T max = LimitsMax<ToT>();
+
+    const auto value_ok = [&](T& value) {
+      if (!IsFiniteT(value)) return false;
+      return true;
+    };
+
+    RandomState rng;
+    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        do {
+          const uint64_t bits = rng();
+          CopyBytes<sizeof(T)>(&bits, &from[i]);  // not same size
+        } while (!value_ok(from[i]));
+        expected[i] = static_cast<ToT>(HWY_MIN(HWY_MAX(min, from[i]), max));
+      }
+
+      const auto in = Load(from_d, from.get());
+      HWY_ASSERT_VEC_EQ(to_d, expected.get(), DemoteTo(to_d, in));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllDemoteToInt() {
+  ForDemoteVectors<TestDemoteTo<uint8_t>>()(int16_t());
+  ForDemoteVectors<TestDemoteTo<uint8_t>, 2>()(int32_t());
+
+  ForDemoteVectors<TestDemoteTo<int8_t>>()(int16_t());
+  ForDemoteVectors<TestDemoteTo<int8_t>, 2>()(int32_t());
+
+  const ForDemoteVectors<TestDemoteTo<uint16_t>> to_u16;
+  to_u16(int32_t());
+
+  const ForDemoteVectors<TestDemoteTo<int16_t>> to_i16;
+  to_i16(int32_t());
+}
+
+HWY_NOINLINE void TestAllDemoteToMixed() {
+#if HWY_HAVE_FLOAT64
+  const ForDemoteVectors<TestDemoteTo<int32_t>> to_i32;
+  to_i32(double());
+#endif
+}
+
+template <typename ToT>
+struct TestDemoteToFloat {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D from_d) {
+    // For floats, we clamp differently and cannot call LimitsMin.
+    static_assert(IsFloat<ToT>(), "Use TestDemoteTo for integer output");
+    static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider");
+    const Rebind<ToT, D> to_d;
+
+    const size_t N = Lanes(from_d);
+    auto from = AllocateAligned<T>(N);
+    auto expected = AllocateAligned<ToT>(N);
+
+    RandomState rng;
+    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        do {
+          const uint64_t bits = rng();
+          CopyBytes<sizeof(T)>(&bits, &from[i]);  // not same size
+        } while (!IsFiniteT(from[i]));
+        const T magn = std::abs(from[i]);
+        const T max_abs = HighestValue<ToT>();
+        // NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see
+        // https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html
+        const T clipped = copysign(HWY_MIN(magn, max_abs), from[i]);
+        expected[i] = static_cast<ToT>(clipped);
+      }
+
+      HWY_ASSERT_VEC_EQ(to_d, expected.get(),
+                        DemoteTo(to_d, Load(from_d, from.get())));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllDemoteToFloat() {
+  // Must test f16 separately because we can only load/store/convert them.
+
+#if HWY_HAVE_FLOAT64
+  const ForDemoteVectors<TestDemoteToFloat<float>, 1> to_float;
+  to_float(double());
+#endif
+}
+
+template <class D>
+AlignedFreeUniquePtr<float[]> ReorderBF16TestCases(D d, size_t& padded) {
+  const float test_cases[] = {
+      // Same as BF16TestCases:
+      // +/- 1
+      1.0f,
+      -1.0f,
+      // +/- 0
+      0.0f,
+      -0.0f,
+      // near 0
+      0.25f,
+      -0.25f,
+      // +/- integer
+      4.0f,
+      -32.0f,
+      // positive +/- delta
+      2.015625f,
+      3.984375f,
+      // negative +/- delta
+      -2.015625f,
+      -3.984375f,
+
+      // No huge values - would interfere with sum. But add more to fill 2 * N:
+      -2.0f,
+      -10.0f,
+      0.03125f,
+      1.03125f,
+      1.5f,
+      2.0f,
+      4.0f,
+      5.0f,
+      6.0f,
+      8.0f,
+      10.0f,
+      256.0f,
+      448.0f,
+      2080.0f,
+  };
+  const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
+  const size_t N = Lanes(d);
+  padded = RoundUpTo(kNumTestCases, 2 * N);  // allow loading pairs of vectors
+  auto in = AllocateAligned<float>(padded);
+  auto expected = AllocateAligned<float>(padded);
+  std::copy(test_cases, test_cases + kNumTestCases, in.get());
+  std::fill(in.get() + kNumTestCases, in.get() + padded, 0.0f);
+  return in;
+}
+
+class TestReorderDemote2To {
+  // In-place N^2 selection sort to avoid dependencies
+  void Sort(float* p, size_t count) {
+    for (size_t i = 0; i < count - 1; ++i) {
+      // Find min_element
+      size_t idx_min = i;
+      for (size_t j = i + 1; j < count; j++) {
+        if (p[j] < p[idx_min]) {
+          idx_min = j;
+        }
+      }
+
+      // Swap with current
+      const float tmp = p[i];
+      p[i] = p[idx_min];
+      p[idx_min] = tmp;
+    }
+  }
+
+ public:
+  template <typename TF32, class DF32>
+  HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {
+#if HWY_TARGET != HWY_SCALAR
+    size_t padded;
+    auto in = ReorderBF16TestCases(d32, padded);
+
+    using TBF16 = bfloat16_t;
+    const Repartition<TBF16, DF32> dbf16;
+    const Half<decltype(dbf16)> dbf16_half;
+    const size_t N = Lanes(d32);
+    auto temp16 = AllocateAligned<TBF16>(2 * N);
+    auto expected = AllocateAligned<float>(2 * N);
+    auto actual = AllocateAligned<float>(2 * N);
+
+    for (size_t i = 0; i < padded; i += 2 * N) {
+      const auto f0 = Load(d32, &in[i + 0]);
+      const auto f1 = Load(d32, &in[i + N]);
+      const auto v16 = ReorderDemote2To(dbf16, f0, f1);
+      Store(v16, dbf16, temp16.get());
+      const auto promoted0 = PromoteTo(d32, Load(dbf16_half, temp16.get() + 0));
+      const auto promoted1 = PromoteTo(d32, Load(dbf16_half, temp16.get() + N));
+
+      // Smoke test: sum should be same (with tolerance for non-associativity)
+      const auto sum_expected = GetLane(SumOfLanes(d32, Add(f0, f1)));
+      const auto sum_actual =
+          GetLane(SumOfLanes(d32, Add(promoted0, promoted1)));
+
+      HWY_ASSERT(sum_expected - 1E-4 <= sum_actual &&
+                 sum_actual <= sum_expected + 1E-4);
+
+      // Ensure values are the same after sorting to undo the Reorder
+      Store(f0, d32, expected.get() + 0);
+      Store(f1, d32, expected.get() + N);
+      Store(promoted0, d32, actual.get() + 0);
+      Store(promoted1, d32, actual.get() + N);
+      Sort(expected.get(), 2 * N);
+      Sort(actual.get(), 2 * N);
+      HWY_ASSERT_VEC_EQ(d32, expected.get() + 0, Load(d32, actual.get() + 0));
+      HWY_ASSERT_VEC_EQ(d32, expected.get() + N, Load(d32, actual.get() + N));
+    }
+#else  // HWY_SCALAR
+    (void)d32;
+#endif
+  }
+};
+
+HWY_NOINLINE void TestAllReorderDemote2To() {
+  ForShrinkableVectors<TestReorderDemote2To>()(float());
+}
+
+struct TestI32F64 {
+  template <typename TF, class DF>
+  HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
+    using TI = int32_t;
+    const Rebind<TI, DF> di;
+    const size_t N = Lanes(df);
+
+    // Integer positive
+    HWY_ASSERT_VEC_EQ(di, Iota(di, TI(4)), DemoteTo(di, Iota(df, TF(4.0))));
+
+    // Integer negative
+    HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), DemoteTo(di, Iota(df, -TF(N))));
+
+    // Above positive
+    HWY_ASSERT_VEC_EQ(di, Iota(di, TI(2)), DemoteTo(di, Iota(df, TF(2.001))));
+
+    // Below positive
+    HWY_ASSERT_VEC_EQ(di, Iota(di, TI(3)), DemoteTo(di, Iota(df, TF(3.9999))));
+
+    const TF eps = static_cast<TF>(0.0001);
+    // Above negative
+    HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)),
+                      DemoteTo(di, Iota(df, -TF(N + 1) + eps)));
+
+    // Below negative
+    HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N + 1)),
+                      DemoteTo(di, Iota(df, -TF(N + 1) - eps)));
+
+    // Huge positive float
+    HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMax<TI>()),
+                      DemoteTo(di, Set(df, TF(1E12))));
+
+    // Huge negative float
+    HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMin<TI>()),
+                      DemoteTo(di, Set(df, TF(-1E12))));
+  }
+};
+
+HWY_NOINLINE void TestAllI32F64() {
+#if HWY_HAVE_FLOAT64
+  ForDemoteVectors<TestI32F64>()(double());
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#endif  //  !HWY_IS_MSAN
+
+#if HWY_ONCE
+
+namespace hwy {
+#if !HWY_IS_MSAN
+HWY_BEFORE_TEST(HwyDemoteTest);
+HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToInt);
+HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToMixed);
+HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToFloat);
+HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllReorderDemote2To);
+HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllI32F64);
+#endif  //  !HWY_IS_MSAN
+}  // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/float_test.cc b/third_party/highway/hwy/tests/float_test.cc
new file mode 100644
index 0000000000..bc6d9020e6
--- /dev/null
+++ b/third_party/highway/hwy/tests/float_test.cc
@@ -0,0 +1,350 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Tests some ops specific to floating-point types (Div, Round etc.)
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>  // std::copy, std::fill
+#include <limits>
+#include <cmath>  // std::abs, std::isnan, std::isinf, std::ceil, std::floor
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/float_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestDiv {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v = Iota(d, T(-2));
+    const auto v1 = Set(d, T(1));
+
+    // Unchanged after division by 1.
+    HWY_ASSERT_VEC_EQ(d, v, Div(v, v1));
+
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = (T(i) - 2) / T(2);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), Div(v, Set(d, T(2))));
+  }
+};
+
+HWY_NOINLINE void TestAllDiv() { ForFloatTypes(ForPartialVectors<TestDiv>()); }
+
+struct TestApproximateReciprocal {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v = Iota(d, T(-2));
+    const auto nonzero = IfThenElse(Eq(v, Zero(d)), Set(d, T(1)), v);
+    const size_t N = Lanes(d);
+    auto input = AllocateAligned<T>(N);
+    Store(nonzero, d, input.get());
+
+    auto actual = AllocateAligned<T>(N);
+    Store(ApproximateReciprocal(nonzero), d, actual.get());
+
+    double max_l1 = 0.0;
+    double worst_expected = 0.0;
+    double worst_actual = 0.0;
+    for (size_t i = 0; i < N; ++i) {
+      const double expected = 1.0 / input[i];
+      const double l1 = std::abs(expected - actual[i]);
+      if (l1 > max_l1) {
+        max_l1 = l1;
+        worst_expected = expected;
+        worst_actual = actual[i];
+      }
+    }
+    const double abs_worst_expected = std::abs(worst_expected);
+    if (abs_worst_expected > 1E-5) {
+      const double max_rel = max_l1 / abs_worst_expected;
+      fprintf(stderr, "max l1 %f rel %f (%f vs %f)\n", max_l1, max_rel,
+              worst_expected, worst_actual);
+      HWY_ASSERT(max_rel < 0.004);
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllApproximateReciprocal() {
+  ForPartialVectors<TestApproximateReciprocal>()(float());
+}
+
+struct TestSquareRoot {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto vi = Iota(d, 0);
+    HWY_ASSERT_VEC_EQ(d, vi, Sqrt(Mul(vi, vi)));
+  }
+};
+
+HWY_NOINLINE void TestAllSquareRoot() {
+  ForFloatTypes(ForPartialVectors<TestSquareRoot>());
+}
+
+struct TestReciprocalSquareRoot {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v = Set(d, 123.0f);
+    const size_t N = Lanes(d);
+    auto lanes = AllocateAligned<T>(N);
+    Store(ApproximateReciprocalSqrt(v), d, lanes.get());
+    for (size_t i = 0; i < N; ++i) {
+      float err = lanes[i] - 0.090166f;
+      if (err < 0.0f) err = -err;
+      if (err >= 4E-4f) {
+        HWY_ABORT("Lane %d (%d): actual %f err %f\n", static_cast<int>(i),
+                  static_cast<int>(N), lanes[i], err);
+      }
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllReciprocalSquareRoot() {
+  ForPartialVectors<TestReciprocalSquareRoot>()(float());
+}
+
+template <typename T, class D>
+AlignedFreeUniquePtr<T[]> RoundTestCases(T /*unused*/, D d, size_t& padded) {
+  const T eps = std::numeric_limits<T>::epsilon();
+  const T test_cases[] = {
+    // +/- 1
+    T(1),
+    T(-1),
+    // +/- 0
+    T(0),
+    T(-0),
+    // near 0
+    T(0.4),
+    T(-0.4),
+    // +/- integer
+    T(4),
+    T(-32),
+    // positive near limit
+    MantissaEnd<T>() - T(1.5),
+    MantissaEnd<T>() + T(1.5),
+    // negative near limit
+    -MantissaEnd<T>() - T(1.5),
+    -MantissaEnd<T>() + T(1.5),
+    // positive tiebreak
+    T(1.5),
+    T(2.5),
+    // negative tiebreak
+    T(-1.5),
+    T(-2.5),
+    // positive +/- delta
+    T(2.0001),
+    T(3.9999),
+    // negative +/- delta
+    T(-999.9999),
+    T(-998.0001),
+    // positive +/- epsilon
+    T(1) + eps,
+    T(1) - eps,
+    // negative +/- epsilon
+    T(-1) + eps,
+    T(-1) - eps,
+    // +/- huge (but still fits in float)
+    T(1E34),
+    T(-1E35),
+    // +/- infinity
+    std::numeric_limits<T>::infinity(),
+    -std::numeric_limits<T>::infinity(),
+    // qNaN
+    GetLane(NaN(d))
+  };
+  const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
+  const size_t N = Lanes(d);
+  padded = RoundUpTo(kNumTestCases, N);  // allow loading whole vectors
+  auto in = AllocateAligned<T>(padded);
+  auto expected = AllocateAligned<T>(padded);
+  std::copy(test_cases, test_cases + kNumTestCases, in.get());
+  std::fill(in.get() + kNumTestCases, in.get() + padded, T(0));
+  return in;
+}
+
+struct TestRound {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T t, D d) {
+    size_t padded;
+    auto in = RoundTestCases(t, d, padded);
+    auto expected = AllocateAligned<T>(padded);
+
+    for (size_t i = 0; i < padded; ++i) {
+      // Avoid [std::]round, which does not round to nearest *even*.
+      // NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see
+      // https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html
+      expected[i] = static_cast<T>(nearbyint(in[i]));
+    }
+    for (size_t i = 0; i < padded; i += Lanes(d)) {
+      HWY_ASSERT_VEC_EQ(d, &expected[i], Round(Load(d, &in[i])));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllRound() {
+  ForFloatTypes(ForPartialVectors<TestRound>());
+}
+
+struct TestNearestInt {
+  template <typename TF, class DF>
+  HWY_NOINLINE void operator()(TF tf, const DF df) {
+    using TI = MakeSigned<TF>;
+    const RebindToSigned<DF> di;
+
+    size_t padded;
+    auto in = RoundTestCases(tf, df, padded);
+    auto expected = AllocateAligned<TI>(padded);
+
+    constexpr double max = static_cast<double>(LimitsMax<TI>());
+    for (size_t i = 0; i < padded; ++i) {
+      if (std::isnan(in[i])) {
+        // We replace NaN with 0 below (no_nan)
+        expected[i] = 0;
+      } else if (std::isinf(in[i]) || double{std::abs(in[i])} >= max) {
+        // Avoid undefined result for lrintf
+        expected[i] = std::signbit(in[i]) ? LimitsMin<TI>() : LimitsMax<TI>();
+      } else {
+        expected[i] = static_cast<TI>(lrintf(in[i]));
+      }
+    }
+    for (size_t i = 0; i < padded; i += Lanes(df)) {
+      const auto v = Load(df, &in[i]);
+      const auto no_nan = IfThenElse(Eq(v, v), v, Zero(df));
+      HWY_ASSERT_VEC_EQ(di, &expected[i], NearestInt(no_nan));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllNearestInt() {
+  ForPartialVectors<TestNearestInt>()(float());
+}
+
+struct TestTrunc {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T t, D d) {
+    size_t padded;
+    auto in = RoundTestCases(t, d, padded);
+    auto expected = AllocateAligned<T>(padded);
+
+    for (size_t i = 0; i < padded; ++i) {
+      // NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see
+      // https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html
+      expected[i] = static_cast<T>(trunc(in[i]));
+    }
+    for (size_t i = 0; i < padded; i += Lanes(d)) {
+      HWY_ASSERT_VEC_EQ(d, &expected[i], Trunc(Load(d, &in[i])));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllTrunc() {
+  ForFloatTypes(ForPartialVectors<TestTrunc>());
+}
+
+struct TestCeil {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T t, D d) {
+    size_t padded;
+    auto in = RoundTestCases(t, d, padded);
+    auto expected = AllocateAligned<T>(padded);
+
+    for (size_t i = 0; i < padded; ++i) {
+      expected[i] = std::ceil(in[i]);
+    }
+    for (size_t i = 0; i < padded; i += Lanes(d)) {
+      HWY_ASSERT_VEC_EQ(d, &expected[i], Ceil(Load(d, &in[i])));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllCeil() {
+  ForFloatTypes(ForPartialVectors<TestCeil>());
+}
+
+struct TestFloor {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T t, D d) {
+    size_t padded;
+    auto in = RoundTestCases(t, d, padded);
+    auto expected = AllocateAligned<T>(padded);
+
+    for (size_t i = 0; i < padded; ++i) {
+      expected[i] = std::floor(in[i]);
+    }
+    for (size_t i = 0; i < padded; i += Lanes(d)) {
+      HWY_ASSERT_VEC_EQ(d, &expected[i], Floor(Load(d, &in[i])));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllFloor() {
+  ForFloatTypes(ForPartialVectors<TestFloor>());
+}
+
+struct TestAbsDiff {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    auto in_lanes_a = AllocateAligned<T>(N);
+    auto in_lanes_b = AllocateAligned<T>(N);
+    auto out_lanes = AllocateAligned<T>(N);
+    for (size_t i = 0; i < N; ++i) {
+      in_lanes_a[i] = static_cast<T>((i ^ 1u) << i);
+      in_lanes_b[i] = static_cast<T>(i << i);
+      out_lanes[i] = std::abs(in_lanes_a[i] - in_lanes_b[i]);
+    }
+    const auto a = Load(d, in_lanes_a.get());
+    const auto b = Load(d, in_lanes_b.get());
+    const auto expected = Load(d, out_lanes.get());
+    HWY_ASSERT_VEC_EQ(d, expected, AbsDiff(a, b));
+    HWY_ASSERT_VEC_EQ(d, expected, AbsDiff(b, a));
+  }
+};
+
+HWY_NOINLINE void TestAllAbsDiff() {
+  ForPartialVectors<TestAbsDiff>()(float());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyFloatTest);
+HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllDiv);
+HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllApproximateReciprocal);
+HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllSquareRoot);
+HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllReciprocalSquareRoot);
+HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllRound);
+HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllNearestInt);
+HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllTrunc);
+HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllCeil);
+HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllFloor);
+HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllAbsDiff);
+}  // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/hwy_gtest.h b/third_party/highway/hwy/tests/hwy_gtest.h
new file mode 100644
index 0000000000..a4c21cd171
--- /dev/null
+++ b/third_party/highway/hwy/tests/hwy_gtest.h
@@ -0,0 +1,157 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HWY_TESTS_HWY_GTEST_H_
+#define HWY_TESTS_HWY_GTEST_H_
+
+// Adapters for GUnit to run tests for all targets.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+#include <tuple>
+
+#include "gtest/gtest.h"
+#include "hwy/highway.h"
+
+namespace hwy {
+
+// googletest before 1.10 didn't define INSTANTIATE_TEST_SUITE_P() but instead
+// used INSTANTIATE_TEST_CASE_P which is now deprecated.
+#ifdef INSTANTIATE_TEST_SUITE_P
+#define HWY_GTEST_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_SUITE_P
+#else
+#define HWY_GTEST_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_CASE_P
+#endif
+
+// Helper class to run parametric tests using the hwy target as parameter. To
+// use this define the following in your test:
+//   class MyTestSuite : public TestWithParamTarget {
+//    ...
+//   };
+//   HWY_TARGET_INSTANTIATE_TEST_SUITE_P(MyTestSuite);
+//   TEST_P(MyTestSuite, MyTest) { ... }
+class TestWithParamTarget : public testing::TestWithParam<int64_t> {
+ protected:
+  void SetUp() override { SetSupportedTargetsForTest(GetParam()); }
+
+  void TearDown() override {
+    // Check that the parametric test calls SupportedTargets() when the source
+    // was compiled with more than one target. In the single-target case only
+    // static dispatch will be used anyway.
+#if (HWY_TARGETS & (HWY_TARGETS - 1)) != 0
+    EXPECT_TRUE(GetChosenTarget().IsInitialized())
+        << "This hwy target parametric test doesn't use dynamic-dispatch and "
+           "doesn't need to be parametric.";
+#endif
+    SetSupportedTargetsForTest(0);
+  }
+};
+
+// Function to convert the test parameter of a TestWithParamTarget for
+// displaying it in the gtest test name.
+static inline std::string TestParamTargetName(
+    const testing::TestParamInfo<int64_t>& info) {
+  return TargetName(info.param);
+}
+
+#define HWY_TARGET_INSTANTIATE_TEST_SUITE_P(suite)              \
+  HWY_GTEST_INSTANTIATE_TEST_SUITE_P(                           \
+      suite##Group, suite,                                      \
+      testing::ValuesIn(::hwy::SupportedAndGeneratedTargets()), \
+      ::hwy::TestParamTargetName)
+
+// Helper class similar to TestWithParamTarget to run parametric tests that
+// depend on the target and another parametric test. If you need to use multiple
+// extra parameters use a std::tuple<> of them and ::testing::Generate(...) as
+// the generator. To use this class define the following in your test:
+//   class MyTestSuite : public TestWithParamTargetT<int> {
+//    ...
+//   };
+//   HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T(MyTestSuite, ::testing::Range(0, 9));
+//   TEST_P(MyTestSuite, MyTest) { ... GetParam() .... }
+template <typename T>
+class TestWithParamTargetAndT
+    : public ::testing::TestWithParam<std::tuple<int64_t, T>> {
+ public:
+  // Expose the parametric type here so it can be used by the
+  // HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T macro.
+  using HwyParamType = T;
+
+ protected:
+  void SetUp() override {
+    SetSupportedTargetsForTest(std::get<0>(
+        ::testing::TestWithParam<std::tuple<int64_t, T>>::GetParam()));
+  }
+
+  void TearDown() override {
+    // Check that the parametric test calls SupportedTargets() when the source
+    // was compiled with more than one target. In the single-target case only
+    // static dispatch will be used anyway.
+#if (HWY_TARGETS & (HWY_TARGETS - 1)) != 0
+    EXPECT_TRUE(GetChosenTarget().IsInitialized())
+        << "This hwy target parametric test doesn't use dynamic-dispatch and "
+           "doesn't need to be parametric.";
+#endif
+    SetSupportedTargetsForTest(0);
+  }
+
+  T GetParam() {
+    return std::get<1>(
+        ::testing::TestWithParam<std::tuple<int64_t, T>>::GetParam());
+  }
+};
+
+template <typename T>
+std::string TestParamTargetNameAndT(
+    const testing::TestParamInfo<std::tuple<int64_t, T>>& info) {
+  return std::string(TargetName(std::get<0>(info.param))) + "_" +
+         ::testing::PrintToString(std::get<1>(info.param));
+}
+
+#define HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T(suite, generator)     \
+  HWY_GTEST_INSTANTIATE_TEST_SUITE_P(                               \
+      suite##Group, suite,                                          \
+      ::testing::Combine(                                           \
+          testing::ValuesIn(::hwy::SupportedAndGeneratedTargets()), \
+          generator),                                               \
+      ::hwy::TestParamTargetNameAndT<suite::HwyParamType>)
+
+// Helper macro to export a function and define a test that tests it. This is
+// equivalent to do a HWY_EXPORT of a void(void) function and run it in a test:
+//   class MyTestSuite : public TestWithParamTarget {
+//    ...
+//   };
+//   HWY_TARGET_INSTANTIATE_TEST_SUITE_P(MyTestSuite);
+//   HWY_EXPORT_AND_TEST_P(MyTestSuite, MyTest);
+#define HWY_EXPORT_AND_TEST_P(suite, func_name)                   \
+  HWY_EXPORT(func_name);                                          \
+  TEST_P(suite, func_name) { HWY_DYNAMIC_DISPATCH(func_name)(); } \
+  static_assert(true, "For requiring trailing semicolon")
+
+#define HWY_EXPORT_AND_TEST_P_T(suite, func_name)                           \
+  HWY_EXPORT(func_name);                                                    \
+  TEST_P(suite, func_name) { HWY_DYNAMIC_DISPATCH(func_name)(GetParam()); } \
+  static_assert(true, "For requiring trailing semicolon")
+
+#define HWY_BEFORE_TEST(suite)                      \
+  class suite : public hwy::TestWithParamTarget {}; \
+  HWY_TARGET_INSTANTIATE_TEST_SUITE_P(suite);       \
+  static_assert(true, "For requiring trailing semicolon")
+
+}  // namespace hwy
+
+#endif  // HWY_TESTS_HWY_GTEST_H_
diff --git a/third_party/highway/hwy/tests/if_test.cc b/third_party/highway/hwy/tests/if_test.cc
new file mode 100644
index 0000000000..e44a878a0c
--- /dev/null
+++ b/third_party/highway/hwy/tests/if_test.cc
@@ -0,0 +1,175 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "hwy/aligned_allocator.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/if_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestIfThenElse {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    RandomState rng;
+
+    using TI = MakeSigned<T>;  // For mask > 0 comparison
+    const Rebind<TI, D> di;
+    const size_t N = Lanes(d);
+    auto in1 = AllocateAligned<T>(N);
+    auto in2 = AllocateAligned<T>(N);
+    auto bool_lanes = AllocateAligned<TI>(N);
+    auto expected = AllocateAligned<T>(N);
+
+    // Each lane should have a chance of having mask=true.
+    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        in1[i] = static_cast<T>(Random32(&rng));
+        in2[i] = static_cast<T>(Random32(&rng));
+        bool_lanes[i] = (Random32(&rng) & 16) ? TI(1) : TI(0);
+      }
+
+      const auto v1 = Load(d, in1.get());
+      const auto v2 = Load(d, in2.get());
+      const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
+
+      for (size_t i = 0; i < N; ++i) {
+        expected[i] = bool_lanes[i] ? in1[i] : in2[i];
+      }
+      HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenElse(mask, v1, v2));
+
+      for (size_t i = 0; i < N; ++i) {
+        expected[i] = bool_lanes[i] ? in1[i] : T(0);
+      }
+      HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenElseZero(mask, v1));
+
+      for (size_t i = 0; i < N; ++i) {
+        expected[i] = bool_lanes[i] ? T(0) : in2[i];
+      }
+      HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenZeroElse(mask, v2));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllIfThenElse() {
+  ForAllTypes(ForPartialVectors<TestIfThenElse>());
+}
+
+struct TestIfVecThenElse {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    RandomState rng;
+
+    using TU = MakeUnsigned<T>;  // For all-one mask
+    const Rebind<TU, D> du;
+    const size_t N = Lanes(d);
+    auto in1 = AllocateAligned<T>(N);
+    auto in2 = AllocateAligned<T>(N);
+    auto vec_lanes = AllocateAligned<TU>(N);
+    auto expected = AllocateAligned<T>(N);
+
+    // Each lane should have a chance of having mask=true.
+    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        in1[i] = static_cast<T>(Random32(&rng));
+        in2[i] = static_cast<T>(Random32(&rng));
+        vec_lanes[i] = (Random32(&rng) & 16) ? static_cast<TU>(~TU(0)) : TU(0);
+      }
+
+      const auto v1 = Load(d, in1.get());
+      const auto v2 = Load(d, in2.get());
+      const auto vec = BitCast(d, Load(du, vec_lanes.get()));
+
+      for (size_t i = 0; i < N; ++i) {
+        expected[i] = vec_lanes[i] ? in1[i] : in2[i];
+      }
+      HWY_ASSERT_VEC_EQ(d, expected.get(), IfVecThenElse(vec, v1, v2));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllIfVecThenElse() {
+  ForAllTypes(ForPartialVectors<TestIfVecThenElse>());
+}
+
+struct TestZeroIfNegative {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v0 = Zero(d);
+    const auto vp = Iota(d, 1);
+    const auto vn = Iota(d, T(-1E5));  // assumes N < 10^5
+
+    // Zero and positive remain unchanged
+    HWY_ASSERT_VEC_EQ(d, v0, ZeroIfNegative(v0));
+    HWY_ASSERT_VEC_EQ(d, vp, ZeroIfNegative(vp));
+
+    // Negative are all replaced with zero
+    HWY_ASSERT_VEC_EQ(d, v0, ZeroIfNegative(vn));
+  }
+};
+
+HWY_NOINLINE void TestAllZeroIfNegative() {
+  ForFloatTypes(ForPartialVectors<TestZeroIfNegative>());
+}
+
+struct TestIfNegative {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v0 = Zero(d);
+    const auto vp = Iota(d, 1);
+    const auto vn = Or(vp, SignBit(d));
+
+    // Zero and positive remain unchanged
+    HWY_ASSERT_VEC_EQ(d, v0, IfNegativeThenElse(v0, vn, v0));
+    HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(v0, v0, vn));
+    HWY_ASSERT_VEC_EQ(d, vp, IfNegativeThenElse(vp, vn, vp));
+    HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(vp, vp, vn));
+
+    // Negative are replaced with 2nd arg
+    HWY_ASSERT_VEC_EQ(d, v0, IfNegativeThenElse(vn, v0, vp));
+    HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(vn, vn, v0));
+    HWY_ASSERT_VEC_EQ(d, vp, IfNegativeThenElse(vn, vp, vn));
+  }
+};
+
+HWY_NOINLINE void TestAllIfNegative() {
+  ForFloatTypes(ForPartialVectors<TestIfNegative>());
+  ForSignedTypes(ForPartialVectors<TestIfNegative>());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyIfTest);
+HWY_EXPORT_AND_TEST_P(HwyIfTest, TestAllIfThenElse);
+HWY_EXPORT_AND_TEST_P(HwyIfTest, TestAllIfVecThenElse);
+HWY_EXPORT_AND_TEST_P(HwyIfTest, TestAllZeroIfNegative);
+HWY_EXPORT_AND_TEST_P(HwyIfTest, TestAllIfNegative);
+}  // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/interleaved_test.cc b/third_party/highway/hwy/tests/interleaved_test.cc
new file mode 100644
index 0000000000..4d1fbd5ac5
--- /dev/null
+++ b/third_party/highway/hwy/tests/interleaved_test.cc
@@ -0,0 +1,256 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/interleaved_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestLoadStoreInterleaved2 {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+
+    RandomState rng;
+
+    // Data to be interleaved
+    auto bytes = AllocateAligned<T>(2 * N);
+    for (size_t i = 0; i < 2 * N; ++i) {
+      bytes[i] = static_cast<T>(Random32(&rng) & 0xFF);
+    }
+    const auto in0 = Load(d, &bytes[0 * N]);
+    const auto in1 = Load(d, &bytes[1 * N]);
+
+    // Interleave here, ensure vector results match scalar
+    auto expected = AllocateAligned<T>(3 * N);
+    auto actual_aligned = AllocateAligned<T>(3 * N + 1);
+    T* actual = actual_aligned.get() + 1;
+
+    for (size_t rep = 0; rep < 100; ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        expected[2 * i + 0] = bytes[0 * N + i];
+        expected[2 * i + 1] = bytes[1 * N + i];
+        // Ensure we do not write more than 2*N bytes
+        expected[2 * N + i] = actual[2 * N + i] = 0;
+      }
+      StoreInterleaved2(in0, in1, d, actual);
+      size_t pos = 0;
+      if (!BytesEqual(expected.get(), actual, 3 * N * sizeof(T), &pos)) {
+        Print(d, "in0", in0, pos / 4);
+        Print(d, "in1", in1, pos / 4);
+        const size_t i = pos;
+        fprintf(stderr, "interleaved i=%d %f %f %f %f  %f %f %f %f\n",
+                static_cast<int>(i), static_cast<double>(actual[i]),
+                static_cast<double>(actual[i + 1]),
+                static_cast<double>(actual[i + 2]),
+                static_cast<double>(actual[i + 3]),
+                static_cast<double>(actual[i + 4]),
+                static_cast<double>(actual[i + 5]),
+                static_cast<double>(actual[i + 6]),
+                static_cast<double>(actual[i + 7]));
+        HWY_ASSERT(false);
+      }
+
+      Vec<D> out0, out1;
+      LoadInterleaved2(d, actual, out0, out1);
+      HWY_ASSERT_VEC_EQ(d, in0, out0);
+      HWY_ASSERT_VEC_EQ(d, in1, out1);
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllLoadStoreInterleaved2() {
+#if HWY_TARGET == HWY_RVV
+  // Segments are limited to 8 registers, so we can only go up to LMUL=2.
+  const ForExtendableVectors<TestLoadStoreInterleaved2, 2> test;
+#else
+  const ForPartialVectors<TestLoadStoreInterleaved2> test;
+#endif
+  ForAllTypes(test);
+}
+
+// Workaround for build timeout on GCC 12 aarch64, see #776
+#if HWY_COMPILER_GCC_ACTUAL >= 1200 && HWY_ARCH_ARM_A64
+#define HWY_BROKEN_LOAD34 1
+#else
+#define HWY_BROKEN_LOAD34 0
+#endif
+
+#if !HWY_BROKEN_LOAD34
+
+struct TestLoadStoreInterleaved3 {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+
+    RandomState rng;
+
+    // Data to be interleaved
+    auto bytes = AllocateAligned<T>(3 * N);
+    for (size_t i = 0; i < 3 * N; ++i) {
+      bytes[i] = static_cast<T>(Random32(&rng) & 0xFF);
+    }
+    const auto in0 = Load(d, &bytes[0 * N]);
+    const auto in1 = Load(d, &bytes[1 * N]);
+    const auto in2 = Load(d, &bytes[2 * N]);
+
+    // Interleave here, ensure vector results match scalar
+    auto expected = AllocateAligned<T>(4 * N);
+    auto actual_aligned = AllocateAligned<T>(4 * N + 1);
+    T* actual = actual_aligned.get() + 1;
+
+    for (size_t rep = 0; rep < 100; ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        expected[3 * i + 0] = bytes[0 * N + i];
+        expected[3 * i + 1] = bytes[1 * N + i];
+        expected[3 * i + 2] = bytes[2 * N + i];
+        // Ensure we do not write more than 3*N bytes
+        expected[3 * N + i] = actual[3 * N + i] = 0;
+      }
+      StoreInterleaved3(in0, in1, in2, d, actual);
+      size_t pos = 0;
+      if (!BytesEqual(expected.get(), actual, 4 * N * sizeof(T), &pos)) {
+        Print(d, "in0", in0, pos / 3, N);
+        Print(d, "in1", in1, pos / 3, N);
+        Print(d, "in2", in2, pos / 3, N);
+        const size_t i = pos;
+        fprintf(stderr, "interleaved i=%d %f %f %f  %f %f %f\n",
+                static_cast<int>(i), static_cast<double>(actual[i]),
+                static_cast<double>(actual[i + 1]),
+                static_cast<double>(actual[i + 2]),
+                static_cast<double>(actual[i + 3]),
+                static_cast<double>(actual[i + 4]),
+                static_cast<double>(actual[i + 5]));
+        HWY_ASSERT(false);
+      }
+
+      Vec<D> out0, out1, out2;
+      LoadInterleaved3(d, actual, out0, out1, out2);
+      HWY_ASSERT_VEC_EQ(d, in0, out0);
+      HWY_ASSERT_VEC_EQ(d, in1, out1);
+      HWY_ASSERT_VEC_EQ(d, in2, out2);
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllLoadStoreInterleaved3() {
+#if HWY_TARGET == HWY_RVV
+  // Segments are limited to 8 registers, so we can only go up to LMUL=2.
+  const ForExtendableVectors<TestLoadStoreInterleaved3, 2> test;
+#else
+  const ForPartialVectors<TestLoadStoreInterleaved3> test;
+#endif
+  ForAllTypes(test);
+}
+
+struct TestLoadStoreInterleaved4 {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+
+    RandomState rng;
+
+    // Data to be interleaved
+    auto bytes = AllocateAligned<T>(4 * N);
+
+    for (size_t i = 0; i < 4 * N; ++i) {
+      bytes[i] = static_cast<T>(Random32(&rng) & 0xFF);
+    }
+    const auto in0 = Load(d, &bytes[0 * N]);
+    const auto in1 = Load(d, &bytes[1 * N]);
+    const auto in2 = Load(d, &bytes[2 * N]);
+    const auto in3 = Load(d, &bytes[3 * N]);
+
+    // Interleave here, ensure vector results match scalar
+    auto expected = AllocateAligned<T>(5 * N);
+    auto actual_aligned = AllocateAligned<T>(5 * N + 1);
+    T* actual = actual_aligned.get() + 1;
+
+    for (size_t rep = 0; rep < 100; ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        expected[4 * i + 0] = bytes[0 * N + i];
+        expected[4 * i + 1] = bytes[1 * N + i];
+        expected[4 * i + 2] = bytes[2 * N + i];
+        expected[4 * i + 3] = bytes[3 * N + i];
+        // Ensure we do not write more than 4*N bytes
+        expected[4 * N + i] = actual[4 * N + i] = 0;
+      }
+      StoreInterleaved4(in0, in1, in2, in3, d, actual);
+      size_t pos = 0;
+      if (!BytesEqual(expected.get(), actual, 5 * N * sizeof(T), &pos)) {
+        Print(d, "in0", in0, pos / 4);
+        Print(d, "in1", in1, pos / 4);
+        Print(d, "in2", in2, pos / 4);
+        Print(d, "in3", in3, pos / 4);
+        const size_t i = pos;
+        fprintf(stderr, "interleaved i=%d %f %f %f %f  %f %f %f %f\n",
+                static_cast<int>(i), static_cast<double>(actual[i]),
+                static_cast<double>(actual[i + 1]),
+                static_cast<double>(actual[i + 2]),
+                static_cast<double>(actual[i + 3]),
+                static_cast<double>(actual[i + 4]),
+                static_cast<double>(actual[i + 5]),
+                static_cast<double>(actual[i + 6]),
+                static_cast<double>(actual[i + 7]));
+        HWY_ASSERT(false);
+      }
+
+      Vec<D> out0, out1, out2, out3;
+      LoadInterleaved4(d, actual, out0, out1, out2, out3);
+      HWY_ASSERT_VEC_EQ(d, in0, out0);
+      HWY_ASSERT_VEC_EQ(d, in1, out1);
+      HWY_ASSERT_VEC_EQ(d, in2, out2);
+      HWY_ASSERT_VEC_EQ(d, in3, out3);
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllLoadStoreInterleaved4() {
+#if HWY_TARGET == HWY_RVV
+  // Segments are limited to 8 registers, so we can only go up to LMUL=2.
+  const ForExtendableVectors<TestLoadStoreInterleaved4, 2> test;
+#else
+  const ForPartialVectors<TestLoadStoreInterleaved4> test;
+#endif
+  ForAllTypes(test);
+}
+
+#endif  // !HWY_BROKEN_LOAD34
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyInterleavedTest);
+HWY_EXPORT_AND_TEST_P(HwyInterleavedTest, TestAllLoadStoreInterleaved2);
+#if !HWY_BROKEN_LOAD34
+HWY_EXPORT_AND_TEST_P(HwyInterleavedTest, TestAllLoadStoreInterleaved3);
+HWY_EXPORT_AND_TEST_P(HwyInterleavedTest, TestAllLoadStoreInterleaved4);
+#endif
+}  // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/list_targets.cc b/third_party/highway/hwy/tests/list_targets.cc
new file mode 100644
index 0000000000..d09ee4fe86
--- /dev/null
+++ b/third_party/highway/hwy/tests/list_targets.cc
@@ -0,0 +1,71 @@
+// Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Simple tool to print the list of targets that were compiled in when building
+// this tool.
+
+#include <stdio.h>
+
+#include "hwy/highway.h"
+
+void PrintTargets(const char* msg, int64_t targets) {
+  fprintf(stderr, "%s", msg);
+  // For each bit:
+  for (int64_t x = targets; x != 0; x = x & (x - 1)) {
+    // Extract value of least-significant bit.
+    fprintf(stderr, " %s", hwy::TargetName(x & (~x + 1)));
+  }
+  fprintf(stderr, "\n");
+}
+
+int main() {
+#ifdef HWY_COMPILE_ONLY_EMU128
+  const int only_emu128 = 1;
+#else
+  const int only_emu128 = 0;
+#endif
+#ifdef HWY_COMPILE_ONLY_SCALAR
+  const int only_scalar = 1;
+#else
+  const int only_scalar = 0;
+#endif
+#ifdef HWY_COMPILE_ONLY_STATIC
+  const int only_static = 1;
+#else
+  const int only_static = 0;
+#endif
+#ifdef HWY_COMPILE_ALL_ATTAINABLE
+  const int all_attain = 1;
+#else
+  const int all_attain = 0;
+#endif
+#ifdef HWY_IS_TEST
+  const int is_test = 1;
+#else
+  const int is_test = 0;
+#endif
+
+  fprintf(stderr,
+          "Config: emu128:%d scalar:%d static:%d all_attain:%d is_test:%d\n",
+          only_emu128, only_scalar, only_static, all_attain, is_test);
+  PrintTargets("Compiled HWY_TARGETS:  ", HWY_TARGETS);
+  PrintTargets("HWY_ATTAINABLE_TARGETS:", HWY_ATTAINABLE_TARGETS);
+  PrintTargets("HWY_BASELINE_TARGETS:  ", HWY_BASELINE_TARGETS);
+  PrintTargets("HWY_STATIC_TARGET:     ", HWY_STATIC_TARGET);
+  PrintTargets("HWY_BROKEN_TARGETS:    ", HWY_BROKEN_TARGETS);
+  PrintTargets("HWY_DISABLED_TARGETS:  ", HWY_DISABLED_TARGETS);
+  PrintTargets("Current CPU supports:  ", hwy::SupportedTargets());
+  return 0;
+}
diff --git a/third_party/highway/hwy/tests/logical_test.cc b/third_party/highway/hwy/tests/logical_test.cc
new file mode 100644
index 0000000000..b646f5ff4b
--- /dev/null
+++ b/third_party/highway/hwy/tests/logical_test.cc
@@ -0,0 +1,246 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>  // memcmp
+
+#include "hwy/aligned_allocator.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/logical_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestNot {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v0 = Zero(d);
+    const auto ones = VecFromMask(d, Eq(v0, v0));
+    const auto v1 = Set(d, 1);
+    const auto vnot1 = Set(d, T(~T(1)));
+
+    HWY_ASSERT_VEC_EQ(d, v0, Not(ones));
+    HWY_ASSERT_VEC_EQ(d, ones, Not(v0));
+    HWY_ASSERT_VEC_EQ(d, v1, Not(vnot1));
+    HWY_ASSERT_VEC_EQ(d, vnot1, Not(v1));
+  }
+};
+
+HWY_NOINLINE void TestAllNot() {
+  ForIntegerTypes(ForPartialVectors<TestNot>());
+}
+
+struct TestLogical {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v0 = Zero(d);
+    const auto vi = Iota(d, 0);
+
+    auto v = vi;
+    v = And(v, vi);
+    HWY_ASSERT_VEC_EQ(d, vi, v);
+    v = And(v, v0);
+    HWY_ASSERT_VEC_EQ(d, v0, v);
+
+    v = Or(v, vi);
+    HWY_ASSERT_VEC_EQ(d, vi, v);
+    v = Or(v, v0);
+    HWY_ASSERT_VEC_EQ(d, vi, v);
+
+    v = Xor(v, vi);
+    HWY_ASSERT_VEC_EQ(d, v0, v);
+    v = Xor(v, v0);
+    HWY_ASSERT_VEC_EQ(d, v0, v);
+
+    HWY_ASSERT_VEC_EQ(d, v0, And(v0, vi));
+    HWY_ASSERT_VEC_EQ(d, v0, And(vi, v0));
+    HWY_ASSERT_VEC_EQ(d, vi, And(vi, vi));
+
+    HWY_ASSERT_VEC_EQ(d, vi, Or(v0, vi));
+    HWY_ASSERT_VEC_EQ(d, vi, Or(vi, v0));
+    HWY_ASSERT_VEC_EQ(d, vi, Or(vi, vi));
+
+    HWY_ASSERT_VEC_EQ(d, vi, Xor(v0, vi));
+    HWY_ASSERT_VEC_EQ(d, vi, Xor(vi, v0));
+    HWY_ASSERT_VEC_EQ(d, v0, Xor(vi, vi));
+
+    HWY_ASSERT_VEC_EQ(d, vi, AndNot(v0, vi));
+    HWY_ASSERT_VEC_EQ(d, v0, AndNot(vi, v0));
+    HWY_ASSERT_VEC_EQ(d, v0, AndNot(vi, vi));
+
+    HWY_ASSERT_VEC_EQ(d, v0, Or3(v0, v0, v0));
+    HWY_ASSERT_VEC_EQ(d, vi, Or3(v0, vi, v0));
+    HWY_ASSERT_VEC_EQ(d, vi, Or3(v0, v0, vi));
+    HWY_ASSERT_VEC_EQ(d, vi, Or3(v0, vi, vi));
+    HWY_ASSERT_VEC_EQ(d, vi, Or3(vi, v0, v0));
+    HWY_ASSERT_VEC_EQ(d, vi, Or3(vi, vi, v0));
+    HWY_ASSERT_VEC_EQ(d, vi, Or3(vi, v0, vi));
+    HWY_ASSERT_VEC_EQ(d, vi, Or3(vi, vi, vi));
+
+    HWY_ASSERT_VEC_EQ(d, v0, Xor3(v0, v0, v0));
+    HWY_ASSERT_VEC_EQ(d, vi, Xor3(v0, vi, v0));
+    HWY_ASSERT_VEC_EQ(d, vi, Xor3(v0, v0, vi));
+    HWY_ASSERT_VEC_EQ(d, v0, Xor3(v0, vi, vi));
+    HWY_ASSERT_VEC_EQ(d, vi, Xor3(vi, v0, v0));
+    HWY_ASSERT_VEC_EQ(d, v0, Xor3(vi, vi, v0));
+    HWY_ASSERT_VEC_EQ(d, v0, Xor3(vi, v0, vi));
+    HWY_ASSERT_VEC_EQ(d, vi, Xor3(vi, vi, vi));
+
+    HWY_ASSERT_VEC_EQ(d, v0, OrAnd(v0, v0, v0));
+    HWY_ASSERT_VEC_EQ(d, v0, OrAnd(v0, vi, v0));
+    HWY_ASSERT_VEC_EQ(d, v0, OrAnd(v0, v0, vi));
+    HWY_ASSERT_VEC_EQ(d, vi, OrAnd(v0, vi, vi));
+    HWY_ASSERT_VEC_EQ(d, vi, OrAnd(vi, v0, v0));
+    HWY_ASSERT_VEC_EQ(d, vi, OrAnd(vi, vi, v0));
+    HWY_ASSERT_VEC_EQ(d, vi, OrAnd(vi, v0, vi));
+    HWY_ASSERT_VEC_EQ(d, vi, OrAnd(vi, vi, vi));
+  }
+};
+
+HWY_NOINLINE void TestAllLogical() {
+  ForAllTypes(ForPartialVectors<TestLogical>());
+}
+
+struct TestCopySign {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v0 = Zero(d);
+    const auto vp = Iota(d, 1);
+    const auto vn = Iota(d, T(-1E5));  // assumes N < 10^5
+
+    // Zero remains zero regardless of sign
+    HWY_ASSERT_VEC_EQ(d, v0, CopySign(v0, v0));
+    HWY_ASSERT_VEC_EQ(d, v0, CopySign(v0, vp));
+    HWY_ASSERT_VEC_EQ(d, v0, CopySign(v0, vn));
+    HWY_ASSERT_VEC_EQ(d, v0, CopySignToAbs(v0, v0));
+    HWY_ASSERT_VEC_EQ(d, v0, CopySignToAbs(v0, vp));
+    HWY_ASSERT_VEC_EQ(d, v0, CopySignToAbs(v0, vn));
+
+    // Positive input, positive sign => unchanged
+    HWY_ASSERT_VEC_EQ(d, vp, CopySign(vp, vp));
+    HWY_ASSERT_VEC_EQ(d, vp, CopySignToAbs(vp, vp));
+
+    // Positive input, negative sign => negated
+    HWY_ASSERT_VEC_EQ(d, Neg(vp), CopySign(vp, vn));
+    HWY_ASSERT_VEC_EQ(d, Neg(vp), CopySignToAbs(vp, vn));
+
+    // Negative input, negative sign => unchanged
+    HWY_ASSERT_VEC_EQ(d, vn, CopySign(vn, vn));
+
+    // Negative input, positive sign => negated
+    HWY_ASSERT_VEC_EQ(d, Neg(vn), CopySign(vn, vp));
+  }
+};
+
+HWY_NOINLINE void TestAllCopySign() {
+  ForFloatTypes(ForPartialVectors<TestCopySign>());
+}
+
+struct TestBroadcastSignBit {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto s0 = Zero(d);
+    const auto s1 = Set(d, -1);  // all bit set
+    const auto vpos = And(Iota(d, 0), Set(d, LimitsMax<T>()));
+    const auto vneg = Sub(s1, vpos);
+
+    HWY_ASSERT_VEC_EQ(d, s0, BroadcastSignBit(vpos));
+    HWY_ASSERT_VEC_EQ(d, s0, BroadcastSignBit(Set(d, LimitsMax<T>())));
+
+    HWY_ASSERT_VEC_EQ(d, s1, BroadcastSignBit(vneg));
+    HWY_ASSERT_VEC_EQ(d, s1, BroadcastSignBit(Set(d, LimitsMin<T>())));
+    HWY_ASSERT_VEC_EQ(d, s1, BroadcastSignBit(Set(d, LimitsMin<T>() / 2)));
+  }
+};
+
+HWY_NOINLINE void TestAllBroadcastSignBit() {
+  ForSignedTypes(ForPartialVectors<TestBroadcastSignBit>());
+}
+
+struct TestTestBit {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t kNumBits = sizeof(T) * 8;
+    for (size_t i = 0; i < kNumBits; ++i) {
+      const auto bit1 = Set(d, T(1ull << i));
+      const auto bit2 = Set(d, T(1ull << ((i + 1) % kNumBits)));
+      const auto bit3 = Set(d, T(1ull << ((i + 2) % kNumBits)));
+      const auto bits12 = Or(bit1, bit2);
+      const auto bits23 = Or(bit2, bit3);
+      HWY_ASSERT(AllTrue(d, TestBit(bit1, bit1)));
+      HWY_ASSERT(AllTrue(d, TestBit(bits12, bit1)));
+      HWY_ASSERT(AllTrue(d, TestBit(bits12, bit2)));
+
+      HWY_ASSERT(AllFalse(d, TestBit(bits12, bit3)));
+      HWY_ASSERT(AllFalse(d, TestBit(bits23, bit1)));
+      HWY_ASSERT(AllFalse(d, TestBit(bit1, bit2)));
+      HWY_ASSERT(AllFalse(d, TestBit(bit2, bit1)));
+      HWY_ASSERT(AllFalse(d, TestBit(bit1, bit3)));
+      HWY_ASSERT(AllFalse(d, TestBit(bit3, bit1)));
+      HWY_ASSERT(AllFalse(d, TestBit(bit2, bit3)));
+      HWY_ASSERT(AllFalse(d, TestBit(bit3, bit2)));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllTestBit() {
+  ForIntegerTypes(ForPartialVectors<TestTestBit>());
+}
+
+struct TestPopulationCount {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    RandomState rng;
+    size_t N = Lanes(d);
+    auto data = AllocateAligned<T>(N);
+    auto popcnt = AllocateAligned<T>(N);
+    for (size_t i = 0; i < AdjustedReps(1 << 18) / N; i++) {
+      for (size_t i = 0; i < N; i++) {
+        data[i] = static_cast<T>(rng());
+        popcnt[i] = static_cast<T>(PopCount(data[i]));
+      }
+      HWY_ASSERT_VEC_EQ(d, popcnt.get(), PopulationCount(Load(d, data.get())));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllPopulationCount() {
+  ForUnsignedTypes(ForPartialVectors<TestPopulationCount>());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyLogicalTest);
+HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllNot);
+HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogical);
+HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCopySign);
+HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllBroadcastSignBit);
+HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllTestBit);
+HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllPopulationCount);
+}  // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/mask_mem_test.cc b/third_party/highway/hwy/tests/mask_mem_test.cc
new file mode 100644
index 0000000000..c44119dcd7
--- /dev/null
+++ b/third_party/highway/hwy/tests/mask_mem_test.cc
@@ -0,0 +1,197 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS  // before inttypes.h
+#endif
+#include <inttypes.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>  // memcmp
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/mask_mem_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestMaskedLoad {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    RandomState rng;
+
+    using TI = MakeSigned<T>;  // For mask > 0 comparison
+    const Rebind<TI, D> di;
+    const size_t N = Lanes(d);
+    auto bool_lanes = AllocateAligned<TI>(N);
+
+    auto lanes = AllocateAligned<T>(N);
+    Store(Iota(d, T{1}), d, lanes.get());
+
+    // Each lane should have a chance of having mask=true.
+    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0);
+      }
+
+      const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
+      const auto expected = IfThenElseZero(mask, Load(d, lanes.get()));
+      const auto actual = MaskedLoad(mask, d, lanes.get());
+      HWY_ASSERT_VEC_EQ(d, expected, actual);
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllMaskedLoad() {
+  ForAllTypes(ForPartialVectors<TestMaskedLoad>());
+}
+
+struct TestBlendedStore {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    RandomState rng;
+
+    using TI = MakeSigned<T>;  // For mask > 0 comparison
+    const Rebind<TI, D> di;
+    const size_t N = Lanes(d);
+    auto bool_lanes = AllocateAligned<TI>(N);
+
+    const Vec<D> v = Iota(d, T{1});
+    auto actual = AllocateAligned<T>(N);
+    auto expected = AllocateAligned<T>(N);
+
+    // Each lane should have a chance of having mask=true.
+    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0);
+        // Re-initialize to something distinct from v[i].
+        actual[i] = static_cast<T>(127 - (i & 127));
+        expected[i] = bool_lanes[i] ? static_cast<T>(i + 1) : actual[i];
+      }
+
+      const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
+      BlendedStore(v, mask, d, actual.get());
+      HWY_ASSERT_VEC_EQ(d, expected.get(), Load(d, actual.get()));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllBlendedStore() {
+  ForAllTypes(ForPartialVectors<TestBlendedStore>());
+}
+
+class TestStoreMaskBits {
+ public:
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*t*/, D /*d*/) {
+    RandomState rng;
+    using TI = MakeSigned<T>;  // For mask > 0 comparison
+    const Rebind<TI, D> di;
+    const size_t N = Lanes(di);
+    auto bool_lanes = AllocateAligned<TI>(N);
+
+    const ScalableTag<uint8_t, -3> d_bits;
+    const size_t expected_num_bytes = (N + 7) / 8;
+    auto expected = AllocateAligned<uint8_t>(expected_num_bytes);
+    auto actual = AllocateAligned<uint8_t>(HWY_MAX(8, expected_num_bytes));
+
+    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+      // Generate random mask pattern.
+      for (size_t i = 0; i < N; ++i) {
+        bool_lanes[i] = static_cast<TI>((rng() & 1024) ? 1 : 0);
+      }
+      const auto bools = Load(di, bool_lanes.get());
+      const auto mask = Gt(bools, Zero(di));
+
+      // Requires at least 8 bytes, ensured above.
+      const size_t bytes_written = StoreMaskBits(di, mask, actual.get());
+      if (bytes_written != expected_num_bytes) {
+        fprintf(stderr, "%s expected %" PRIu64 " bytes, actual %" PRIu64 "\n",
+                TypeName(T(), N).c_str(),
+                static_cast<uint64_t>(expected_num_bytes),
+                static_cast<uint64_t>(bytes_written));
+
+        HWY_ASSERT(false);
+      }
+
+      // Requires at least 8 bytes, ensured above.
+      const auto mask2 = LoadMaskBits(di, actual.get());
+      HWY_ASSERT_MASK_EQ(di, mask, mask2);
+
+      memset(expected.get(), 0, expected_num_bytes);
+      for (size_t i = 0; i < N; ++i) {
+        expected[i / 8] =
+            static_cast<uint8_t>(expected[i / 8] | (bool_lanes[i] << (i % 8)));
+      }
+
+      size_t i = 0;
+      // Stored bits must match original mask
+      for (; i < N; ++i) {
+        const TI is_set = (actual[i / 8] & (1 << (i % 8))) ? 1 : 0;
+        if (is_set != bool_lanes[i]) {
+          fprintf(stderr, "%s lane %" PRIu64 ": expected %d, actual %d\n",
+                  TypeName(T(), N).c_str(), static_cast<uint64_t>(i),
+                  static_cast<int>(bool_lanes[i]), static_cast<int>(is_set));
+          Print(di, "bools", bools, 0, N);
+          Print(d_bits, "expected bytes", Load(d_bits, expected.get()), 0,
+                expected_num_bytes);
+          Print(d_bits, "actual bytes", Load(d_bits, actual.get()), 0,
+                expected_num_bytes);
+
+          HWY_ASSERT(false);
+        }
+      }
+      // Any partial bits in the last byte must be zero
+      for (; i < 8 * bytes_written; ++i) {
+        const int bit = (actual[i / 8] & (1 << (i % 8)));
+        if (bit != 0) {
+          fprintf(stderr, "%s: bit #%" PRIu64 " should be zero\n",
+                  TypeName(T(), N).c_str(), static_cast<uint64_t>(i));
+          Print(di, "bools", bools, 0, N);
+          Print(d_bits, "expected bytes", Load(d_bits, expected.get()), 0,
+                expected_num_bytes);
+          Print(d_bits, "actual bytes", Load(d_bits, actual.get()), 0,
+                expected_num_bytes);
+
+          HWY_ASSERT(false);
+        }
+      }
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllStoreMaskBits() {
+  ForAllTypes(ForPartialVectors<TestStoreMaskBits>());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyMaskTest);
+HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllMaskedLoad);
+HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllBlendedStore);
+HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllStoreMaskBits);
+}  // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/mask_test.cc b/third_party/highway/hwy/tests/mask_test.cc
new file mode 100644
index 0000000000..cf0d2d4ee8
--- /dev/null
+++ b/third_party/highway/hwy/tests/mask_test.cc
@@ -0,0 +1,295 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>  // memcmp
+
+#include <algorithm>  // std::fill
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/mask_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// All types.
+struct TestFromVec {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    auto lanes = AllocateAligned<T>(N);
+
+    memset(lanes.get(), 0, N * sizeof(T));
+    const auto actual_false = MaskFromVec(Load(d, lanes.get()));
+    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), actual_false);
+
+    memset(lanes.get(), 0xFF, N * sizeof(T));
+    const auto actual_true = MaskFromVec(Load(d, lanes.get()));
+    HWY_ASSERT_MASK_EQ(d, MaskTrue(d), actual_true);
+  }
+};
+
+HWY_NOINLINE void TestAllFromVec() {
+  ForAllTypes(ForPartialVectors<TestFromVec>());
+}
+
+struct TestFirstN {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    auto bool_lanes = AllocateAligned<T>(N);
+
+    using TN = SignedFromSize<HWY_MIN(sizeof(size_t), sizeof(T))>;
+    const size_t max_len = static_cast<size_t>(LimitsMax<TN>());
+
+    const size_t max_lanes = HWY_MIN(2 * N, AdjustedReps(512));
+    for (size_t len = 0; len <= HWY_MIN(max_lanes, max_len); ++len) {
+      // Loop instead of Iota+Lt to avoid wraparound for 8-bit T.
+      for (size_t i = 0; i < N; ++i) {
+        bool_lanes[i] = (i < len) ? T{1} : 0;
+      }
+      const auto expected = Eq(Load(d, bool_lanes.get()), Set(d, T{1}));
+      HWY_ASSERT_MASK_EQ(d, expected, FirstN(d, len));
+    }
+
+    // Also ensure huge values yield all-true (unless the vector is actually
+    // larger than max_len).
+    for (size_t i = 0; i < N; ++i) {
+      bool_lanes[i] = (i < max_len) ? T{1} : 0;
+    }
+    const auto expected = Eq(Load(d, bool_lanes.get()), Set(d, T{1}));
+    HWY_ASSERT_MASK_EQ(d, expected, FirstN(d, max_len));
+  }
+};
+
+HWY_NOINLINE void TestAllFirstN() {
+  ForAllTypes(ForPartialVectors<TestFirstN>());
+}
+
+struct TestMaskVec {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    RandomState rng;
+
+    using TI = MakeSigned<T>;  // For mask > 0 comparison
+    const Rebind<TI, D> di;
+    const size_t N = Lanes(d);
+    auto bool_lanes = AllocateAligned<TI>(N);
+
+    // Each lane should have a chance of having mask=true.
+    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0);
+      }
+
+      const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
+      HWY_ASSERT_MASK_EQ(d, mask, MaskFromVec(VecFromMask(d, mask)));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllMaskVec() {
+  const ForPartialVectors<TestMaskVec> test;
+
+  test(uint16_t());
+  test(int16_t());
+  // TODO(janwas): float16_t - cannot compare yet
+
+  ForUIF3264(test);
+}
+
+struct TestAllTrueFalse {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto zero = Zero(d);
+    auto v = zero;
+
+    const size_t N = Lanes(d);
+    auto lanes = AllocateAligned<T>(N);
+    std::fill(lanes.get(), lanes.get() + N, T(0));
+
+    HWY_ASSERT(AllTrue(d, Eq(v, zero)));
+    HWY_ASSERT(!AllFalse(d, Eq(v, zero)));
+
+    // Single lane implies AllFalse = !AllTrue. Otherwise, there are multiple
+    // lanes and one is nonzero.
+    const bool expected_all_false = (N != 1);
+
+    // Set each lane to nonzero and back to zero
+    for (size_t i = 0; i < N; ++i) {
+      lanes[i] = T(1);
+      v = Load(d, lanes.get());
+
+      HWY_ASSERT(!AllTrue(d, Eq(v, zero)));
+
+      HWY_ASSERT(expected_all_false ^ AllFalse(d, Eq(v, zero)));
+
+      lanes[i] = T(-1);
+      v = Load(d, lanes.get());
+      HWY_ASSERT(!AllTrue(d, Eq(v, zero)));
+      HWY_ASSERT(expected_all_false ^ AllFalse(d, Eq(v, zero)));
+
+      // Reset to all zero
+      lanes[i] = T(0);
+      v = Load(d, lanes.get());
+      HWY_ASSERT(AllTrue(d, Eq(v, zero)));
+      HWY_ASSERT(!AllFalse(d, Eq(v, zero)));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllAllTrueFalse() {
+  ForAllTypes(ForPartialVectors<TestAllTrueFalse>());
+}
+
+struct TestCountTrue {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using TI = MakeSigned<T>;  // For mask > 0 comparison
+    const Rebind<TI, D> di;
+    const size_t N = Lanes(di);
+    auto bool_lanes = AllocateAligned<TI>(N);
+    memset(bool_lanes.get(), 0, N * sizeof(TI));
+
+    // For all combinations of zero/nonzero state of subset of lanes:
+    const size_t max_lanes = HWY_MIN(N, size_t(10));
+
+    for (size_t code = 0; code < (1ull << max_lanes); ++code) {
+      // Number of zeros written = number of mask lanes that are true.
+      size_t expected = 0;
+      for (size_t i = 0; i < max_lanes; ++i) {
+        const bool is_true = (code & (1ull << i)) != 0;
+        bool_lanes[i] = is_true ? TI(1) : TI(0);
+        expected += is_true;
+      }
+
+      const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
+      const size_t actual = CountTrue(d, mask);
+      HWY_ASSERT_EQ(expected, actual);
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllCountTrue() {
+  ForAllTypes(ForPartialVectors<TestCountTrue>());
+}
+
+struct TestFindFirstTrue {  // Also FindKnownFirstTrue
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using TI = MakeSigned<T>;  // For mask > 0 comparison
+    const Rebind<TI, D> di;
+    const size_t N = Lanes(di);
+    auto bool_lanes = AllocateAligned<TI>(N);
+    memset(bool_lanes.get(), 0, N * sizeof(TI));
+
+    // For all combinations of zero/nonzero state of subset of lanes:
+    const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(9)));
+
+    HWY_ASSERT_EQ(intptr_t(-1), FindFirstTrue(d, MaskFalse(d)));
+    HWY_ASSERT_EQ(intptr_t(0), FindFirstTrue(d, MaskTrue(d)));
+    HWY_ASSERT_EQ(size_t(0), FindKnownFirstTrue(d, MaskTrue(d)));
+
+    for (size_t code = 1; code < (1ull << max_lanes); ++code) {
+      for (size_t i = 0; i < max_lanes; ++i) {
+        bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0);
+      }
+
+      const size_t expected =
+          Num0BitsBelowLS1Bit_Nonzero32(static_cast<uint32_t>(code));
+      const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
+      HWY_ASSERT_EQ(static_cast<intptr_t>(expected), FindFirstTrue(d, mask));
+      HWY_ASSERT_EQ(expected, FindKnownFirstTrue(d, mask));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllFindFirstTrue() {
+  ForAllTypes(ForPartialVectors<TestFindFirstTrue>());
+}
+
+struct TestLogicalMask {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto m0 = MaskFalse(d);
+    const auto m_all = MaskTrue(d);
+
+    using TI = MakeSigned<T>;  // For mask > 0 comparison
+    const Rebind<TI, D> di;
+    const size_t N = Lanes(di);
+    auto bool_lanes = AllocateAligned<TI>(N);
+    memset(bool_lanes.get(), 0, N * sizeof(TI));
+
+    HWY_ASSERT_MASK_EQ(d, m0, Not(m_all));
+    HWY_ASSERT_MASK_EQ(d, m_all, Not(m0));
+
+    Print(d, ".", VecFromMask(d, ExclusiveNeither(m0, m0)));
+    HWY_ASSERT_MASK_EQ(d, m_all, ExclusiveNeither(m0, m0));
+    HWY_ASSERT_MASK_EQ(d, m0, ExclusiveNeither(m_all, m0));
+    HWY_ASSERT_MASK_EQ(d, m0, ExclusiveNeither(m0, m_all));
+
+    // For all combinations of zero/nonzero state of subset of lanes:
+    const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6)));
+    for (size_t code = 0; code < (1ull << max_lanes); ++code) {
+      for (size_t i = 0; i < max_lanes; ++i) {
+        bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0);
+      }
+
+      const auto m = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
+
+      HWY_ASSERT_MASK_EQ(d, m0, Xor(m, m));
+      HWY_ASSERT_MASK_EQ(d, m0, AndNot(m, m));
+      HWY_ASSERT_MASK_EQ(d, m0, AndNot(m_all, m));
+
+      HWY_ASSERT_MASK_EQ(d, m, Or(m, m));
+      HWY_ASSERT_MASK_EQ(d, m, Or(m0, m));
+      HWY_ASSERT_MASK_EQ(d, m, Or(m, m0));
+      HWY_ASSERT_MASK_EQ(d, m, Xor(m0, m));
+      HWY_ASSERT_MASK_EQ(d, m, Xor(m, m0));
+      HWY_ASSERT_MASK_EQ(d, m, And(m, m));
+      HWY_ASSERT_MASK_EQ(d, m, And(m_all, m));
+      HWY_ASSERT_MASK_EQ(d, m, And(m, m_all));
+      HWY_ASSERT_MASK_EQ(d, m, AndNot(m0, m));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllLogicalMask() {
+  ForAllTypes(ForPartialVectors<TestLogicalMask>());
+}
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyMaskTest);
+HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFromVec);
+HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFirstN);
+HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllMaskVec);
+HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllAllTrueFalse);
+HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllCountTrue);
+HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFindFirstTrue);
+HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllLogicalMask);
+}  // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/memory_test.cc b/third_party/highway/hwy/tests/memory_test.cc
new file mode 100644
index 0000000000..d17addf544
--- /dev/null
+++ b/third_party/highway/hwy/tests/memory_test.cc
@@ -0,0 +1,343 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Ensure incompabilities with Windows macros (e.g. #define StoreFence) are
+// detected. Must come before Highway headers.
+#include "hwy/base.h"
+#if defined(_WIN32) || defined(_WIN64)
+#include <windows.h>
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>  // std::fill
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/memory_test.cc"
+#include "hwy/cache_control.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestLoadStore {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    const auto hi = Iota(d, static_cast<T>(1 + N));
+    const auto lo = Iota(d, 1);
+    auto lanes = AllocateAligned<T>(2 * N);
+    Store(hi, d, &lanes[N]);
+    Store(lo, d, &lanes[0]);
+
+    // Aligned load
+    const auto lo2 = Load(d, &lanes[0]);
+    HWY_ASSERT_VEC_EQ(d, lo2, lo);
+
+    // Aligned store
+    auto lanes2 = AllocateAligned<T>(2 * N);
+    Store(lo2, d, &lanes2[0]);
+    Store(hi, d, &lanes2[N]);
+    for (size_t i = 0; i < 2 * N; ++i) {
+      HWY_ASSERT_EQ(lanes[i], lanes2[i]);
+    }
+
+    // Unaligned load
+    const auto vu = LoadU(d, &lanes[1]);
+    auto lanes3 = AllocateAligned<T>(N);
+    Store(vu, d, lanes3.get());
+    for (size_t i = 0; i < N; ++i) {
+      HWY_ASSERT_EQ(T(i + 2), lanes3[i]);
+    }
+
+    // Unaligned store
+    StoreU(lo2, d, &lanes2[N / 2]);
+    size_t i = 0;
+    for (; i < N / 2; ++i) {
+      HWY_ASSERT_EQ(lanes[i], lanes2[i]);
+    }
+    for (; i < 3 * N / 2; ++i) {
+      HWY_ASSERT_EQ(T(i - N / 2 + 1), lanes2[i]);
+    }
+    // Subsequent values remain unchanged.
+    for (; i < 2 * N; ++i) {
+      HWY_ASSERT_EQ(T(i + 1), lanes2[i]);
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllLoadStore() {
+  ForAllTypes(ForPartialVectors<TestLoadStore>());
+}
+
+struct TestSafeCopyN {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    const auto v = Iota(d, 1);
+    auto from = AllocateAligned<T>(N + 2);
+    auto to = AllocateAligned<T>(N + 2);
+    Store(v, d, from.get());
+
+    // 0: nothing changes
+    to[0] = T();
+    SafeCopyN(0, d, from.get(), to.get());
+    HWY_ASSERT_EQ(T(), to[0]);
+
+    // 1: only first changes
+    to[1] = T();
+    SafeCopyN(1, d, from.get(), to.get());
+    HWY_ASSERT_EQ(static_cast<T>(1), to[0]);
+    HWY_ASSERT_EQ(T(), to[1]);
+
+    // N-1: last does not change
+    to[N - 1] = T();
+    SafeCopyN(N - 1, d, from.get(), to.get());
+    HWY_ASSERT_EQ(T(), to[N - 1]);
+    // Also check preceding lanes
+    to[N - 1] = static_cast<T>(N);
+    HWY_ASSERT_VEC_EQ(d, to.get(), v);
+
+    // N: all change
+    to[N] = T();
+    SafeCopyN(N, d, from.get(), to.get());
+    HWY_ASSERT_VEC_EQ(d, to.get(), v);
+    HWY_ASSERT_EQ(T(), to[N]);
+
+    // N+1: subsequent lane does not change if using masked store
+    to[N + 1] = T();
+    SafeCopyN(N + 1, d, from.get(), to.get());
+    HWY_ASSERT_VEC_EQ(d, to.get(), v);
+#if !HWY_MEM_OPS_MIGHT_FAULT
+    HWY_ASSERT_EQ(T(), to[N + 1]);
+#endif
+  }
+};
+
+HWY_NOINLINE void TestAllSafeCopyN() {
+  ForAllTypes(ForPartialVectors<TestSafeCopyN>());
+}
+
+struct TestLoadDup128 {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    // Scalar does not define LoadDup128.
+#if HWY_TARGET != HWY_SCALAR || HWY_IDE
+    constexpr size_t N128 = 16 / sizeof(T);
+    alignas(16) T lanes[N128];
+    for (size_t i = 0; i < N128; ++i) {
+      lanes[i] = static_cast<T>(1 + i);
+    }
+
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = static_cast<T>(i % N128 + 1);
+    }
+
+    HWY_ASSERT_VEC_EQ(d, expected.get(), LoadDup128(d, lanes));
+#else
+    (void)d;
+#endif
+  }
+};
+
+HWY_NOINLINE void TestAllLoadDup128() {
+  ForAllTypes(ForGEVectors<128, TestLoadDup128>());
+}
+
+struct TestStream {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v = Iota(d, T(1));
+    const size_t affected_bytes =
+        (Lanes(d) * sizeof(T) + HWY_STREAM_MULTIPLE - 1) &
+        ~size_t(HWY_STREAM_MULTIPLE - 1);
+    const size_t affected_lanes = affected_bytes / sizeof(T);
+    auto out = AllocateAligned<T>(2 * affected_lanes);
+    std::fill(out.get(), out.get() + 2 * affected_lanes, T(0));
+
+    Stream(v, d, out.get());
+    FlushStream();
+    const auto actual = Load(d, out.get());
+    HWY_ASSERT_VEC_EQ(d, v, actual);
+    // Ensure Stream didn't modify more memory than expected
+    for (size_t i = affected_lanes; i < 2 * affected_lanes; ++i) {
+      HWY_ASSERT_EQ(T(0), out[i]);
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllStream() {
+  const ForPartialVectors<TestStream> test;
+  // No u8,u16.
+  test(uint32_t());
+  test(uint64_t());
+  // No i8,i16.
+  test(int32_t());
+  test(int64_t());
+  ForFloatTypes(test);
+}
+
+// Assumes little-endian byte order!
+struct TestScatter {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using Offset = MakeSigned<T>;
+
+    const size_t N = Lanes(d);
+    const size_t range = 4 * N;                  // number of items to scatter
+    const size_t max_bytes = range * sizeof(T);  // upper bound on offset
+
+    RandomState rng;
+
+    // Data to be scattered
+    auto bytes = AllocateAligned<uint8_t>(max_bytes);
+    for (size_t i = 0; i < max_bytes; ++i) {
+      bytes[i] = static_cast<uint8_t>(Random32(&rng) & 0xFF);
+    }
+    const auto data = Load(d, reinterpret_cast<const T*>(bytes.get()));
+
+    // Scatter into these regions, ensure vector results match scalar
+    auto expected = AllocateAligned<T>(range);
+    auto actual = AllocateAligned<T>(range);
+
+    const Rebind<Offset, D> d_offsets;
+    auto offsets = AllocateAligned<Offset>(N);  // or indices
+
+    for (size_t rep = 0; rep < 100; ++rep) {
+      // Byte offsets
+      std::fill(expected.get(), expected.get() + range, T(0));
+      std::fill(actual.get(), actual.get() + range, T(0));
+      for (size_t i = 0; i < N; ++i) {
+        // Must be aligned
+        offsets[i] = static_cast<Offset>((Random32(&rng) % range) * sizeof(T));
+        CopyBytes<sizeof(T)>(
+            bytes.get() + i * sizeof(T),
+            reinterpret_cast<uint8_t*>(expected.get()) + offsets[i]);
+      }
+      const auto voffsets = Load(d_offsets, offsets.get());
+      ScatterOffset(data, d, actual.get(), voffsets);
+      if (!BytesEqual(expected.get(), actual.get(), max_bytes)) {
+        Print(d, "Data", data);
+        Print(d_offsets, "Offsets", voffsets);
+        HWY_ASSERT(false);
+      }
+
+      // Indices
+      std::fill(expected.get(), expected.get() + range, T(0));
+      std::fill(actual.get(), actual.get() + range, T(0));
+      for (size_t i = 0; i < N; ++i) {
+        offsets[i] = static_cast<Offset>(Random32(&rng) % range);
+        CopyBytes<sizeof(T)>(bytes.get() + i * sizeof(T),
+                             &expected[size_t(offsets[i])]);
+      }
+      const auto vindices = Load(d_offsets, offsets.get());
+      ScatterIndex(data, d, actual.get(), vindices);
+      if (!BytesEqual(expected.get(), actual.get(), max_bytes)) {
+        Print(d, "Data", data);
+        Print(d_offsets, "Indices", vindices);
+        HWY_ASSERT(false);
+      }
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllScatter() {
+  ForUIF3264(ForPartialVectors<TestScatter>());
+}
+
+struct TestGather {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using Offset = MakeSigned<T>;
+
+    const size_t N = Lanes(d);
+    const size_t range = 4 * N;                  // number of items to gather
+    const size_t max_bytes = range * sizeof(T);  // upper bound on offset
+
+    RandomState rng;
+
+    // Data to be gathered from
+    auto bytes = AllocateAligned<uint8_t>(max_bytes);
+    for (size_t i = 0; i < max_bytes; ++i) {
+      bytes[i] = static_cast<uint8_t>(Random32(&rng) & 0xFF);
+    }
+
+    auto expected = AllocateAligned<T>(N);
+    auto offsets = AllocateAligned<Offset>(N);
+    auto indices = AllocateAligned<Offset>(N);
+
+    for (size_t rep = 0; rep < 100; ++rep) {
+      // Offsets
+      for (size_t i = 0; i < N; ++i) {
+        // Must be aligned
+        offsets[i] = static_cast<Offset>((Random32(&rng) % range) * sizeof(T));
+        CopyBytes<sizeof(T)>(bytes.get() + offsets[i], &expected[i]);
+      }
+
+      const Rebind<Offset, D> d_offset;
+      const T* base = reinterpret_cast<const T*>(bytes.get());
+      auto actual = GatherOffset(d, base, Load(d_offset, offsets.get()));
+      HWY_ASSERT_VEC_EQ(d, expected.get(), actual);
+
+      // Indices
+      for (size_t i = 0; i < N; ++i) {
+        indices[i] =
+            static_cast<Offset>(Random32(&rng) % (max_bytes / sizeof(T)));
+        CopyBytes<sizeof(T)>(base + indices[i], &expected[i]);
+      }
+      actual = GatherIndex(d, base, Load(d_offset, indices.get()));
+      HWY_ASSERT_VEC_EQ(d, expected.get(), actual);
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllGather() {
+  ForUIF3264(ForPartialVectors<TestGather>());
+}
+
+HWY_NOINLINE void TestAllCache() {
+  LoadFence();
+  FlushStream();
+  int test = 0;
+  Prefetch(&test);
+  FlushCacheline(&test);
+  Pause();
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyMemoryTest);
+HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadStore);
+HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllSafeCopyN);
+HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadDup128);
+HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStream);
+HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllScatter);
+HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllGather);
+HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllCache);
+}  // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/mul_test.cc b/third_party/highway/hwy/tests/mul_test.cc
new file mode 100644
index 0000000000..5622983cee
--- /dev/null
+++ b/third_party/highway/hwy/tests/mul_test.cc
@@ -0,0 +1,526 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/mul_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+template <size_t kBits>
+constexpr uint64_t FirstBits() {
+  return (1ull << kBits) - 1;
+}
+template <>
+constexpr uint64_t FirstBits<64>() {
+  return ~uint64_t{0};
+}
+
+struct TestUnsignedMul {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v0 = Zero(d);
+    const auto v1 = Set(d, T(1));
+    const auto vi = Iota(d, 1);
+    const auto vj = Iota(d, 3);
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+
+    HWY_ASSERT_VEC_EQ(d, v0, Mul(v0, v0));
+    HWY_ASSERT_VEC_EQ(d, v1, Mul(v1, v1));
+    HWY_ASSERT_VEC_EQ(d, vi, Mul(v1, vi));
+    HWY_ASSERT_VEC_EQ(d, vi, Mul(vi, v1));
+
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = static_cast<T>((1 + i) * (1 + i));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vi, vi));
+
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = static_cast<T>((1 + i) * (3 + i));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vi, vj));
+
+    const T max = LimitsMax<T>();
+    const auto vmax = Set(d, max);
+    HWY_ASSERT_VEC_EQ(d, vmax, Mul(vmax, v1));
+    HWY_ASSERT_VEC_EQ(d, vmax, Mul(v1, vmax));
+
+    constexpr uint64_t kMask = FirstBits<sizeof(T) * 8>();
+    const T max2 = (static_cast<uint64_t>(max) * max) & kMask;
+    HWY_ASSERT_VEC_EQ(d, Set(d, max2), Mul(vmax, vmax));
+  }
+};
+
+struct TestSignedMul {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+
+    const auto v0 = Zero(d);
+    const auto v1 = Set(d, T(1));
+    const auto vi = Iota(d, 1);
+    const auto vn = Iota(d, -T(N));  // no i8 supported, so no wraparound
+    HWY_ASSERT_VEC_EQ(d, v0, Mul(v0, v0));
+    HWY_ASSERT_VEC_EQ(d, v1, Mul(v1, v1));
+    HWY_ASSERT_VEC_EQ(d, vi, Mul(v1, vi));
+    HWY_ASSERT_VEC_EQ(d, vi, Mul(vi, v1));
+
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = static_cast<T>((1 + i) * (1 + i));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vi, vi));
+
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = static_cast<T>((-T(N) + T(i)) * T(1u + i));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vn, vi));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vi, vn));
+  }
+};
+
+struct TestMulOverflow {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto vMax = Set(d, LimitsMax<T>());
+    HWY_ASSERT_VEC_EQ(d, Mul(vMax, vMax), Mul(vMax, vMax));
+  }
+};
+
+struct TestDivOverflow {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto vZero = Set(d, T(0));
+    const auto v1 = Set(d, T(1));
+    HWY_ASSERT_VEC_EQ(d, Div(v1, vZero), Div(v1, vZero));
+  }
+};
+
+HWY_NOINLINE void TestAllMul() {
+  const ForPartialVectors<TestUnsignedMul> test_unsigned;
+  // No u8.
+  test_unsigned(uint16_t());
+  test_unsigned(uint32_t());
+  test_unsigned(uint64_t());
+
+  const ForPartialVectors<TestSignedMul> test_signed;
+  // No i8.
+  test_signed(int16_t());
+  test_signed(int32_t());
+  test_signed(int64_t());
+
+  const ForPartialVectors<TestMulOverflow> test_mul_overflow;
+  test_mul_overflow(int16_t());
+  test_mul_overflow(int32_t());
+#if HWY_HAVE_INTEGER64
+  test_mul_overflow(int64_t());
+#endif
+
+  const ForPartialVectors<TestDivOverflow> test_div_overflow;
+  test_div_overflow(float());
+#if HWY_HAVE_FLOAT64
+  test_div_overflow(double());
+#endif
+}
+
+struct TestMulHigh {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using Wide = MakeWide<T>;
+    const size_t N = Lanes(d);
+    auto in_lanes = AllocateAligned<T>(N);
+    auto expected_lanes = AllocateAligned<T>(N);
+
+    const auto vi = Iota(d, 1);
+    // no i8 supported, so no wraparound
+    const auto vni = Iota(d, T(static_cast<T>(~N + 1)));
+
+    const auto v0 = Zero(d);
+    HWY_ASSERT_VEC_EQ(d, v0, MulHigh(v0, v0));
+    HWY_ASSERT_VEC_EQ(d, v0, MulHigh(v0, vi));
+    HWY_ASSERT_VEC_EQ(d, v0, MulHigh(vi, v0));
+
+    // Large positive squared
+    for (size_t i = 0; i < N; ++i) {
+      in_lanes[i] = T(LimitsMax<T>() >> i);
+      expected_lanes[i] = T((Wide(in_lanes[i]) * in_lanes[i]) >> 16);
+    }
+    auto v = Load(d, in_lanes.get());
+    HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(v, v));
+
+    // Large positive * small positive
+    for (size_t i = 0; i < N; ++i) {
+      expected_lanes[i] = T((Wide(in_lanes[i]) * T(1u + i)) >> 16);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(v, vi));
+    HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(vi, v));
+
+    // Large positive * small negative
+    for (size_t i = 0; i < N; ++i) {
+      expected_lanes[i] = T((Wide(in_lanes[i]) * T(i - N)) >> 16);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(v, vni));
+    HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(vni, v));
+  }
+};
+
+HWY_NOINLINE void TestAllMulHigh() {
+  ForPartialVectors<TestMulHigh> test;
+  test(int16_t());
+  test(uint16_t());
+}
+
+struct TestMulFixedPoint15 {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v0 = Zero(d);
+    HWY_ASSERT_VEC_EQ(d, v0, MulFixedPoint15(v0, v0));
+    HWY_ASSERT_VEC_EQ(d, v0, MulFixedPoint15(v0, v0));
+
+    const size_t N = Lanes(d);
+    auto in1 = AllocateAligned<T>(N);
+    auto in2 = AllocateAligned<T>(N);
+    auto expected = AllocateAligned<T>(N);
+
+    // Random inputs in each lane
+    RandomState rng;
+    for (size_t rep = 0; rep < AdjustedReps(10000); ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        in1[i] = static_cast<T>(Random64(&rng) & 0xFFFF);
+        in2[i] = static_cast<T>(Random64(&rng) & 0xFFFF);
+      }
+
+      for (size_t i = 0; i < N; ++i) {
+        // There are three ways to compute the results. x86 and ARM are defined
+        // using 32-bit multiplication results:
+        const int arm = (2 * in1[i] * in2[i] + 0x8000) >> 16;
+        const int x86 = (((in1[i] * in2[i]) >> 14) + 1) >> 1;
+        // On other platforms, split the result into upper and lower 16 bits.
+        const auto v1 = Set(d, in1[i]);
+        const auto v2 = Set(d, in2[i]);
+        const int hi = GetLane(MulHigh(v1, v2));
+        const int lo = GetLane(Mul(v1, v2)) & 0xFFFF;
+        const int split = 2 * hi + ((lo + 0x4000) >> 15);
+        expected[i] = static_cast<T>(arm);
+        if (in1[i] != -32768 || in2[i] != -32768) {
+          HWY_ASSERT_EQ(arm, x86);
+          HWY_ASSERT_EQ(arm, split);
+        }
+      }
+
+      const auto a = Load(d, in1.get());
+      const auto b = Load(d, in2.get());
+      HWY_ASSERT_VEC_EQ(d, expected.get(), MulFixedPoint15(a, b));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllMulFixedPoint15() {
+  ForPartialVectors<TestMulFixedPoint15>()(int16_t());
+}
+
+struct TestMulEven {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using Wide = MakeWide<T>;
+    const Repartition<Wide, D> d2;
+    const auto v0 = Zero(d);
+    HWY_ASSERT_VEC_EQ(d2, Zero(d2), MulEven(v0, v0));
+
+    const size_t N = Lanes(d);
+    auto in_lanes = AllocateAligned<T>(N);
+    auto expected = AllocateAligned<Wide>(Lanes(d2));
+    for (size_t i = 0; i < N; i += 2) {
+      in_lanes[i + 0] = LimitsMax<T>() >> i;
+      if (N != 1) {
+        in_lanes[i + 1] = 1;  // unused
+      }
+      expected[i / 2] = Wide(in_lanes[i + 0]) * in_lanes[i + 0];
+    }
+
+    const auto v = Load(d, in_lanes.get());
+    HWY_ASSERT_VEC_EQ(d2, expected.get(), MulEven(v, v));
+  }
+};
+
+struct TestMulEvenOdd64 {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+#if HWY_TARGET != HWY_SCALAR
+    const auto v0 = Zero(d);
+    HWY_ASSERT_VEC_EQ(d, Zero(d), MulEven(v0, v0));
+    HWY_ASSERT_VEC_EQ(d, Zero(d), MulOdd(v0, v0));
+
+    const size_t N = Lanes(d);
+    if (N == 1) return;
+
+    auto in1 = AllocateAligned<T>(N);
+    auto in2 = AllocateAligned<T>(N);
+    auto expected_even = AllocateAligned<T>(N);
+    auto expected_odd = AllocateAligned<T>(N);
+
+    // Random inputs in each lane
+    RandomState rng;
+    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        in1[i] = Random64(&rng);
+        in2[i] = Random64(&rng);
+      }
+
+      for (size_t i = 0; i < N; i += 2) {
+        expected_even[i] = Mul128(in1[i], in2[i], &expected_even[i + 1]);
+        expected_odd[i] = Mul128(in1[i + 1], in2[i + 1], &expected_odd[i + 1]);
+      }
+
+      const auto a = Load(d, in1.get());
+      const auto b = Load(d, in2.get());
+      HWY_ASSERT_VEC_EQ(d, expected_even.get(), MulEven(a, b));
+      HWY_ASSERT_VEC_EQ(d, expected_odd.get(), MulOdd(a, b));
+    }
+#else
+    (void)d;
+#endif  // HWY_TARGET != HWY_SCALAR
+  }
+};
+
+HWY_NOINLINE void TestAllMulEven() {
+  ForGEVectors<64, TestMulEven> test;
+  test(int32_t());
+  test(uint32_t());
+
+  ForGEVectors<128, TestMulEvenOdd64>()(uint64_t());
+}
+
+#ifndef HWY_NATIVE_FMA
+#error "Bug in set_macros-inl.h, did not set HWY_NATIVE_FMA"
+#endif
+
+struct TestMulAdd {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto k0 = Zero(d);
+    const auto kNeg0 = Set(d, T(-0.0));
+    const auto v1 = Iota(d, 1);
+    const auto v2 = Iota(d, 2);
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+    HWY_ASSERT_VEC_EQ(d, k0, MulAdd(k0, k0, k0));
+    HWY_ASSERT_VEC_EQ(d, v2, MulAdd(k0, v1, v2));
+    HWY_ASSERT_VEC_EQ(d, v2, MulAdd(v1, k0, v2));
+    HWY_ASSERT_VEC_EQ(d, k0, NegMulAdd(k0, k0, k0));
+    HWY_ASSERT_VEC_EQ(d, v2, NegMulAdd(k0, v1, v2));
+    HWY_ASSERT_VEC_EQ(d, v2, NegMulAdd(v1, k0, v2));
+
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = static_cast<T>((i + 1) * (i + 2));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), MulAdd(v2, v1, k0));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), MulAdd(v1, v2, k0));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulAdd(Neg(v2), v1, k0));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulAdd(v1, Neg(v2), k0));
+
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = static_cast<T>((i + 2) * (i + 2) + (i + 1));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), MulAdd(v2, v2, v1));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulAdd(Neg(v2), v2, v1));
+
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] =
+          T(-T(i + 2u) * static_cast<T>(i + 2) + static_cast<T>(1 + i));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulAdd(v2, v2, v1));
+
+    HWY_ASSERT_VEC_EQ(d, k0, MulSub(k0, k0, k0));
+    HWY_ASSERT_VEC_EQ(d, kNeg0, NegMulSub(k0, k0, k0));
+
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = -T(i + 2);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), MulSub(k0, v1, v2));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), MulSub(v1, k0, v2));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulSub(Neg(k0), v1, v2));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulSub(v1, Neg(k0), v2));
+
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = static_cast<T>((i + 1) * (i + 2));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), MulSub(v1, v2, k0));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), MulSub(v2, v1, k0));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulSub(Neg(v1), v2, k0));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulSub(v2, Neg(v1), k0));
+
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = static_cast<T>((i + 2) * (i + 2) - (1 + i));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), MulSub(v2, v2, v1));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulSub(Neg(v2), v2, v1));
+  }
+};
+
+HWY_NOINLINE void TestAllMulAdd() {
+  ForFloatTypes(ForPartialVectors<TestMulAdd>());
+}
+
+struct TestReorderWidenMulAccumulate {
+  template <typename TN, class DN>
+  HWY_NOINLINE void operator()(TN /*unused*/, DN dn) {
+    using TW = MakeWide<TN>;
+    const RepartitionToWide<DN> dw;
+    const Half<DN> dnh;
+    using VW = Vec<decltype(dw)>;
+    using VN = Vec<decltype(dn)>;
+    const size_t NN = Lanes(dn);
+
+    const VW f0 = Zero(dw);
+    const VW f1 = Set(dw, TW{1});
+    const VN bf0 = Zero(dn);
+    // Cannot Set() bfloat16_t directly.
+    const VN bf1 = ReorderDemote2To(dn, f1, f1);
+
+    // Any input zero => both outputs zero
+    VW sum1 = f0;
+    HWY_ASSERT_VEC_EQ(dw, f0,
+                      ReorderWidenMulAccumulate(dw, bf0, bf0, f0, sum1));
+    HWY_ASSERT_VEC_EQ(dw, f0, sum1);
+    HWY_ASSERT_VEC_EQ(dw, f0,
+                      ReorderWidenMulAccumulate(dw, bf0, bf1, f0, sum1));
+    HWY_ASSERT_VEC_EQ(dw, f0, sum1);
+    HWY_ASSERT_VEC_EQ(dw, f0,
+                      ReorderWidenMulAccumulate(dw, bf1, bf0, f0, sum1));
+    HWY_ASSERT_VEC_EQ(dw, f0, sum1);
+
+    // delta[p] := 1, all others zero. For each p: Dot(delta, all-ones) == 1.
+    auto delta_w = AllocateAligned<TW>(NN);
+    for (size_t p = 0; p < NN; ++p) {
+      // Workaround for incorrect Clang wasm codegen: re-initialize the entire
+      // array rather than zero-initialize once and then toggle lane p.
+      for (size_t i = 0; i < NN; ++i) {
+        delta_w[i] = static_cast<TW>(i == p);
+      }
+      const VW delta0 = Load(dw, delta_w.get());
+      const VW delta1 = Load(dw, delta_w.get() + NN / 2);
+      const VN delta = ReorderDemote2To(dn, delta0, delta1);
+
+      {
+        sum1 = f0;
+        const VW sum0 = ReorderWidenMulAccumulate(dw, delta, bf1, f0, sum1);
+        HWY_ASSERT_EQ(TW{1}, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
+      }
+      // Swapped arg order
+      {
+        sum1 = f0;
+        const VW sum0 = ReorderWidenMulAccumulate(dw, bf1, delta, f0, sum1);
+        HWY_ASSERT_EQ(TW{1}, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
+      }
+      // Start with nonzero sum0 or sum1
+      {
+        VW sum0 = PromoteTo(dw, LowerHalf(dnh, delta));
+        sum1 = PromoteTo(dw, UpperHalf(dnh, delta));
+        sum0 = ReorderWidenMulAccumulate(dw, delta, bf1, sum0, sum1);
+        HWY_ASSERT_EQ(TW{2}, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
+      }
+      // Start with nonzero sum0 or sum1, and swap arg order
+      {
+        VW sum0 = PromoteTo(dw, LowerHalf(dnh, delta));
+        sum1 = PromoteTo(dw, UpperHalf(dnh, delta));
+        sum0 = ReorderWidenMulAccumulate(dw, bf1, delta, sum0, sum1);
+        HWY_ASSERT_EQ(TW{2}, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
+      }
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllReorderWidenMulAccumulate() {
+  ForShrinkableVectors<TestReorderWidenMulAccumulate>()(bfloat16_t());
+  ForShrinkableVectors<TestReorderWidenMulAccumulate>()(int16_t());
+}
+
+struct TestRearrangeToOddPlusEven {
+  template <typename TN, class DN>
+  HWY_NOINLINE void operator()(TN /*unused*/, DN dn) {
+    using TW = MakeWide<TN>;
+    const RebindToUnsigned<DN> du;
+    const RepartitionToWide<DN> dw;
+    const Half<DN> dnh;
+    const RebindToUnsigned<decltype(dnh)> duh;
+    using VW = Vec<decltype(dw)>;
+    using VN = Vec<decltype(dn)>;
+    const size_t NW = Lanes(dw);
+
+    const VW up0 = Iota(dw, TW{1});
+    const VW up1 = Iota(dw, static_cast<TW>(1 + NW));
+    // We will compute i * (N-i) to avoid per-lane overflow.
+    const VW down0 = Reverse(dw, up1);
+    const VW down1 = Reverse(dw, up0);
+
+    // Combine is not available for bf16, so cast to u16.
+    const auto a0 = BitCast(duh, DemoteTo(dnh, up0));
+    const auto a1 = BitCast(duh, DemoteTo(dnh, up1));
+    const VN a = BitCast(dn, Combine(du, a1, a0));
+    const auto b0 = BitCast(duh, DemoteTo(dnh, down0));
+    const auto b1 = BitCast(duh, DemoteTo(dnh, down1));
+    const VN b = BitCast(dn, Combine(du, b1, b0));
+
+    const auto expected = AllocateAligned<TW>(NW);
+    for (size_t iw = 0; iw < NW; ++iw) {
+      const size_t in = iw * 2;  // even, odd is +1
+      const size_t a0 = 1 + in;
+      const size_t b0 = 1 + 2 * NW - a0;
+      const size_t a1 = a0 + 1;
+      const size_t b1 = b0 - 1;
+      expected[iw] = static_cast<TW>(a0 * b0 + a1 * b1);
+    }
+
+    VW sum1 = Zero(dw);
+    const VW sum0 = ReorderWidenMulAccumulate(dw, a, b, Zero(dw), sum1);
+    const VW sum_odd_even = RearrangeToOddPlusEven(sum0, sum1);
+    HWY_ASSERT_VEC_EQ(dw, expected.get(), sum_odd_even);
+  }
+};
+
+HWY_NOINLINE void TestAllRearrangeToOddPlusEven() {
+  ForShrinkableVectors<TestRearrangeToOddPlusEven>()(bfloat16_t());
+  ForShrinkableVectors<TestRearrangeToOddPlusEven>()(int16_t());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyMulTest);
+HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllMul);
+HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllMulHigh);
+HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllMulFixedPoint15);
+HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllMulEven);
+HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllMulAdd);
+HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllReorderWidenMulAccumulate);
+HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllRearrangeToOddPlusEven);
+
+}  // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/reduction_test.cc b/third_party/highway/hwy/tests/reduction_test.cc
new file mode 100644
index 0000000000..5cc051ef1c
--- /dev/null
+++ b/third_party/highway/hwy/tests/reduction_test.cc
@@ -0,0 +1,261 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/reduction_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestSumOfLanes {
+  template <typename T, size_t N, int P,
+            hwy::EnableIf<!IsSigned<T>() || ((N & 1) != 0)>* = nullptr>
+  HWY_NOINLINE void SignedEvenLengthVectorTests(Simd<T, N, P>) {
+    // do nothing
+  }
+  template <typename T, size_t N, int P,
+            hwy::EnableIf<IsSigned<T>() && ((N & 1) == 0)>* = nullptr>
+  HWY_NOINLINE void SignedEvenLengthVectorTests(Simd<T, N, P> d) {
+    const T pairs = static_cast<T>(Lanes(d) / 2);
+
+    // Lanes are the repeated sequence -2, 1, [...]; each pair sums to -1,
+    // so the eventual total is just -(N/2).
+    Vec<decltype(d)> v =
+        InterleaveLower(Set(d, static_cast<T>(-2)), Set(d, T{1}));
+    HWY_ASSERT_VEC_EQ(d, Set(d, static_cast<T>(-pairs)), SumOfLanes(d, v));
+
+    // Similar test with a positive result.
+    v = InterleaveLower(Set(d, static_cast<T>(-2)), Set(d, T{4}));
+    HWY_ASSERT_VEC_EQ(d, Set(d, static_cast<T>(pairs * 2)), SumOfLanes(d, v));
+  }
+
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    auto in_lanes = AllocateAligned<T>(N);
+
+    // Lane i = bit i, higher lanes 0
+    double sum = 0.0;
+    // Avoid setting sign bit and cap at double precision
+    constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51);
+    for (size_t i = 0; i < N; ++i) {
+      in_lanes[i] = i < kBits ? static_cast<T>(1ull << i) : 0;
+      sum += static_cast<double>(in_lanes[i]);
+    }
+    HWY_ASSERT_VEC_EQ(d, Set(d, T(sum)),
+                      SumOfLanes(d, Load(d, in_lanes.get())));
+
+    // Lane i = i (iota) to include upper lanes
+    sum = 0.0;
+    for (size_t i = 0; i < N; ++i) {
+      sum += static_cast<double>(i);
+    }
+    HWY_ASSERT_VEC_EQ(d, Set(d, T(sum)), SumOfLanes(d, Iota(d, 0)));
+
+    // Run more tests only for signed types with even vector lengths. Some of
+    // this code may not otherwise compile, so put it in a templated function.
+    SignedEvenLengthVectorTests(d);
+  }
+};
+
+HWY_NOINLINE void TestAllSumOfLanes() {
+  ForUIF3264(ForPartialVectors<TestSumOfLanes>());
+  ForUI16(ForPartialVectors<TestSumOfLanes>());
+
+#if HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_SSE4 || HWY_TARGET == HWY_SSSE3
+  ForUI8(ForGEVectors<64, TestSumOfLanes>());
+#endif
+}
+
+struct TestMinOfLanes {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    auto in_lanes = AllocateAligned<T>(N);
+
+    // Lane i = bit i, higher lanes = 2 (not the minimum)
+    T min = HighestValue<T>();
+    // Avoid setting sign bit and cap at double precision
+    constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51);
+    for (size_t i = 0; i < N; ++i) {
+      in_lanes[i] = i < kBits ? static_cast<T>(1ull << i) : 2;
+      min = HWY_MIN(min, in_lanes[i]);
+    }
+    HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get())));
+
+    // Lane i = N - i to include upper lanes
+    min = HighestValue<T>();
+    for (size_t i = 0; i < N; ++i) {
+      in_lanes[i] = static_cast<T>(N - i);  // no 8-bit T so no wraparound
+      min = HWY_MIN(min, in_lanes[i]);
+    }
+    HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get())));
+
+    // Bug #910: also check negative values
+    min = HighestValue<T>();
+    const T input_copy[] = {static_cast<T>(-1),
+                            static_cast<T>(-2),
+                            1,
+                            2,
+                            3,
+                            4,
+                            5,
+                            6,
+                            7,
+                            8,
+                            9,
+                            10,
+                            11,
+                            12,
+                            13,
+                            14};
+    size_t i = 0;
+    for (; i < HWY_MIN(N, sizeof(input_copy) / sizeof(T)); ++i) {
+      in_lanes[i] = input_copy[i];
+      min = HWY_MIN(min, input_copy[i]);
+    }
+    // Pad with neutral element to full vector (so we can load)
+    for (; i < N; ++i) {
+      in_lanes[i] = min;
+    }
+    HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get())));
+  }
+};
+
+struct TestMaxOfLanes {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    auto in_lanes = AllocateAligned<T>(N);
+
+    T max = LowestValue<T>();
+    // Avoid setting sign bit and cap at double precision
+    constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51);
+    for (size_t i = 0; i < N; ++i) {
+      in_lanes[i] = i < kBits ? static_cast<T>(1ull << i) : 0;
+      max = HWY_MAX(max, in_lanes[i]);
+    }
+    HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get())));
+
+    // Lane i = i to include upper lanes
+    max = LowestValue<T>();
+    for (size_t i = 0; i < N; ++i) {
+      in_lanes[i] = static_cast<T>(i);  // no 8-bit T so no wraparound
+      max = HWY_MAX(max, in_lanes[i]);
+    }
+    HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get())));
+
+    // Bug #910: also check negative values
+    max = LowestValue<T>();
+    const T input_copy[] = {static_cast<T>(-1),
+                            static_cast<T>(-2),
+                            1,
+                            2,
+                            3,
+                            4,
+                            5,
+                            6,
+                            7,
+                            8,
+                            9,
+                            10,
+                            11,
+                            12,
+                            13,
+                            14};
+    size_t i = 0;
+    for (; i < HWY_MIN(N, sizeof(input_copy) / sizeof(T)); ++i) {
+      in_lanes[i] = input_copy[i];
+      max = HWY_MAX(max, in_lanes[i]);
+    }
+    // Pad with neutral element to full vector (so we can load)
+    for (; i < N; ++i) {
+      in_lanes[i] = max;
+    }
+    HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get())));
+  }
+};
+
+HWY_NOINLINE void TestAllMinMaxOfLanes() {
+  const ForPartialVectors<TestMinOfLanes> test_min;
+  const ForPartialVectors<TestMaxOfLanes> test_max;
+  ForUIF3264(test_min);
+  ForUIF3264(test_max);
+  ForUI16(test_min);
+  ForUI16(test_max);
+
+#if HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_SSE4 || HWY_TARGET == HWY_SSSE3
+  ForUI8(ForGEVectors<64, TestMinOfLanes>());
+  ForUI8(ForGEVectors<64, TestMaxOfLanes>());
+#endif
+}
+
+struct TestSumsOf8 {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    RandomState rng;
+
+    const size_t N = Lanes(d);
+    if (N < 8) return;
+    const Repartition<uint64_t, D> du64;
+
+    auto in_lanes = AllocateAligned<T>(N);
+    auto sum_lanes = AllocateAligned<uint64_t>(N / 8);
+
+    for (size_t rep = 0; rep < 100; ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        in_lanes[i] = Random64(&rng) & 0xFF;
+      }
+
+      for (size_t idx_sum = 0; idx_sum < N / 8; ++idx_sum) {
+        uint64_t sum = 0;
+        for (size_t i = 0; i < 8; ++i) {
+          sum += in_lanes[idx_sum * 8 + i];
+        }
+        sum_lanes[idx_sum] = sum;
+      }
+
+      const Vec<D> in = Load(d, in_lanes.get());
+      HWY_ASSERT_VEC_EQ(du64, sum_lanes.get(), SumsOf8(in));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllSumsOf8() {
+  ForGEVectors<64, TestSumsOf8>()(uint8_t());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyReductionTest);
+HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllSumOfLanes);
+HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllMinMaxOfLanes);
+HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllSumsOf8);
+}  // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/reverse_test.cc b/third_party/highway/hwy/tests/reverse_test.cc
new file mode 100644
index 0000000000..b1572c03fe
--- /dev/null
+++ b/third_party/highway/hwy/tests/reverse_test.cc
@@ -0,0 +1,186 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+
+#include "hwy/base.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/reverse_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestReverse {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    const RebindToUnsigned<D> du;  // Iota does not support float16_t.
+    const auto v = BitCast(d, Iota(du, 1));
+    auto expected = AllocateAligned<T>(N);
+
+    // Can't set float16_t value directly, need to permute in memory.
+    auto copy = AllocateAligned<T>(N);
+    Store(v, d, copy.get());
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = copy[N - 1 - i];
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse(d, v));
+  }
+};
+
+struct TestReverse2 {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    const RebindToUnsigned<D> du;  // Iota does not support float16_t.
+    const auto v = BitCast(d, Iota(du, 1));
+    auto expected = AllocateAligned<T>(N);
+    if (N == 1) {
+      Store(v, d, expected.get());
+      HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse2(d, v));
+      return;
+    }
+
+    // Can't set float16_t value directly, need to permute in memory.
+    auto copy = AllocateAligned<T>(N);
+    Store(v, d, copy.get());
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = copy[i ^ 1];
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse2(d, v));
+  }
+};
+
+struct TestReverse4 {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    const RebindToUnsigned<D> du;  // Iota does not support float16_t.
+    const auto v = BitCast(d, Iota(du, 1));
+    auto expected = AllocateAligned<T>(N);
+
+    // Can't set float16_t value directly, need to permute in memory.
+    auto copy = AllocateAligned<T>(N);
+    Store(v, d, copy.get());
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = copy[i ^ 3];
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse4(d, v));
+  }
+};
+
+struct TestReverse8 {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    const RebindToUnsigned<D> du;  // Iota does not support float16_t.
+    const auto v = BitCast(d, Iota(du, 1));
+    auto expected = AllocateAligned<T>(N);
+
+    // Can't set float16_t value directly, need to permute in memory.
+    auto copy = AllocateAligned<T>(N);
+    Store(v, d, copy.get());
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = copy[i ^ 7];
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse8(d, v));
+  }
+};
+
+HWY_NOINLINE void TestAllReverse() {
+  // 8-bit is not supported because Risc-V uses rgather of Lanes - Iota,
+  // which requires 16 bits.
+  ForUIF163264(ForPartialVectors<TestReverse>());
+}
+
+HWY_NOINLINE void TestAllReverse2() {
+  // 8-bit is not supported because Risc-V uses rgather of Lanes - Iota,
+  // which requires 16 bits.
+  ForUIF64(ForGEVectors<128, TestReverse2>());
+  ForUIF32(ForGEVectors<64, TestReverse2>());
+  ForUIF16(ForGEVectors<32, TestReverse2>());
+
+#if HWY_TARGET == HWY_SSSE3
+  // Implemented mainly for internal use.
+  ForUI8(ForPartialVectors<TestReverse2>());
+#endif
+}
+
+HWY_NOINLINE void TestAllReverse4() {
+  // 8-bit is not supported because Risc-V uses rgather of Lanes - Iota,
+  // which requires 16 bits.
+  ForUIF64(ForGEVectors<256, TestReverse4>());
+  ForUIF32(ForGEVectors<128, TestReverse4>());
+  ForUIF16(ForGEVectors<64, TestReverse4>());
+}
+
+HWY_NOINLINE void TestAllReverse8() {
+  // 8-bit is not supported because Risc-V uses rgather of Lanes - Iota,
+  // which requires 16 bits.
+  ForUIF64(ForGEVectors<512, TestReverse8>());
+  ForUIF32(ForGEVectors<256, TestReverse8>());
+  ForUIF16(ForGEVectors<128, TestReverse8>());
+}
+
+struct TestReverseBlocks {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    const RebindToUnsigned<D> du;  // Iota does not support float16_t.
+    const auto v = BitCast(d, Iota(du, 1));
+    auto expected = AllocateAligned<T>(N);
+
+    constexpr size_t kLanesPerBlock = 16 / sizeof(T);
+    const size_t num_blocks = N / kLanesPerBlock;
+    HWY_ASSERT(num_blocks != 0);
+
+    // Can't set float16_t value directly, need to permute in memory.
+    auto copy = AllocateAligned<T>(N);
+    Store(v, d, copy.get());
+    for (size_t i = 0; i < N; ++i) {
+      const size_t idx_block = i / kLanesPerBlock;
+      const size_t base = (num_blocks - 1 - idx_block) * kLanesPerBlock;
+      expected[i] = copy[base + (i % kLanesPerBlock)];
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ReverseBlocks(d, v));
+  }
+};
+
+HWY_NOINLINE void TestAllReverseBlocks() {
+  ForAllTypes(ForGEVectors<128, TestReverseBlocks>());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyReverseTest);
+HWY_EXPORT_AND_TEST_P(HwyReverseTest, TestAllReverse);
+HWY_EXPORT_AND_TEST_P(HwyReverseTest, TestAllReverse2);
+HWY_EXPORT_AND_TEST_P(HwyReverseTest, TestAllReverse4);
+HWY_EXPORT_AND_TEST_P(HwyReverseTest, TestAllReverse8);
+HWY_EXPORT_AND_TEST_P(HwyReverseTest, TestAllReverseBlocks);
+}  // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/shift_test.cc b/third_party/highway/hwy/tests/shift_test.cc
new file mode 100644
index 0000000000..585eba761c
--- /dev/null
+++ b/third_party/highway/hwy/tests/shift_test.cc
@@ -0,0 +1,428 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <limits>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/shift_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+template <bool kSigned>
+struct TestLeftShifts {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T t, D d) {
+    if (kSigned) {
+      // Also test positive values
+      TestLeftShifts</*kSigned=*/false>()(t, d);
+    }
+
+    using TI = MakeSigned<T>;
+    using TU = MakeUnsigned<T>;
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+
+    // Values to shift
+    const auto values = Iota(d, static_cast<T>(kSigned ? -TI(N) : TI(0)));
+    constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
+
+    // 0
+    HWY_ASSERT_VEC_EQ(d, values, ShiftLeft<0>(values));
+    HWY_ASSERT_VEC_EQ(d, values, ShiftLeftSame(values, 0));
+
+    // 1
+    for (size_t i = 0; i < N; ++i) {
+      const T value = kSigned ? T(T(i) - T(N)) : T(i);
+      expected[i] = T(TU(value) << 1);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<1>(values));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, 1));
+
+    // max
+    for (size_t i = 0; i < N; ++i) {
+      const T value = kSigned ? T(T(i) - T(N)) : T(i);
+      expected[i] = T(TU(value) << kMaxShift);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<kMaxShift>(values));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, kMaxShift));
+  }
+};
+
+template <bool kSigned>
+struct TestVariableLeftShifts {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T t, D d) {
+    if (kSigned) {
+      // Also test positive values
+      TestVariableLeftShifts</*kSigned=*/false>()(t, d);
+    }
+
+    using TI = MakeSigned<T>;
+    using TU = MakeUnsigned<T>;
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+
+    const auto v0 = Zero(d);
+    const auto v1 = Set(d, 1);
+    const auto values = Iota(d, kSigned ? -TI(N) : TI(0));  // value to shift
+
+    constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
+    const auto max_shift = Set(d, kMaxShift);
+    const auto small_shifts = And(Iota(d, 0), max_shift);
+    const auto large_shifts = max_shift - small_shifts;
+
+    // Same: 0
+    HWY_ASSERT_VEC_EQ(d, values, Shl(values, v0));
+
+    // Same: 1
+    for (size_t i = 0; i < N; ++i) {
+      const T value = kSigned ? T(i) - T(N) : T(i);
+      expected[i] = T(TU(value) << 1);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, v1));
+
+    // Same: max
+    for (size_t i = 0; i < N; ++i) {
+      const T value = kSigned ? T(i) - T(N) : T(i);
+      expected[i] = T(TU(value) << kMaxShift);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, max_shift));
+
+    // Variable: small
+    for (size_t i = 0; i < N; ++i) {
+      const T value = kSigned ? T(i) - T(N) : T(i);
+      expected[i] = T(TU(value) << (i & kMaxShift));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, small_shifts));
+
+    // Variable: large
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = T(TU(1) << (kMaxShift - (i & kMaxShift)));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(v1, large_shifts));
+  }
+};
+
+struct TestUnsignedRightShifts {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+
+    const auto values = Iota(d, 0);
+
+    const T kMax = LimitsMax<T>();
+    constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
+
+    // Shift by 0
+    HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values));
+    HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0));
+
+    // Shift by 1
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = T(T(i & kMax) >> 1);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1));
+
+    // max
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = T(T(i & kMax) >> kMaxShift);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<kMaxShift>(values));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, kMaxShift));
+  }
+};
+
+struct TestRotateRight {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+
+    constexpr size_t kBits = sizeof(T) * 8;
+    const auto mask_shift = Set(d, T{kBits});
+    // Cover as many bit positions as possible to test shifting out
+    const auto values = Shl(Set(d, T{1}), And(Iota(d, 0), mask_shift));
+
+    // Rotate by 0
+    HWY_ASSERT_VEC_EQ(d, values, RotateRight<0>(values));
+
+    // Rotate by 1
+    Store(values, d, expected.get());
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = (expected[i] >> 1) | (expected[i] << (kBits - 1));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<1>(values));
+
+    // Rotate by half
+    Store(values, d, expected.get());
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = (expected[i] >> (kBits / 2)) | (expected[i] << (kBits / 2));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<kBits / 2>(values));
+
+    // Rotate by max
+    Store(values, d, expected.get());
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = (expected[i] >> (kBits - 1)) | (expected[i] << 1);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<kBits - 1>(values));
+  }
+};
+
+struct TestVariableUnsignedRightShifts {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+
+    const auto v0 = Zero(d);
+    const auto v1 = Set(d, 1);
+    const auto values = Iota(d, 0);
+
+    const T kMax = LimitsMax<T>();
+    const auto max = Set(d, kMax);
+
+    constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
+    const auto max_shift = Set(d, kMaxShift);
+    const auto small_shifts = And(Iota(d, 0), max_shift);
+    const auto large_shifts = max_shift - small_shifts;
+
+    // Same: 0
+    HWY_ASSERT_VEC_EQ(d, values, Shr(values, v0));
+
+    // Same: 1
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = T(T(i & kMax) >> 1);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, v1));
+
+    // Same: max
+    HWY_ASSERT_VEC_EQ(d, v0, Shr(values, max_shift));
+
+    // Variable: small
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = T(i) >> (i & kMaxShift);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, small_shifts));
+
+    // Variable: Large
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = kMax >> (kMaxShift - (i & kMaxShift));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(max, large_shifts));
+  }
+};
+
+template <int kAmount, typename T>
+T RightShiftNegative(T val) {
+  // C++ shifts are implementation-defined for negative numbers, and we have
+  // seen divisions replaced with shifts, so resort to bit operations.
+  using TU = hwy::MakeUnsigned<T>;
+  TU bits;
+  CopySameSize(&val, &bits);
+
+  const TU shifted = TU(bits >> kAmount);
+
+  const TU all = TU(~TU(0));
+  const size_t num_zero = sizeof(TU) * 8 - 1 - kAmount;
+  const TU sign_extended = static_cast<TU>((all << num_zero) & LimitsMax<TU>());
+
+  bits = shifted | sign_extended;
+  CopySameSize(&bits, &val);
+  return val;
+}
+
+class TestSignedRightShifts {
+ public:
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+    constexpr T kMin = LimitsMin<T>();
+    constexpr T kMax = LimitsMax<T>();
+    constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
+
+    // First test positive values, negative are checked below.
+    const auto v0 = Zero(d);
+    const auto values = And(Iota(d, 0), Set(d, kMax));
+
+    // Shift by 0
+    HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values));
+    HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0));
+
+    // Shift by 1
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = T(T(i & kMax) >> 1);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1));
+
+    // max
+    HWY_ASSERT_VEC_EQ(d, v0, ShiftRight<kMaxShift>(values));
+    HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(values, kMaxShift));
+
+    // Even negative value
+    Test<0>(kMin, d, __LINE__);
+    Test<1>(kMin, d, __LINE__);
+    Test<2>(kMin, d, __LINE__);
+    Test<kMaxShift>(kMin, d, __LINE__);
+
+    const T odd = static_cast<T>(kMin + 1);
+    Test<0>(odd, d, __LINE__);
+    Test<1>(odd, d, __LINE__);
+    Test<2>(odd, d, __LINE__);
+    Test<kMaxShift>(odd, d, __LINE__);
+  }
+
+ private:
+  template <int kAmount, typename T, class D>
+  void Test(T val, D d, int line) {
+    const auto expected = Set(d, RightShiftNegative<kAmount>(val));
+    const auto in = Set(d, val);
+    const char* file = __FILE__;
+    AssertVecEqual(d, expected, ShiftRight<kAmount>(in), file, line);
+    AssertVecEqual(d, expected, ShiftRightSame(in, kAmount), file, line);
+  }
+};
+
+struct TestVariableSignedRightShifts {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using TU = MakeUnsigned<T>;
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+
+    constexpr T kMin = LimitsMin<T>();
+    constexpr T kMax = LimitsMax<T>();
+
+    constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
+
+    // First test positive values, negative are checked below.
+    const auto v0 = Zero(d);
+    const auto positive = Iota(d, 0) & Set(d, kMax);
+
+    // Shift by 0
+    HWY_ASSERT_VEC_EQ(d, positive, ShiftRight<0>(positive));
+    HWY_ASSERT_VEC_EQ(d, positive, ShiftRightSame(positive, 0));
+
+    // Shift by 1
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = T(T(i & kMax) >> 1);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(positive));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(positive, 1));
+
+    // max
+    HWY_ASSERT_VEC_EQ(d, v0, ShiftRight<kMaxShift>(positive));
+    HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(positive, kMaxShift));
+
+    const auto max_shift = Set(d, kMaxShift);
+    const auto small_shifts = And(Iota(d, 0), max_shift);
+    const auto large_shifts = max_shift - small_shifts;
+
+    const auto negative = Iota(d, kMin);
+
+    // Test varying negative to shift
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = RightShiftNegative<1>(static_cast<T>(kMin + i));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(negative, Set(d, 1)));
+
+    // Shift MSB right by small amounts
+    for (size_t i = 0; i < N; ++i) {
+      const size_t amount = i & kMaxShift;
+      const TU shifted = ~((1ull << (kMaxShift - amount)) - 1);
+      CopySameSize(&shifted, &expected[i]);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(Set(d, kMin), small_shifts));
+
+    // Shift MSB right by large amounts
+    for (size_t i = 0; i < N; ++i) {
+      const size_t amount = kMaxShift - (i & kMaxShift);
+      const TU shifted = ~((1ull << (kMaxShift - amount)) - 1);
+      CopySameSize(&shifted, &expected[i]);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(Set(d, kMin), large_shifts));
+  }
+};
+
+HWY_NOINLINE void TestAllShifts() {
+  ForUnsignedTypes(ForPartialVectors<TestLeftShifts</*kSigned=*/false>>());
+  ForSignedTypes(ForPartialVectors<TestLeftShifts</*kSigned=*/true>>());
+  ForUnsignedTypes(ForPartialVectors<TestUnsignedRightShifts>());
+  ForSignedTypes(ForPartialVectors<TestSignedRightShifts>());
+}
+
+HWY_NOINLINE void TestAllVariableShifts() {
+  const ForPartialVectors<TestLeftShifts</*kSigned=*/false>> shl_u;
+  const ForPartialVectors<TestLeftShifts</*kSigned=*/true>> shl_s;
+  const ForPartialVectors<TestUnsignedRightShifts> shr_u;
+  const ForPartialVectors<TestSignedRightShifts> shr_s;
+
+  shl_u(uint16_t());
+  shr_u(uint16_t());
+
+  shl_u(uint32_t());
+  shr_u(uint32_t());
+
+  shl_s(int16_t());
+  shr_s(int16_t());
+
+  shl_s(int32_t());
+  shr_s(int32_t());
+
+#if HWY_HAVE_INTEGER64
+  shl_u(uint64_t());
+  shr_u(uint64_t());
+
+  shl_s(int64_t());
+  shr_s(int64_t());
+#endif
+}
+
+HWY_NOINLINE void TestAllRotateRight() {
+  const ForPartialVectors<TestRotateRight> test;
+  test(uint32_t());
+#if HWY_HAVE_INTEGER64
+  test(uint64_t());
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyShiftTest);
+HWY_EXPORT_AND_TEST_P(HwyShiftTest, TestAllShifts);
+HWY_EXPORT_AND_TEST_P(HwyShiftTest, TestAllVariableShifts);
+HWY_EXPORT_AND_TEST_P(HwyShiftTest, TestAllRotateRight);
+}  // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/swizzle_test.cc b/third_party/highway/hwy/tests/swizzle_test.cc
new file mode 100644
index 0000000000..f447f7a800
--- /dev/null
+++ b/third_party/highway/hwy/tests/swizzle_test.cc
@@ -0,0 +1,272 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <string.h>  // memset
+
+#include "hwy/base.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/swizzle_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestGetLane {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v = Iota(d, T(1));
+    HWY_ASSERT_EQ(T(1), GetLane(v));
+  }
+};
+
+HWY_NOINLINE void TestAllGetLane() {
+  ForAllTypes(ForPartialVectors<TestGetLane>());
+}
+
+struct TestExtractLane {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v = Iota(d, T(1));
+    for (size_t i = 0; i < Lanes(d); ++i) {
+      const T actual = ExtractLane(v, i);
+      HWY_ASSERT_EQ(static_cast<T>(i + 1), actual);
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllExtractLane() {
+  ForAllTypes(ForPartialVectors<TestExtractLane>());
+}
+
+struct TestInsertLane {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using V = Vec<D>;
+    const V v = Iota(d, T(1));
+    const size_t N = Lanes(d);
+    auto lanes = AllocateAligned<T>(N);
+    Store(v, d, lanes.get());
+
+    for (size_t i = 0; i < Lanes(d); ++i) {
+      lanes[i] = T{0};
+      const V actual = InsertLane(v, i, static_cast<T>(i + 1));
+      HWY_ASSERT_VEC_EQ(d, v, actual);
+      Store(v, d, lanes.get());  // restore lane i
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllInsertLane() {
+  ForAllTypes(ForPartialVectors<TestInsertLane>());
+}
+
+struct TestDupEven {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = static_cast<T>((static_cast<int>(i) & ~1) + 1);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), DupEven(Iota(d, 1)));
+  }
+};
+
+HWY_NOINLINE void TestAllDupEven() {
+  ForUIF3264(ForShrinkableVectors<TestDupEven>());
+}
+
+struct TestDupOdd {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+#if HWY_TARGET != HWY_SCALAR
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = static_cast<T>((static_cast<int>(i) & ~1) + 2);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), DupOdd(Iota(d, 1)));
+#else
+    (void)d;
+#endif
+  }
+};
+
+HWY_NOINLINE void TestAllDupOdd() {
+  ForUIF3264(ForShrinkableVectors<TestDupOdd>());
+}
+
+struct TestOddEven {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    const auto even = Iota(d, 1);
+    const auto odd = Iota(d, static_cast<T>(1 + N));
+    auto expected = AllocateAligned<T>(N);
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = static_cast<T>(1 + i + ((i & 1) ? N : 0));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), OddEven(odd, even));
+  }
+};
+
+HWY_NOINLINE void TestAllOddEven() {
+  ForAllTypes(ForShrinkableVectors<TestOddEven>());
+}
+
+struct TestOddEvenBlocks {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    const auto even = Iota(d, 1);
+    const auto odd = Iota(d, static_cast<T>(1 + N));
+    auto expected = AllocateAligned<T>(N);
+    for (size_t i = 0; i < N; ++i) {
+      const size_t idx_block = i / (16 / sizeof(T));
+      expected[i] = static_cast<T>(1 + i + ((idx_block & 1) ? N : 0));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), OddEvenBlocks(odd, even));
+  }
+};
+
+HWY_NOINLINE void TestAllOddEvenBlocks() {
+  ForAllTypes(ForGEVectors<128, TestOddEvenBlocks>());
+}
+
+struct TestSwapAdjacentBlocks {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    constexpr size_t kLanesPerBlock = 16 / sizeof(T);
+    if (N < 2 * kLanesPerBlock) return;
+    const auto vi = Iota(d, 1);
+    auto expected = AllocateAligned<T>(N);
+    for (size_t i = 0; i < N; ++i) {
+      const size_t idx_block = i / kLanesPerBlock;
+      const size_t base = (idx_block ^ 1) * kLanesPerBlock;
+      const size_t mod = i % kLanesPerBlock;
+      expected[i] = static_cast<T>(1 + base + mod);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), SwapAdjacentBlocks(vi));
+  }
+};
+
+HWY_NOINLINE void TestAllSwapAdjacentBlocks() {
+  ForAllTypes(ForGEVectors<128, TestSwapAdjacentBlocks>());
+}
+
+struct TestTableLookupLanes {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const RebindToSigned<D> di;
+    using TI = TFromD<decltype(di)>;
+#if HWY_TARGET != HWY_SCALAR
+    const size_t N = Lanes(d);
+    auto idx = AllocateAligned<TI>(N);
+    memset(idx.get(), 0, N * sizeof(TI));
+    auto expected = AllocateAligned<T>(N);
+    const auto v = Iota(d, 1);
+
+    if (N <= 8) {  // Test all permutations
+      for (size_t i0 = 0; i0 < N; ++i0) {
+        idx[0] = static_cast<TI>(i0);
+
+        for (size_t i1 = 0; i1 < N; ++i1) {
+          if (N >= 2) idx[1] = static_cast<TI>(i1);
+          for (size_t i2 = 0; i2 < N; ++i2) {
+            if (N >= 4) idx[2] = static_cast<TI>(i2);
+            for (size_t i3 = 0; i3 < N; ++i3) {
+              if (N >= 4) idx[3] = static_cast<TI>(i3);
+
+              for (size_t i = 0; i < N; ++i) {
+                expected[i] = static_cast<T>(idx[i] + 1);  // == v[idx[i]]
+              }
+
+              const auto opaque1 = IndicesFromVec(d, Load(di, idx.get()));
+              const auto actual1 = TableLookupLanes(v, opaque1);
+              HWY_ASSERT_VEC_EQ(d, expected.get(), actual1);
+
+              const auto opaque2 = SetTableIndices(d, idx.get());
+              const auto actual2 = TableLookupLanes(v, opaque2);
+              HWY_ASSERT_VEC_EQ(d, expected.get(), actual2);
+            }
+          }
+        }
+      }
+    } else {
+      // Too many permutations to test exhaustively; choose one with repeated
+      // and cross-block indices and ensure indices do not exceed #lanes.
+      // For larger vectors, upper lanes will be zero.
+      HWY_ALIGN TI idx_source[16] = {1,  3,  2,  2,  8, 1, 7, 6,
+                                     15, 14, 14, 15, 4, 9, 8, 5};
+      for (size_t i = 0; i < N; ++i) {
+        idx[i] = (i < 16) ? idx_source[i] : 0;
+        // Avoid undefined results / asan error for scalar by capping indices.
+        if (idx[i] >= static_cast<TI>(N)) {
+          idx[i] = static_cast<TI>(N - 1);
+        }
+        expected[i] = static_cast<T>(idx[i] + 1);  // == v[idx[i]]
+      }
+
+      const auto opaque1 = IndicesFromVec(d, Load(di, idx.get()));
+      const auto actual1 = TableLookupLanes(v, opaque1);
+      HWY_ASSERT_VEC_EQ(d, expected.get(), actual1);
+
+      const auto opaque2 = SetTableIndices(d, idx.get());
+      const auto actual2 = TableLookupLanes(v, opaque2);
+      HWY_ASSERT_VEC_EQ(d, expected.get(), actual2);
+    }
+#else
+    const TI index = 0;
+    const auto v = Set(d, 1);
+    const auto opaque1 = SetTableIndices(d, &index);
+    HWY_ASSERT_VEC_EQ(d, v, TableLookupLanes(v, opaque1));
+    const auto opaque2 = IndicesFromVec(d, Zero(di));
+    HWY_ASSERT_VEC_EQ(d, v, TableLookupLanes(v, opaque2));
+#endif
+  }
+};
+
+HWY_NOINLINE void TestAllTableLookupLanes() {
+  ForUIF3264(ForPartialVectors<TestTableLookupLanes>());
+}
+
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwySwizzleTest);
+HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllGetLane);
+HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllExtractLane);
+HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllInsertLane);
+HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllDupEven);
+HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllDupOdd);
+HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllOddEven);
+HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllOddEvenBlocks);
+HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllSwapAdjacentBlocks);
+HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllTableLookupLanes);
+}  // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/test_util-inl.h b/third_party/highway/hwy/tests/test_util-inl.h
new file mode 100644
index 0000000000..972b3361e0
--- /dev/null
+++ b/third_party/highway/hwy/tests/test_util-inl.h
@@ -0,0 +1,665 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Target-specific helper functions for use by *_test.cc.
+
+#include <stdint.h>
+
+#include "hwy/base.h"
+#include "hwy/tests/hwy_gtest.h"
+#include "hwy/tests/test_util.h"
+
+// After test_util (also includes highway.h)
+#include "hwy/print-inl.h"
+
+// Per-target include guard
+#if defined(HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_) == \
+    defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_
+#undef HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_
+#else
+#define HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_
+#endif
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// Compare expected vector to vector.
+// HWY_INLINE works around a Clang SVE compiler bug where all but the first
+// 128 bits (the NEON register) of actual are zero.
+template <class D, typename T = TFromD<D>, class V = Vec<D>>
+HWY_INLINE void AssertVecEqual(D d, const T* expected, VecArg<V> actual,
+                               const char* filename, const int line) {
+  const size_t N = Lanes(d);
+  auto actual_lanes = AllocateAligned<T>(N);
+  Store(actual, d, actual_lanes.get());
+
+  const auto info = hwy::detail::MakeTypeInfo<T>();
+  const char* target_name = hwy::TargetName(HWY_TARGET);
+  hwy::detail::AssertArrayEqual(info, expected, actual_lanes.get(), N,
+                                target_name, filename, line);
+}
+
+// Compare expected lanes to vector.
+// HWY_INLINE works around a Clang SVE compiler bug where all but the first
+// 128 bits (the NEON register) of actual are zero.
+template <class D, typename T = TFromD<D>, class V = Vec<D>>
+HWY_INLINE void AssertVecEqual(D d, VecArg<V> expected, VecArg<V> actual,
+                               const char* filename, int line) {
+  auto expected_lanes = AllocateAligned<T>(Lanes(d));
+  Store(expected, d, expected_lanes.get());
+  AssertVecEqual(d, expected_lanes.get(), actual, filename, line);
+}
+
+// Only checks the valid mask elements (those whose index < Lanes(d)).
+template <class D>
+HWY_NOINLINE void AssertMaskEqual(D d, VecArg<Mask<D>> a, VecArg<Mask<D>> b,
+                                  const char* filename, int line) {
+  // lvalues prevented MSAN failure in farm_sve.
+  const Vec<D> va = VecFromMask(d, a);
+  const Vec<D> vb = VecFromMask(d, b);
+  AssertVecEqual(d, va, vb, filename, line);
+
+  const char* target_name = hwy::TargetName(HWY_TARGET);
+  AssertEqual(CountTrue(d, a), CountTrue(d, b), target_name, filename, line);
+  AssertEqual(AllTrue(d, a), AllTrue(d, b), target_name, filename, line);
+  AssertEqual(AllFalse(d, a), AllFalse(d, b), target_name, filename, line);
+
+  const size_t N = Lanes(d);
+#if HWY_TARGET == HWY_SCALAR
+  const Rebind<uint8_t, D> d8;
+#else
+  const Repartition<uint8_t, D> d8;
+#endif
+  const size_t N8 = Lanes(d8);
+  auto bits_a = AllocateAligned<uint8_t>(HWY_MAX(size_t{8}, N8));
+  auto bits_b = AllocateAligned<uint8_t>(size_t{HWY_MAX(8, N8)});
+  memset(bits_a.get(), 0, N8);
+  memset(bits_b.get(), 0, N8);
+  const size_t num_bytes_a = StoreMaskBits(d, a, bits_a.get());
+  const size_t num_bytes_b = StoreMaskBits(d, b, bits_b.get());
+  AssertEqual(num_bytes_a, num_bytes_b, target_name, filename, line);
+  size_t i = 0;
+  // First check whole bytes (if that many elements are still valid)
+  for (; i < N / 8; ++i) {
+    if (bits_a[i] != bits_b[i]) {
+      fprintf(stderr, "Mismatch in byte %d: %d != %d\n", static_cast<int>(i),
+              bits_a[i], bits_b[i]);
+      Print(d8, "expect", Load(d8, bits_a.get()), 0, N8);
+      Print(d8, "actual", Load(d8, bits_b.get()), 0, N8);
+      hwy::Abort(filename, line, "Masks not equal");
+    }
+  }
+  // Then the valid bit(s) in the last byte.
+  const size_t remainder = N % 8;
+  if (remainder != 0) {
+    const int mask = (1 << remainder) - 1;
+    const int valid_a = bits_a[i] & mask;
+    const int valid_b = bits_b[i] & mask;
+    if (valid_a != valid_b) {
+      fprintf(stderr, "Mismatch in last byte %d: %d != %d\n",
+              static_cast<int>(i), valid_a, valid_b);
+      Print(d8, "expect", Load(d8, bits_a.get()), 0, N8);
+      Print(d8, "actual", Load(d8, bits_b.get()), 0, N8);
+      hwy::Abort(filename, line, "Masks not equal");
+    }
+  }
+}
+
+// Only sets valid elements (those whose index < Lanes(d)). This helps catch
+// tests that are not masking off the (undefined) upper mask elements.
+//
+// TODO(janwas): with HWY_NOINLINE GCC zeros the upper half of AVX2 masks.
+template <class D>
+HWY_INLINE Mask<D> MaskTrue(const D d) {
+  return FirstN(d, Lanes(d));
+}
+
+template <class D>
+HWY_INLINE Mask<D> MaskFalse(const D d) {
+  const auto zero = Zero(RebindToSigned<D>());
+  return RebindMask(d, Lt(zero, zero));
+}
+
+#ifndef HWY_ASSERT_EQ
+
+#define HWY_ASSERT_EQ(expected, actual)                                     \
+  hwy::AssertEqual(expected, actual, hwy::TargetName(HWY_TARGET), __FILE__, \
+                   __LINE__)
+
+#define HWY_ASSERT_ARRAY_EQ(expected, actual, count)                          \
+  hwy::AssertArrayEqual(expected, actual, count, hwy::TargetName(HWY_TARGET), \
+                        __FILE__, __LINE__)
+
+#define HWY_ASSERT_STRING_EQ(expected, actual)                          \
+  hwy::AssertStringEqual(expected, actual, hwy::TargetName(HWY_TARGET), \
+                         __FILE__, __LINE__)
+
+#define HWY_ASSERT_VEC_EQ(d, expected, actual) \
+  AssertVecEqual(d, expected, actual, __FILE__, __LINE__)
+
+#define HWY_ASSERT_MASK_EQ(d, expected, actual) \
+  AssertMaskEqual(d, expected, actual, __FILE__, __LINE__)
+
+#endif  // HWY_ASSERT_EQ
+
+namespace detail {
+
+// Helpers for instantiating tests with combinations of lane types / counts.
+
+// Calls Test for each CappedTag<T, N> where N is in [kMinLanes, kMul * kMinArg]
+// and the resulting Lanes() is in [min_lanes, max_lanes]. The upper bound
+// is required to ensure capped vectors remain extendable. Implemented by
+// recursively halving kMul until it is zero.
+template <typename T, size_t kMul, size_t kMinArg, class Test>
+struct ForeachCappedR {
+  static void Do(size_t min_lanes, size_t max_lanes) {
+    const CappedTag<T, kMul * kMinArg> d;
+
+    // If we already don't have enough lanes, stop.
+    const size_t lanes = Lanes(d);
+    if (lanes < min_lanes) return;
+
+    if (lanes <= max_lanes) {
+      Test()(T(), d);
+    }
+    ForeachCappedR<T, kMul / 2, kMinArg, Test>::Do(min_lanes, max_lanes);
+  }
+};
+
+// Base case to stop the recursion.
+template <typename T, size_t kMinArg, class Test>
+struct ForeachCappedR<T, 0, kMinArg, Test> {
+  static void Do(size_t, size_t) {}
+};
+
+#if HWY_HAVE_SCALABLE
+
+template <typename T>
+constexpr int MinPow2() {
+  // Highway follows RVV LMUL in that the smallest fraction is 1/8th (encoded
+  // as kPow2 == -3). The fraction also must not result in zero lanes for the
+  // smallest possible vector size, which is 128 bits even on RISC-V (with the
+  // application processor profile).
+  return HWY_MAX(-3, -static_cast<int>(CeilLog2(16 / sizeof(T))));
+}
+
+// Iterates kPow2 upward through +3.
+template <typename T, int kPow2, int kAddPow2, class Test>
+struct ForeachShiftR {
+  static void Do(size_t min_lanes) {
+    const ScalableTag<T, kPow2 + kAddPow2> d;
+
+    // Precondition: [kPow2, 3] + kAddPow2 is a valid fraction of the minimum
+    // vector size, so we always have enough lanes, except ForGEVectors.
+    if (Lanes(d) >= min_lanes) {
+      Test()(T(), d);
+    } else {
+      fprintf(stderr, "%d lanes < %d: T=%d pow=%d\n",
+              static_cast<int>(Lanes(d)), static_cast<int>(min_lanes),
+              static_cast<int>(sizeof(T)), kPow2 + kAddPow2);
+      HWY_ASSERT(min_lanes != 1);
+    }
+
+    ForeachShiftR<T, kPow2 + 1, kAddPow2, Test>::Do(min_lanes);
+  }
+};
+
+// Base case to stop the recursion.
+template <typename T, int kAddPow2, class Test>
+struct ForeachShiftR<T, 4, kAddPow2, Test> {
+  static void Do(size_t) {}
+};
+#else
+// ForeachCappedR already handled all possible sizes.
+#endif  // HWY_HAVE_SCALABLE
+
+}  // namespace detail
+
+// These 'adapters' call a test for all possible N or kPow2 subject to
+// constraints such as "vectors must be extendable" or "vectors >= 128 bits".
+// They may be called directly, or via For*Types. Note that for an adapter C,
+// `C<Test>(T())` does not call the test - the correct invocation is
+// `C<Test>()(T())`, or preferably `ForAllTypes(C<Test>())`. We check at runtime
+// that operator() is called to prevent such bugs. Note that this is not
+// thread-safe, but that is fine because C are typically local variables.
+
+// Calls Test for all power of two N in [1, Lanes(d) >> kPow2]. This is for
+// ops that widen their input, e.g. Combine (not supported by HWY_SCALAR).
+template <class Test, int kPow2 = 1>
+class ForExtendableVectors {
+  mutable bool called_ = false;
+
+ public:
+  ~ForExtendableVectors() {
+    if (!called_) {
+      HWY_ABORT("Test is incorrect, ensure operator() is called");
+    }
+  }
+
+  template <typename T>
+  void operator()(T /*unused*/) const {
+    called_ = true;
+    constexpr size_t kMaxCapped = HWY_LANES(T);
+    // Skip CappedTag that are already full vectors.
+    const size_t max_lanes = Lanes(ScalableTag<T>()) >> kPow2;
+    (void)kMaxCapped;
+    (void)max_lanes;
+#if HWY_TARGET == HWY_SCALAR
+    // not supported
+#else
+    detail::ForeachCappedR<T, (kMaxCapped >> kPow2), 1, Test>::Do(1, max_lanes);
+#if HWY_TARGET == HWY_RVV
+    // For each [MinPow2, 3 - kPow2]; counter is [MinPow2 + kPow2, 3].
+    detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2, -kPow2, Test>::Do(1);
+#elif HWY_HAVE_SCALABLE
+    // For each [MinPow2, 0 - kPow2]; counter is [MinPow2 + kPow2 + 3, 3].
+    detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2 + 3, -kPow2 - 3,
+                          Test>::Do(1);
+#endif
+#endif  // HWY_SCALAR
+  }
+};
+
+// Calls Test for all power of two N in [1 << kPow2, Lanes(d)]. This is for ops
+// that narrow their input, e.g. UpperHalf.
+template <class Test, int kPow2 = 1>
+class ForShrinkableVectors {
+  mutable bool called_ = false;
+
+ public:
+  ~ForShrinkableVectors() {
+    if (!called_) {
+      HWY_ABORT("Test is incorrect, ensure operator() is called");
+    }
+  }
+
+  template <typename T>
+  void operator()(T /*unused*/) const {
+    called_ = true;
+    constexpr size_t kMinLanes = size_t{1} << kPow2;
+    constexpr size_t kMaxCapped = HWY_LANES(T);
+    // For shrinking, an upper limit is unnecessary.
+    constexpr size_t max_lanes = kMaxCapped;
+
+    (void)kMinLanes;
+    (void)max_lanes;
+    (void)max_lanes;
+#if HWY_TARGET == HWY_SCALAR
+    // not supported
+#else
+    detail::ForeachCappedR<T, (kMaxCapped >> kPow2), kMinLanes, Test>::Do(
+        kMinLanes, max_lanes);
+#if HWY_TARGET == HWY_RVV
+    // For each [MinPow2 + kPow2, 3]; counter is [MinPow2 + kPow2, 3].
+    detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2, 0, Test>::Do(
+        kMinLanes);
+#elif HWY_HAVE_SCALABLE
+    // For each [MinPow2 + kPow2, 0]; counter is [MinPow2 + kPow2 + 3, 3].
+    detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2 + 3, -3, Test>::Do(
+        kMinLanes);
+#endif
+#endif  // HWY_TARGET == HWY_SCALAR
+  }
+};
+
+// Calls Test for all supported power of two vectors of at least kMinBits.
+// Examples: AES or 64x64 require 128 bits, casts may require 64 bits.
+template <size_t kMinBits, class Test>
+class ForGEVectors {
+  mutable bool called_ = false;
+
+ public:
+  ~ForGEVectors() {
+    if (!called_) {
+      HWY_ABORT("Test is incorrect, ensure operator() is called");
+    }
+  }
+
+  template <typename T>
+  void operator()(T /*unused*/) const {
+    called_ = true;
+    constexpr size_t kMaxCapped = HWY_LANES(T);
+    constexpr size_t kMinLanes = kMinBits / 8 / sizeof(T);
+    // An upper limit is unnecessary.
+    constexpr size_t max_lanes = kMaxCapped;
+    (void)max_lanes;
+#if HWY_TARGET == HWY_SCALAR
+    (void)kMinLanes;  // not supported
+#else
+    detail::ForeachCappedR<T, HWY_LANES(T) / kMinLanes, kMinLanes, Test>::Do(
+        kMinLanes, max_lanes);
+#if HWY_TARGET == HWY_RVV
+    // Can be 0 (handled below) if kMinBits > 64.
+    constexpr size_t kRatio = 128 / kMinBits;
+    constexpr int kMinPow2 =
+        kRatio == 0 ? 0 : -static_cast<int>(CeilLog2(kRatio));
+    // For each [kMinPow2, 3]; counter is [kMinPow2, 3].
+    detail::ForeachShiftR<T, kMinPow2, 0, Test>::Do(kMinLanes);
+#elif HWY_HAVE_SCALABLE
+    // Can be 0 (handled below) if kMinBits > 128.
+    constexpr size_t kRatio = 128 / kMinBits;
+    constexpr int kMinPow2 =
+        kRatio == 0 ? 0 : -static_cast<int>(CeilLog2(kRatio));
+    // For each [kMinPow2, 0]; counter is [kMinPow2 + 3, 3].
+    detail::ForeachShiftR<T, kMinPow2 + 3, -3, Test>::Do(kMinLanes);
+#endif
+#endif  // HWY_TARGET == HWY_SCALAR
+  }
+};
+
+template <class Test>
+using ForGE128Vectors = ForGEVectors<128, Test>;
+
+// Calls Test for all N that can be promoted (not the same as Extendable because
+// HWY_SCALAR has one lane). Also used for ZipLower, but not ZipUpper.
+template <class Test, int kPow2 = 1>
+class ForPromoteVectors {
+  mutable bool called_ = false;
+
+ public:
+  ~ForPromoteVectors() {
+    if (!called_) {
+      HWY_ABORT("Test is incorrect, ensure operator() is called");
+    }
+  }
+
+  template <typename T>
+  void operator()(T /*unused*/) const {
+    called_ = true;
+    constexpr size_t kFactor = size_t{1} << kPow2;
+    static_assert(kFactor >= 2 && kFactor * sizeof(T) <= sizeof(uint64_t), "");
+    constexpr size_t kMaxCapped = HWY_LANES(T);
+    constexpr size_t kMinLanes = kFactor;
+    // Skip CappedTag that are already full vectors.
+    const size_t max_lanes = Lanes(ScalableTag<T>()) >> kPow2;
+    (void)kMaxCapped;
+    (void)kMinLanes;
+    (void)max_lanes;
+#if HWY_TARGET == HWY_SCALAR
+    detail::ForeachCappedR<T, 1, 1, Test>::Do(1, 1);
+#else
+    // TODO(janwas): call Extendable if kMinLanes check not required?
+    detail::ForeachCappedR<T, (kMaxCapped >> kPow2), 1, Test>::Do(kMinLanes,
+                                                                  max_lanes);
+#if HWY_TARGET == HWY_RVV
+    // For each [MinPow2, 3 - kPow2]; counter is [MinPow2 + kPow2, 3].
+    detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2, -kPow2, Test>::Do(
+        kMinLanes);
+#elif HWY_HAVE_SCALABLE
+    // For each [MinPow2, 0 - kPow2]; counter is [MinPow2 + kPow2 + 3, 3].
+    detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2 + 3, -kPow2 - 3,
+                          Test>::Do(kMinLanes);
+#endif
+#endif  // HWY_SCALAR
+  }
+};
+
+// Calls Test for all N than can be demoted (not the same as Shrinkable because
+// HWY_SCALAR has one lane).
+template <class Test, int kPow2 = 1>
+class ForDemoteVectors {
+  mutable bool called_ = false;
+
+ public:
+  ~ForDemoteVectors() {
+    if (!called_) {
+      HWY_ABORT("Test is incorrect, ensure operator() is called");
+    }
+  }
+
+  template <typename T>
+  void operator()(T /*unused*/) const {
+    called_ = true;
+    constexpr size_t kMinLanes = size_t{1} << kPow2;
+    constexpr size_t kMaxCapped = HWY_LANES(T);
+    // For shrinking, an upper limit is unnecessary.
+    constexpr size_t max_lanes = kMaxCapped;
+
+    (void)kMinLanes;
+    (void)max_lanes;
+    (void)max_lanes;
+#if HWY_TARGET == HWY_SCALAR
+    detail::ForeachCappedR<T, 1, 1, Test>::Do(1, 1);
+#else
+    detail::ForeachCappedR<T, (kMaxCapped >> kPow2), kMinLanes, Test>::Do(
+        kMinLanes, max_lanes);
+
+// TODO(janwas): call Extendable if kMinLanes check not required?
+#if HWY_TARGET == HWY_RVV
+    // For each [MinPow2 + kPow2, 3]; counter is [MinPow2 + kPow2, 3].
+    detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2, 0, Test>::Do(
+        kMinLanes);
+#elif HWY_HAVE_SCALABLE
+    // For each [MinPow2 + kPow2, 0]; counter is [MinPow2 + kPow2 + 3, 3].
+    detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2 + 3, -3, Test>::Do(
+        kMinLanes);
+#endif
+#endif  // HWY_TARGET == HWY_SCALAR
+  }
+};
+
+// For LowerHalf/Quarter.
+template <class Test, int kPow2 = 1>
+class ForHalfVectors {
+  mutable bool called_ = false;
+
+ public:
+  ~ForHalfVectors() {
+    if (!called_) {
+      HWY_ABORT("Test is incorrect, ensure operator() is called");
+    }
+  }
+
+  template <typename T>
+  void operator()(T /*unused*/) const {
+    called_ = true;
+#if HWY_TARGET == HWY_SCALAR
+    detail::ForeachCappedR<T, 1, 1, Test>::Do(1, 1);
+#else
+    constexpr size_t kMinLanes = size_t{1} << kPow2;
+    // For shrinking, an upper limit is unnecessary.
+    constexpr size_t kMaxCapped = HWY_LANES(T);
+    detail::ForeachCappedR<T, (kMaxCapped >> kPow2), kMinLanes, Test>::Do(
+        kMinLanes, kMaxCapped);
+
+// TODO(janwas): call Extendable if kMinLanes check not required?
+#if HWY_TARGET == HWY_RVV
+    // For each [MinPow2 + kPow2, 3]; counter is [MinPow2 + kPow2, 3].
+    detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2, 0, Test>::Do(
+        kMinLanes);
+#elif HWY_HAVE_SCALABLE
+    // For each [MinPow2 + kPow2, 0]; counter is [MinPow2 + kPow2 + 3, 3].
+    detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2 + 3, -3, Test>::Do(
+        kMinLanes);
+#endif
+#endif  // HWY_TARGET == HWY_SCALAR
+  }
+};
+
+// Calls Test for all power of two N in [1, Lanes(d)]. This is the default
+// for ops that do not narrow nor widen their input, nor require 128 bits.
+template <class Test>
+class ForPartialVectors {
+  mutable bool called_ = false;
+
+ public:
+  ~ForPartialVectors() {
+    if (!called_) {
+      HWY_ABORT("Test is incorrect, ensure operator() is called");
+    }
+  }
+
+  template <typename T>
+  void operator()(T t) const {
+    called_ = true;
+#if HWY_TARGET == HWY_SCALAR
+    (void)t;
+    detail::ForeachCappedR<T, 1, 1, Test>::Do(1, 1);
+#else
+    ForExtendableVectors<Test, 0>()(t);
+#endif
+  }
+};
+
+// Type lists to shorten call sites:
+
+template <class Func>
+void ForSignedTypes(const Func& func) {
+  func(int8_t());
+  func(int16_t());
+  func(int32_t());
+#if HWY_HAVE_INTEGER64
+  func(int64_t());
+#endif
+}
+
+template <class Func>
+void ForUnsignedTypes(const Func& func) {
+  func(uint8_t());
+  func(uint16_t());
+  func(uint32_t());
+#if HWY_HAVE_INTEGER64
+  func(uint64_t());
+#endif
+}
+
+template <class Func>
+void ForIntegerTypes(const Func& func) {
+  ForSignedTypes(func);
+  ForUnsignedTypes(func);
+}
+
+template <class Func>
+void ForFloatTypes(const Func& func) {
+  func(float());
+#if HWY_HAVE_FLOAT64
+  func(double());
+#endif
+}
+
+template <class Func>
+void ForAllTypes(const Func& func) {
+  ForIntegerTypes(func);
+  ForFloatTypes(func);
+}
+
+template <class Func>
+void ForUI8(const Func& func) {
+  func(uint8_t());
+  func(int8_t());
+}
+
+template <class Func>
+void ForUI16(const Func& func) {
+  func(uint16_t());
+  func(int16_t());
+}
+
+template <class Func>
+void ForUIF16(const Func& func) {
+  ForUI16(func);
+#if HWY_HAVE_FLOAT16
+  func(float16_t());
+#endif
+}
+
+template <class Func>
+void ForUI32(const Func& func) {
+  func(uint32_t());
+  func(int32_t());
+}
+
+template <class Func>
+void ForUIF32(const Func& func) {
+  ForUI32(func);
+  func(float());
+}
+
+template <class Func>
+void ForUI64(const Func& func) {
+#if HWY_HAVE_INTEGER64
+  func(uint64_t());
+  func(int64_t());
+#endif
+}
+
+template <class Func>
+void ForUIF64(const Func& func) {
+  ForUI64(func);
+#if HWY_HAVE_FLOAT64
+  func(double());
+#endif
+}
+
+template <class Func>
+void ForUI3264(const Func& func) {
+  ForUI32(func);
+  ForUI64(func);
+}
+
+template <class Func>
+void ForUIF3264(const Func& func) {
+  ForUIF32(func);
+  ForUIF64(func);
+}
+
+template <class Func>
+void ForUI163264(const Func& func) {
+  ForUI16(func);
+  ForUI3264(func);
+}
+
+template <class Func>
+void ForUIF163264(const Func& func) {
+  ForUIF16(func);
+  ForUIF3264(func);
+}
+
+// For tests that involve loops, adjust the trip count so that emulated tests
+// finish quickly (but always at least 2 iterations to ensure some diversity).
+constexpr size_t AdjustedReps(size_t max_reps) {
+#if HWY_ARCH_RVV
+  return HWY_MAX(max_reps / 32, 2);
+#elif HWY_IS_DEBUG_BUILD
+  return HWY_MAX(max_reps / 8, 2);
+#elif HWY_ARCH_ARM
+  return HWY_MAX(max_reps / 4, 2);
+#else
+  return HWY_MAX(max_reps, 2);
+#endif
+}
+
+// Same as above, but the loop trip count will be 1 << max_pow2.
+constexpr size_t AdjustedLog2Reps(size_t max_pow2) {
+  // If "negative" (unsigned wraparound), use original.
+#if HWY_ARCH_RVV
+  return HWY_MIN(max_pow2 - 4, max_pow2);
+#elif HWY_IS_DEBUG_BUILD
+  return HWY_MIN(max_pow2 - 1, max_pow2);
+#elif HWY_ARCH_ARM
+  return HWY_MIN(max_pow2 - 1, max_pow2);
+#else
+  return max_pow2;
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#endif  // per-target include guard
diff --git a/third_party/highway/hwy/tests/test_util.cc b/third_party/highway/hwy/tests/test_util.cc
new file mode 100644
index 0000000000..a0796b15f9
--- /dev/null
+++ b/third_party/highway/hwy/tests/test_util.cc
@@ -0,0 +1,117 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/tests/test_util.h"
+
+#include <stddef.h>
+#include <stdio.h>
+
+#include <cmath>
+
+#include "hwy/base.h"
+#include "hwy/print.h"
+
+namespace hwy {
+
+HWY_TEST_DLLEXPORT bool BytesEqual(const void* p1, const void* p2,
+                                   const size_t size, size_t* pos) {
+  const uint8_t* bytes1 = reinterpret_cast<const uint8_t*>(p1);
+  const uint8_t* bytes2 = reinterpret_cast<const uint8_t*>(p2);
+  for (size_t i = 0; i < size; ++i) {
+    if (bytes1[i] != bytes2[i]) {
+      if (pos != nullptr) {
+        *pos = i;
+      }
+      return false;
+    }
+  }
+  return true;
+}
+
+void AssertStringEqual(const char* expected, const char* actual,
+                       const char* target_name, const char* filename,
+                       int line) {
+  while (*expected == *actual++) {
+    if (*expected++ == '\0') return;
+  }
+
+  Abort(filename, line, "%s string mismatch: expected '%s', got '%s'.\n",
+        target_name, expected, actual);
+}
+
+namespace detail {
+
+HWY_TEST_DLLEXPORT bool IsEqual(const TypeInfo& info, const void* expected_ptr,
+                                const void* actual_ptr) {
+  if (!info.is_float) {
+    return BytesEqual(expected_ptr, actual_ptr, info.sizeof_t);
+  }
+
+  if (info.sizeof_t == 4) {
+    float expected, actual;
+    CopyBytes<4>(expected_ptr, &expected);
+    CopyBytes<4>(actual_ptr, &actual);
+    return ComputeUlpDelta(expected, actual) <= 1;
+  } else if (info.sizeof_t == 8) {
+    double expected, actual;
+    CopyBytes<8>(expected_ptr, &expected);
+    CopyBytes<8>(actual_ptr, &actual);
+    return ComputeUlpDelta(expected, actual) <= 1;
+  } else {
+    HWY_ABORT("Unexpected float size %d\n", static_cast<int>(info.sizeof_t));
+    return false;
+  }
+}
+
+HWY_TEST_DLLEXPORT HWY_NORETURN void PrintMismatchAndAbort(
+    const TypeInfo& info, const void* expected_ptr, const void* actual_ptr,
+    const char* target_name, const char* filename, int line, size_t lane,
+    size_t num_lanes) {
+  char type_name[100];
+  TypeName(info, 1, type_name);
+  char expected_str[100];
+  ToString(info, expected_ptr, expected_str);
+  char actual_str[100];
+  ToString(info, actual_ptr, actual_str);
+  Abort(filename, line,
+        "%s, %sx%d lane %d mismatch: expected '%s', got '%s'.\n", target_name,
+        type_name, static_cast<int>(num_lanes), static_cast<int>(lane),
+        expected_str, actual_str);
+}
+
+HWY_TEST_DLLEXPORT void AssertArrayEqual(const TypeInfo& info,
+                                         const void* expected_void,
+                                         const void* actual_void, size_t N,
+                                         const char* target_name,
+                                         const char* filename, int line) {
+  const uint8_t* expected_array =
+      reinterpret_cast<const uint8_t*>(expected_void);
+  const uint8_t* actual_array = reinterpret_cast<const uint8_t*>(actual_void);
+  for (size_t i = 0; i < N; ++i) {
+    const void* expected_ptr = expected_array + i * info.sizeof_t;
+    const void* actual_ptr = actual_array + i * info.sizeof_t;
+    if (!IsEqual(info, expected_ptr, actual_ptr)) {
+      fprintf(stderr, "\n\n");
+      PrintArray(info, "expect", expected_array, N, i);
+      PrintArray(info, "actual", actual_array, N, i);
+
+      PrintMismatchAndAbort(info, expected_ptr, actual_ptr, target_name,
+                            filename, line, i, N);
+    }
+  }
+}
+
+}  // namespace detail
+}  // namespace hwy
diff --git a/third_party/highway/hwy/tests/test_util.h b/third_party/highway/hwy/tests/test_util.h
new file mode 100644
index 0000000000..558d1bcfba
--- /dev/null
+++ b/third_party/highway/hwy/tests/test_util.h
@@ -0,0 +1,173 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HWY_TESTS_TEST_UTIL_H_
+#define HWY_TESTS_TEST_UTIL_H_
+
+// Target-independent helper functions for use by *_test.cc.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <cmath>  // std::isnan
+#include <string>
+
+#include "hwy/aligned_allocator.h"
+#include "hwy/base.h"
+#include "hwy/highway.h"
+#include "hwy/highway_export.h"
+#include "hwy/print.h"
+
+namespace hwy {
+
+// The maximum vector size used in tests when defining test data. DEPRECATED.
+constexpr size_t kTestMaxVectorSize = 64;
+
+// 64-bit random generator (Xorshift128+). Much smaller state than std::mt19937,
+// which triggers a compiler bug.
+class RandomState {
+ public:
+  explicit RandomState(const uint64_t seed = 0x123456789ull) {
+    s0_ = SplitMix64(seed + 0x9E3779B97F4A7C15ull);
+    s1_ = SplitMix64(s0_);
+  }
+
+  HWY_INLINE uint64_t operator()() {
+    uint64_t s1 = s0_;
+    const uint64_t s0 = s1_;
+    const uint64_t bits = s1 + s0;
+    s0_ = s0;
+    s1 ^= s1 << 23;
+    s1 ^= s0 ^ (s1 >> 18) ^ (s0 >> 5);
+    s1_ = s1;
+    return bits;
+  }
+
+ private:
+  static uint64_t SplitMix64(uint64_t z) {
+    z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull;
+    z = (z ^ (z >> 27)) * 0x94D049BB133111EBull;
+    return z ^ (z >> 31);
+  }
+
+  uint64_t s0_;
+  uint64_t s1_;
+};
+
+static HWY_INLINE uint32_t Random32(RandomState* rng) {
+  return static_cast<uint32_t>((*rng)());
+}
+
+static HWY_INLINE uint64_t Random64(RandomState* rng) { return (*rng)(); }
+
+// Prevents the compiler from eliding the computations that led to "output".
+// Works by indicating to the compiler that "output" is being read and modified.
+// The +r constraint avoids unnecessary writes to memory, but only works for
+// built-in types.
+template <class T>
+inline void PreventElision(T&& output) {
+#if HWY_COMPILER_MSVC
+  (void)output;
+#else   // HWY_COMPILER_MSVC
+  asm volatile("" : "+r"(output) : : "memory");
+#endif  // HWY_COMPILER_MSVC
+}
+
+HWY_TEST_DLLEXPORT bool BytesEqual(const void* p1, const void* p2,
+                                   const size_t size, size_t* pos = nullptr);
+
+void AssertStringEqual(const char* expected, const char* actual,
+                       const char* target_name, const char* filename, int line);
+
+namespace detail {
+
+template <typename T, typename TU = MakeUnsigned<T>>
+TU ComputeUlpDelta(const T expected, const T actual) {
+  // Handle -0 == 0 and infinities.
+  if (expected == actual) return 0;
+
+  // Consider "equal" if both are NaN, so we can verify an expected NaN.
+  // Needs a special case because there are many possible NaN representations.
+  if (std::isnan(expected) && std::isnan(actual)) return 0;
+
+  // Compute the difference in units of last place. We do not need to check for
+  // differing signs; they will result in large differences, which is fine.
+  TU ux, uy;
+  CopySameSize(&expected, &ux);
+  CopySameSize(&actual, &uy);
+
+  // Avoid unsigned->signed cast: 2's complement is only guaranteed by C++20.
+  const TU ulp = HWY_MAX(ux, uy) - HWY_MIN(ux, uy);
+  return ulp;
+}
+
+HWY_TEST_DLLEXPORT bool IsEqual(const TypeInfo& info, const void* expected_ptr,
+                                const void* actual_ptr);
+
+HWY_TEST_DLLEXPORT HWY_NORETURN void PrintMismatchAndAbort(
+    const TypeInfo& info, const void* expected_ptr, const void* actual_ptr,
+    const char* target_name, const char* filename, int line, size_t lane = 0,
+    size_t num_lanes = 1);
+
+HWY_TEST_DLLEXPORT void AssertArrayEqual(const TypeInfo& info,
+                                         const void* expected_void,
+                                         const void* actual_void, size_t N,
+                                         const char* target_name,
+                                         const char* filename, int line);
+
+}  // namespace detail
+
+// Returns a name for the vector/part/scalar. The type prefix is u/i/f for
+// unsigned/signed/floating point, followed by the number of bits per lane;
+// then 'x' followed by the number of lanes. Example: u8x16. This is useful for
+// understanding which instantiation of a generic test failed.
+template <typename T>
+std::string TypeName(T /*unused*/, size_t N) {
+  char string100[100];
+  detail::TypeName(detail::MakeTypeInfo<T>(), N, string100);
+  return string100;
+}
+
+// Compare non-vector, non-string T.
+template <typename T>
+HWY_INLINE bool IsEqual(const T expected, const T actual) {
+  const auto info = detail::MakeTypeInfo<T>();
+  return detail::IsEqual(info, &expected, &actual);
+}
+
+template <typename T>
+HWY_INLINE void AssertEqual(const T expected, const T actual,
+                            const char* target_name, const char* filename,
+                            int line, size_t lane = 0) {
+  const auto info = detail::MakeTypeInfo<T>();
+  if (!detail::IsEqual(info, &expected, &actual)) {
+    detail::PrintMismatchAndAbort(info, &expected, &actual, target_name,
+                                  filename, line, lane);
+  }
+}
+
+template <typename T>
+HWY_INLINE void AssertArrayEqual(const T* expected, const T* actual,
+                                 size_t count, const char* target_name,
+                                 const char* filename, int line) {
+  const auto info = hwy::detail::MakeTypeInfo<T>();
+  detail::AssertArrayEqual(info, expected, actual, count, target_name, filename,
+                           line);
+}
+
+}  // namespace hwy
+
+#endif  // HWY_TESTS_TEST_UTIL_H_
diff --git a/third_party/highway/hwy/tests/test_util_test.cc b/third_party/highway/hwy/tests/test_util_test.cc
new file mode 100644
index 0000000000..1911467c34
--- /dev/null
+++ b/third_party/highway/hwy/tests/test_util_test.cc
@@ -0,0 +1,107 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/test_util_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestName {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T t, D d) {
+    char num[10];
+    std::string expected = IsFloat<T>() ? "f" : (IsSigned<T>() ? "i" : "u");
+    snprintf(num, sizeof(num), "%u" , static_cast<unsigned>(sizeof(T) * 8));
+    expected += num;
+
+    const size_t N = Lanes(d);
+    if (N != 1) {
+      expected += 'x';
+      snprintf(num, sizeof(num), "%u", static_cast<unsigned>(N));
+      expected += num;
+    }
+    const std::string actual = TypeName(t, N);
+    if (expected != actual) {
+      HWY_ABORT("%s mismatch: expected '%s', got '%s'.\n",
+                hwy::TargetName(HWY_TARGET), expected.c_str(), actual.c_str());
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllName() { ForAllTypes(ForPartialVectors<TestName>()); }
+
+struct TestEqualInteger {
+  template <class T>
+  HWY_NOINLINE void operator()(T /*t*/) const {
+    HWY_ASSERT_EQ(T(0), T(0));
+    HWY_ASSERT_EQ(T(1), T(1));
+    HWY_ASSERT_EQ(T(-1), T(-1));
+    HWY_ASSERT_EQ(LimitsMin<T>(), LimitsMin<T>());
+
+    HWY_ASSERT(!IsEqual(T(0), T(1)));
+    HWY_ASSERT(!IsEqual(T(1), T(0)));
+    HWY_ASSERT(!IsEqual(T(1), T(-1)));
+    HWY_ASSERT(!IsEqual(T(-1), T(1)));
+    HWY_ASSERT(!IsEqual(LimitsMin<T>(), LimitsMax<T>()));
+    HWY_ASSERT(!IsEqual(LimitsMax<T>(), LimitsMin<T>()));
+  }
+};
+
+struct TestEqualFloat {
+  template <class T>
+  HWY_NOINLINE void operator()(T /*t*/) const {
+    HWY_ASSERT(IsEqual(T(0), T(0)));
+    HWY_ASSERT(IsEqual(T(1), T(1)));
+    HWY_ASSERT(IsEqual(T(-1), T(-1)));
+    HWY_ASSERT(IsEqual(MantissaEnd<T>(), MantissaEnd<T>()));
+
+    HWY_ASSERT(!IsEqual(T(0), T(1)));
+    HWY_ASSERT(!IsEqual(T(1), T(0)));
+    HWY_ASSERT(!IsEqual(T(1), T(-1)));
+    HWY_ASSERT(!IsEqual(T(-1), T(1)));
+    HWY_ASSERT(!IsEqual(LowestValue<T>(), HighestValue<T>()));
+    HWY_ASSERT(!IsEqual(HighestValue<T>(), LowestValue<T>()));
+  }
+};
+
+HWY_NOINLINE void TestAllEqual() {
+  ForIntegerTypes(TestEqualInteger());
+  ForFloatTypes(TestEqualFloat());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(TestUtilTest);
+HWY_EXPORT_AND_TEST_P(TestUtilTest, TestAllName);
+HWY_EXPORT_AND_TEST_P(TestUtilTest, TestAllEqual);
+}  // namespace hwy
+
+#endif