summaryrefslogtreecommitdiffstats
path: root/third_party/highway/hwy/tests
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/highway/hwy/tests')
-rw-r--r--third_party/highway/hwy/tests/arithmetic_test.cc499
-rw-r--r--third_party/highway/hwy/tests/blockwise_shift_test.cc270
-rw-r--r--third_party/highway/hwy/tests/blockwise_test.cc454
-rw-r--r--third_party/highway/hwy/tests/combine_test.cc275
-rw-r--r--third_party/highway/hwy/tests/compare_test.cc509
-rw-r--r--third_party/highway/hwy/tests/compress_test.cc833
-rw-r--r--third_party/highway/hwy/tests/convert_test.cc643
-rw-r--r--third_party/highway/hwy/tests/crypto_test.cc553
-rw-r--r--third_party/highway/hwy/tests/demote_test.cc328
-rw-r--r--third_party/highway/hwy/tests/float_test.cc350
-rw-r--r--third_party/highway/hwy/tests/hwy_gtest.h157
-rw-r--r--third_party/highway/hwy/tests/if_test.cc175
-rw-r--r--third_party/highway/hwy/tests/interleaved_test.cc256
-rw-r--r--third_party/highway/hwy/tests/list_targets.cc71
-rw-r--r--third_party/highway/hwy/tests/logical_test.cc246
-rw-r--r--third_party/highway/hwy/tests/mask_mem_test.cc197
-rw-r--r--third_party/highway/hwy/tests/mask_test.cc295
-rw-r--r--third_party/highway/hwy/tests/memory_test.cc343
-rw-r--r--third_party/highway/hwy/tests/mul_test.cc526
-rw-r--r--third_party/highway/hwy/tests/reduction_test.cc261
-rw-r--r--third_party/highway/hwy/tests/reverse_test.cc186
-rw-r--r--third_party/highway/hwy/tests/shift_test.cc428
-rw-r--r--third_party/highway/hwy/tests/swizzle_test.cc272
-rw-r--r--third_party/highway/hwy/tests/test_util-inl.h665
-rw-r--r--third_party/highway/hwy/tests/test_util.cc117
-rw-r--r--third_party/highway/hwy/tests/test_util.h173
-rw-r--r--third_party/highway/hwy/tests/test_util_test.cc107
27 files changed, 9189 insertions, 0 deletions
diff --git a/third_party/highway/hwy/tests/arithmetic_test.cc b/third_party/highway/hwy/tests/arithmetic_test.cc
new file mode 100644
index 0000000000..fa533228a0
--- /dev/null
+++ b/third_party/highway/hwy/tests/arithmetic_test.cc
@@ -0,0 +1,499 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/arithmetic_test.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestPlusMinus {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto v2 = Iota(d, T(2));
+ const auto v3 = Iota(d, T(3));
+ const auto v4 = Iota(d, T(4));
+
+ const size_t N = Lanes(d);
+ auto lanes = AllocateAligned<T>(N);
+ for (size_t i = 0; i < N; ++i) {
+ lanes[i] = static_cast<T>((2 + i) + (3 + i));
+ }
+ HWY_ASSERT_VEC_EQ(d, lanes.get(), Add(v2, v3));
+ HWY_ASSERT_VEC_EQ(d, Set(d, 2), Sub(v4, v2));
+
+ for (size_t i = 0; i < N; ++i) {
+ lanes[i] = static_cast<T>((2 + i) + (4 + i));
+ }
+ auto sum = v2;
+ sum = Add(sum, v4); // sum == 6,8..
+ HWY_ASSERT_VEC_EQ(d, Load(d, lanes.get()), sum);
+
+ sum = Sub(sum, v4);
+ HWY_ASSERT_VEC_EQ(d, v2, sum);
+ }
+};
+
+struct TestPlusMinusOverflow {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto v1 = Iota(d, T(1));
+ const auto vMax = Iota(d, LimitsMax<T>());
+ const auto vMin = Iota(d, LimitsMin<T>());
+
+ // Check that no UB triggered.
+ // "assert" here is formal - to avoid compiler dropping calculations
+ HWY_ASSERT_VEC_EQ(d, Add(v1, vMax), Add(vMax, v1));
+ HWY_ASSERT_VEC_EQ(d, Add(vMax, vMax), Add(vMax, vMax));
+ HWY_ASSERT_VEC_EQ(d, Sub(vMin, v1), Sub(vMin, v1));
+ HWY_ASSERT_VEC_EQ(d, Sub(vMin, vMax), Sub(vMin, vMax));
+ }
+};
+
+HWY_NOINLINE void TestAllPlusMinus() {
+ ForAllTypes(ForPartialVectors<TestPlusMinus>());
+ ForIntegerTypes(ForPartialVectors<TestPlusMinusOverflow>());
+}
+
+struct TestUnsignedSaturatingArithmetic {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto v0 = Zero(d);
+ const auto vi = Iota(d, 1);
+ const auto vm = Set(d, LimitsMax<T>());
+
+ HWY_ASSERT_VEC_EQ(d, Add(v0, v0), SaturatedAdd(v0, v0));
+ HWY_ASSERT_VEC_EQ(d, Add(v0, vi), SaturatedAdd(v0, vi));
+ HWY_ASSERT_VEC_EQ(d, Add(v0, vm), SaturatedAdd(v0, vm));
+ HWY_ASSERT_VEC_EQ(d, vm, SaturatedAdd(vi, vm));
+ HWY_ASSERT_VEC_EQ(d, vm, SaturatedAdd(vm, vm));
+
+ HWY_ASSERT_VEC_EQ(d, v0, SaturatedSub(v0, v0));
+ HWY_ASSERT_VEC_EQ(d, v0, SaturatedSub(v0, vi));
+ HWY_ASSERT_VEC_EQ(d, v0, SaturatedSub(vi, vi));
+ HWY_ASSERT_VEC_EQ(d, v0, SaturatedSub(vi, vm));
+ HWY_ASSERT_VEC_EQ(d, Sub(vm, vi), SaturatedSub(vm, vi));
+ }
+};
+
+struct TestSignedSaturatingArithmetic {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto v0 = Zero(d);
+ const auto vpm = Set(d, LimitsMax<T>());
+ // Ensure all lanes are positive, even if Iota wraps around
+ const auto vi = Or(And(Iota(d, 0), vpm), Set(d, 1));
+ const auto vn = Sub(v0, vi);
+ const auto vnm = Set(d, LimitsMin<T>());
+ HWY_ASSERT_MASK_EQ(d, MaskTrue(d), Gt(vi, v0));
+ HWY_ASSERT_MASK_EQ(d, MaskTrue(d), Lt(vn, v0));
+
+ HWY_ASSERT_VEC_EQ(d, v0, SaturatedAdd(v0, v0));
+ HWY_ASSERT_VEC_EQ(d, vi, SaturatedAdd(v0, vi));
+ HWY_ASSERT_VEC_EQ(d, vpm, SaturatedAdd(v0, vpm));
+ HWY_ASSERT_VEC_EQ(d, vpm, SaturatedAdd(vi, vpm));
+ HWY_ASSERT_VEC_EQ(d, vpm, SaturatedAdd(vpm, vpm));
+
+ HWY_ASSERT_VEC_EQ(d, v0, SaturatedSub(v0, v0));
+ HWY_ASSERT_VEC_EQ(d, Sub(v0, vi), SaturatedSub(v0, vi));
+ HWY_ASSERT_VEC_EQ(d, vn, SaturatedSub(vn, v0));
+ HWY_ASSERT_VEC_EQ(d, vnm, SaturatedSub(vnm, vi));
+ HWY_ASSERT_VEC_EQ(d, vnm, SaturatedSub(vnm, vpm));
+ }
+};
+
+struct TestSaturatingArithmeticOverflow {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto v1 = Iota(d, T(1));
+ const auto vMax = Iota(d, LimitsMax<T>());
+ const auto vMin = Iota(d, LimitsMin<T>());
+
+ // Check that no UB triggered.
+ // "assert" here is formal - to avoid compiler dropping calculations
+ HWY_ASSERT_VEC_EQ(d, SaturatedAdd(v1, vMax), SaturatedAdd(vMax, v1));
+ HWY_ASSERT_VEC_EQ(d, SaturatedAdd(vMax, vMax), SaturatedAdd(vMax, vMax));
+ HWY_ASSERT_VEC_EQ(d, SaturatedAdd(vMin, vMax), SaturatedAdd(vMin, vMax));
+ HWY_ASSERT_VEC_EQ(d, SaturatedAdd(vMin, vMin), SaturatedAdd(vMin, vMin));
+ HWY_ASSERT_VEC_EQ(d, SaturatedSub(vMin, v1), SaturatedSub(vMin, v1));
+ HWY_ASSERT_VEC_EQ(d, SaturatedSub(vMin, vMax), SaturatedSub(vMin, vMax));
+ HWY_ASSERT_VEC_EQ(d, SaturatedSub(vMax, vMin), SaturatedSub(vMax, vMin));
+ HWY_ASSERT_VEC_EQ(d, SaturatedSub(vMin, vMin), SaturatedSub(vMin, vMin));
+ }
+};
+
+HWY_NOINLINE void TestAllSaturatingArithmetic() {
+ const ForPartialVectors<TestUnsignedSaturatingArithmetic> test_unsigned;
+ test_unsigned(uint8_t());
+ test_unsigned(uint16_t());
+
+ const ForPartialVectors<TestSignedSaturatingArithmetic> test_signed;
+ test_signed(int8_t());
+ test_signed(int16_t());
+
+ const ForPartialVectors<TestSaturatingArithmeticOverflow> test_overflow;
+ test_overflow(int8_t());
+ test_overflow(uint8_t());
+ test_overflow(int16_t());
+ test_overflow(uint16_t());
+}
+
+struct TestAverage {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto v0 = Zero(d);
+ const auto v1 = Set(d, T(1));
+ const auto v2 = Set(d, T(2));
+
+ HWY_ASSERT_VEC_EQ(d, v0, AverageRound(v0, v0));
+ HWY_ASSERT_VEC_EQ(d, v1, AverageRound(v0, v1));
+ HWY_ASSERT_VEC_EQ(d, v1, AverageRound(v1, v1));
+ HWY_ASSERT_VEC_EQ(d, v2, AverageRound(v1, v2));
+ HWY_ASSERT_VEC_EQ(d, v2, AverageRound(v2, v2));
+ }
+};
+
+HWY_NOINLINE void TestAllAverage() {
+ const ForPartialVectors<TestAverage> test;
+ test(uint8_t());
+ test(uint16_t());
+}
+
+struct TestAbs {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto v0 = Zero(d);
+ const auto vp1 = Set(d, T(1));
+ const auto vn1 = Set(d, T(-1));
+ const auto vpm = Set(d, LimitsMax<T>());
+ const auto vnm = Set(d, LimitsMin<T>());
+
+ HWY_ASSERT_VEC_EQ(d, v0, Abs(v0));
+ HWY_ASSERT_VEC_EQ(d, vp1, Abs(vp1));
+ HWY_ASSERT_VEC_EQ(d, vp1, Abs(vn1));
+ HWY_ASSERT_VEC_EQ(d, vpm, Abs(vpm));
+ HWY_ASSERT_VEC_EQ(d, vnm, Abs(vnm));
+ }
+};
+
+struct TestFloatAbs {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto v0 = Zero(d);
+ const auto vp1 = Set(d, T(1));
+ const auto vn1 = Set(d, T(-1));
+ const auto vp2 = Set(d, T(0.01));
+ const auto vn2 = Set(d, T(-0.01));
+
+ HWY_ASSERT_VEC_EQ(d, v0, Abs(v0));
+ HWY_ASSERT_VEC_EQ(d, vp1, Abs(vp1));
+ HWY_ASSERT_VEC_EQ(d, vp1, Abs(vn1));
+ HWY_ASSERT_VEC_EQ(d, vp2, Abs(vp2));
+ HWY_ASSERT_VEC_EQ(d, vp2, Abs(vn2));
+ }
+};
+
+HWY_NOINLINE void TestAllAbs() {
+ ForSignedTypes(ForPartialVectors<TestAbs>());
+ ForFloatTypes(ForPartialVectors<TestFloatAbs>());
+}
+
+struct TestNeg {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto v0 = Zero(d);
+ const auto vn = Set(d, T(-3));
+ const auto vp = Set(d, T(3));
+ HWY_ASSERT_VEC_EQ(d, v0, Neg(v0));
+ HWY_ASSERT_VEC_EQ(d, vp, Neg(vn));
+ HWY_ASSERT_VEC_EQ(d, vn, Neg(vp));
+ }
+};
+
+struct TestNegOverflow {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto vn = Set(d, LimitsMin<T>());
+ const auto vp = Set(d, LimitsMax<T>());
+ HWY_ASSERT_VEC_EQ(d, Neg(vn), Neg(vn));
+ HWY_ASSERT_VEC_EQ(d, Neg(vp), Neg(vp));
+ }
+};
+
+HWY_NOINLINE void TestAllNeg() {
+ ForSignedTypes(ForPartialVectors<TestNeg>());
+ ForFloatTypes(ForPartialVectors<TestNeg>());
+ ForSignedTypes(ForPartialVectors<TestNegOverflow>());
+}
+
+struct TestUnsignedMinMax {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto v0 = Zero(d);
+ // Leave headroom such that v1 < v2 even after wraparound.
+ const auto mod = And(Iota(d, 0), Set(d, LimitsMax<T>() >> 1));
+ const auto v1 = Add(mod, Set(d, 1));
+ const auto v2 = Add(mod, Set(d, 2));
+ HWY_ASSERT_VEC_EQ(d, v1, Min(v1, v2));
+ HWY_ASSERT_VEC_EQ(d, v2, Max(v1, v2));
+ HWY_ASSERT_VEC_EQ(d, v0, Min(v1, v0));
+ HWY_ASSERT_VEC_EQ(d, v1, Max(v1, v0));
+
+ const auto vmin = Set(d, LimitsMin<T>());
+ const auto vmax = Set(d, LimitsMax<T>());
+
+ HWY_ASSERT_VEC_EQ(d, vmin, Min(vmin, vmax));
+ HWY_ASSERT_VEC_EQ(d, vmin, Min(vmax, vmin));
+
+ HWY_ASSERT_VEC_EQ(d, vmax, Max(vmin, vmax));
+ HWY_ASSERT_VEC_EQ(d, vmax, Max(vmax, vmin));
+ }
+};
+
+struct TestSignedMinMax {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ // Leave headroom such that v1 < v2 even after wraparound.
+ const auto mod = And(Iota(d, 0), Set(d, LimitsMax<T>() >> 1));
+ const auto v1 = Add(mod, Set(d, 1));
+ const auto v2 = Add(mod, Set(d, 2));
+ const auto v_neg = Sub(Zero(d), v1);
+ HWY_ASSERT_VEC_EQ(d, v1, Min(v1, v2));
+ HWY_ASSERT_VEC_EQ(d, v2, Max(v1, v2));
+ HWY_ASSERT_VEC_EQ(d, v_neg, Min(v1, v_neg));
+ HWY_ASSERT_VEC_EQ(d, v1, Max(v1, v_neg));
+
+ const auto v0 = Zero(d);
+ const auto vmin = Set(d, LimitsMin<T>());
+ const auto vmax = Set(d, LimitsMax<T>());
+ HWY_ASSERT_VEC_EQ(d, vmin, Min(v0, vmin));
+ HWY_ASSERT_VEC_EQ(d, vmin, Min(vmin, v0));
+ HWY_ASSERT_VEC_EQ(d, v0, Max(v0, vmin));
+ HWY_ASSERT_VEC_EQ(d, v0, Max(vmin, v0));
+
+ HWY_ASSERT_VEC_EQ(d, vmin, Min(vmin, vmax));
+ HWY_ASSERT_VEC_EQ(d, vmin, Min(vmax, vmin));
+
+ HWY_ASSERT_VEC_EQ(d, vmax, Max(vmin, vmax));
+ HWY_ASSERT_VEC_EQ(d, vmax, Max(vmax, vmin));
+ }
+};
+
+struct TestFloatMinMax {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto v1 = Iota(d, 1);
+ const auto v2 = Iota(d, 2);
+ const auto v_neg = Iota(d, -T(Lanes(d)));
+ HWY_ASSERT_VEC_EQ(d, v1, Min(v1, v2));
+ HWY_ASSERT_VEC_EQ(d, v2, Max(v1, v2));
+ HWY_ASSERT_VEC_EQ(d, v_neg, Min(v1, v_neg));
+ HWY_ASSERT_VEC_EQ(d, v1, Max(v1, v_neg));
+
+ const auto v0 = Zero(d);
+ const auto vmin = Set(d, T(-1E30));
+ const auto vmax = Set(d, T(1E30));
+ HWY_ASSERT_VEC_EQ(d, vmin, Min(v0, vmin));
+ HWY_ASSERT_VEC_EQ(d, vmin, Min(vmin, v0));
+ HWY_ASSERT_VEC_EQ(d, v0, Max(v0, vmin));
+ HWY_ASSERT_VEC_EQ(d, v0, Max(vmin, v0));
+
+ HWY_ASSERT_VEC_EQ(d, vmin, Min(vmin, vmax));
+ HWY_ASSERT_VEC_EQ(d, vmin, Min(vmax, vmin));
+
+ HWY_ASSERT_VEC_EQ(d, vmax, Max(vmin, vmax));
+ HWY_ASSERT_VEC_EQ(d, vmax, Max(vmax, vmin));
+ }
+};
+
+HWY_NOINLINE void TestAllMinMax() {
+ ForUnsignedTypes(ForPartialVectors<TestUnsignedMinMax>());
+ ForSignedTypes(ForPartialVectors<TestSignedMinMax>());
+ ForFloatTypes(ForPartialVectors<TestFloatMinMax>());
+}
+
+template <class D>
+static HWY_NOINLINE Vec<D> Make128(D d, uint64_t hi, uint64_t lo) {
+ alignas(16) uint64_t in[2];
+ in[0] = lo;
+ in[1] = hi;
+ return LoadDup128(d, in);
+}
+
+struct TestMinMax128 {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ using V = Vec<D>;
+ const size_t N = Lanes(d);
+ auto a_lanes = AllocateAligned<T>(N);
+ auto b_lanes = AllocateAligned<T>(N);
+ auto min_lanes = AllocateAligned<T>(N);
+ auto max_lanes = AllocateAligned<T>(N);
+ RandomState rng;
+
+ const V v00 = Zero(d);
+ const V v01 = Make128(d, 0, 1);
+ const V v10 = Make128(d, 1, 0);
+ const V v11 = Add(v01, v10);
+
+ // Same arg
+ HWY_ASSERT_VEC_EQ(d, v00, Min128(d, v00, v00));
+ HWY_ASSERT_VEC_EQ(d, v01, Min128(d, v01, v01));
+ HWY_ASSERT_VEC_EQ(d, v10, Min128(d, v10, v10));
+ HWY_ASSERT_VEC_EQ(d, v11, Min128(d, v11, v11));
+ HWY_ASSERT_VEC_EQ(d, v00, Max128(d, v00, v00));
+ HWY_ASSERT_VEC_EQ(d, v01, Max128(d, v01, v01));
+ HWY_ASSERT_VEC_EQ(d, v10, Max128(d, v10, v10));
+ HWY_ASSERT_VEC_EQ(d, v11, Max128(d, v11, v11));
+
+ // First arg less
+ HWY_ASSERT_VEC_EQ(d, v00, Min128(d, v00, v01));
+ HWY_ASSERT_VEC_EQ(d, v01, Min128(d, v01, v10));
+ HWY_ASSERT_VEC_EQ(d, v10, Min128(d, v10, v11));
+ HWY_ASSERT_VEC_EQ(d, v01, Max128(d, v00, v01));
+ HWY_ASSERT_VEC_EQ(d, v10, Max128(d, v01, v10));
+ HWY_ASSERT_VEC_EQ(d, v11, Max128(d, v10, v11));
+
+ // Second arg less
+ HWY_ASSERT_VEC_EQ(d, v00, Min128(d, v01, v00));
+ HWY_ASSERT_VEC_EQ(d, v01, Min128(d, v10, v01));
+ HWY_ASSERT_VEC_EQ(d, v10, Min128(d, v11, v10));
+ HWY_ASSERT_VEC_EQ(d, v01, Max128(d, v01, v00));
+ HWY_ASSERT_VEC_EQ(d, v10, Max128(d, v10, v01));
+ HWY_ASSERT_VEC_EQ(d, v11, Max128(d, v11, v10));
+
+ // Also check 128-bit blocks are independent
+ for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
+ for (size_t i = 0; i < N; ++i) {
+ a_lanes[i] = Random64(&rng);
+ b_lanes[i] = Random64(&rng);
+ }
+ const V a = Load(d, a_lanes.get());
+ const V b = Load(d, b_lanes.get());
+ for (size_t i = 0; i < N; i += 2) {
+ const bool lt = a_lanes[i + 1] == b_lanes[i + 1]
+ ? (a_lanes[i] < b_lanes[i])
+ : (a_lanes[i + 1] < b_lanes[i + 1]);
+ min_lanes[i + 0] = lt ? a_lanes[i + 0] : b_lanes[i + 0];
+ min_lanes[i + 1] = lt ? a_lanes[i + 1] : b_lanes[i + 1];
+ max_lanes[i + 0] = lt ? b_lanes[i + 0] : a_lanes[i + 0];
+ max_lanes[i + 1] = lt ? b_lanes[i + 1] : a_lanes[i + 1];
+ }
+ HWY_ASSERT_VEC_EQ(d, min_lanes.get(), Min128(d, a, b));
+ HWY_ASSERT_VEC_EQ(d, max_lanes.get(), Max128(d, a, b));
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllMinMax128() {
+ ForGEVectors<128, TestMinMax128>()(uint64_t());
+}
+
+struct TestMinMax128Upper {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ using V = Vec<D>;
+ const size_t N = Lanes(d);
+ auto a_lanes = AllocateAligned<T>(N);
+ auto b_lanes = AllocateAligned<T>(N);
+ auto min_lanes = AllocateAligned<T>(N);
+ auto max_lanes = AllocateAligned<T>(N);
+ RandomState rng;
+
+ const V v00 = Zero(d);
+ const V v01 = Make128(d, 0, 1);
+ const V v10 = Make128(d, 1, 0);
+ const V v11 = Add(v01, v10);
+
+ // Same arg
+ HWY_ASSERT_VEC_EQ(d, v00, Min128Upper(d, v00, v00));
+ HWY_ASSERT_VEC_EQ(d, v01, Min128Upper(d, v01, v01));
+ HWY_ASSERT_VEC_EQ(d, v10, Min128Upper(d, v10, v10));
+ HWY_ASSERT_VEC_EQ(d, v11, Min128Upper(d, v11, v11));
+ HWY_ASSERT_VEC_EQ(d, v00, Max128Upper(d, v00, v00));
+ HWY_ASSERT_VEC_EQ(d, v01, Max128Upper(d, v01, v01));
+ HWY_ASSERT_VEC_EQ(d, v10, Max128Upper(d, v10, v10));
+ HWY_ASSERT_VEC_EQ(d, v11, Max128Upper(d, v11, v11));
+
+ // Equivalent but not equal (chooses second arg)
+ HWY_ASSERT_VEC_EQ(d, v01, Min128Upper(d, v00, v01));
+ HWY_ASSERT_VEC_EQ(d, v11, Min128Upper(d, v10, v11));
+ HWY_ASSERT_VEC_EQ(d, v00, Min128Upper(d, v01, v00));
+ HWY_ASSERT_VEC_EQ(d, v10, Min128Upper(d, v11, v10));
+ HWY_ASSERT_VEC_EQ(d, v00, Max128Upper(d, v01, v00));
+ HWY_ASSERT_VEC_EQ(d, v10, Max128Upper(d, v11, v10));
+ HWY_ASSERT_VEC_EQ(d, v01, Max128Upper(d, v00, v01));
+ HWY_ASSERT_VEC_EQ(d, v11, Max128Upper(d, v10, v11));
+
+ // First arg less
+ HWY_ASSERT_VEC_EQ(d, v01, Min128Upper(d, v01, v10));
+ HWY_ASSERT_VEC_EQ(d, v10, Max128Upper(d, v01, v10));
+
+ // Second arg less
+ HWY_ASSERT_VEC_EQ(d, v01, Min128Upper(d, v10, v01));
+ HWY_ASSERT_VEC_EQ(d, v10, Max128Upper(d, v10, v01));
+
+ // Also check 128-bit blocks are independent
+ for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
+ for (size_t i = 0; i < N; ++i) {
+ a_lanes[i] = Random64(&rng);
+ b_lanes[i] = Random64(&rng);
+ }
+ const V a = Load(d, a_lanes.get());
+ const V b = Load(d, b_lanes.get());
+ for (size_t i = 0; i < N; i += 2) {
+ const bool lt = a_lanes[i + 1] < b_lanes[i + 1];
+ min_lanes[i + 0] = lt ? a_lanes[i + 0] : b_lanes[i + 0];
+ min_lanes[i + 1] = lt ? a_lanes[i + 1] : b_lanes[i + 1];
+ max_lanes[i + 0] = lt ? b_lanes[i + 0] : a_lanes[i + 0];
+ max_lanes[i + 1] = lt ? b_lanes[i + 1] : a_lanes[i + 1];
+ }
+ HWY_ASSERT_VEC_EQ(d, min_lanes.get(), Min128Upper(d, a, b));
+ HWY_ASSERT_VEC_EQ(d, max_lanes.get(), Max128Upper(d, a, b));
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllMinMax128Upper() {
+ ForGEVectors<128, TestMinMax128Upper>()(uint64_t());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyArithmeticTest);
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllPlusMinus);
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSaturatingArithmetic);
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAverage);
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAbs);
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllNeg);
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax);
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax128);
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax128Upper);
+} // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/blockwise_shift_test.cc b/third_party/highway/hwy/tests/blockwise_shift_test.cc
new file mode 100644
index 0000000000..4e5250841b
--- /dev/null
+++ b/third_party/highway/hwy/tests/blockwise_shift_test.cc
@@ -0,0 +1,270 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h> // memcpy
+
+#include <algorithm> // std::fill
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/blockwise_shift_test.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestShiftBytes {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ // Scalar does not define Shift*Bytes.
+#if HWY_TARGET != HWY_SCALAR || HWY_IDE
+ const Repartition<uint8_t, D> du8;
+ const size_t N8 = Lanes(du8);
+
+ // Zero remains zero
+ const auto v0 = Zero(d);
+ HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(v0));
+ HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(d, v0));
+ HWY_ASSERT_VEC_EQ(d, v0, ShiftRightBytes<1>(d, v0));
+
+ // Zero after shifting out the high/low byte
+ auto bytes = AllocateAligned<uint8_t>(N8);
+ std::fill(bytes.get(), bytes.get() + N8, 0);
+ bytes[N8 - 1] = 0x7F;
+ const auto vhi = BitCast(d, Load(du8, bytes.get()));
+ bytes[N8 - 1] = 0;
+ bytes[0] = 0x7F;
+ const auto vlo = BitCast(d, Load(du8, bytes.get()));
+ HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(vhi));
+ HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(d, vhi));
+ HWY_ASSERT_VEC_EQ(d, v0, ShiftRightBytes<1>(d, vlo));
+
+ // Check expected result with Iota
+ const size_t N = Lanes(d);
+ auto in = AllocateAligned<T>(N);
+ const uint8_t* in_bytes = reinterpret_cast<const uint8_t*>(in.get());
+ const auto v = BitCast(d, Iota(du8, 1));
+ Store(v, d, in.get());
+
+ auto expected = AllocateAligned<T>(N);
+ uint8_t* expected_bytes = reinterpret_cast<uint8_t*>(expected.get());
+
+ const size_t block_size = HWY_MIN(N8, 16);
+ for (size_t block = 0; block < N8; block += block_size) {
+ expected_bytes[block] = 0;
+ memcpy(expected_bytes + block + 1, in_bytes + block, block_size - 1);
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftBytes<1>(v));
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftBytes<1>(d, v));
+
+ for (size_t block = 0; block < N8; block += block_size) {
+ memcpy(expected_bytes + block, in_bytes + block + 1, block_size - 1);
+ expected_bytes[block + block_size - 1] = 0;
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightBytes<1>(d, v));
+#else
+ (void)d;
+#endif // #if HWY_TARGET != HWY_SCALAR
+ }
+};
+
+HWY_NOINLINE void TestAllShiftBytes() {
+ ForIntegerTypes(ForPartialVectors<TestShiftBytes>());
+}
+
+struct TestShiftLeftLanes {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ // Scalar does not define Shift*Lanes.
+#if HWY_TARGET != HWY_SCALAR || HWY_IDE
+ const auto v = Iota(d, T(1));
+ const size_t N = Lanes(d);
+ if (N == 1) return;
+ auto expected = AllocateAligned<T>(N);
+
+ HWY_ASSERT_VEC_EQ(d, v, ShiftLeftLanes<0>(v));
+ HWY_ASSERT_VEC_EQ(d, v, ShiftLeftLanes<0>(d, v));
+
+ constexpr size_t kLanesPerBlock = 16 / sizeof(T);
+
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = (i % kLanesPerBlock) == 0 ? T(0) : T(i);
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftLanes<1>(v));
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftLanes<1>(d, v));
+#else
+ (void)d;
+#endif // #if HWY_TARGET != HWY_SCALAR
+ }
+};
+
+struct TestShiftRightLanes {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ // Scalar does not define Shift*Lanes.
+#if HWY_TARGET != HWY_SCALAR || HWY_IDE
+ const auto v = Iota(d, T(1));
+ const size_t N = Lanes(d);
+ if (N == 1) return;
+ auto expected = AllocateAligned<T>(N);
+
+ HWY_ASSERT_VEC_EQ(d, v, ShiftRightLanes<0>(d, v));
+
+ constexpr size_t kLanesPerBlock = 16 / sizeof(T);
+
+ for (size_t i = 0; i < N; ++i) {
+ const size_t mod = i % kLanesPerBlock;
+ expected[i] = mod == (kLanesPerBlock - 1) || i >= N - 1 ? T(0) : T(2 + i);
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightLanes<1>(d, v));
+#else
+ (void)d;
+#endif // #if HWY_TARGET != HWY_SCALAR
+ }
+};
+
+HWY_NOINLINE void TestAllShiftLeftLanes() {
+ ForAllTypes(ForPartialVectors<TestShiftLeftLanes>());
+}
+
+HWY_NOINLINE void TestAllShiftRightLanes() {
+ ForAllTypes(ForPartialVectors<TestShiftRightLanes>());
+}
+
+// Scalar does not define CombineShiftRightBytes.
+#if HWY_TARGET != HWY_SCALAR || HWY_IDE
+
+template <int kBytes>
+struct TestCombineShiftRightBytes {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T, D d) {
+ constexpr size_t kBlockSize = 16;
+ static_assert(kBytes < kBlockSize, "Shift count is per block");
+ const Repartition<uint8_t, D> d8;
+ const size_t N8 = Lanes(d8);
+ if (N8 < 16) return;
+ auto hi_bytes = AllocateAligned<uint8_t>(N8);
+ auto lo_bytes = AllocateAligned<uint8_t>(N8);
+ auto expected_bytes = AllocateAligned<uint8_t>(N8);
+ uint8_t combined[2 * kBlockSize];
+
+ // Random inputs in each lane
+ RandomState rng;
+ for (size_t rep = 0; rep < AdjustedReps(100); ++rep) {
+ for (size_t i = 0; i < N8; ++i) {
+ hi_bytes[i] = static_cast<uint8_t>(Random64(&rng) & 0xFF);
+ lo_bytes[i] = static_cast<uint8_t>(Random64(&rng) & 0xFF);
+ }
+ for (size_t i = 0; i < N8; i += kBlockSize) {
+ // Arguments are not the same size.
+ CopyBytes<kBlockSize>(&lo_bytes[i], combined);
+ CopyBytes<kBlockSize>(&hi_bytes[i], combined + kBlockSize);
+ CopyBytes<kBlockSize>(combined + kBytes, &expected_bytes[i]);
+ }
+
+ const auto hi = BitCast(d, Load(d8, hi_bytes.get()));
+ const auto lo = BitCast(d, Load(d8, lo_bytes.get()));
+ const auto expected = BitCast(d, Load(d8, expected_bytes.get()));
+ HWY_ASSERT_VEC_EQ(d, expected, CombineShiftRightBytes<kBytes>(d, hi, lo));
+ }
+ }
+};
+
+template <int kLanes>
+struct TestCombineShiftRightLanes {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T, D d) {
+ const Repartition<uint8_t, D> d8;
+ const size_t N8 = Lanes(d8);
+ if (N8 < 16) return;
+
+ auto hi_bytes = AllocateAligned<uint8_t>(N8);
+ auto lo_bytes = AllocateAligned<uint8_t>(N8);
+ auto expected_bytes = AllocateAligned<uint8_t>(N8);
+ constexpr size_t kBlockSize = 16;
+ uint8_t combined[2 * kBlockSize];
+
+ // Random inputs in each lane
+ RandomState rng;
+ for (size_t rep = 0; rep < AdjustedReps(100); ++rep) {
+ for (size_t i = 0; i < N8; ++i) {
+ hi_bytes[i] = static_cast<uint8_t>(Random64(&rng) & 0xFF);
+ lo_bytes[i] = static_cast<uint8_t>(Random64(&rng) & 0xFF);
+ }
+ for (size_t i = 0; i < N8; i += kBlockSize) {
+ // Arguments are not the same size.
+ CopyBytes<kBlockSize>(&lo_bytes[i], combined);
+ CopyBytes<kBlockSize>(&hi_bytes[i], combined + kBlockSize);
+ CopyBytes<kBlockSize>(combined + kLanes * sizeof(T),
+ &expected_bytes[i]);
+ }
+
+ const auto hi = BitCast(d, Load(d8, hi_bytes.get()));
+ const auto lo = BitCast(d, Load(d8, lo_bytes.get()));
+ const auto expected = BitCast(d, Load(d8, expected_bytes.get()));
+ HWY_ASSERT_VEC_EQ(d, expected, CombineShiftRightLanes<kLanes>(d, hi, lo));
+ }
+ }
+};
+
+#endif // #if HWY_TARGET != HWY_SCALAR
+
+struct TestCombineShiftRight {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T t, D d) {
+// Scalar does not define CombineShiftRightBytes.
+#if HWY_TARGET != HWY_SCALAR || HWY_IDE
+ constexpr int kMaxBytes =
+ HWY_MIN(16, static_cast<int>(MaxLanes(d) * sizeof(T)));
+ constexpr int kMaxLanes = kMaxBytes / static_cast<int>(sizeof(T));
+ TestCombineShiftRightBytes<kMaxBytes - 1>()(t, d);
+ TestCombineShiftRightBytes<HWY_MAX(kMaxBytes / 2, 1)>()(t, d);
+ TestCombineShiftRightBytes<1>()(t, d);
+
+ TestCombineShiftRightLanes<kMaxLanes - 1>()(t, d);
+ TestCombineShiftRightLanes<HWY_MAX(kMaxLanes / 2, -1)>()(t, d);
+ TestCombineShiftRightLanes<1>()(t, d);
+#else
+ (void)t;
+ (void)d;
+#endif
+ }
+};
+
+HWY_NOINLINE void TestAllCombineShiftRight() {
+ // Need at least 2 lanes.
+ ForAllTypes(ForShrinkableVectors<TestCombineShiftRight>());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyBlockwiseShiftTest);
+HWY_EXPORT_AND_TEST_P(HwyBlockwiseShiftTest, TestAllShiftBytes);
+HWY_EXPORT_AND_TEST_P(HwyBlockwiseShiftTest, TestAllShiftLeftLanes);
+HWY_EXPORT_AND_TEST_P(HwyBlockwiseShiftTest, TestAllShiftRightLanes);
+HWY_EXPORT_AND_TEST_P(HwyBlockwiseShiftTest, TestAllCombineShiftRight);
+} // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/blockwise_test.cc b/third_party/highway/hwy/tests/blockwise_test.cc
new file mode 100644
index 0000000000..e5ac9ab362
--- /dev/null
+++ b/third_party/highway/hwy/tests/blockwise_test.cc
@@ -0,0 +1,454 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <algorithm> // std::fill
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/blockwise_test.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+template <typename D, int kLane>
+struct TestBroadcastR {
+ HWY_NOINLINE void operator()() const {
+ using T = typename D::T;
+ const D d;
+ const size_t N = Lanes(d);
+ if (kLane >= N) return;
+ auto in_lanes = AllocateAligned<T>(N);
+ std::fill(in_lanes.get(), in_lanes.get() + N, T(0));
+ const size_t blockN = HWY_MIN(N * sizeof(T), 16) / sizeof(T);
+ // Need to set within each 128-bit block
+ for (size_t block = 0; block < N; block += blockN) {
+ in_lanes[block + kLane] = static_cast<T>(block + 1);
+ }
+ const auto in = Load(d, in_lanes.get());
+ auto expected = AllocateAligned<T>(N);
+ for (size_t block = 0; block < N; block += blockN) {
+ for (size_t i = 0; i < blockN; ++i) {
+ expected[block + i] = T(block + 1);
+ }
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), Broadcast<kLane>(in));
+
+ TestBroadcastR<D, kLane - 1>()();
+ }
+};
+
+template <class D>
+struct TestBroadcastR<D, -1> {
+ void operator()() const {}
+};
+
+struct TestBroadcast {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ TestBroadcastR<D, HWY_MIN(MaxLanes(d), 16 / sizeof(T)) - 1>()();
+ }
+};
+
+HWY_NOINLINE void TestAllBroadcast() {
+ const ForPartialVectors<TestBroadcast> test;
+ // No u/i8.
+ test(uint16_t());
+ test(int16_t());
+ ForUIF3264(test);
+}
+
+template <bool kFull>
+struct ChooseTableSize {
+ template <typename T, typename DIdx>
+ using type = DIdx;
+};
+template <>
+struct ChooseTableSize<true> {
+ template <typename T, typename DIdx>
+ using type = ScalableTag<T>;
+};
+
+template <bool kFull>
+struct TestTableLookupBytes {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+#if HWY_TARGET != HWY_SCALAR
+ RandomState rng;
+
+ const typename ChooseTableSize<kFull>::template type<T, D> d_tbl;
+ const Repartition<uint8_t, decltype(d_tbl)> d_tbl8;
+ const size_t NT8 = Lanes(d_tbl8);
+
+ const Repartition<uint8_t, D> d8;
+ const size_t N8 = Lanes(d8);
+
+ // Random input bytes
+ auto in_bytes = AllocateAligned<uint8_t>(NT8);
+ for (size_t i = 0; i < NT8; ++i) {
+ in_bytes[i] = Random32(&rng) & 0xFF;
+ }
+ const auto in = BitCast(d_tbl, Load(d_tbl8, in_bytes.get()));
+
+ // Enough test data; for larger vectors, upper lanes will be zero.
+ const uint8_t index_bytes_source[64] = {
+ // Same index as source, multiple outputs from same input,
+ // unused input (9), ascending/descending and nonconsecutive neighbors.
+ 0, 2, 1, 2, 15, 12, 13, 14, 6, 7, 8, 5, 4, 3, 10, 11,
+ 11, 10, 3, 4, 5, 8, 7, 6, 14, 13, 12, 15, 2, 1, 2, 0,
+ 4, 3, 2, 2, 5, 6, 7, 7, 15, 15, 15, 15, 15, 15, 0, 1};
+ auto index_bytes = AllocateAligned<uint8_t>(N8);
+ const size_t max_index = HWY_MIN(NT8, 16) - 1;
+ for (size_t i = 0; i < N8; ++i) {
+ index_bytes[i] = (i < 64) ? index_bytes_source[i] : 0;
+ // Avoid asan error for partial vectors.
+ index_bytes[i] = static_cast<uint8_t>(HWY_MIN(index_bytes[i], max_index));
+ }
+ const auto indices = Load(d, reinterpret_cast<const T*>(index_bytes.get()));
+
+ const size_t N = Lanes(d);
+ auto expected = AllocateAligned<T>(N);
+ uint8_t* expected_bytes = reinterpret_cast<uint8_t*>(expected.get());
+
+ for (size_t block = 0; block < N8; block += 16) {
+ for (size_t i = 0; i < 16 && (block + i) < N8; ++i) {
+ const uint8_t index = index_bytes[block + i];
+ HWY_ASSERT(index <= max_index);
+ // Note that block + index may exceed NT8 on RVV, which is fine because
+ // the operation uses the larger of the table and index vector size.
+ HWY_ASSERT(block + index < HWY_MAX(N8, NT8));
+ // For large vectors, the lane index may wrap around due to block,
+ // also wrap around after 8-bit overflow.
+ expected_bytes[block + i] =
+ in_bytes[(block + index) % HWY_MIN(NT8, 256)];
+ }
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), TableLookupBytes(in, indices));
+
+ // Individually test zeroing each byte position.
+ for (size_t i = 0; i < N8; ++i) {
+ const uint8_t prev_expected = expected_bytes[i];
+ const uint8_t prev_index = index_bytes[i];
+ expected_bytes[i] = 0;
+
+ const int idx = 0x80 + (static_cast<int>(Random32(&rng) & 7) << 4);
+ HWY_ASSERT(0x80 <= idx && idx < 256);
+ index_bytes[i] = static_cast<uint8_t>(idx);
+
+ const auto indices =
+ Load(d, reinterpret_cast<const T*>(index_bytes.get()));
+ HWY_ASSERT_VEC_EQ(d, expected.get(), TableLookupBytesOr0(in, indices));
+ expected_bytes[i] = prev_expected;
+ index_bytes[i] = prev_index;
+ }
+#else
+ (void)d;
+#endif
+ }
+};
+
+HWY_NOINLINE void TestAllTableLookupBytesSame() {
+ // Partial index, same-sized table.
+ ForIntegerTypes(ForPartialVectors<TestTableLookupBytes<false>>());
+}
+
+HWY_NOINLINE void TestAllTableLookupBytesMixed() {
+ // Partial index, full-size table.
+ ForIntegerTypes(ForPartialVectors<TestTableLookupBytes<true>>());
+}
+
+struct TestInterleaveLower {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ using TU = MakeUnsigned<T>;
+ const size_t N = Lanes(d);
+ auto even_lanes = AllocateAligned<T>(N);
+ auto odd_lanes = AllocateAligned<T>(N);
+ auto expected = AllocateAligned<T>(N);
+ for (size_t i = 0; i < N; ++i) {
+ even_lanes[i] = static_cast<T>(2 * i + 0);
+ odd_lanes[i] = static_cast<T>(2 * i + 1);
+ }
+ const auto even = Load(d, even_lanes.get());
+ const auto odd = Load(d, odd_lanes.get());
+
+ const size_t blockN = HWY_MIN(16 / sizeof(T), N);
+ for (size_t i = 0; i < Lanes(d); ++i) {
+ const size_t block = i / blockN;
+ const size_t index = (i % blockN) + block * 2 * blockN;
+ expected[i] = static_cast<T>(index & LimitsMax<TU>());
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveLower(even, odd));
+ HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveLower(d, even, odd));
+ }
+};
+
+struct TestInterleaveUpper {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const size_t N = Lanes(d);
+ if (N == 1) return;
+ auto even_lanes = AllocateAligned<T>(N);
+ auto odd_lanes = AllocateAligned<T>(N);
+ auto expected = AllocateAligned<T>(N);
+ for (size_t i = 0; i < N; ++i) {
+ even_lanes[i] = static_cast<T>(2 * i + 0);
+ odd_lanes[i] = static_cast<T>(2 * i + 1);
+ }
+ const auto even = Load(d, even_lanes.get());
+ const auto odd = Load(d, odd_lanes.get());
+
+ const size_t blockN = HWY_MIN(16 / sizeof(T), N);
+ for (size_t i = 0; i < Lanes(d); ++i) {
+ const size_t block = i / blockN;
+ expected[i] = T((i % blockN) + block * 2 * blockN + blockN);
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveUpper(d, even, odd));
+ }
+};
+
+HWY_NOINLINE void TestAllInterleave() {
+ // Not DemoteVectors because this cannot be supported by HWY_SCALAR.
+ ForAllTypes(ForShrinkableVectors<TestInterleaveLower>());
+ ForAllTypes(ForShrinkableVectors<TestInterleaveUpper>());
+}
+
+struct TestZipLower {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ using WideT = MakeWide<T>;
+ static_assert(sizeof(T) * 2 == sizeof(WideT), "Must be double-width");
+ static_assert(IsSigned<T>() == IsSigned<WideT>(), "Must have same sign");
+ const size_t N = Lanes(d);
+ auto even_lanes = AllocateAligned<T>(N);
+ auto odd_lanes = AllocateAligned<T>(N);
+ // At least 2 lanes for HWY_SCALAR
+ auto zip_lanes = AllocateAligned<T>(HWY_MAX(N, 2));
+ const T kMaxT = LimitsMax<T>();
+ for (size_t i = 0; i < N; ++i) {
+ even_lanes[i] = static_cast<T>((2 * i + 0) & kMaxT);
+ odd_lanes[i] = static_cast<T>((2 * i + 1) & kMaxT);
+ }
+ const auto even = Load(d, even_lanes.get());
+ const auto odd = Load(d, odd_lanes.get());
+
+ const Repartition<WideT, D> dw;
+#if HWY_TARGET == HWY_SCALAR
+ // Safely handle big-endian
+ const auto expected = Set(dw, static_cast<WideT>(1ULL << (sizeof(T) * 8)));
+#else
+ const size_t blockN = HWY_MIN(size_t(16) / sizeof(T), N);
+ for (size_t i = 0; i < N; i += 2) {
+ const size_t base = (i / blockN) * blockN;
+ const size_t mod = i % blockN;
+ zip_lanes[i + 0] = even_lanes[mod / 2 + base];
+ zip_lanes[i + 1] = odd_lanes[mod / 2 + base];
+ }
+ const auto expected =
+ Load(dw, reinterpret_cast<const WideT*>(zip_lanes.get()));
+#endif // HWY_TARGET == HWY_SCALAR
+ HWY_ASSERT_VEC_EQ(dw, expected, ZipLower(even, odd));
+ HWY_ASSERT_VEC_EQ(dw, expected, ZipLower(dw, even, odd));
+ }
+};
+
+HWY_NOINLINE void TestAllZipLower() {
+ const ForDemoteVectors<TestZipLower> lower_unsigned;
+ lower_unsigned(uint8_t());
+ lower_unsigned(uint16_t());
+#if HWY_HAVE_INTEGER64
+ lower_unsigned(uint32_t()); // generates u64
+#endif
+
+ const ForDemoteVectors<TestZipLower> lower_signed;
+ lower_signed(int8_t());
+ lower_signed(int16_t());
+#if HWY_HAVE_INTEGER64
+ lower_signed(int32_t()); // generates i64
+#endif
+
+ // No float - concatenating f32 does not result in a f64
+}
+
+// Remove this test (so it does not show as having run) if the only target is
+// HWY_SCALAR, which does not support this op.
+#if HWY_TARGETS != HWY_SCALAR
+
+struct TestZipUpper {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+#if HWY_TARGET == HWY_SCALAR
+ (void)d;
+#else
+ using WideT = MakeWide<T>;
+ static_assert(sizeof(T) * 2 == sizeof(WideT), "Must be double-width");
+ static_assert(IsSigned<T>() == IsSigned<WideT>(), "Must have same sign");
+ const size_t N = Lanes(d);
+ if (N < 16 / sizeof(T)) return;
+ auto even_lanes = AllocateAligned<T>(N);
+ auto odd_lanes = AllocateAligned<T>(N);
+ auto zip_lanes = AllocateAligned<T>(N);
+ const T kMaxT = LimitsMax<T>();
+ for (size_t i = 0; i < N; ++i) {
+ even_lanes[i] = static_cast<T>((2 * i + 0) & kMaxT);
+ odd_lanes[i] = static_cast<T>((2 * i + 1) & kMaxT);
+ }
+ const auto even = Load(d, even_lanes.get());
+ const auto odd = Load(d, odd_lanes.get());
+
+ const size_t blockN = HWY_MIN(size_t(16) / sizeof(T), N);
+
+ for (size_t i = 0; i < N; i += 2) {
+ const size_t base = (i / blockN) * blockN + blockN / 2;
+ const size_t mod = i % blockN;
+ zip_lanes[i + 0] = even_lanes[mod / 2 + base];
+ zip_lanes[i + 1] = odd_lanes[mod / 2 + base];
+ }
+ const Repartition<WideT, D> dw;
+ const auto expected =
+ Load(dw, reinterpret_cast<const WideT*>(zip_lanes.get()));
+ HWY_ASSERT_VEC_EQ(dw, expected, ZipUpper(dw, even, odd));
+#endif // HWY_TARGET == HWY_SCALAR
+ }
+};
+
+HWY_NOINLINE void TestAllZipUpper() {
+ const ForShrinkableVectors<TestZipUpper> upper_unsigned;
+ upper_unsigned(uint8_t());
+ upper_unsigned(uint16_t());
+#if HWY_HAVE_INTEGER64
+ upper_unsigned(uint32_t()); // generates u64
+#endif
+
+ const ForShrinkableVectors<TestZipUpper> upper_signed;
+ upper_signed(int8_t());
+ upper_signed(int16_t());
+#if HWY_HAVE_INTEGER64
+ upper_signed(int32_t()); // generates i64
+#endif
+
+ // No float - concatenating f32 does not result in a f64
+}
+
+#endif // HWY_TARGETS != HWY_SCALAR
+
+class TestSpecialShuffle32 {
+ public:
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto v = Iota(d, 0);
+ VerifyLanes32(d, Shuffle2301(v), 2, 3, 0, 1, __FILE__, __LINE__);
+ VerifyLanes32(d, Shuffle1032(v), 1, 0, 3, 2, __FILE__, __LINE__);
+ VerifyLanes32(d, Shuffle0321(v), 0, 3, 2, 1, __FILE__, __LINE__);
+ VerifyLanes32(d, Shuffle2103(v), 2, 1, 0, 3, __FILE__, __LINE__);
+ VerifyLanes32(d, Shuffle0123(v), 0, 1, 2, 3, __FILE__, __LINE__);
+ }
+
+ private:
+ // HWY_INLINE works around a Clang SVE compiler bug where all but the first
+ // 128 bits (the NEON register) of actual are zero.
+ template <class D, class V>
+ HWY_INLINE void VerifyLanes32(D d, VecArg<V> actual, const size_t i3,
+ const size_t i2, const size_t i1,
+ const size_t i0, const char* filename,
+ const int line) {
+ using T = TFromD<D>;
+ constexpr size_t kBlockN = 16 / sizeof(T);
+ const size_t N = Lanes(d);
+ if (N < 4) return;
+ auto expected = AllocateAligned<T>(N);
+ for (size_t block = 0; block < N; block += kBlockN) {
+ expected[block + 3] = static_cast<T>(block + i3);
+ expected[block + 2] = static_cast<T>(block + i2);
+ expected[block + 1] = static_cast<T>(block + i1);
+ expected[block + 0] = static_cast<T>(block + i0);
+ }
+ AssertVecEqual(d, expected.get(), actual, filename, line);
+ }
+};
+
+class TestSpecialShuffle64 {
+ public:
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto v = Iota(d, 0);
+ VerifyLanes64(d, Shuffle01(v), 0, 1, __FILE__, __LINE__);
+ }
+
+ private:
+ // HWY_INLINE works around a Clang SVE compiler bug where all but the first
+ // 128 bits (the NEON register) of actual are zero.
+ template <class D, class V>
+ HWY_INLINE void VerifyLanes64(D d, VecArg<V> actual, const size_t i1,
+ const size_t i0, const char* filename,
+ const int line) {
+ using T = TFromD<D>;
+ constexpr size_t kBlockN = 16 / sizeof(T);
+ const size_t N = Lanes(d);
+ if (N < 2) return;
+ auto expected = AllocateAligned<T>(N);
+ for (size_t block = 0; block < N; block += kBlockN) {
+ expected[block + 1] = static_cast<T>(block + i1);
+ expected[block + 0] = static_cast<T>(block + i0);
+ }
+ AssertVecEqual(d, expected.get(), actual, filename, line);
+ }
+};
+
+HWY_NOINLINE void TestAllSpecialShuffles() {
+ const ForGEVectors<128, TestSpecialShuffle32> test32;
+ test32(uint32_t());
+ test32(int32_t());
+ test32(float());
+
+#if HWY_HAVE_INTEGER64
+ const ForGEVectors<128, TestSpecialShuffle64> test64;
+ test64(uint64_t());
+ test64(int64_t());
+#endif
+
+#if HWY_HAVE_FLOAT64
+ const ForGEVectors<128, TestSpecialShuffle64> test_d;
+ test_d(double());
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyBlockwiseTest);
+HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllBroadcast);
+HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllTableLookupBytesSame);
+HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllTableLookupBytesMixed);
+HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllInterleave);
+HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllZipLower);
+#if HWY_TARGETS != HWY_SCALAR
+HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllZipUpper);
+#endif
+HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllSpecialShuffles);
+} // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/combine_test.cc b/third_party/highway/hwy/tests/combine_test.cc
new file mode 100644
index 0000000000..e2f4cbeb00
--- /dev/null
+++ b/third_party/highway/hwy/tests/combine_test.cc
@@ -0,0 +1,275 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h> // memcpy
+
+#include <algorithm> // std::fill
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/combine_test.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestLowerHalf {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const Half<D> d2;
+
+ const size_t N = Lanes(d);
+ auto lanes = AllocateAligned<T>(N);
+ auto lanes2 = AllocateAligned<T>(N);
+ std::fill(lanes.get(), lanes.get() + N, T(0));
+ std::fill(lanes2.get(), lanes2.get() + N, T(0));
+ const auto v = Iota(d, 1);
+ Store(LowerHalf(d2, v), d2, lanes.get());
+ Store(LowerHalf(v), d2, lanes2.get()); // optionally without D
+ size_t i = 0;
+ for (; i < Lanes(d2); ++i) {
+ HWY_ASSERT_EQ(T(1 + i), lanes[i]);
+ HWY_ASSERT_EQ(T(1 + i), lanes2[i]);
+ }
+ // Other half remains unchanged
+ for (; i < N; ++i) {
+ HWY_ASSERT_EQ(T(0), lanes[i]);
+ HWY_ASSERT_EQ(T(0), lanes2[i]);
+ }
+ }
+};
+
+struct TestLowerQuarter {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const Half<D> d2;
+ const Half<decltype(d2)> d4;
+
+ const size_t N = Lanes(d);
+ auto lanes = AllocateAligned<T>(N);
+ auto lanes2 = AllocateAligned<T>(N);
+ std::fill(lanes.get(), lanes.get() + N, T(0));
+ std::fill(lanes2.get(), lanes2.get() + N, T(0));
+ const auto v = Iota(d, 1);
+ const auto lo = LowerHalf(d4, LowerHalf(d2, v));
+ const auto lo2 = LowerHalf(LowerHalf(v)); // optionally without D
+ Store(lo, d4, lanes.get());
+ Store(lo2, d4, lanes2.get());
+ size_t i = 0;
+ for (; i < Lanes(d4); ++i) {
+ HWY_ASSERT_EQ(T(i + 1), lanes[i]);
+ HWY_ASSERT_EQ(T(i + 1), lanes2[i]);
+ }
+ // Upper 3/4 remain unchanged
+ for (; i < N; ++i) {
+ HWY_ASSERT_EQ(T(0), lanes[i]);
+ HWY_ASSERT_EQ(T(0), lanes2[i]);
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllLowerHalf() {
+ ForAllTypes(ForHalfVectors<TestLowerHalf>());
+
+ // The minimum vector size is 128 bits, so there's no guarantee we can have
+ // quarters of 64-bit lanes, hence test 'all' other types.
+ ForHalfVectors<TestLowerQuarter, 2> test_quarter;
+ ForUI8(test_quarter);
+ ForUI16(test_quarter); // exclude float16_t - cannot compare
+ ForUIF32(test_quarter);
+}
+
+struct TestUpperHalf {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ // Scalar does not define UpperHalf.
+#if HWY_TARGET != HWY_SCALAR
+ const Half<D> d2;
+ const size_t N2 = Lanes(d2);
+ HWY_ASSERT(N2 * 2 == Lanes(d));
+ auto expected = AllocateAligned<T>(N2);
+ size_t i = 0;
+ for (; i < N2; ++i) {
+ expected[i] = static_cast<T>(N2 + 1 + i);
+ }
+ HWY_ASSERT_VEC_EQ(d2, expected.get(), UpperHalf(d2, Iota(d, 1)));
+#else
+ (void)d;
+#endif
+ }
+};
+
+HWY_NOINLINE void TestAllUpperHalf() {
+ ForAllTypes(ForHalfVectors<TestUpperHalf>());
+}
+
+struct TestZeroExtendVector {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const Twice<D> d2;
+
+ const auto v = Iota(d, 1);
+ const size_t N = Lanes(d);
+ const size_t N2 = Lanes(d2);
+ // If equal, then N was already MaxLanes(d) and it's not clear what
+ // Combine or ZeroExtendVector should return.
+ if (N2 == N) return;
+ HWY_ASSERT(N2 == 2 * N);
+ auto lanes = AllocateAligned<T>(N2);
+ Store(v, d, &lanes[0]);
+ Store(v, d, &lanes[N]);
+
+ const auto ext = ZeroExtendVector(d2, v);
+ Store(ext, d2, lanes.get());
+
+ // Lower half is unchanged
+ HWY_ASSERT_VEC_EQ(d, v, Load(d, &lanes[0]));
+ // Upper half is zero
+ HWY_ASSERT_VEC_EQ(d, Zero(d), Load(d, &lanes[N]));
+ }
+};
+
+HWY_NOINLINE void TestAllZeroExtendVector() {
+ ForAllTypes(ForExtendableVectors<TestZeroExtendVector>());
+}
+
+struct TestCombine {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const Twice<D> d2;
+ const size_t N2 = Lanes(d2);
+ auto lanes = AllocateAligned<T>(N2);
+
+ const auto lo = Iota(d, 1);
+ const auto hi = Iota(d, static_cast<T>(N2 / 2 + 1));
+ const auto combined = Combine(d2, hi, lo);
+ Store(combined, d2, lanes.get());
+
+ const auto expected = Iota(d2, 1);
+ HWY_ASSERT_VEC_EQ(d2, expected, combined);
+ }
+};
+
+HWY_NOINLINE void TestAllCombine() {
+ ForAllTypes(ForExtendableVectors<TestCombine>());
+}
+
+struct TestConcat {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const size_t N = Lanes(d);
+ if (N == 1) return;
+ const size_t half_bytes = N * sizeof(T) / 2;
+
+ auto hi = AllocateAligned<T>(N);
+ auto lo = AllocateAligned<T>(N);
+ auto expected = AllocateAligned<T>(N);
+ RandomState rng;
+ for (size_t rep = 0; rep < 10; ++rep) {
+ for (size_t i = 0; i < N; ++i) {
+ hi[i] = static_cast<T>(Random64(&rng) & 0xFF);
+ lo[i] = static_cast<T>(Random64(&rng) & 0xFF);
+ }
+
+ {
+ memcpy(&expected[N / 2], &hi[N / 2], half_bytes);
+ memcpy(&expected[0], &lo[0], half_bytes);
+ const auto vhi = Load(d, hi.get());
+ const auto vlo = Load(d, lo.get());
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ConcatUpperLower(d, vhi, vlo));
+ }
+
+ {
+ memcpy(&expected[N / 2], &hi[N / 2], half_bytes);
+ memcpy(&expected[0], &lo[N / 2], half_bytes);
+ const auto vhi = Load(d, hi.get());
+ const auto vlo = Load(d, lo.get());
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ConcatUpperUpper(d, vhi, vlo));
+ }
+
+ {
+ memcpy(&expected[N / 2], &hi[0], half_bytes);
+ memcpy(&expected[0], &lo[N / 2], half_bytes);
+ const auto vhi = Load(d, hi.get());
+ const auto vlo = Load(d, lo.get());
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ConcatLowerUpper(d, vhi, vlo));
+ }
+
+ {
+ memcpy(&expected[N / 2], &hi[0], half_bytes);
+ memcpy(&expected[0], &lo[0], half_bytes);
+ const auto vhi = Load(d, hi.get());
+ const auto vlo = Load(d, lo.get());
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ConcatLowerLower(d, vhi, vlo));
+ }
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllConcat() {
+ ForAllTypes(ForShrinkableVectors<TestConcat>());
+}
+
+struct TestConcatOddEven {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+#if HWY_TARGET != HWY_SCALAR
+ const size_t N = Lanes(d);
+ const auto hi = Iota(d, static_cast<T>(N));
+ const auto lo = Iota(d, 0);
+ const auto even = Add(Iota(d, 0), Iota(d, 0));
+ const auto odd = Add(even, Set(d, 1));
+ HWY_ASSERT_VEC_EQ(d, odd, ConcatOdd(d, hi, lo));
+ HWY_ASSERT_VEC_EQ(d, even, ConcatEven(d, hi, lo));
+
+ // This test catches inadvertent saturation.
+ const auto min = Set(d, LowestValue<T>());
+ const auto max = Set(d, HighestValue<T>());
+ HWY_ASSERT_VEC_EQ(d, max, ConcatOdd(d, max, max));
+ HWY_ASSERT_VEC_EQ(d, max, ConcatEven(d, max, max));
+ HWY_ASSERT_VEC_EQ(d, min, ConcatOdd(d, min, min));
+ HWY_ASSERT_VEC_EQ(d, min, ConcatEven(d, min, min));
+#else
+ (void)d;
+#endif
+ }
+};
+
+HWY_NOINLINE void TestAllConcatOddEven() {
+ ForAllTypes(ForShrinkableVectors<TestConcatOddEven>());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyCombineTest);
+HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllLowerHalf);
+HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllUpperHalf);
+HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllZeroExtendVector);
+HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllCombine);
+HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllConcat);
+HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllConcatOddEven);
+} // namespace hwy
+
+#endif // HWY_ONCE
diff --git a/third_party/highway/hwy/tests/compare_test.cc b/third_party/highway/hwy/tests/compare_test.cc
new file mode 100644
index 0000000000..a96e29fc62
--- /dev/null
+++ b/third_party/highway/hwy/tests/compare_test.cc
@@ -0,0 +1,509 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h> // memset
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/compare_test.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// All types.
+struct TestEquality {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto v2 = Iota(d, 2);
+ const auto v2b = Iota(d, 2);
+ const auto v3 = Iota(d, 3);
+
+ const auto mask_false = MaskFalse(d);
+ const auto mask_true = MaskTrue(d);
+
+ HWY_ASSERT_MASK_EQ(d, mask_false, Eq(v2, v3));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Eq(v3, v2));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Eq(v2, v2));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Eq(v2, v2b));
+
+ HWY_ASSERT_MASK_EQ(d, mask_true, Ne(v2, v3));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Ne(v3, v2));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Ne(v2, v2));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Ne(v2, v2b));
+ }
+};
+
+HWY_NOINLINE void TestAllEquality() {
+ ForAllTypes(ForPartialVectors<TestEquality>());
+}
+
+// a > b should be true, verify that for Gt/Lt and with swapped args.
+template <class D>
+void EnsureGreater(D d, TFromD<D> a, TFromD<D> b, const char* file, int line) {
+ const auto mask_false = MaskFalse(d);
+ const auto mask_true = MaskTrue(d);
+
+ const auto va = Set(d, a);
+ const auto vb = Set(d, b);
+ AssertMaskEqual(d, mask_true, Gt(va, vb), file, line);
+ AssertMaskEqual(d, mask_false, Lt(va, vb), file, line);
+
+ // Swapped order
+ AssertMaskEqual(d, mask_false, Gt(vb, va), file, line);
+ AssertMaskEqual(d, mask_true, Lt(vb, va), file, line);
+
+ // Also ensure irreflexive
+ AssertMaskEqual(d, mask_false, Gt(va, va), file, line);
+ AssertMaskEqual(d, mask_false, Gt(vb, vb), file, line);
+ AssertMaskEqual(d, mask_false, Lt(va, va), file, line);
+ AssertMaskEqual(d, mask_false, Lt(vb, vb), file, line);
+}
+
+#define HWY_ENSURE_GREATER(d, a, b) EnsureGreater(d, a, b, __FILE__, __LINE__)
+
+struct TestStrictUnsigned {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const T max = LimitsMax<T>();
+ const auto v0 = Zero(d);
+ const auto v2 = And(Iota(d, T(2)), Set(d, 255)); // 0..255
+
+ const auto mask_false = MaskFalse(d);
+
+ // Individual values of interest
+ HWY_ENSURE_GREATER(d, 2, 1);
+ HWY_ENSURE_GREATER(d, 1, 0);
+ HWY_ENSURE_GREATER(d, 128, 127);
+ HWY_ENSURE_GREATER(d, max, max / 2);
+ HWY_ENSURE_GREATER(d, max, 1);
+ HWY_ENSURE_GREATER(d, max, 0);
+
+ // Also use Iota to ensure lanes are independent
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, v0));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v0, v2));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v0, v0));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v0, v0));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, v2));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v2, v2));
+ }
+};
+
+HWY_NOINLINE void TestAllStrictUnsigned() {
+ ForUnsignedTypes(ForPartialVectors<TestStrictUnsigned>());
+}
+
+struct TestStrictInt {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const T min = LimitsMin<T>();
+ const T max = LimitsMax<T>();
+ const auto v0 = Zero(d);
+ const auto v2 = And(Iota(d, T(2)), Set(d, 127)); // 0..127
+ const auto vn = Sub(Neg(v2), Set(d, 1)); // -1..-128
+
+ const auto mask_false = MaskFalse(d);
+ const auto mask_true = MaskTrue(d);
+
+ // Individual values of interest
+ HWY_ENSURE_GREATER(d, 2, 1);
+ HWY_ENSURE_GREATER(d, 1, 0);
+ HWY_ENSURE_GREATER(d, 0, -1);
+ HWY_ENSURE_GREATER(d, -1, -2);
+ HWY_ENSURE_GREATER(d, max, max / 2);
+ HWY_ENSURE_GREATER(d, max, 1);
+ HWY_ENSURE_GREATER(d, max, 0);
+ HWY_ENSURE_GREATER(d, max, -1);
+ HWY_ENSURE_GREATER(d, max, min);
+ HWY_ENSURE_GREATER(d, 0, min);
+ HWY_ENSURE_GREATER(d, min / 2, min);
+
+ // Also use Iota to ensure lanes are independent
+ HWY_ASSERT_MASK_EQ(d, mask_true, Gt(v2, vn));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Lt(vn, v2));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, vn));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Gt(vn, v2));
+
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v0, v0));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, v2));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt(vn, vn));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v0, v0));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v2, v2));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Gt(vn, vn));
+ }
+};
+
+// S-SSE3 bug (#795): same upper, differing MSB in lower
+struct TestStrictInt64 {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto m0 = MaskFalse(d);
+ const auto m1 = MaskTrue(d);
+ HWY_ASSERT_MASK_EQ(d, m0, Lt(Set(d, 0x380000000LL), Set(d, 0x300000001LL)));
+ HWY_ASSERT_MASK_EQ(d, m1, Lt(Set(d, 0xF00000000LL), Set(d, 0xF80000000LL)));
+ HWY_ASSERT_MASK_EQ(d, m1, Lt(Set(d, 0xF00000000LL), Set(d, 0xF80000001LL)));
+ }
+};
+
+HWY_NOINLINE void TestAllStrictInt() {
+ ForSignedTypes(ForPartialVectors<TestStrictInt>());
+ ForPartialVectors<TestStrictInt64>()(int64_t());
+}
+
+struct TestStrictFloat {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const T huge_neg = T(-1E35);
+ const T huge_pos = T(1E36);
+ const auto v0 = Zero(d);
+ const auto v2 = Iota(d, T(2));
+ const auto vn = Neg(v2);
+
+ const auto mask_false = MaskFalse(d);
+ const auto mask_true = MaskTrue(d);
+
+ // Individual values of interest
+ HWY_ENSURE_GREATER(d, 2, 1);
+ HWY_ENSURE_GREATER(d, 1, 0);
+ HWY_ENSURE_GREATER(d, 0, -1);
+ HWY_ENSURE_GREATER(d, -1, -2);
+ HWY_ENSURE_GREATER(d, huge_pos, 1);
+ HWY_ENSURE_GREATER(d, huge_pos, 0);
+ HWY_ENSURE_GREATER(d, huge_pos, -1);
+ HWY_ENSURE_GREATER(d, huge_pos, huge_neg);
+ HWY_ENSURE_GREATER(d, 0, huge_neg);
+
+ // Also use Iota to ensure lanes are independent
+ HWY_ASSERT_MASK_EQ(d, mask_true, Gt(v2, vn));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Lt(vn, v2));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, vn));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Gt(vn, v2));
+
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v0, v0));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, v2));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt(vn, vn));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v0, v0));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v2, v2));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Gt(vn, vn));
+ }
+};
+
+HWY_NOINLINE void TestAllStrictFloat() {
+ ForFloatTypes(ForPartialVectors<TestStrictFloat>());
+}
+
+struct TestWeakFloat {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto v2 = Iota(d, T(2));
+ const auto vn = Iota(d, -T(Lanes(d)));
+
+ const auto mask_false = MaskFalse(d);
+ const auto mask_true = MaskTrue(d);
+
+ HWY_ASSERT_MASK_EQ(d, mask_true, Ge(v2, v2));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Le(vn, vn));
+
+ HWY_ASSERT_MASK_EQ(d, mask_true, Ge(v2, vn));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Le(vn, v2));
+
+ HWY_ASSERT_MASK_EQ(d, mask_false, Le(v2, vn));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Ge(vn, v2));
+ }
+};
+
+HWY_NOINLINE void TestAllWeakFloat() {
+ ForFloatTypes(ForPartialVectors<TestWeakFloat>());
+}
+
+template <class D>
+static HWY_NOINLINE Vec<D> Make128(D d, uint64_t hi, uint64_t lo) {
+ alignas(16) uint64_t in[2];
+ in[0] = lo;
+ in[1] = hi;
+ return LoadDup128(d, in);
+}
+
+struct TestLt128 {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ using V = Vec<D>;
+ const V v00 = Zero(d);
+ const V v01 = Make128(d, 0, 1);
+ const V v10 = Make128(d, 1, 0);
+ const V v11 = Add(v01, v10);
+
+ const auto mask_false = MaskFalse(d);
+ const auto mask_true = MaskTrue(d);
+
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v00, v00));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v01, v01));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v10, v10));
+
+ HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v00, v01));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v01, v10));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v01, v11));
+
+ // Reversed order
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v01, v00));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v10, v01));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v11, v01));
+
+ // Also check 128-bit blocks are independent
+ const V iota = Iota(d, 1);
+ HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, iota, Add(iota, v01)));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, iota, Add(iota, v10)));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, Add(iota, v01), iota));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, Add(iota, v10), iota));
+
+ // Max value
+ const V vm = Make128(d, LimitsMax<T>(), LimitsMax<T>());
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, vm));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, v00));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, v01));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, v10));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, v11));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v00, vm));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v01, vm));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v10, vm));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v11, vm));
+ }
+};
+
+HWY_NOINLINE void TestAllLt128() { ForGEVectors<128, TestLt128>()(uint64_t()); }
+
+struct TestLt128Upper {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ using V = Vec<D>;
+ const V v00 = Zero(d);
+ const V v01 = Make128(d, 0, 1);
+ const V v10 = Make128(d, 1, 0);
+ const V v11 = Add(v01, v10);
+
+ const auto mask_false = MaskFalse(d);
+ const auto mask_true = MaskTrue(d);
+
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v00, v00));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v01, v01));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v10, v10));
+
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v00, v01));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v01, v10));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v01, v11));
+
+ // Reversed order
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v01, v00));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v10, v01));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v11, v01));
+
+ // Also check 128-bit blocks are independent
+ const V iota = Iota(d, 1);
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, iota, Add(iota, v01)));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, iota, Add(iota, v10)));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, Add(iota, v01), iota));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, Add(iota, v10), iota));
+
+ // Max value
+ const V vm = Make128(d, LimitsMax<T>(), LimitsMax<T>());
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, vm, vm));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, vm, v00));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, vm, v01));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, vm, v10));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, vm, v11));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v00, vm));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v01, vm));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v10, vm));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v11, vm));
+ }
+};
+
+HWY_NOINLINE void TestAllLt128Upper() {
+ ForGEVectors<128, TestLt128Upper>()(uint64_t());
+}
+
+struct TestEq128 { // Also Ne128
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ using V = Vec<D>;
+ const V v00 = Zero(d);
+ const V v01 = Make128(d, 0, 1);
+ const V v10 = Make128(d, 1, 0);
+ const V v11 = Add(v01, v10);
+
+ const auto mask_false = MaskFalse(d);
+ const auto mask_true = MaskTrue(d);
+
+ HWY_ASSERT_MASK_EQ(d, mask_true, Eq128(d, v00, v00));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Eq128(d, v01, v01));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Eq128(d, v10, v10));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Ne128(d, v00, v00));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Ne128(d, v01, v01));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Ne128(d, v10, v10));
+
+ HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v00, v01));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v01, v10));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v01, v11));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v00, v01));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v01, v10));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v01, v11));
+
+ // Reversed order
+ HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v01, v00));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v10, v01));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v11, v01));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v01, v00));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v10, v01));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v11, v01));
+
+ // Also check 128-bit blocks are independent
+ const V iota = Iota(d, 1);
+ HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, iota, Add(iota, v01)));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, iota, Add(iota, v10)));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, Add(iota, v01), iota));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, Add(iota, v10), iota));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, iota, Add(iota, v01)));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, iota, Add(iota, v10)));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, Add(iota, v01), iota));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, Add(iota, v10), iota));
+
+ // Max value
+ const V vm = Make128(d, LimitsMax<T>(), LimitsMax<T>());
+ HWY_ASSERT_MASK_EQ(d, mask_true, Eq128(d, vm, vm));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Ne128(d, vm, vm));
+
+ HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, vm, v00));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, vm, v01));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, vm, v10));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, vm, v11));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v00, vm));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v01, vm));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v10, vm));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v11, vm));
+
+ HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, vm, v00));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, vm, v01));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, vm, v10));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, vm, v11));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v00, vm));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v01, vm));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v10, vm));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v11, vm));
+ }
+};
+
+HWY_NOINLINE void TestAllEq128() { ForGEVectors<128, TestEq128>()(uint64_t()); }
+
+struct TestEq128Upper { // Also Ne128Upper
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ using V = Vec<D>;
+ const V v00 = Zero(d);
+ const V v01 = Make128(d, 0, 1);
+ const V v10 = Make128(d, 1, 0);
+ const V v11 = Add(v01, v10);
+
+ const auto mask_false = MaskFalse(d);
+ const auto mask_true = MaskTrue(d);
+
+ HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v00, v00));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v01, v01));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v10, v10));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v00, v00));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v01, v01));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v10, v10));
+
+ HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v00, v01));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v00, v01));
+
+ HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v01, v10));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v01, v11));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v01, v10));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v01, v11));
+
+ // Reversed order
+ HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v01, v00));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v01, v00));
+
+ HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v10, v01));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v11, v01));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v10, v01));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v11, v01));
+
+ // Also check 128-bit blocks are independent
+ const V iota = Iota(d, 1);
+ HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, iota, Add(iota, v01)));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, iota, Add(iota, v01)));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, iota, Add(iota, v10)));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, iota, Add(iota, v10)));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, Add(iota, v01), iota));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, Add(iota, v01), iota));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, Add(iota, v10), iota));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, Add(iota, v10), iota));
+
+ // Max value
+ const V vm = Make128(d, LimitsMax<T>(), LimitsMax<T>());
+ HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, vm, vm));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, vm, vm));
+
+ HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, vm, v00));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, vm, v01));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, vm, v10));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, vm, v11));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v00, vm));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v01, vm));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v10, vm));
+ HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v11, vm));
+
+ HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, vm, v00));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, vm, v01));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, vm, v10));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, vm, v11));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v00, vm));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v01, vm));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v10, vm));
+ HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v11, vm));
+ }
+};
+
+HWY_NOINLINE void TestAllEq128Upper() {
+ ForGEVectors<128, TestEq128Upper>()(uint64_t());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyCompareTest);
+HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllEquality);
+HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictUnsigned);
+HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictInt);
+HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictFloat);
+HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllWeakFloat);
+HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllLt128);
+HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllLt128Upper);
+HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllEq128);
+HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllEq128Upper);
+} // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/compress_test.cc b/third_party/highway/hwy/tests/compress_test.cc
new file mode 100644
index 0000000000..ae008b4dc4
--- /dev/null
+++ b/third_party/highway/hwy/tests/compress_test.cc
@@ -0,0 +1,833 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h> // memset
+
+#include <array> // IWYU pragma: keep
+
+#include "hwy/base.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/compress_test.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// Regenerate tables used in the implementation, instead of testing.
+#define HWY_PRINT_TABLES 0
+
+#if !HWY_PRINT_TABLES || HWY_IDE
+
+template <class D, class DI, typename T = TFromD<D>, typename TI = TFromD<DI>>
+void CheckStored(D d, DI di, const char* op, size_t expected_pos,
+ size_t actual_pos, size_t num_to_check,
+ const AlignedFreeUniquePtr<T[]>& in,
+ const AlignedFreeUniquePtr<TI[]>& mask_lanes,
+ const AlignedFreeUniquePtr<T[]>& expected, const T* actual_u,
+ int line) {
+ if (expected_pos != actual_pos) {
+ hwy::Abort(__FILE__, line,
+ "%s: size mismatch for %s: expected %d, actual %d\n", op,
+ TypeName(T(), Lanes(d)).c_str(), static_cast<int>(expected_pos),
+ static_cast<int>(actual_pos));
+ }
+ // Modified from AssertVecEqual - we may not be checking all lanes.
+ for (size_t i = 0; i < num_to_check; ++i) {
+ if (!IsEqual(expected[i], actual_u[i])) {
+ const size_t N = Lanes(d);
+ fprintf(stderr, "%s: mismatch at i=%d of %d, line %d:\n\n", op,
+ static_cast<int>(i), static_cast<int>(num_to_check), line);
+ Print(di, "mask", Load(di, mask_lanes.get()), 0, N);
+ Print(d, "in", Load(d, in.get()), 0, N);
+ Print(d, "expect", Load(d, expected.get()), 0, num_to_check);
+ Print(d, "actual", Load(d, actual_u), 0, num_to_check);
+ HWY_ASSERT(false);
+ }
+ }
+}
+
+struct TestCompress {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ RandomState rng;
+
+ using TI = MakeSigned<T>; // For mask > 0 comparison
+ using TU = MakeUnsigned<T>;
+ const Rebind<TI, D> di;
+ const size_t N = Lanes(d);
+
+ for (int frac : {0, 2, 3}) {
+ // For CompressStore
+ const size_t misalign = static_cast<size_t>(frac) * N / 4;
+
+ auto in_lanes = AllocateAligned<T>(N);
+ auto mask_lanes = AllocateAligned<TI>(N);
+ auto garbage = AllocateAligned<TU>(N);
+ auto expected = AllocateAligned<T>(N);
+ auto actual_a = AllocateAligned<T>(misalign + N);
+ T* actual_u = actual_a.get() + misalign;
+
+ const size_t bits_size = RoundUpTo((N + 7) / 8, 8);
+ auto bits = AllocateAligned<uint8_t>(bits_size);
+ memset(bits.get(), 0, bits_size); // for MSAN
+
+ // Each lane should have a chance of having mask=true.
+ for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+ size_t expected_pos = 0;
+ for (size_t i = 0; i < N; ++i) {
+ const uint64_t r = Random32(&rng);
+ in_lanes[i] = T(); // cannot initialize float16_t directly.
+ CopyBytes<sizeof(T)>(&r, &in_lanes[i]); // not same size
+ mask_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0);
+ if (mask_lanes[i] > 0) {
+ expected[expected_pos++] = in_lanes[i];
+ }
+ garbage[i] = static_cast<TU>(Random64(&rng));
+ }
+ size_t num_to_check;
+ if (CompressIsPartition<T>::value) {
+ // For non-native Compress, also check that mask=false lanes were
+ // moved to the back of the vector (highest indices).
+ size_t extra = expected_pos;
+ for (size_t i = 0; i < N; ++i) {
+ if (mask_lanes[i] == 0) {
+ expected[extra++] = in_lanes[i];
+ }
+ }
+ HWY_ASSERT(extra == N);
+ num_to_check = N;
+ } else {
+ // For native Compress, only the mask=true lanes are defined.
+ num_to_check = expected_pos;
+ }
+
+ const auto in = Load(d, in_lanes.get());
+ const auto mask =
+ RebindMask(d, Gt(Load(di, mask_lanes.get()), Zero(di)));
+ StoreMaskBits(d, mask, bits.get());
+
+ // Compress
+ memset(actual_u, 0, N * sizeof(T));
+ StoreU(Compress(in, mask), d, actual_u);
+ CheckStored(d, di, "Compress", expected_pos, expected_pos, num_to_check,
+ in_lanes, mask_lanes, expected, actual_u, __LINE__);
+
+ // CompressNot
+ memset(actual_u, 0, N * sizeof(T));
+ StoreU(CompressNot(in, Not(mask)), d, actual_u);
+ CheckStored(d, di, "CompressNot", expected_pos, expected_pos,
+ num_to_check, in_lanes, mask_lanes, expected, actual_u,
+ __LINE__);
+
+ // CompressStore
+ memset(actual_u, 0, N * sizeof(T));
+ const size_t size1 = CompressStore(in, mask, d, actual_u);
+ // expected_pos instead of num_to_check because this op is not
+ // affected by CompressIsPartition.
+ CheckStored(d, di, "CompressStore", expected_pos, size1, expected_pos,
+ in_lanes, mask_lanes, expected, actual_u, __LINE__);
+
+ // CompressBlendedStore
+ memcpy(actual_u, garbage.get(), N * sizeof(T));
+ const size_t size2 = CompressBlendedStore(in, mask, d, actual_u);
+ // expected_pos instead of num_to_check because this op only writes
+ // the mask=true lanes.
+ CheckStored(d, di, "CompressBlendedStore", expected_pos, size2,
+ expected_pos, in_lanes, mask_lanes, expected, actual_u,
+ __LINE__);
+ // Subsequent lanes are untouched.
+ for (size_t i = size2; i < N; ++i) {
+#if HWY_COMPILER_MSVC && HWY_TARGET == HWY_AVX2
+ // TODO(eustas): re-enable when compiler is fixed
+#else
+ HWY_ASSERT_EQ(garbage[i], reinterpret_cast<TU*>(actual_u)[i]);
+#endif
+ }
+
+ // CompressBits
+ memset(actual_u, 0, N * sizeof(T));
+ StoreU(CompressBits(in, bits.get()), d, actual_u);
+ CheckStored(d, di, "CompressBits", expected_pos, expected_pos,
+ num_to_check, in_lanes, mask_lanes, expected, actual_u,
+ __LINE__);
+
+ // CompressBitsStore
+ memset(actual_u, 0, N * sizeof(T));
+ const size_t size3 = CompressBitsStore(in, bits.get(), d, actual_u);
+ // expected_pos instead of num_to_check because this op is not
+ // affected by CompressIsPartition.
+ CheckStored(d, di, "CompressBitsStore", expected_pos, size3,
+ expected_pos, in_lanes, mask_lanes, expected, actual_u,
+ __LINE__);
+ } // rep
+ } // frac
+ } // operator()
+};
+
+HWY_NOINLINE void TestAllCompress() {
+ ForAllTypes(ForPartialVectors<TestCompress>());
+}
+
+struct TestCompressBlocks {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+#if HWY_TARGET == HWY_SCALAR
+ (void)d;
+#else
+ static_assert(sizeof(T) == 8 && !IsSigned<T>(), "Should be u64");
+ RandomState rng;
+
+ using TI = MakeSigned<T>; // For mask > 0 comparison
+ const Rebind<TI, D> di;
+ const size_t N = Lanes(d);
+
+ auto in_lanes = AllocateAligned<T>(N);
+ auto mask_lanes = AllocateAligned<TI>(N);
+ auto expected = AllocateAligned<T>(N);
+ auto actual = AllocateAligned<T>(N);
+
+ // Each lane should have a chance of having mask=true.
+ for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+ size_t expected_pos = 0;
+ for (size_t i = 0; i < N; i += 2) {
+ const uint64_t bits = Random32(&rng);
+ in_lanes[i + 1] = in_lanes[i] = T(); // cannot set float16_t directly.
+ CopyBytes<sizeof(T)>(&bits, &in_lanes[i]); // not same size
+ CopyBytes<sizeof(T)>(&bits, &in_lanes[i + 1]); // not same size
+ mask_lanes[i + 1] = mask_lanes[i] = TI{(Random32(&rng) & 8) ? 1 : 0};
+ if (mask_lanes[i] > 0) {
+ expected[expected_pos++] = in_lanes[i];
+ expected[expected_pos++] = in_lanes[i + 1];
+ }
+ }
+ size_t num_to_check;
+ if (CompressIsPartition<T>::value) {
+ // For non-native Compress, also check that mask=false lanes were
+ // moved to the back of the vector (highest indices).
+ size_t extra = expected_pos;
+ for (size_t i = 0; i < N; ++i) {
+ if (mask_lanes[i] == 0) {
+ expected[extra++] = in_lanes[i];
+ }
+ }
+ HWY_ASSERT(extra == N);
+ num_to_check = N;
+ } else {
+ // For native Compress, only the mask=true lanes are defined.
+ num_to_check = expected_pos;
+ }
+
+ const auto in = Load(d, in_lanes.get());
+ const auto mask = RebindMask(d, Gt(Load(di, mask_lanes.get()), Zero(di)));
+
+ // CompressBlocksNot
+ memset(actual.get(), 0, N * sizeof(T));
+ StoreU(CompressBlocksNot(in, Not(mask)), d, actual.get());
+ CheckStored(d, di, "CompressBlocksNot", expected_pos, expected_pos,
+ num_to_check, in_lanes, mask_lanes, expected, actual.get(),
+ __LINE__);
+ } // rep
+#endif // HWY_TARGET == HWY_SCALAR
+ } // operator()
+};
+
+HWY_NOINLINE void TestAllCompressBlocks() {
+ ForGE128Vectors<TestCompressBlocks>()(uint64_t());
+}
+
+#endif // !HWY_PRINT_TABLES
+
+#if HWY_PRINT_TABLES || HWY_IDE
+namespace detail { // for code folding
+
+void PrintCompress8x8Tables() {
+ printf("======================================= 8x8\n");
+ constexpr size_t N = 8;
+ for (uint64_t code = 0; code < (1ull << N); ++code) {
+ std::array<uint8_t, N> indices{0};
+ size_t pos = 0;
+ // All lanes where mask = true
+ for (size_t i = 0; i < N; ++i) {
+ if (code & (1ull << i)) {
+ indices[pos++] = i;
+ }
+ }
+ // All lanes where mask = false
+ for (size_t i = 0; i < N; ++i) {
+ if (!(code & (1ull << i))) {
+ indices[pos++] = i;
+ }
+ }
+ HWY_ASSERT(pos == N);
+
+ for (size_t i = 0; i < N; ++i) {
+ printf("%d,", indices[i]);
+ }
+ printf(code & 1 ? "//\n" : "/**/");
+ }
+ printf("\n");
+}
+
+void PrintCompress16x8Tables() {
+ printf("======================================= 16x8\n");
+ constexpr size_t N = 8; // 128-bit SIMD
+ for (uint64_t code = 0; code < (1ull << N); ++code) {
+ std::array<uint8_t, N> indices{0};
+ size_t pos = 0;
+ // All lanes where mask = true
+ for (size_t i = 0; i < N; ++i) {
+ if (code & (1ull << i)) {
+ indices[pos++] = i;
+ }
+ }
+ // All lanes where mask = false
+ for (size_t i = 0; i < N; ++i) {
+ if (!(code & (1ull << i))) {
+ indices[pos++] = i;
+ }
+ }
+ HWY_ASSERT(pos == N);
+
+ // Doubled (for converting lane to byte indices)
+ for (size_t i = 0; i < N; ++i) {
+ printf("%d,", 2 * indices[i]);
+ }
+ printf(code & 1 ? "//\n" : "/**/");
+ }
+ printf("\n");
+}
+
+void PrintCompressNot16x8Tables() {
+ printf("======================================= Not 16x8\n");
+ constexpr size_t N = 8; // 128-bit SIMD
+ for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+ const uint64_t code = ~not_code;
+ std::array<uint8_t, N> indices{0};
+ size_t pos = 0;
+ // All lanes where mask = true
+ for (size_t i = 0; i < N; ++i) {
+ if (code & (1ull << i)) {
+ indices[pos++] = i;
+ }
+ }
+ // All lanes where mask = false
+ for (size_t i = 0; i < N; ++i) {
+ if (!(code & (1ull << i))) {
+ indices[pos++] = i;
+ }
+ }
+ HWY_ASSERT(pos == N);
+
+ // Doubled (for converting lane to byte indices)
+ for (size_t i = 0; i < N; ++i) {
+ printf("%d,", 2 * indices[i]);
+ }
+ printf(not_code & 1 ? "//\n" : "/**/");
+ }
+ printf("\n");
+}
+
+// Compressed to nibbles, unpacked via variable right shift. Also includes
+// FirstN bits in the nibble MSB.
+void PrintCompress32x8Tables() {
+ printf("======================================= 32/64x8\n");
+ constexpr size_t N = 8; // AVX2 or 64-bit AVX3
+ for (uint64_t code = 0; code < (1ull << N); ++code) {
+ const size_t count = PopCount(code);
+ std::array<uint32_t, N> indices{0};
+ size_t pos = 0;
+ // All lanes where mask = true
+ for (size_t i = 0; i < N; ++i) {
+ if (code & (1ull << i)) {
+ indices[pos++] = i;
+ }
+ }
+ // All lanes where mask = false
+ for (size_t i = 0; i < N; ++i) {
+ if (!(code & (1ull << i))) {
+ indices[pos++] = i;
+ }
+ }
+ HWY_ASSERT(pos == N);
+
+ // Convert to nibbles
+ uint64_t packed = 0;
+ for (size_t i = 0; i < N; ++i) {
+ HWY_ASSERT(indices[i] < N);
+ if (i < count) {
+ indices[i] |= N;
+ HWY_ASSERT(indices[i] < 0x10);
+ }
+ packed += indices[i] << (i * 4);
+ }
+
+ HWY_ASSERT(packed < (1ull << (N * 4)));
+ printf("0x%08x,", static_cast<uint32_t>(packed));
+ }
+ printf("\n");
+}
+
+void PrintCompressNot32x8Tables() {
+ printf("======================================= Not 32/64x8\n");
+ constexpr size_t N = 8; // AVX2 or 64-bit AVX3
+ for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+ const uint64_t code = ~not_code;
+ const size_t count = PopCount(code);
+ std::array<uint32_t, N> indices{0};
+ size_t pos = 0;
+ // All lanes where mask = true
+ for (size_t i = 0; i < N; ++i) {
+ if (code & (1ull << i)) {
+ indices[pos++] = i;
+ }
+ }
+ // All lanes where mask = false
+ for (size_t i = 0; i < N; ++i) {
+ if (!(code & (1ull << i))) {
+ indices[pos++] = i;
+ }
+ }
+ HWY_ASSERT(pos == N);
+
+ // Convert to nibbles
+ uint64_t packed = 0;
+ for (size_t i = 0; i < N; ++i) {
+ HWY_ASSERT(indices[i] < N);
+ if (i < count) {
+ indices[i] |= N;
+ HWY_ASSERT(indices[i] < 0x10);
+ }
+ packed += indices[i] << (i * 4);
+ }
+
+ HWY_ASSERT(packed < (1ull << (N * 4)));
+ printf("0x%08x,", static_cast<uint32_t>(packed));
+ }
+ printf("\n");
+}
+
+// Compressed to nibbles (for AVX3 64x4)
+void PrintCompress64x4NibbleTables() {
+ printf("======================================= 64x4Nibble\n");
+ constexpr size_t N = 4; // AVX2
+ for (uint64_t code = 0; code < (1ull << N); ++code) {
+ std::array<uint32_t, N> indices{0};
+ size_t pos = 0;
+ // All lanes where mask = true
+ for (size_t i = 0; i < N; ++i) {
+ if (code & (1ull << i)) {
+ indices[pos++] = i;
+ }
+ }
+ // All lanes where mask = false
+ for (size_t i = 0; i < N; ++i) {
+ if (!(code & (1ull << i))) {
+ indices[pos++] = i;
+ }
+ }
+ HWY_ASSERT(pos == N);
+
+ // Convert to nibbles
+ uint64_t packed = 0;
+ for (size_t i = 0; i < N; ++i) {
+ HWY_ASSERT(indices[i] < N);
+ packed += indices[i] << (i * 4);
+ }
+
+ HWY_ASSERT(packed < (1ull << (N * 4)));
+ printf("0x%08x,", static_cast<uint32_t>(packed));
+ }
+ printf("\n");
+}
+
+void PrintCompressNot64x4NibbleTables() {
+ printf("======================================= Not 64x4Nibble\n");
+ constexpr size_t N = 4; // AVX2
+ for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+ const uint64_t code = ~not_code;
+ std::array<uint32_t, N> indices{0};
+ size_t pos = 0;
+ // All lanes where mask = true
+ for (size_t i = 0; i < N; ++i) {
+ if (code & (1ull << i)) {
+ indices[pos++] = i;
+ }
+ }
+ // All lanes where mask = false
+ for (size_t i = 0; i < N; ++i) {
+ if (!(code & (1ull << i))) {
+ indices[pos++] = i;
+ }
+ }
+ HWY_ASSERT(pos == N);
+
+ // Convert to nibbles
+ uint64_t packed = 0;
+ for (size_t i = 0; i < N; ++i) {
+ HWY_ASSERT(indices[i] < N);
+ packed += indices[i] << (i * 4);
+ }
+
+ HWY_ASSERT(packed < (1ull << (N * 4)));
+ printf("0x%08x,", static_cast<uint32_t>(packed));
+ }
+ printf("\n");
+}
+
+void PrintCompressNot64x2NibbleTables() {
+ printf("======================================= Not 64x2Nibble\n");
+ constexpr size_t N = 2; // 128-bit
+ for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+ const uint64_t code = ~not_code;
+ std::array<uint32_t, N> indices{0};
+ size_t pos = 0;
+ // All lanes where mask = true
+ for (size_t i = 0; i < N; ++i) {
+ if (code & (1ull << i)) {
+ indices[pos++] = i;
+ }
+ }
+ // All lanes where mask = false
+ for (size_t i = 0; i < N; ++i) {
+ if (!(code & (1ull << i))) {
+ indices[pos++] = i;
+ }
+ }
+ HWY_ASSERT(pos == N);
+
+ // Convert to nibbles
+ uint64_t packed = 0;
+ for (size_t i = 0; i < N; ++i) {
+ HWY_ASSERT(indices[i] < N);
+ packed += indices[i] << (i * 4);
+ }
+
+ HWY_ASSERT(packed < (1ull << (N * 4)));
+ printf("0x%08x,", static_cast<uint32_t>(packed));
+ }
+ printf("\n");
+}
+
+void PrintCompress64x4Tables() {
+ printf("======================================= 64x4 uncompressed\n");
+ constexpr size_t N = 4; // SVE_256
+ for (uint64_t code = 0; code < (1ull << N); ++code) {
+ std::array<size_t, N> indices{0};
+ size_t pos = 0;
+ // All lanes where mask = true
+ for (size_t i = 0; i < N; ++i) {
+ if (code & (1ull << i)) {
+ indices[pos++] = i;
+ }
+ }
+ // All lanes where mask = false
+ for (size_t i = 0; i < N; ++i) {
+ if (!(code & (1ull << i))) {
+ indices[pos++] = i;
+ }
+ }
+ HWY_ASSERT(pos == N);
+
+ // Store uncompressed indices because SVE TBL returns 0 if an index is out
+ // of bounds. On AVX3 we simply variable-shift because permute indices are
+ // interpreted modulo N. Compression is not worth the extra shift+AND
+ // because the table is anyway only 512 bytes.
+ for (size_t i = 0; i < N; ++i) {
+ printf("%d,", static_cast<int>(indices[i]));
+ }
+ }
+ printf("\n");
+}
+
+void PrintCompressNot64x4Tables() {
+ printf("======================================= Not 64x4 uncompressed\n");
+ constexpr size_t N = 4; // SVE_256
+ for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+ const uint64_t code = ~not_code;
+ std::array<size_t, N> indices{0};
+ size_t pos = 0;
+ // All lanes where mask = true
+ for (size_t i = 0; i < N; ++i) {
+ if (code & (1ull << i)) {
+ indices[pos++] = i;
+ }
+ }
+ // All lanes where mask = false
+ for (size_t i = 0; i < N; ++i) {
+ if (!(code & (1ull << i))) {
+ indices[pos++] = i;
+ }
+ }
+ HWY_ASSERT(pos == N);
+
+ // Store uncompressed indices because SVE TBL returns 0 if an index is out
+ // of bounds. On AVX3 we simply variable-shift because permute indices are
+ // interpreted modulo N. Compression is not worth the extra shift+AND
+ // because the table is anyway only 512 bytes.
+ for (size_t i = 0; i < N; ++i) {
+ printf("%d,", static_cast<int>(indices[i]));
+ }
+ }
+ printf("\n");
+}
+
+// Same as above, but prints pairs of u32 indices (for AVX2). Also includes
+// FirstN bits in the nibble MSB.
+void PrintCompress64x4PairTables() {
+ printf("======================================= 64x4 u32 index\n");
+ constexpr size_t N = 4; // AVX2
+ for (uint64_t code = 0; code < (1ull << N); ++code) {
+ const size_t count = PopCount(code);
+ std::array<size_t, N> indices{0};
+ size_t pos = 0;
+ // All lanes where mask = true
+ for (size_t i = 0; i < N; ++i) {
+ if (code & (1ull << i)) {
+ indices[pos++] = i;
+ }
+ }
+ // All lanes where mask = false
+ for (size_t i = 0; i < N; ++i) {
+ if (!(code & (1ull << i))) {
+ indices[pos++] = i;
+ }
+ }
+ HWY_ASSERT(pos == N);
+
+ // Store uncompressed indices because SVE TBL returns 0 if an index is out
+ // of bounds. On AVX3 we simply variable-shift because permute indices are
+ // interpreted modulo N. Compression is not worth the extra shift+AND
+ // because the table is anyway only 512 bytes.
+ for (size_t i = 0; i < N; ++i) {
+ const int first_n_bit = i < count ? 8 : 0;
+ const int low = static_cast<int>(2 * indices[i]) + first_n_bit;
+ HWY_ASSERT(low < 0x10);
+ printf("%d, %d, ", low, low + 1);
+ }
+ }
+ printf("\n");
+}
+
+void PrintCompressNot64x4PairTables() {
+ printf("======================================= Not 64x4 u32 index\n");
+ constexpr size_t N = 4; // AVX2
+ for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+ const uint64_t code = ~not_code;
+ const size_t count = PopCount(code);
+ std::array<size_t, N> indices{0};
+ size_t pos = 0;
+ // All lanes where mask = true
+ for (size_t i = 0; i < N; ++i) {
+ if (code & (1ull << i)) {
+ indices[pos++] = i;
+ }
+ }
+ // All lanes where mask = false
+ for (size_t i = 0; i < N; ++i) {
+ if (!(code & (1ull << i))) {
+ indices[pos++] = i;
+ }
+ }
+ HWY_ASSERT(pos == N);
+
+ // Store uncompressed indices because SVE TBL returns 0 if an index is out
+ // of bounds. On AVX3 we simply variable-shift because permute indices are
+ // interpreted modulo N. Compression is not worth the extra shift+AND
+ // because the table is anyway only 512 bytes.
+ for (size_t i = 0; i < N; ++i) {
+ const int first_n_bit = i < count ? 8 : 0;
+ const int low = static_cast<int>(2 * indices[i]) + first_n_bit;
+ HWY_ASSERT(low < 0x10);
+ printf("%d, %d, ", low, low + 1);
+ }
+ }
+ printf("\n");
+}
+
+// 4-tuple of byte indices
+void PrintCompress32x4Tables() {
+ printf("======================================= 32x4\n");
+ using T = uint32_t;
+ constexpr size_t N = 4; // SSE4
+ for (uint64_t code = 0; code < (1ull << N); ++code) {
+ std::array<uint32_t, N> indices{0};
+ size_t pos = 0;
+ // All lanes where mask = true
+ for (size_t i = 0; i < N; ++i) {
+ if (code & (1ull << i)) {
+ indices[pos++] = i;
+ }
+ }
+ // All lanes where mask = false
+ for (size_t i = 0; i < N; ++i) {
+ if (!(code & (1ull << i))) {
+ indices[pos++] = i;
+ }
+ }
+ HWY_ASSERT(pos == N);
+
+ for (size_t i = 0; i < N; ++i) {
+ for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
+ printf("%d,", static_cast<int>(sizeof(T) * indices[i] + idx_byte));
+ }
+ }
+ }
+ printf("\n");
+}
+
+void PrintCompressNot32x4Tables() {
+ printf("======================================= Not 32x4\n");
+ using T = uint32_t;
+ constexpr size_t N = 4; // SSE4
+ for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+ const uint64_t code = ~not_code;
+ std::array<uint32_t, N> indices{0};
+ size_t pos = 0;
+ // All lanes where mask = true
+ for (size_t i = 0; i < N; ++i) {
+ if (code & (1ull << i)) {
+ indices[pos++] = i;
+ }
+ }
+ // All lanes where mask = false
+ for (size_t i = 0; i < N; ++i) {
+ if (!(code & (1ull << i))) {
+ indices[pos++] = i;
+ }
+ }
+ HWY_ASSERT(pos == N);
+
+ for (size_t i = 0; i < N; ++i) {
+ for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
+ printf("%d,", static_cast<int>(sizeof(T) * indices[i] + idx_byte));
+ }
+ }
+ }
+ printf("\n");
+}
+
+// 8-tuple of byte indices
+void PrintCompress64x2Tables() {
+ printf("======================================= 64x2\n");
+ using T = uint64_t;
+ constexpr size_t N = 2; // SSE4
+ for (uint64_t code = 0; code < (1ull << N); ++code) {
+ std::array<uint32_t, N> indices{0};
+ size_t pos = 0;
+ // All lanes where mask = true
+ for (size_t i = 0; i < N; ++i) {
+ if (code & (1ull << i)) {
+ indices[pos++] = i;
+ }
+ }
+ // All lanes where mask = false
+ for (size_t i = 0; i < N; ++i) {
+ if (!(code & (1ull << i))) {
+ indices[pos++] = i;
+ }
+ }
+ HWY_ASSERT(pos == N);
+
+ for (size_t i = 0; i < N; ++i) {
+ for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
+ printf("%d,", static_cast<int>(sizeof(T) * indices[i] + idx_byte));
+ }
+ }
+ }
+ printf("\n");
+}
+
+void PrintCompressNot64x2Tables() {
+ printf("======================================= Not 64x2\n");
+ using T = uint64_t;
+ constexpr size_t N = 2; // SSE4
+ for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+ const uint64_t code = ~not_code;
+ std::array<uint32_t, N> indices{0};
+ size_t pos = 0;
+ // All lanes where mask = true
+ for (size_t i = 0; i < N; ++i) {
+ if (code & (1ull << i)) {
+ indices[pos++] = i;
+ }
+ }
+ // All lanes where mask = false
+ for (size_t i = 0; i < N; ++i) {
+ if (!(code & (1ull << i))) {
+ indices[pos++] = i;
+ }
+ }
+ HWY_ASSERT(pos == N);
+
+ for (size_t i = 0; i < N; ++i) {
+ for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
+ printf("%d,", static_cast<int>(sizeof(T) * indices[i] + idx_byte));
+ }
+ }
+ }
+ printf("\n");
+}
+
+} // namespace detail
+
+HWY_NOINLINE void PrintTables() {
+ // Only print once.
+#if HWY_TARGET == HWY_STATIC_TARGET
+ detail::PrintCompress32x8Tables();
+ detail::PrintCompressNot32x8Tables();
+ detail::PrintCompress64x4NibbleTables();
+ detail::PrintCompressNot64x4NibbleTables();
+ detail::PrintCompressNot64x2NibbleTables();
+ detail::PrintCompress64x4Tables();
+ detail::PrintCompressNot64x4Tables();
+ detail::PrintCompress32x4Tables();
+ detail::PrintCompressNot32x4Tables();
+ detail::PrintCompress64x2Tables();
+ detail::PrintCompressNot64x2Tables();
+ detail::PrintCompress64x4PairTables();
+ detail::PrintCompressNot64x4PairTables();
+ detail::PrintCompress16x8Tables();
+ detail::PrintCompress8x8Tables();
+ detail::PrintCompressNot16x8Tables();
+#endif
+}
+
+#endif // HWY_PRINT_TABLES
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyCompressTest);
+#if HWY_PRINT_TABLES
+// Only print instead of running tests; this will be visible in the log.
+HWY_EXPORT_AND_TEST_P(HwyCompressTest, PrintTables);
+#else
+HWY_EXPORT_AND_TEST_P(HwyCompressTest, TestAllCompress);
+HWY_EXPORT_AND_TEST_P(HwyCompressTest, TestAllCompressBlocks);
+#endif
+} // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/convert_test.cc b/third_party/highway/hwy/tests/convert_test.cc
new file mode 100644
index 0000000000..a7aea5fe9e
--- /dev/null
+++ b/third_party/highway/hwy/tests/convert_test.cc
@@ -0,0 +1,643 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <cmath> // std::isfinite
+
+#include "hwy/base.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/convert_test.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// Cast and ensure bytes are the same. Called directly from TestAllBitCast or
+// via TestBitCastFrom.
+template <typename ToT>
+struct TestBitCast {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const Repartition<ToT, D> dto;
+ const size_t N = Lanes(d);
+ const size_t Nto = Lanes(dto);
+ if (N == 0 || Nto == 0) return;
+ HWY_ASSERT_EQ(N * sizeof(T), Nto * sizeof(ToT));
+ const auto vf = Iota(d, 1);
+ const auto vt = BitCast(dto, vf);
+ // Must return the same bits
+ auto from_lanes = AllocateAligned<T>(Lanes(d));
+ auto to_lanes = AllocateAligned<ToT>(Lanes(dto));
+ Store(vf, d, from_lanes.get());
+ Store(vt, dto, to_lanes.get());
+ HWY_ASSERT(
+ BytesEqual(from_lanes.get(), to_lanes.get(), Lanes(d) * sizeof(T)));
+ }
+};
+
+// From D to all types.
+struct TestBitCastFrom {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T t, D d) {
+ TestBitCast<uint8_t>()(t, d);
+ TestBitCast<uint16_t>()(t, d);
+ TestBitCast<uint32_t>()(t, d);
+#if HWY_HAVE_INTEGER64
+ TestBitCast<uint64_t>()(t, d);
+#endif
+ TestBitCast<int8_t>()(t, d);
+ TestBitCast<int16_t>()(t, d);
+ TestBitCast<int32_t>()(t, d);
+#if HWY_HAVE_INTEGER64
+ TestBitCast<int64_t>()(t, d);
+#endif
+ TestBitCast<float>()(t, d);
+#if HWY_HAVE_FLOAT64
+ TestBitCast<double>()(t, d);
+#endif
+ }
+};
+
+HWY_NOINLINE void TestAllBitCast() {
+ // For HWY_SCALAR and partial vectors, we can only cast to same-sized types:
+ // the former can't partition its single lane, and the latter can be smaller
+ // than a destination type.
+ const ForPartialVectors<TestBitCast<uint8_t>> to_u8;
+ to_u8(uint8_t());
+ to_u8(int8_t());
+
+ const ForPartialVectors<TestBitCast<int8_t>> to_i8;
+ to_i8(uint8_t());
+ to_i8(int8_t());
+
+ const ForPartialVectors<TestBitCast<uint16_t>> to_u16;
+ to_u16(uint16_t());
+ to_u16(int16_t());
+
+ const ForPartialVectors<TestBitCast<int16_t>> to_i16;
+ to_i16(uint16_t());
+ to_i16(int16_t());
+
+ const ForPartialVectors<TestBitCast<uint32_t>> to_u32;
+ to_u32(uint32_t());
+ to_u32(int32_t());
+ to_u32(float());
+
+ const ForPartialVectors<TestBitCast<int32_t>> to_i32;
+ to_i32(uint32_t());
+ to_i32(int32_t());
+ to_i32(float());
+
+#if HWY_HAVE_INTEGER64
+ const ForPartialVectors<TestBitCast<uint64_t>> to_u64;
+ to_u64(uint64_t());
+ to_u64(int64_t());
+#if HWY_HAVE_FLOAT64
+ to_u64(double());
+#endif
+
+ const ForPartialVectors<TestBitCast<int64_t>> to_i64;
+ to_i64(uint64_t());
+ to_i64(int64_t());
+#if HWY_HAVE_FLOAT64
+ to_i64(double());
+#endif
+#endif // HWY_HAVE_INTEGER64
+
+ const ForPartialVectors<TestBitCast<float>> to_float;
+ to_float(uint32_t());
+ to_float(int32_t());
+ to_float(float());
+
+#if HWY_HAVE_FLOAT64
+ const ForPartialVectors<TestBitCast<double>> to_double;
+ to_double(double());
+#if HWY_HAVE_INTEGER64
+ to_double(uint64_t());
+ to_double(int64_t());
+#endif // HWY_HAVE_INTEGER64
+#endif // HWY_HAVE_FLOAT64
+
+#if HWY_TARGET != HWY_SCALAR
+ // For non-scalar vectors, we can cast all types to all.
+ ForAllTypes(ForGEVectors<64, TestBitCastFrom>());
+#endif
+}
+
+template <typename ToT>
+struct TestPromoteTo {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D from_d) {
+ static_assert(sizeof(T) < sizeof(ToT), "Input type must be narrower");
+ const Rebind<ToT, D> to_d;
+
+ const size_t N = Lanes(from_d);
+ auto from = AllocateAligned<T>(N);
+ auto expected = AllocateAligned<ToT>(N);
+
+ RandomState rng;
+ for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+ for (size_t i = 0; i < N; ++i) {
+ const uint64_t bits = rng();
+ CopyBytes<sizeof(T)>(&bits, &from[i]); // not same size
+ expected[i] = from[i];
+ }
+
+ HWY_ASSERT_VEC_EQ(to_d, expected.get(),
+ PromoteTo(to_d, Load(from_d, from.get())));
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllPromoteTo() {
+ const ForPromoteVectors<TestPromoteTo<uint16_t>, 1> to_u16div2;
+ to_u16div2(uint8_t());
+
+ const ForPromoteVectors<TestPromoteTo<uint32_t>, 2> to_u32div4;
+ to_u32div4(uint8_t());
+
+ const ForPromoteVectors<TestPromoteTo<uint32_t>, 1> to_u32div2;
+ to_u32div2(uint16_t());
+
+ const ForPromoteVectors<TestPromoteTo<int16_t>, 1> to_i16div2;
+ to_i16div2(uint8_t());
+ to_i16div2(int8_t());
+
+ const ForPromoteVectors<TestPromoteTo<int32_t>, 1> to_i32div2;
+ to_i32div2(uint16_t());
+ to_i32div2(int16_t());
+
+ const ForPromoteVectors<TestPromoteTo<int32_t>, 2> to_i32div4;
+ to_i32div4(uint8_t());
+ to_i32div4(int8_t());
+
+ // Must test f16/bf16 separately because we can only load/store/convert them.
+
+#if HWY_HAVE_INTEGER64
+ const ForPromoteVectors<TestPromoteTo<uint64_t>, 1> to_u64div2;
+ to_u64div2(uint32_t());
+
+ const ForPromoteVectors<TestPromoteTo<int64_t>, 1> to_i64div2;
+ to_i64div2(int32_t());
+#endif
+
+#if HWY_HAVE_FLOAT64
+ const ForPromoteVectors<TestPromoteTo<double>, 1> to_f64div2;
+ to_f64div2(int32_t());
+ to_f64div2(float());
+#endif
+}
+
+template <typename T, HWY_IF_FLOAT(T)>
+bool IsFinite(T t) {
+ return std::isfinite(t);
+}
+// Wrapper avoids calling std::isfinite for integer types (ambiguous).
+template <typename T, HWY_IF_NOT_FLOAT(T)>
+bool IsFinite(T /*unused*/) {
+ return true;
+}
+
+template <class D>
+AlignedFreeUniquePtr<float[]> F16TestCases(D d, size_t& padded) {
+ const float test_cases[] = {
+ // +/- 1
+ 1.0f, -1.0f,
+ // +/- 0
+ 0.0f, -0.0f,
+ // near 0
+ 0.25f, -0.25f,
+ // +/- integer
+ 4.0f, -32.0f,
+ // positive near limit
+ 65472.0f, 65504.0f,
+ // negative near limit
+ -65472.0f, -65504.0f,
+ // positive +/- delta
+ 2.00390625f, 3.99609375f,
+ // negative +/- delta
+ -2.00390625f, -3.99609375f,
+ // No infinity/NaN - implementation-defined due to ARM.
+ };
+ constexpr size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
+ const size_t N = Lanes(d);
+ HWY_ASSERT(N != 0);
+ padded = RoundUpTo(kNumTestCases, N); // allow loading whole vectors
+ auto in = AllocateAligned<float>(padded);
+ auto expected = AllocateAligned<float>(padded);
+ size_t i = 0;
+ for (; i < kNumTestCases; ++i) {
+ in[i] = test_cases[i];
+ }
+ for (; i < padded; ++i) {
+ in[i] = 0.0f;
+ }
+ return in;
+}
+
+struct TestF16 {
+ template <typename TF32, class DF32>
+ HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {
+#if HWY_HAVE_FLOAT16
+ size_t padded;
+ const size_t N = Lanes(d32); // same count for f16
+ HWY_ASSERT(N != 0);
+ auto in = F16TestCases(d32, padded);
+ using TF16 = float16_t;
+ const Rebind<TF16, DF32> d16;
+ auto temp16 = AllocateAligned<TF16>(N);
+
+ for (size_t i = 0; i < padded; i += N) {
+ const auto loaded = Load(d32, &in[i]);
+ Store(DemoteTo(d16, loaded), d16, temp16.get());
+ HWY_ASSERT_VEC_EQ(d32, loaded, PromoteTo(d32, Load(d16, temp16.get())));
+ }
+#else
+ (void)d32;
+#endif
+ }
+};
+
+HWY_NOINLINE void TestAllF16() { ForDemoteVectors<TestF16>()(float()); }
+
+template <class D>
+AlignedFreeUniquePtr<float[]> BF16TestCases(D d, size_t& padded) {
+ const float test_cases[] = {
+ // +/- 1
+ 1.0f, -1.0f,
+ // +/- 0
+ 0.0f, -0.0f,
+ // near 0
+ 0.25f, -0.25f,
+ // +/- integer
+ 4.0f, -32.0f,
+ // positive near limit
+ 3.389531389251535E38f, 1.99384199368e+38f,
+ // negative near limit
+ -3.389531389251535E38f, -1.99384199368e+38f,
+ // positive +/- delta
+ 2.015625f, 3.984375f,
+ // negative +/- delta
+ -2.015625f, -3.984375f,
+ };
+ constexpr size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
+ const size_t N = Lanes(d);
+ HWY_ASSERT(N != 0);
+ padded = RoundUpTo(kNumTestCases, N); // allow loading whole vectors
+ auto in = AllocateAligned<float>(padded);
+ auto expected = AllocateAligned<float>(padded);
+ size_t i = 0;
+ for (; i < kNumTestCases; ++i) {
+ in[i] = test_cases[i];
+ }
+ for (; i < padded; ++i) {
+ in[i] = 0.0f;
+ }
+ return in;
+}
+
+struct TestBF16 {
+ template <typename TF32, class DF32>
+ HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {
+#if !defined(HWY_EMULATE_SVE)
+ size_t padded;
+ auto in = BF16TestCases(d32, padded);
+ using TBF16 = bfloat16_t;
+#if HWY_TARGET == HWY_SCALAR
+ const Rebind<TBF16, DF32> dbf16; // avoid 4/2 = 2 lanes
+#else
+ const Repartition<TBF16, DF32> dbf16;
+#endif
+ const Half<decltype(dbf16)> dbf16_half;
+ const size_t N = Lanes(d32);
+ HWY_ASSERT(Lanes(dbf16_half) <= N);
+ auto temp16 = AllocateAligned<TBF16>(N);
+
+ for (size_t i = 0; i < padded; i += N) {
+ const auto loaded = Load(d32, &in[i]);
+ const auto v16 = DemoteTo(dbf16_half, loaded);
+ Store(v16, dbf16_half, temp16.get());
+ const auto v16_loaded = Load(dbf16_half, temp16.get());
+ HWY_ASSERT_VEC_EQ(d32, loaded, PromoteTo(d32, v16_loaded));
+ }
+#else
+ (void)d32;
+#endif
+ }
+};
+
+HWY_NOINLINE void TestAllBF16() { ForShrinkableVectors<TestBF16>()(float()); }
+
+struct TestConvertU8 {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, const D du32) {
+ const Rebind<uint8_t, D> du8;
+ const auto wrap = Set(du32, 0xFF);
+ HWY_ASSERT_VEC_EQ(du8, Iota(du8, 0), U8FromU32(And(Iota(du32, 0), wrap)));
+ HWY_ASSERT_VEC_EQ(du8, Iota(du8, 0x7F),
+ U8FromU32(And(Iota(du32, 0x7F), wrap)));
+ }
+};
+
+HWY_NOINLINE void TestAllConvertU8() {
+ ForDemoteVectors<TestConvertU8, 2>()(uint32_t());
+}
+
+template <typename From, typename To, class D>
+constexpr bool IsSupportedTruncation() {
+ return (sizeof(To) < sizeof(From)) &&
+ (Pow2(Rebind<To, D>()) + 3 >= static_cast<int>(CeilLog2(sizeof(To))));
+}
+
+struct TestTruncateTo {
+ template <typename From, typename To, class D,
+ hwy::EnableIf<!IsSupportedTruncation<From, To, D>()>* = nullptr>
+ HWY_NOINLINE void testTo(From, To, const D) {
+ // do nothing
+ }
+
+ template <typename From, typename To, class D,
+ hwy::EnableIf<IsSupportedTruncation<From, To, D>()>* = nullptr>
+ HWY_NOINLINE void testTo(From, To, const D d) {
+ constexpr uint32_t base = 0xFA578D00;
+ const Rebind<To, D> dTo;
+ const auto src = Iota(d, static_cast<From>(base));
+ const auto expected = Iota(dTo, static_cast<To>(base));
+ const VFromD<decltype(dTo)> actual = TruncateTo(dTo, src);
+ HWY_ASSERT_VEC_EQ(dTo, expected, actual);
+ }
+
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T from, const D d) {
+ testTo<T, uint8_t, D>(from, uint8_t(), d);
+ testTo<T, uint16_t, D>(from, uint16_t(), d);
+ testTo<T, uint32_t, D>(from, uint32_t(), d);
+ }
+};
+
+HWY_NOINLINE void TestAllTruncate() {
+ ForUnsignedTypes(ForPartialVectors<TestTruncateTo>());
+}
+
+// Separate function to attempt to work around a compiler bug on ARM: when this
+// is merged with TestIntFromFloat, outputs match a previous Iota(-(N+1)) input.
+struct TestIntFromFloatHuge {
+ template <typename TF, class DF>
+ HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
+ // The ARMv7 manual says that float->int saturates, i.e. chooses the
+ // nearest representable value. This works correctly on armhf with GCC, but
+ // not with clang. For reasons unknown, MSVC also runs into an out-of-memory
+ // error here.
+#if HWY_COMPILER_CLANG || HWY_COMPILER_MSVC
+ (void)df;
+#else
+ using TI = MakeSigned<TF>;
+ const Rebind<TI, DF> di;
+
+ // Workaround for incorrect 32-bit GCC codegen for SSSE3 - Print-ing
+ // the expected lvalue also seems to prevent the issue.
+ const size_t N = Lanes(df);
+ auto expected = AllocateAligned<TI>(N);
+
+ // Huge positive
+ Store(Set(di, LimitsMax<TI>()), di, expected.get());
+ HWY_ASSERT_VEC_EQ(di, expected.get(), ConvertTo(di, Set(df, TF(1E20))));
+
+ // Huge negative
+ Store(Set(di, LimitsMin<TI>()), di, expected.get());
+ HWY_ASSERT_VEC_EQ(di, expected.get(), ConvertTo(di, Set(df, TF(-1E20))));
+#endif
+ }
+};
+
+class TestIntFromFloat {
+ template <typename TF, class DF>
+ static HWY_NOINLINE void TestPowers(TF /*unused*/, const DF df) {
+ using TI = MakeSigned<TF>;
+ const Rebind<TI, DF> di;
+ constexpr size_t kBits = sizeof(TF) * 8;
+
+ // Powers of two, plus offsets to set some mantissa bits.
+ const int64_t ofs_table[3] = {0LL, 3LL << (kBits / 2), 1LL << (kBits - 15)};
+ for (int sign = 0; sign < 2; ++sign) {
+ for (size_t shift = 0; shift < kBits - 1; ++shift) {
+ for (int64_t ofs : ofs_table) {
+ const int64_t mag = (int64_t{1} << shift) + ofs;
+ const int64_t val = sign ? mag : -mag;
+ HWY_ASSERT_VEC_EQ(di, Set(di, static_cast<TI>(val)),
+ ConvertTo(di, Set(df, static_cast<TF>(val))));
+ }
+ }
+ }
+ }
+
+ template <typename TF, class DF>
+ static HWY_NOINLINE void TestRandom(TF /*unused*/, const DF df) {
+ using TI = MakeSigned<TF>;
+ const Rebind<TI, DF> di;
+ const size_t N = Lanes(df);
+
+ // TF does not have enough precision to represent TI.
+ const double min = static_cast<double>(LimitsMin<TI>());
+ const double max = static_cast<double>(LimitsMax<TI>());
+
+ // Also check random values.
+ auto from = AllocateAligned<TF>(N);
+ auto expected = AllocateAligned<TI>(N);
+ RandomState rng;
+ for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
+ for (size_t i = 0; i < N; ++i) {
+ do {
+ const uint64_t bits = rng();
+ CopyBytes<sizeof(TF)>(&bits, &from[i]); // not same size
+ } while (!std::isfinite(from[i]));
+ if (from[i] >= max) {
+ expected[i] = LimitsMax<TI>();
+ } else if (from[i] <= min) {
+ expected[i] = LimitsMin<TI>();
+ } else {
+ expected[i] = static_cast<TI>(from[i]);
+ }
+ }
+
+ HWY_ASSERT_VEC_EQ(di, expected.get(),
+ ConvertTo(di, Load(df, from.get())));
+ }
+ }
+
+ public:
+ template <typename TF, class DF>
+ HWY_NOINLINE void operator()(TF tf, const DF df) {
+ using TI = MakeSigned<TF>;
+ const Rebind<TI, DF> di;
+ const size_t N = Lanes(df);
+
+ // Integer positive
+ HWY_ASSERT_VEC_EQ(di, Iota(di, TI(4)), ConvertTo(di, Iota(df, TF(4.0))));
+
+ // Integer negative
+ HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), ConvertTo(di, Iota(df, -TF(N))));
+
+ // Above positive
+ HWY_ASSERT_VEC_EQ(di, Iota(di, TI(2)), ConvertTo(di, Iota(df, TF(2.001))));
+
+ // Below positive
+ HWY_ASSERT_VEC_EQ(di, Iota(di, TI(3)), ConvertTo(di, Iota(df, TF(3.9999))));
+
+ const TF eps = static_cast<TF>(0.0001);
+ // Above negative
+ HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)),
+ ConvertTo(di, Iota(df, -TF(N + 1) + eps)));
+
+ // Below negative
+ HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N + 1)),
+ ConvertTo(di, Iota(df, -TF(N + 1) - eps)));
+
+ TestPowers(tf, df);
+ TestRandom(tf, df);
+ }
+};
+
+HWY_NOINLINE void TestAllIntFromFloat() {
+ ForFloatTypes(ForPartialVectors<TestIntFromFloatHuge>());
+ ForFloatTypes(ForPartialVectors<TestIntFromFloat>());
+}
+
+struct TestFloatFromInt {
+ template <typename TF, class DF>
+ HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
+ using TI = MakeSigned<TF>;
+ const RebindToSigned<DF> di;
+ const size_t N = Lanes(df);
+
+ // Integer positive
+ HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), ConvertTo(df, Iota(di, TI(4))));
+
+ // Integer negative
+ HWY_ASSERT_VEC_EQ(df, Iota(df, -TF(N)), ConvertTo(df, Iota(di, -TI(N))));
+
+ // Max positive
+ HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMax<TI>())),
+ ConvertTo(df, Set(di, LimitsMax<TI>())));
+
+ // Min negative
+ HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMin<TI>())),
+ ConvertTo(df, Set(di, LimitsMin<TI>())));
+ }
+};
+
+HWY_NOINLINE void TestAllFloatFromInt() {
+ ForFloatTypes(ForPartialVectors<TestFloatFromInt>());
+}
+
+struct TestFloatFromUint {
+ template <typename TF, class DF>
+ HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
+ using TU = MakeUnsigned<TF>;
+ const RebindToUnsigned<DF> du;
+
+ // Integer positive
+ HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), ConvertTo(df, Iota(du, TU(4))));
+ HWY_ASSERT_VEC_EQ(df, Iota(df, TF(65535.0)),
+ ConvertTo(df, Iota(du, 65535))); // 2^16-1
+ if (sizeof(TF) > 4) {
+ HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4294967295.0)),
+ ConvertTo(df, Iota(du, 4294967295ULL))); // 2^32-1
+ }
+
+ // Max positive
+ HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMax<TU>())),
+ ConvertTo(df, Set(du, LimitsMax<TU>())));
+
+ // Zero
+ HWY_ASSERT_VEC_EQ(df, Zero(df), ConvertTo(df, Zero(du)));
+ }
+};
+
+HWY_NOINLINE void TestAllFloatFromUint() {
+ ForFloatTypes(ForPartialVectors<TestFloatFromUint>());
+}
+
+struct TestI32F64 {
+ template <typename TF, class DF>
+ HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
+ using TI = int32_t;
+ const Rebind<TI, DF> di;
+ const size_t N = Lanes(df);
+
+ // Integer positive
+ HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), PromoteTo(df, Iota(di, TI(4))));
+
+ // Integer negative
+ HWY_ASSERT_VEC_EQ(df, Iota(df, -TF(N)), PromoteTo(df, Iota(di, -TI(N))));
+
+ // Above positive
+ HWY_ASSERT_VEC_EQ(df, Iota(df, TF(2.0)), PromoteTo(df, Iota(di, TI(2))));
+
+ // Below positive
+ HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), PromoteTo(df, Iota(di, TI(4))));
+
+ // Above negative
+ HWY_ASSERT_VEC_EQ(df, Iota(df, TF(-4.0)), PromoteTo(df, Iota(di, TI(-4))));
+
+ // Below negative
+ HWY_ASSERT_VEC_EQ(df, Iota(df, TF(-2.0)), PromoteTo(df, Iota(di, TI(-2))));
+
+ // Max positive int
+ HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMax<TI>())),
+ PromoteTo(df, Set(di, LimitsMax<TI>())));
+
+ // Min negative int
+ HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMin<TI>())),
+ PromoteTo(df, Set(di, LimitsMin<TI>())));
+ }
+};
+
+HWY_NOINLINE void TestAllI32F64() {
+#if HWY_HAVE_FLOAT64
+ ForDemoteVectors<TestI32F64>()(double());
+#endif
+}
+
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyConvertTest);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllBitCast);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllPromoteTo);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllF16);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllBF16);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllConvertU8);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllTruncate);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllIntFromFloat);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllFloatFromInt);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllFloatFromUint);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllI32F64);
+} // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/crypto_test.cc b/third_party/highway/hwy/tests/crypto_test.cc
new file mode 100644
index 0000000000..b7dfb198a3
--- /dev/null
+++ b/third_party/highway/hwy/tests/crypto_test.cc
@@ -0,0 +1,553 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h> // memcpy
+
+#include "hwy/aligned_allocator.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/crypto_test.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+#define HWY_PRINT_CLMUL_GOLDEN 0
+
+#if HWY_TARGET != HWY_SCALAR
+
+class TestAES {
+ template <typename T, class D>
+ HWY_NOINLINE void TestSBox(T /*unused*/, D d) {
+ // The generic implementation of the S-box is difficult to verify by
+ // inspection, so we add a white-box test that verifies it using enumeration
+ // (outputs for 0..255 vs. https://en.wikipedia.org/wiki/Rijndael_S-box).
+ const uint8_t sbox[256] = {
+ 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b,
+ 0xfe, 0xd7, 0xab, 0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0,
+ 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26,
+ 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
+ 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2,
+ 0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0,
+ 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed,
+ 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
+ 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f,
+ 0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
+ 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec,
+ 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
+ 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14,
+ 0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c,
+ 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d,
+ 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
+ 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f,
+ 0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e,
+ 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1, 0xf8, 0x98, 0x11,
+ 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
+ 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f,
+ 0xb0, 0x54, 0xbb, 0x16};
+
+ // Ensure it's safe to load an entire vector by padding.
+ const size_t N = Lanes(d);
+ const size_t padded = RoundUpTo(256, N);
+ auto expected = AllocateAligned<T>(padded);
+ // Must wrap around to match the input (Iota).
+ for (size_t pos = 0; pos < padded;) {
+ const size_t remaining = HWY_MIN(padded - pos, size_t(256));
+ memcpy(expected.get() + pos, sbox, remaining);
+ pos += remaining;
+ }
+
+ for (size_t i = 0; i < 256; i += N) {
+ const auto in = Iota(d, static_cast<T>(i));
+ HWY_ASSERT_VEC_EQ(d, expected.get() + i, detail::SubBytes(in));
+ }
+ }
+
+ public:
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T t, D d) {
+ // Test vector (after first KeyAddition) from
+ // https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Standards-and-Guidelines/documents/examples/AES_Core128.pdf
+ alignas(16) constexpr uint8_t test_lanes[16] = {
+ 0x40, 0xBF, 0xAB, 0xF4, 0x06, 0xEE, 0x4D, 0x30,
+ 0x42, 0xCA, 0x6B, 0x99, 0x7A, 0x5C, 0x58, 0x16};
+ const auto test = LoadDup128(d, test_lanes);
+
+ // = ShiftRow result
+ alignas(16) constexpr uint8_t expected_sr_lanes[16] = {
+ 0x09, 0x28, 0x7F, 0x47, 0x6F, 0x74, 0x6A, 0xBF,
+ 0x2C, 0x4A, 0x62, 0x04, 0xDA, 0x08, 0xE3, 0xEE};
+ const auto expected_sr = LoadDup128(d, expected_sr_lanes);
+
+ // = MixColumn result
+ alignas(16) constexpr uint8_t expected_mc_lanes[16] = {
+ 0x52, 0x9F, 0x16, 0xC2, 0x97, 0x86, 0x15, 0xCA,
+ 0xE0, 0x1A, 0xAE, 0x54, 0xBA, 0x1A, 0x26, 0x59};
+ const auto expected_mc = LoadDup128(d, expected_mc_lanes);
+
+ // = KeyAddition result
+ alignas(16) constexpr uint8_t expected_lanes[16] = {
+ 0xF2, 0x65, 0xE8, 0xD5, 0x1F, 0xD2, 0x39, 0x7B,
+ 0xC3, 0xB9, 0x97, 0x6D, 0x90, 0x76, 0x50, 0x5C};
+ const auto expected = LoadDup128(d, expected_lanes);
+
+ alignas(16) uint8_t key_lanes[16];
+ for (size_t i = 0; i < 16; ++i) {
+ key_lanes[i] = expected_mc_lanes[i] ^ expected_lanes[i];
+ }
+ const auto round_key = LoadDup128(d, key_lanes);
+
+ HWY_ASSERT_VEC_EQ(d, expected_mc, AESRound(test, Zero(d)));
+ HWY_ASSERT_VEC_EQ(d, expected, AESRound(test, round_key));
+ HWY_ASSERT_VEC_EQ(d, expected_sr, AESLastRound(test, Zero(d)));
+ HWY_ASSERT_VEC_EQ(d, Xor(expected_sr, round_key),
+ AESLastRound(test, round_key));
+
+ TestSBox(t, d);
+ }
+};
+HWY_NOINLINE void TestAllAES() { ForGEVectors<128, TestAES>()(uint8_t()); }
+
+#else
+HWY_NOINLINE void TestAllAES() {}
+#endif // HWY_TARGET != HWY_SCALAR
+
+struct TestCLMul {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ // needs 64 bit lanes and 128-bit result
+#if HWY_TARGET != HWY_SCALAR && HWY_HAVE_INTEGER64
+ const size_t N = Lanes(d);
+ if (N == 1) return;
+
+ auto in1 = AllocateAligned<T>(N);
+ auto in2 = AllocateAligned<T>(N);
+
+ constexpr size_t kCLMulNum = 512;
+ // Depends on rng!
+ static constexpr uint64_t kCLMulLower[kCLMulNum] = {
+ 0x24511d4ce34d6350ULL, 0x4ca582edde1236bbULL, 0x537e58f72dac25a8ULL,
+ 0x4e942d5e130b9225ULL, 0x75a906c519257a68ULL, 0x1df9f85126d96c5eULL,
+ 0x464e7c13f4ad286aULL, 0x138535ee35dabc40ULL, 0xb2f7477b892664ecULL,
+ 0x01557b077167c25dULL, 0xf32682490ee49624ULL, 0x0025bac603b9e140ULL,
+ 0xcaa86aca3e3daf40ULL, 0x1fbcfe4af73eb6c4ULL, 0x8ee8064dd0aae5dcULL,
+ 0x1248cb547858c213ULL, 0x37a55ee5b10fb34cULL, 0x6eb5c97b958f86e2ULL,
+ 0x4b1ab3eb655ea7cdULL, 0x1d66645a85627520ULL, 0xf8728e96daa36748ULL,
+ 0x38621043e6ff5e3bULL, 0xd1d28b5da5ffefb4ULL, 0x0a5cd65931546df7ULL,
+ 0x2a0639be3d844150ULL, 0x0e2d0f18c8d6f045ULL, 0xfacc770b963326c1ULL,
+ 0x19611b31ca2ef141ULL, 0xabea29510dd87518ULL, 0x18a7dc4b205f2768ULL,
+ 0x9d3975ea5612dc86ULL, 0x06319c139e374773ULL, 0x6641710400b4c390ULL,
+ 0x356c29b6001c3670ULL, 0xe9e04d851e040a00ULL, 0x21febe561222d79aULL,
+ 0xc071eaae6e148090ULL, 0x0eed351a0af94f5bULL, 0x04324eedb3c03688ULL,
+ 0x39e89b136e0d6ccdULL, 0x07d0fd2777a31600ULL, 0x44b8573827209822ULL,
+ 0x6d690229ea177d78ULL, 0x1b9749d960ba9f18ULL, 0x190945271c0fbb94ULL,
+ 0x189aea0e07d2c88eULL, 0xf18eab6b65a6beb2ULL, 0x57744b21c13d0d84ULL,
+ 0xf63050a613e95c2eULL, 0x12cd20d25f97102fULL, 0x5a5df0678dbcba60ULL,
+ 0x0b08fb80948bfafcULL, 0x44cf1cbe7c6fc3c8ULL, 0x166a470ef25da288ULL,
+ 0x2c498a609204e48cULL, 0x261b0a22585697ecULL, 0x737750574af7dde4ULL,
+ 0x4079959c60b01e0cULL, 0x06ed8aac13f782d6ULL, 0x019d454ba9b5ef20ULL,
+ 0xea1edbf96d49e858ULL, 0x17c2f3ebde9ac469ULL, 0x5cf72706e3d6f5e4ULL,
+ 0x16e856aa3c841516ULL, 0x256f7e3cef83368eULL, 0x47e17c8eb2774e77ULL,
+ 0x9b48ac150a804821ULL, 0x584523f61ccfdf22ULL, 0xedcb6a2a75d9e7f2ULL,
+ 0x1fe3d1838e537aa7ULL, 0x778872e9f64549caULL, 0x2f1cea6f0d3faf92ULL,
+ 0x0e8c4b6a9343f326ULL, 0x01902d1ba3048954ULL, 0xc5c1fd5269e91dc0ULL,
+ 0x0ef8a4707817eb9cULL, 0x1f696f09a5354ca4ULL, 0x369cd9de808b818cULL,
+ 0xf6917d1dd43fd784ULL, 0x7f4b76bf40dc166fULL, 0x4ce67698724ace12ULL,
+ 0x02c3bf60e6e9cd92ULL, 0xb8229e45b21458e8ULL, 0x415efd41e91adf49ULL,
+ 0x5edfcd516bb921cdULL, 0x5ff2c29429fd187eULL, 0x0af666b17103b3e0ULL,
+ 0x1f5e4ff8f54c9a5bULL, 0x429253d8a5544ba6ULL, 0x19de2fdf9f4d9dcaULL,
+ 0x29bf3d37ddc19a40ULL, 0x04d4513a879552baULL, 0x5cc7476cf71ee155ULL,
+ 0x40011f8c238784a5ULL, 0x1a3ae50b0fd2ee2bULL, 0x7db22f432ba462baULL,
+ 0x417290b0bee2284aULL, 0x055a6bd5bb853db2ULL, 0xaa667daeed8c2a34ULL,
+ 0x0d6b316bda7f3577ULL, 0x72d35598468e3d5dULL, 0x375b594804bfd33aULL,
+ 0x16ed3a319b540ae8ULL, 0x093bace4b4695afdULL, 0xc7118754ec2737ceULL,
+ 0x0fff361f0505c81aULL, 0x996e9e7291321af0ULL, 0x496b1d9b0b89ba8cULL,
+ 0x65a98b2e9181da9cULL, 0x70759c8dd45575dfULL, 0x3446fe727f5e2cbbULL,
+ 0x1121ae609d195e74ULL, 0x5ff5d68ce8a21018ULL, 0x0e27eca3825b60d6ULL,
+ 0x82f628bceca3d1daULL, 0x2756a0914e344047ULL, 0xa460406c1c708d50ULL,
+ 0x63ce32a0c083e491ULL, 0xc883e5a685c480e0ULL, 0x602c951891e600f9ULL,
+ 0x02ecb2e3911ca5f8ULL, 0x0d8675f4bb70781aULL, 0x43545cc3c78ea496ULL,
+ 0x04164b01d6b011c2ULL, 0x3acbb323dcab2c9bULL, 0x31c5ba4e22793082ULL,
+ 0x5a6484af5f7c2d10ULL, 0x1a929b16194e8078ULL, 0x7a6a75d03b313924ULL,
+ 0x0553c73a35b1d525ULL, 0xf18628c51142be34ULL, 0x1b51cf80d7efd8f5ULL,
+ 0x52e0ca4df63ee258ULL, 0x0e977099160650c9ULL, 0x6be1524e92024f70ULL,
+ 0x0ee2152625438b9dULL, 0xfa32af436f6d8eb4ULL, 0x5ecf49c2154287e5ULL,
+ 0x6b72f4ae3590569dULL, 0x086c5ee6e87bfb68ULL, 0x737a4f0dc04b6187ULL,
+ 0x08c3439280edea41ULL, 0x9547944f01636c5cULL, 0x6acfbfc2571cd71fULL,
+ 0x85d7842972449637ULL, 0x252ea5e5a7fad86aULL, 0x4e41468f99ba1632ULL,
+ 0x095e0c3ae63b25a2ULL, 0xb005ce88fd1c9425ULL, 0x748e668abbe09f03ULL,
+ 0xb2cfdf466b187d18ULL, 0x60b11e633d8fe845ULL, 0x07144c4d246db604ULL,
+ 0x139bcaac55e96125ULL, 0x118679b5a6176327ULL, 0x1cebe90fa4d9f83fULL,
+ 0x22244f52f0d312acULL, 0x669d4e17c9bfb713ULL, 0x96390e0b834bb0d0ULL,
+ 0x01f7f0e82ba08071ULL, 0x2dffeee31ca6d284ULL, 0x1f4738745ef039feULL,
+ 0x4ce0dd2b603b6420ULL, 0x0035fc905910a4d5ULL, 0x07df2b533df6fb04ULL,
+ 0x1cee2735c9b910ddULL, 0x2bc4af565f7809eaULL, 0x2f876c1f5cb1076cULL,
+ 0x33e079524099d056ULL, 0x169e0405d2f9efbaULL, 0x018643ab548a358cULL,
+ 0x1bb6fc4331cffe92ULL, 0x05111d3a04e92faaULL, 0x23c27ecf0d638b73ULL,
+ 0x1b79071dc1685d68ULL, 0x0662d20aba8e1e0cULL, 0xe7f6440277144c6fULL,
+ 0x4ca38b64c22196c0ULL, 0x43c05f6d1936fbeeULL, 0x0654199d4d1faf0fULL,
+ 0xf2014054e71c2d04ULL, 0x0a103e47e96b4c84ULL, 0x7986e691dd35b040ULL,
+ 0x4e1ebb53c306a341ULL, 0x2775bb3d75d65ba6ULL, 0x0562ab0adeff0f15ULL,
+ 0x3c2746ad5eba3eacULL, 0x1facdb5765680c60ULL, 0xb802a60027d81d00ULL,
+ 0x1191d0f6366ae3a9ULL, 0x81a97b5ae0ea5d14ULL, 0x06bee05b6178a770ULL,
+ 0xc7baeb2fe1d6aeb3ULL, 0x594cb5b867d04fdfULL, 0xf515a80138a4e350ULL,
+ 0x646417ad8073cf38ULL, 0x4a229a43373fb8d4ULL, 0x10fa6eafff1ca453ULL,
+ 0x9f060700895cc731ULL, 0x00521133d11d11f4ULL, 0xb940a2bb912a7a5cULL,
+ 0x3fab180670ad2a3cULL, 0x45a5f0e5b6fdb95dULL, 0x27c1baad6f946b15ULL,
+ 0x336c6bdbe527cf58ULL, 0x3b83aa602a5baea3ULL, 0xdf749153f9bcc376ULL,
+ 0x1a05513a6c0b4a90ULL, 0xb81e0b570a075c47ULL, 0x471fabb40bdc27ceULL,
+ 0x9dec9472f6853f60ULL, 0x361f71b88114193bULL, 0x3b550a8c4feeff00ULL,
+ 0x0f6cde5a68bc9bc0ULL, 0x3f50121a925703e0ULL, 0x6967ff66d6d343a9ULL,
+ 0xff6b5bd2ce7bc3ccULL, 0x05474cea08bf6cd8ULL, 0xf76eabbfaf108eb0ULL,
+ 0x067529be4fc6d981ULL, 0x4d766b137cf8a988ULL, 0x2f09c7395c5cfbbdULL,
+ 0x388793712da06228ULL, 0x02c9ff342c8f339aULL, 0x152c734139a860a3ULL,
+ 0x35776eb2b270c04dULL, 0x0f8d8b41f11c4608ULL, 0x0c2071665be6b288ULL,
+ 0xc034e212b3f71d88ULL, 0x071d961ef3276f99ULL, 0xf98598ee75b60773ULL,
+ 0x062062c58c6724e4ULL, 0xd156438e2125572cULL, 0x38552d59a7f0f7c8ULL,
+ 0x1a402178206e413cULL, 0x1f1f996c68293b26ULL, 0x8bce3cafe1730f7eULL,
+ 0x2d0480a0828f6bf5ULL, 0x6c99cffa171f92f6ULL, 0x0087f842bb0ac681ULL,
+ 0x11d7ed06e1e7fd3eULL, 0x07cb1186f2385dc6ULL, 0x5d7763ebff1e170fULL,
+ 0x2dacc870231ac292ULL, 0x8486317a9ffb390cULL, 0x1c3a6dd20c959ac6ULL,
+ 0x90dc96e3992e06b8ULL, 0x70d60bfa33e72b67ULL, 0x70c9bddd0985ee63ULL,
+ 0x012c9767b3673093ULL, 0xfcd3bc5580f6a88aULL, 0x0ac80017ef6308c3ULL,
+ 0xdb67d709ef4bba09ULL, 0x4c63e324f0e247ccULL, 0xa15481d3fe219d60ULL,
+ 0x094c4279cdccb501ULL, 0x965a28c72575cb82ULL, 0x022869db25e391ebULL,
+ 0x37f528c146023910ULL, 0x0c1290636917deceULL, 0x9aee25e96251ca9cULL,
+ 0x728ac5ba853b69c2ULL, 0x9f272c93c4be20c8ULL, 0x06c1aa6319d28124ULL,
+ 0x4324496b1ca8a4f7ULL, 0x0096ecfe7dfc0189ULL, 0x9e06131b19ae0020ULL,
+ 0x15278b15902f4597ULL, 0x2a9fece8c13842d8ULL, 0x1d4e6781f0e1355eULL,
+ 0x6855b712d3dbf7c0ULL, 0x06a07fad99be6f46ULL, 0x3ed9d7957e4d1d7cULL,
+ 0x0c326f7cbc248bb2ULL, 0xe6363ad2c537cf51ULL, 0x0e12eb1c40723f13ULL,
+ 0xf5c6ac850afba803ULL, 0x0322a79d615fa9f0ULL, 0x6116696ed97bd5f8ULL,
+ 0x0d438080fbbdc9f1ULL, 0x2e4dc42c38f1e243ULL, 0x64948e9104f3a5bfULL,
+ 0x9fd622371bdb5f00ULL, 0x0f12bf082b2a1b6eULL, 0x4b1f8d867d78031cULL,
+ 0x134392ea9f5ef832ULL, 0xf3d70472321bc23eULL, 0x05fcbe5e9eea268eULL,
+ 0x136dede7175a22cfULL, 0x1308f8baac2cbcccULL, 0xd691026f0915eb64ULL,
+ 0x0e49a668345c3a38ULL, 0x24ddbbe8bc96f331ULL, 0x4d2ec9479b640578ULL,
+ 0x450f0697327b359cULL, 0x32b45360f4488ee0ULL, 0x4f6d9ecec46a105aULL,
+ 0x5500c63401ae8e80ULL, 0x47dea495cf6f98baULL, 0x13dc9a2dfca80babULL,
+ 0xe6f8a93f7b24ca92ULL, 0x073f57a6d900a87fULL, 0x9ddb935fd3aa695aULL,
+ 0x101e98d24b39e8aaULL, 0x6b8d0eb95a507ddcULL, 0x45a908b3903d209bULL,
+ 0x6c96a3e119e617d4ULL, 0x2442787543d3be48ULL, 0xd3bc055c7544b364ULL,
+ 0x7693bb042ca8653eULL, 0xb95e3a4ea5d0101eULL, 0x116f0d459bb94a73ULL,
+ 0x841244b72cdc5e90ULL, 0x1271acced6cb34d3ULL, 0x07d289106524d638ULL,
+ 0x537c9cf49c01b5bbULL, 0x8a8e16706bb7a5daULL, 0x12e50a9c499dc3a9ULL,
+ 0x1cade520db2ba830ULL, 0x1add52f000d7db70ULL, 0x12cf15db2ce78e30ULL,
+ 0x0657eaf606bfc866ULL, 0x4026816d3b05b1d0ULL, 0x1ba0ebdf90128e4aULL,
+ 0xdfd649375996dd6eULL, 0x0f416e906c23d9aeULL, 0x384273cad0582a24ULL,
+ 0x2ff27b0378a46189ULL, 0xc4ecd18a2d7a7616ULL, 0x35cef0b5cd51d640ULL,
+ 0x7d582363643f48b7ULL, 0x0984ad746ad0ab7cULL, 0x2990a999835f9688ULL,
+ 0x2d4df66a97b19e05ULL, 0x592c79720af99aa2ULL, 0x052863c230602cd3ULL,
+ 0x5f5e2b15edcf2840ULL, 0x01dff1b694b978b0ULL, 0x14345a48b622025eULL,
+ 0x028fab3b6407f715ULL, 0x3455d188e6feca50ULL, 0x1d0d40288fb1b5fdULL,
+ 0x4685c5c2b6a1e5aeULL, 0x3a2077b1e5fe5adeULL, 0x1bc55d611445a0d8ULL,
+ 0x05480ae95f3f83feULL, 0xbbb59cfcf7e17fb6ULL, 0x13f7f10970bbb990ULL,
+ 0x6d00ac169425a352ULL, 0x7da0db397ef2d5d3ULL, 0x5b512a247f8d2479ULL,
+ 0x637eaa6a977c3c32ULL, 0x3720f0ae37cba89cULL, 0x443df6e6aa7f525bULL,
+ 0x28664c287dcef321ULL, 0x03c267c00cf35e49ULL, 0x690185572d4021deULL,
+ 0x2707ff2596e321c2ULL, 0xd865f5af7722c380ULL, 0x1ea285658e33aafbULL,
+ 0xc257c5e88755bef4ULL, 0x066f67275cfcc31eULL, 0xb09931945cc0fed0ULL,
+ 0x58c1dc38d6e3a03fULL, 0xf99489678fc94ee8ULL, 0x75045bb99be5758aULL,
+ 0x6c163bc34b40feefULL, 0x0420063ce7bdd3b4ULL, 0xf86ef10582bf2e28ULL,
+ 0x162c3449ca14858cULL, 0x94106aa61dfe3280ULL, 0x4073ae7a4e7e4941ULL,
+ 0x32b13fd179c250b4ULL, 0x0178fbb216a7e744ULL, 0xf840ae2f1cf92669ULL,
+ 0x18fc709acc80243dULL, 0x20ac2ebd69f4d558ULL, 0x6e580ad9c73ad46aULL,
+ 0x76d2b535b541c19dULL, 0x6c7a3fb9dd0ce0afULL, 0xc3481689b9754f28ULL,
+ 0x156e813b6557abdbULL, 0x6ee372e31276eb10ULL, 0x19cf37c038c8d381ULL,
+ 0x00d4d906c9ae3072ULL, 0x09f03cbb6dfbfd40ULL, 0x461ba31c4125f3cfULL,
+ 0x25b29fc63ad9f05bULL, 0x6808c95c2dddede9ULL, 0x0564224337066d9bULL,
+ 0xc87eb5f4a4d966f2ULL, 0x66fc66e1701f5847ULL, 0xc553a3559f74da28ULL,
+ 0x1dfd841be574df43ULL, 0x3ee2f100c3ebc082ULL, 0x1a2c4f9517b56e89ULL,
+ 0x502f65c4b535c8ffULL, 0x1da5663ab6f96ec0ULL, 0xba1f80b73988152cULL,
+ 0x364ff12182ac8dc1ULL, 0xe3457a3c4871db31ULL, 0x6ae9cadf92fd7e84ULL,
+ 0x9621ba3d6ca15186ULL, 0x00ff5af878c144ceULL, 0x918464dc130101a4ULL,
+ 0x036511e6b187efa6ULL, 0x06667d66550ff260ULL, 0x7fd18913f9b51bc1ULL,
+ 0x3740e6b27af77aa8ULL, 0x1f546c2fd358ff8aULL, 0x42f1424e3115c891ULL,
+ 0x03767db4e3a1bb33ULL, 0xa171a1c564345060ULL, 0x0afcf632fd7b1324ULL,
+ 0xb59508d933ffb7d0ULL, 0x57d766c42071be83ULL, 0x659f0447546114a2ULL,
+ 0x4070364481c460aeULL, 0xa2b9752280644d52ULL, 0x04ab884bea5771bdULL,
+ 0x87cd135602a232b4ULL, 0x15e54cd9a8155313ULL, 0x1e8005efaa3e1047ULL,
+ 0x696b93f4ab15d39fULL, 0x0855a8e540de863aULL, 0x0bb11799e79f9426ULL,
+ 0xeffa61e5c1b579baULL, 0x1e060a1d11808219ULL, 0x10e219205667c599ULL,
+ 0x2f7b206091c49498ULL, 0xb48854c820064860ULL, 0x21c4aaa3bfbe4a38ULL,
+ 0x8f4a032a3fa67e9cULL, 0x3146b3823401e2acULL, 0x3afee26f19d88400ULL,
+ 0x167087c485791d38ULL, 0xb67a1ed945b0fb4bULL, 0x02436eb17e27f1c0ULL,
+ 0xe05afce2ce2d2790ULL, 0x49c536fc6224cfebULL, 0x178865b3b862b856ULL,
+ 0x1ce530de26acde5bULL, 0x87312c0b30a06f38ULL, 0x03e653b578558d76ULL,
+ 0x4d3663c21d8b3accULL, 0x038003c23626914aULL, 0xd9d5a2c052a09451ULL,
+ 0x39b5acfe08a49384ULL, 0x40f349956d5800e4ULL, 0x0968b6950b1bd8feULL,
+ 0xd60b2ca030f3779cULL, 0x7c8bc11a23ce18edULL, 0xcc23374e27630bc2ULL,
+ 0x2e38fc2a8bb33210ULL, 0xe421357814ee5c44ULL, 0x315fb65ea71ec671ULL,
+ 0xfb1b0223f70ed290ULL, 0x30556c9f983eaf07ULL, 0x8dd438c3d0cd625aULL,
+ 0x05a8fd0c7ffde71bULL, 0x764d1313b5aeec7aULL, 0x2036af5de9622f47ULL,
+ 0x508a5bfadda292feULL, 0x3f77f04ba2830e90ULL, 0x9047cd9c66ca66d2ULL,
+ 0x1168b5318a54eb21ULL, 0xc93462d221da2e15ULL, 0x4c2c7cc54abc066eULL,
+ 0x767a56fec478240eULL, 0x095de72546595bd3ULL, 0xc9da535865158558ULL,
+ 0x1baccf36f33e73fbULL, 0xf3d7dbe64df77f18ULL, 0x1f8ebbb7be4850b8ULL,
+ 0x043c5ed77bce25a1ULL, 0x07d401041b2a178aULL, 0x9181ebb8bd8d5618ULL,
+ 0x078b935dc3e4034aULL, 0x7b59c08954214300ULL, 0x03570dc2a4f84421ULL,
+ 0xdd8715b82f6b4078ULL, 0x2bb49c8bb544163bULL, 0xc9eb125564d59686ULL,
+ 0x5fdc7a38f80b810aULL, 0x3a4a6d8fff686544ULL, 0x28360e2418627d3aULL,
+ 0x60874244c95ed992ULL, 0x2115cc1dd9c34ed3ULL, 0xfaa3ef61f55e9efcULL,
+ 0x27ac9b1ef1adc7e6ULL, 0x95ea00478fec3f54ULL, 0x5aea808b2d99ab43ULL,
+ 0xc8f79e51fe43a580ULL, 0x5dbccd714236ce25ULL, 0x783fa76ed0753458ULL,
+ 0x48cb290f19d84655ULL, 0xc86a832f7696099aULL, 0x52f30c6fec0e71d3ULL,
+ 0x77d4e91e8cdeb886ULL, 0x7169a703c6a79ccdULL, 0x98208145b9596f74ULL,
+ 0x0945695c761c0796ULL, 0x0be897830d17bae0ULL, 0x033ad3924caeeeb4ULL,
+ 0xedecb6cfa2d303a8ULL, 0x3f86b074818642e7ULL, 0xeefa7c878a8b03f4ULL,
+ 0x093c101b80922551ULL, 0xfb3b4e6c26ac0034ULL, 0x162bf87999b94f5eULL,
+ 0xeaedae76e975b17cULL, 0x1852aa090effe18eULL};
+
+ static constexpr uint64_t kCLMulUpper[kCLMulNum] = {
+ 0xbb41199b1d587c69ULL, 0x514d94d55894ee29ULL, 0xebc6cd4d2efd5d16ULL,
+ 0x042044ad2de477fdULL, 0xb865c8b0fcdf4b15ULL, 0x0724d7e551cc40f3ULL,
+ 0xb15a16f39edb0bccULL, 0x37d64419ede7a171ULL, 0x2aa01bb80c753401ULL,
+ 0x06ff3f8a95fdaf4dULL, 0x79898cc0838546deULL, 0x776acbd1b237c60aULL,
+ 0x4c1753be4f4e0064ULL, 0x0ba9243601206ed3ULL, 0xd567c3b1bf3ec557ULL,
+ 0x043fac7bcff61fb3ULL, 0x49356232b159fb2fULL, 0x3910c82038102d4dULL,
+ 0x30592fef753eb300ULL, 0x7b2660e0c92a9e9aULL, 0x8246c9248d671ef0ULL,
+ 0x5a0dcd95147af5faULL, 0x43fde953909cc0eaULL, 0x06147b972cb96e1bULL,
+ 0xd84193a6b2411d80ULL, 0x00cd7711b950196fULL, 0x1088f9f4ade7fa64ULL,
+ 0x05a13096ec113cfbULL, 0x958d816d53b00edcULL, 0x3846154a7cdba9cbULL,
+ 0x8af516db6b27d1e6ULL, 0x1a1d462ab8a33b13ULL, 0x4040b0ac1b2c754cULL,
+ 0x05127fe9af2fe1d6ULL, 0x9f96e79374321fa6ULL, 0x06ff64a4d9c326f3ULL,
+ 0x28709566e158ac15ULL, 0x301701d7111ca51cULL, 0x31e0445d1b9d9544ULL,
+ 0x0a95aff69bf1d03eULL, 0x7c298c8414ecb879ULL, 0x00801499b4143195ULL,
+ 0x91521a00dd676a5cULL, 0x2777526a14c2f723ULL, 0xfa26aac6a6357dddULL,
+ 0x1d265889b0187a4bULL, 0xcd6e70fa8ed283e4ULL, 0x18a815aa50ea92caULL,
+ 0xc01e082694a263c6ULL, 0x4b40163ba53daf25ULL, 0xbc658caff6501673ULL,
+ 0x3ba35359586b9652ULL, 0x74f96acc97a4936cULL, 0x3989dfdb0cf1d2cfULL,
+ 0x358a01eaa50dda32ULL, 0x01109a5ed8f0802bULL, 0x55b84922e63c2958ULL,
+ 0x55b14843d87551d5ULL, 0x1db8ec61b1b578d8ULL, 0x79a2d49ef8c3658fULL,
+ 0xa304516816b3fbe0ULL, 0x163ecc09cc7b82f9ULL, 0xab91e8d22aabef00ULL,
+ 0x0ed6b09262de8354ULL, 0xcfd47d34cf73f6f2ULL, 0x7dbd1db2390bc6c3ULL,
+ 0x5ae789d3875e7b00ULL, 0x1d60fd0e70fe8fa4ULL, 0x690bc15d5ae4f6f5ULL,
+ 0x121ef5565104fb44ULL, 0x6e98e89297353b54ULL, 0x42554949249d62edULL,
+ 0xd6d6d16b12df78d2ULL, 0x320b33549b74975dULL, 0xd2a0618763d22e00ULL,
+ 0x0808deb93cba2017ULL, 0x01bd3b2302a2cc70ULL, 0x0b7b8dd4d71c8dd6ULL,
+ 0x34d60a3382a0756cULL, 0x40984584c8219629ULL, 0xf1152cba10093a66ULL,
+ 0x068001c6b2159ccbULL, 0x3d70f13c6cda0800ULL, 0x0e6b6746a322b956ULL,
+ 0x83a494319d8c770bULL, 0x0faecf64a8553e9aULL, 0xa34919222c39b1bcULL,
+ 0x0c63850d89e71c6fULL, 0x585f0bee92e53dc8ULL, 0x10f222b13b4fa5deULL,
+ 0x61573114f94252f2ULL, 0x09d59c311fba6c27ULL, 0x014effa7da49ed4eULL,
+ 0x4a400a1bc1c31d26ULL, 0xc9091c047b484972ULL, 0x3989f341ec2230ccULL,
+ 0xdcb03a98b3aee41eULL, 0x4a54a676a33a95e1ULL, 0xe499b7753951ef7cULL,
+ 0x2f43b1d1061d8b48ULL, 0xc3313bdc68ceb146ULL, 0x5159f6bc0e99227fULL,
+ 0x98128e6d9c05efcaULL, 0x15ea32b27f77815bULL, 0xe882c054e2654eecULL,
+ 0x003d2cdb8faee8c6ULL, 0xb416dd333a9fe1dfULL, 0x73f6746aefcfc98bULL,
+ 0x93dc114c10a38d70ULL, 0x05055941657845eaULL, 0x2ed7351347349334ULL,
+ 0x26fb1ee2c69ae690ULL, 0xa4575d10dc5b28e0ULL, 0x3395b11295e485ebULL,
+ 0xe840f198a224551cULL, 0x78e6e5a431d941d4ULL, 0xa1fee3ceab27f391ULL,
+ 0x07d35b3c5698d0dcULL, 0x983c67fca9174a29ULL, 0x2bb6bbae72b5144aULL,
+ 0xa7730b8d13ce58efULL, 0x51b5272883de1998ULL, 0xb334e128bb55e260ULL,
+ 0x1cacf5fbbe1b9974ULL, 0x71a9df4bb743de60ULL, 0x5176fe545c2d0d7aULL,
+ 0xbe592ecf1a16d672ULL, 0x27aa8a30c3efe460ULL, 0x4c78a32f47991e06ULL,
+ 0x383459294312f26aULL, 0x97ba789127f1490cULL, 0x51c9aa8a3abd1ef1ULL,
+ 0xcc7355188121e50fULL, 0x0ecb3a178ae334c1ULL, 0x84879a5e574b7160ULL,
+ 0x0765298f6389e8f3ULL, 0x5c6750435539bb22ULL, 0x11a05cf056c937b5ULL,
+ 0xb5dc2172dbfb7662ULL, 0x3ffc17915d9f40e8ULL, 0xbc7904daf3b431b0ULL,
+ 0x71f2088490930a7cULL, 0xa89505fd9efb53c4ULL, 0x02e194afd61c5671ULL,
+ 0x99a97f4abf35fcecULL, 0x26830aad30fae96fULL, 0x4b2abc16b25cf0b0ULL,
+ 0x07ec6fffa1cafbdbULL, 0xf38188fde97a280cULL, 0x121335701afff64dULL,
+ 0xea5ef38b4e672a64ULL, 0x477edbcae3eabf03ULL, 0xa32813cc0e0d244dULL,
+ 0x13346d2af4972eefULL, 0xcbc18357af1cfa9aULL, 0x561b630316e73fa6ULL,
+ 0xe9dfb53249249305ULL, 0x5d2b9dd1479312eeULL, 0x3458008119b56d04ULL,
+ 0x50e6790b49801385ULL, 0x5bb9febe2349492bULL, 0x0c2813954299098fULL,
+ 0xf747b0c890a071d5ULL, 0x417e8f82cc028d77ULL, 0xa134fee611d804f8ULL,
+ 0x24c99ee9a0408761ULL, 0x3ebb224e727137f3ULL, 0x0686022073ceb846ULL,
+ 0xa05e901fb82ad7daULL, 0x0ece7dc43ab470fcULL, 0x2d334ecc58f7d6a3ULL,
+ 0x23166fadacc54e40ULL, 0x9c3a4472f839556eULL, 0x071717ab5267a4adULL,
+ 0xb6600ac351ba3ea0ULL, 0x30ec748313bb63d4ULL, 0xb5374e39287b23ccULL,
+ 0x074d75e784238aebULL, 0x77315879243914a4ULL, 0x3bbb1971490865f1ULL,
+ 0xa355c21f4fbe02d3ULL, 0x0027f4bb38c8f402ULL, 0xeef8708e652bc5f0ULL,
+ 0x7b9aa56cf9440050ULL, 0x113ac03c16cfc924ULL, 0x395db36d3e4bef9fULL,
+ 0x5d826fabcaa597aeULL, 0x2a77d3c58786d7e0ULL, 0x85996859a3ba19d4ULL,
+ 0x01e7e3c904c2d97fULL, 0x34f90b9b98d51fd0ULL, 0x243aa97fd2e99bb7ULL,
+ 0x40a0cebc4f65c1e8ULL, 0x46d3922ed4a5503eULL, 0x446e7ecaf1f9c0a4ULL,
+ 0x49dc11558bc2e6aeULL, 0xe7a9f20881793af8ULL, 0x5771cc4bc98103f1ULL,
+ 0x2446ea6e718fce90ULL, 0x25d14aca7f7da198ULL, 0x4347af186f9af964ULL,
+ 0x10cb44fc9146363aULL, 0x8a35587afce476b4ULL, 0x575144662fee3d3aULL,
+ 0x69f41177a6bc7a05ULL, 0x02ff8c38d6b3c898ULL, 0x57c73589a226ca40ULL,
+ 0x732f6b5baae66683ULL, 0x00c008bbedd4bb34ULL, 0x7412ff09524d6cadULL,
+ 0xb8fd0b5ad8c145a8ULL, 0x74bd9f94b6cdc7dfULL, 0x68233b317ca6c19cULL,
+ 0x314b9c2c08b15c54ULL, 0x5bd1ad72072ebd08ULL, 0x6610e6a6c07030e4ULL,
+ 0xa4fc38e885ead7ceULL, 0x36975d1ca439e034ULL, 0xa358f0fe358ffb1aULL,
+ 0x38e247ad663acf7dULL, 0x77daed3643b5deb8ULL, 0x5507c2aeae1ec3d0ULL,
+ 0xfdec226c73acf775ULL, 0x1b87ff5f5033492dULL, 0xa832dee545d9033fULL,
+ 0x1cee43a61e41783bULL, 0xdff82b2e2d822f69ULL, 0x2bbc9a376cb38cf2ULL,
+ 0x117b1cdaf765dc02ULL, 0x26a407f5682be270ULL, 0x8eb664cf5634af28ULL,
+ 0x17cb4513bec68551ULL, 0xb0df6527900cbfd0ULL, 0x335a2dc79c5afdfcULL,
+ 0xa2f0ca4cd38dca88ULL, 0x1c370713b81a2de1ULL, 0x849d5df654d1adfcULL,
+ 0x2fd1f7675ae14e44ULL, 0x4ff64dfc02247f7bULL, 0x3a2bcf40e395a48dULL,
+ 0x436248c821b187c1ULL, 0x29f4337b1c7104c0ULL, 0xfc317c46e6630ec4ULL,
+ 0x2774bccc4e3264c7ULL, 0x2d03218d9d5bee23ULL, 0x36a0ed04d659058aULL,
+ 0x452484461573cab6ULL, 0x0708edf87ed6272bULL, 0xf07960a1587446cbULL,
+ 0x3660167b067d84e0ULL, 0x65990a6993ddf8c4ULL, 0x0b197cd3d0b40b3fULL,
+ 0x1dcec4ab619f3a05ULL, 0x722ab223a84f9182ULL, 0x0822d61a81e7c38fULL,
+ 0x3d22ad75da563201ULL, 0x93cef6979fd35e0fULL, 0x05c3c25ae598b14cULL,
+ 0x1338df97dd496377ULL, 0x15bc324dc9c20acfULL, 0x96397c6127e6e8cfULL,
+ 0x004d01069ef2050fULL, 0x2fcf2e27893fdcbcULL, 0x072f77c3e44f4a5cULL,
+ 0x5eb1d80b3fe44918ULL, 0x1f59e7c28cc21f22ULL, 0x3390ce5df055c1f8ULL,
+ 0x4c0ef11df92cb6bfULL, 0x50f82f9e0848c900ULL, 0x08d0fde3ffc0ae38ULL,
+ 0xbd8d0089a3fbfb73ULL, 0x118ba5b0f311ef59ULL, 0x9be9a8407b926a61ULL,
+ 0x4ea04fbb21318f63ULL, 0xa1c8e7bb07b871ffULL, 0x1253a7262d5d3b02ULL,
+ 0x13e997a0512e5b29ULL, 0x54318460ce9055baULL, 0x4e1d8a4db0054798ULL,
+ 0x0b235226e2cade32ULL, 0x2588732c1476b315ULL, 0x16a378750ba8ac68ULL,
+ 0xba0b116c04448731ULL, 0x4dd02bd47694c2f1ULL, 0x16d6797b218b6b25ULL,
+ 0x769eb3709cfbf936ULL, 0x197746a0ce396f38ULL, 0x7d17ad8465961d6eULL,
+ 0xfe58f4998ae19bb4ULL, 0x36df24305233ce69ULL, 0xb88a4eb008f4ee72ULL,
+ 0x302b2eb923334787ULL, 0x15a4e3edbe13d448ULL, 0x39a4bf64dd7730ceULL,
+ 0xedf25421b31090c4ULL, 0x4d547fc131be3b69ULL, 0x2b316e120ca3b90eULL,
+ 0x0faf2357bf18a169ULL, 0x71f34b54ee2c1d62ULL, 0x18eaf6e5c93a3824ULL,
+ 0x7e168ba03c1b4c18ULL, 0x1a534dd586d9e871ULL, 0xa2cccd307f5f8c38ULL,
+ 0x2999a6fb4dce30f6ULL, 0x8f6d3b02c1d549a6ULL, 0x5cf7f90d817aac5aULL,
+ 0xd2a4ceefe66c8170ULL, 0x11560edc4ca959feULL, 0x89e517e6f0dc464dULL,
+ 0x75bb8972dddd2085ULL, 0x13859ed1e459d65aULL, 0x057114653326fa84ULL,
+ 0xe2e6f465173cc86cULL, 0x0ada4076497d7de4ULL, 0xa856fa10ec6dbf8aULL,
+ 0x41505d9a7c25d875ULL, 0x3091b6278382eccdULL, 0x055737185b2c3f13ULL,
+ 0x2f4df8ecd6f9c632ULL, 0x0633e89c33552d98ULL, 0xf7673724d16db440ULL,
+ 0x7331bd08e636c391ULL, 0x0252f29672fee426ULL, 0x1fc384946b6b9ddeULL,
+ 0x03460c12c901443aULL, 0x003a0792e10abcdaULL, 0x8dbec31f624e37d0ULL,
+ 0x667420d5bfe4dcbeULL, 0xfbfa30e874ed7641ULL, 0x46d1ae14db7ecef6ULL,
+ 0x216bd7e8f5448768ULL, 0x32bcd40d3d69cc88ULL, 0x2e991dbc39b65abeULL,
+ 0x0e8fb123a502f553ULL, 0x3d2d486b2c7560c0ULL, 0x09aba1db3079fe03ULL,
+ 0xcb540c59398c9bceULL, 0x363970e5339ed600ULL, 0x2caee457c28af00eULL,
+ 0x005e7d7ee47f41a0ULL, 0x69fad3eb10f44100ULL, 0x048109388c75beb3ULL,
+ 0x253dddf96c7a6fb8ULL, 0x4c47f705b9d47d09ULL, 0x6cec894228b5e978ULL,
+ 0x04044bb9f8ff45c2ULL, 0x079e75704d775caeULL, 0x073bd54d2a9e2c33ULL,
+ 0xcec7289270a364fbULL, 0x19e7486f19cd9e4eULL, 0xb50ac15b86b76608ULL,
+ 0x0620cf81f165c812ULL, 0x63eaaf13be7b11d4ULL, 0x0e0cf831948248c2ULL,
+ 0xf0412df8f46e7957ULL, 0x671c1fe752517e3fULL, 0x8841bfb04dd3f540ULL,
+ 0x122de4142249f353ULL, 0x40a4959fb0e76870ULL, 0x25cfd3d4b4bbc459ULL,
+ 0x78a07c82930c60d0ULL, 0x12c2de24d4cbc969ULL, 0x85d44866096ad7f4ULL,
+ 0x1fd917ca66b2007bULL, 0x01fbbb0751764764ULL, 0x3d2a4953c6fe0fdcULL,
+ 0xcc1489c5737afd94ULL, 0x1817c5b6a5346f41ULL, 0xe605a6a7e9985644ULL,
+ 0x3c50412328ff1946ULL, 0xd8c7fd65817f1291ULL, 0x0bd66975ab66339bULL,
+ 0x2baf8fa1c7d10fa9ULL, 0x24abdf06ddef848dULL, 0x14df0c9b2ea4f6c2ULL,
+ 0x2be950edfd2cb1f7ULL, 0x21911e21094178b6ULL, 0x0fa54d518a93b379ULL,
+ 0xb52508e0ac01ab42ULL, 0x0e035b5fd8cb79beULL, 0x1c1c6d1a3b3c8648ULL,
+ 0x286037b42ea9871cULL, 0xfe67bf311e48a340ULL, 0x02324131e932a472ULL,
+ 0x2486dc2dd919e2deULL, 0x008aec7f1da1d2ebULL, 0x63269ba0e8d3eb3aULL,
+ 0x23c0f11154adb62fULL, 0xc6052393ecd4c018ULL, 0x523585b7d2f5b9fcULL,
+ 0xf7e6f8c1e87564c9ULL, 0x09eb9fe5dd32c1a3ULL, 0x4d4f86886e055472ULL,
+ 0x67ea17b58a37966bULL, 0x3d3ce8c23b1ed1a8ULL, 0x0df97c5ac48857ceULL,
+ 0x9b6992623759eb12ULL, 0x275aa9551ae091f2ULL, 0x08855e19ac5e62e5ULL,
+ 0x1155fffe0ae083ccULL, 0xbc9c78db7c570240ULL, 0x074560c447dd2418ULL,
+ 0x3bf78d330bcf1e70ULL, 0x49867cd4b7ed134bULL, 0x8e6eee0cb4470accULL,
+ 0x1dabafdf59233dd6ULL, 0xea3a50d844fc3fb8ULL, 0x4f03f4454764cb87ULL,
+ 0x1f2f41cc36c9e6ecULL, 0x53cba4df42963441ULL, 0x10883b70a88d91fbULL,
+ 0x62b1fc77d4eb9481ULL, 0x893d8f2604b362e1ULL, 0x0933b7855368b440ULL,
+ 0x9351b545703b2fceULL, 0x59c1d489b9bdd3b4ULL, 0xe72a9c4311417b18ULL,
+ 0x5355df77e88eb226ULL, 0xe802c37aa963d7e1ULL, 0x381c3747bd6c3bc3ULL,
+ 0x378565573444258cULL, 0x37848b1e52b43c18ULL, 0x5da2cd32bdce12b6ULL,
+ 0x13166c5da615f6fdULL, 0xa51ef95efcc66ac8ULL, 0x640c95e473f1e541ULL,
+ 0x6ec68def1f217500ULL, 0x49ce3543c76a4079ULL, 0x5fc6fd3cddc706b5ULL,
+ 0x05c3c0f0f6a1fb0dULL, 0xe7820c0996ad1bddULL, 0x21f0d752a088f35cULL,
+ 0x755405b51d6fc4a0ULL, 0x7ec7649ca4b0e351ULL, 0x3d2b6a46a251f790ULL,
+ 0x23e1176b19f418adULL, 0x06056575efe8ac05ULL, 0x0f75981b6966e477ULL,
+ 0x06e87ec41ad437e4ULL, 0x43f6c255d5e1cb84ULL, 0xe4e67d1120ceb580ULL,
+ 0x2cd67b9e12c26d7bULL, 0xcd00b5ff7fd187f1ULL, 0x3f6cd40accdc4106ULL,
+ 0x3e895c835459b330ULL, 0x0814d53a217c0850ULL, 0xc9111fe78bc3a62dULL,
+ 0x719967e351473204ULL, 0xe757707d24282aa4ULL, 0x7226b7f5607f98e6ULL,
+ 0x7b268ffae3c08d96ULL, 0x16d3917c8b86020eULL, 0x5128bca51c49ea64ULL,
+ 0x345ffea02bb1698dULL, 0x9460f5111fe4fbc8ULL, 0x60dd1aa5762852cbULL,
+ 0xbb7440ed3c81667cULL, 0x0a4b12affa7f6f5cULL, 0x95cbcb0ae03861b6ULL,
+ 0x07ab3b0591db6070ULL, 0xc6476a4c3de78982ULL, 0x204e82e8623ad725ULL,
+ 0x569a5b4e8ac2a5ccULL, 0x425a1d77d72ebae2ULL, 0xcdaad5551ab33830ULL,
+ 0x0b7c68fd8422939eULL, 0x46d9a01f53ec3020ULL, 0x102871edbb29e852ULL,
+ 0x7a8e8084039075a5ULL, 0x40eaede8615e376aULL, 0x4dc67d757a1c751fULL,
+ 0x1176ef33063f9145ULL, 0x4ea230285b1c8156ULL, 0x6b2aa46ce0027392ULL,
+ 0x32b13230fba1b068ULL, 0x0e69796851bb984fULL, 0xb749f4542db698c0ULL,
+ 0x19ad0241ffffd49cULL, 0x2f41e92ef6caff52ULL, 0x4d0b068576747439ULL,
+ 0x14d607aef7463e00ULL, 0x1443d00d85fb440eULL, 0x529b43bf68688780ULL,
+ 0x21133a6bc3a3e378ULL, 0x865b6436dae0e7e5ULL, 0x6b4fe83dc1d6defcULL,
+ 0x03a5858a0ca0be46ULL, 0x1e841b187e67f312ULL, 0x61ee22ef40a66940ULL,
+ 0x0494bd2e9e741ef8ULL, 0x4eb59e323010e72cULL, 0x19f2abcfb749810eULL,
+ 0xb30f1e4f994ef9bcULL, 0x53cf6cdd51bd2d96ULL, 0x263943036497a514ULL,
+ 0x0d4b52170aa2edbaULL, 0x0c4758a1c7b4f758ULL, 0x178dadb1b502b51aULL,
+ 0x1ddbb20a602eb57aULL, 0x1fc2e2564a9f27fdULL, 0xd5f8c50a0e3d6f90ULL,
+ 0x0081da3bbe72ac09ULL, 0xcf140d002ccdb200ULL, 0x0ae8389f09b017feULL,
+ 0x17cc9ffdc03f4440ULL, 0x04eb921d704bcdddULL, 0x139a0ce4cdc521abULL,
+ 0x0bfce00c145cb0f0ULL, 0x99925ff132eff707ULL, 0x063f6e5da50c3d35ULL,
+ 0xa0c25dea3f0e6e29ULL, 0x0c7a9048cc8e040fULL,
+ };
+
+ const size_t padded = RoundUpTo(kCLMulNum, N);
+ auto expected_lower = AllocateAligned<T>(padded);
+ auto expected_upper = AllocateAligned<T>(padded);
+ CopyBytes<kCLMulNum * sizeof(T)>(kCLMulLower, expected_lower.get());
+ CopyBytes<kCLMulNum * sizeof(T)>(kCLMulUpper, expected_upper.get());
+ const size_t padding_size = (padded - kCLMulNum) * sizeof(T);
+ memset(expected_lower.get() + kCLMulNum, 0, padding_size);
+ memset(expected_upper.get() + kCLMulNum, 0, padding_size);
+
+ // Random inputs in each lane
+ RandomState rng;
+ for (size_t rep = 0; rep < kCLMulNum / N; ++rep) {
+ for (size_t i = 0; i < N; ++i) {
+ in1[i] = Random64(&rng);
+ in2[i] = Random64(&rng);
+ }
+
+ const auto a = Load(d, in1.get());
+ const auto b = Load(d, in2.get());
+#if HWY_PRINT_CLMUL_GOLDEN
+ Store(CLMulLower(a, b), d, expected_lower.get() + rep * N);
+ Store(CLMulUpper(a, b), d, expected_upper.get() + rep * N);
+#else
+ HWY_ASSERT_VEC_EQ(d, expected_lower.get() + rep * N, CLMulLower(a, b));
+ HWY_ASSERT_VEC_EQ(d, expected_upper.get() + rep * N, CLMulUpper(a, b));
+#endif
+ }
+
+#if HWY_PRINT_CLMUL_GOLDEN
+ // RVV lacks PRIu64, so print 32-bit halves.
+ for (size_t i = 0; i < kCLMulNum; ++i) {
+ printf("0x%08x%08xULL,", static_cast<uint32_t>(expected_lower[i] >> 32),
+ static_cast<uint32_t>(expected_lower[i] & 0xFFFFFFFFU));
+ }
+ printf("\n");
+ for (size_t i = 0; i < kCLMulNum; ++i) {
+ printf("0x%08x%08xULL,", static_cast<uint32_t>(expected_upper[i] >> 32),
+ static_cast<uint32_t>(expected_upper[i] & 0xFFFFFFFFU));
+ }
+#endif // HWY_PRINT_CLMUL_GOLDEN
+#else
+ (void)d;
+#endif
+ }
+};
+
+HWY_NOINLINE void TestAllCLMul() { ForGEVectors<128, TestCLMul>()(uint64_t()); }
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyCryptoTest);
+HWY_EXPORT_AND_TEST_P(HwyCryptoTest, TestAllAES);
+HWY_EXPORT_AND_TEST_P(HwyCryptoTest, TestAllCLMul);
+} // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/demote_test.cc b/third_party/highway/hwy/tests/demote_test.cc
new file mode 100644
index 0000000000..22469113d5
--- /dev/null
+++ b/third_party/highway/hwy/tests/demote_test.cc
@@ -0,0 +1,328 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <cmath> // std::isfinite
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/demote_test.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+// Causes build timeout.
+#if !HWY_IS_MSAN
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+template <typename T, HWY_IF_FLOAT(T)>
+bool IsFiniteT(T t) {
+ return std::isfinite(t);
+}
+// Wrapper avoids calling std::isfinite for integer types (ambiguous).
+template <typename T, HWY_IF_NOT_FLOAT(T)>
+bool IsFiniteT(T /*unused*/) {
+ return true;
+}
+
+template <typename ToT>
+struct TestDemoteTo {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D from_d) {
+ static_assert(!IsFloat<ToT>(), "Use TestDemoteToFloat for float output");
+ static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider");
+ const Rebind<ToT, D> to_d;
+
+ const size_t N = Lanes(from_d);
+ auto from = AllocateAligned<T>(N);
+ auto expected = AllocateAligned<ToT>(N);
+
+ // Narrower range in the wider type, for clamping before we cast
+ const T min = LimitsMin<ToT>();
+ const T max = LimitsMax<ToT>();
+
+ const auto value_ok = [&](T& value) {
+ if (!IsFiniteT(value)) return false;
+ return true;
+ };
+
+ RandomState rng;
+ for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
+ for (size_t i = 0; i < N; ++i) {
+ do {
+ const uint64_t bits = rng();
+ CopyBytes<sizeof(T)>(&bits, &from[i]); // not same size
+ } while (!value_ok(from[i]));
+ expected[i] = static_cast<ToT>(HWY_MIN(HWY_MAX(min, from[i]), max));
+ }
+
+ const auto in = Load(from_d, from.get());
+ HWY_ASSERT_VEC_EQ(to_d, expected.get(), DemoteTo(to_d, in));
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllDemoteToInt() {
+ ForDemoteVectors<TestDemoteTo<uint8_t>>()(int16_t());
+ ForDemoteVectors<TestDemoteTo<uint8_t>, 2>()(int32_t());
+
+ ForDemoteVectors<TestDemoteTo<int8_t>>()(int16_t());
+ ForDemoteVectors<TestDemoteTo<int8_t>, 2>()(int32_t());
+
+ const ForDemoteVectors<TestDemoteTo<uint16_t>> to_u16;
+ to_u16(int32_t());
+
+ const ForDemoteVectors<TestDemoteTo<int16_t>> to_i16;
+ to_i16(int32_t());
+}
+
+HWY_NOINLINE void TestAllDemoteToMixed() {
+#if HWY_HAVE_FLOAT64
+ const ForDemoteVectors<TestDemoteTo<int32_t>> to_i32;
+ to_i32(double());
+#endif
+}
+
+template <typename ToT>
+struct TestDemoteToFloat {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D from_d) {
+ // For floats, we clamp differently and cannot call LimitsMin.
+ static_assert(IsFloat<ToT>(), "Use TestDemoteTo for integer output");
+ static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider");
+ const Rebind<ToT, D> to_d;
+
+ const size_t N = Lanes(from_d);
+ auto from = AllocateAligned<T>(N);
+ auto expected = AllocateAligned<ToT>(N);
+
+ RandomState rng;
+ for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
+ for (size_t i = 0; i < N; ++i) {
+ do {
+ const uint64_t bits = rng();
+ CopyBytes<sizeof(T)>(&bits, &from[i]); // not same size
+ } while (!IsFiniteT(from[i]));
+ const T magn = std::abs(from[i]);
+ const T max_abs = HighestValue<ToT>();
+ // NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see
+ // https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html
+ const T clipped = copysign(HWY_MIN(magn, max_abs), from[i]);
+ expected[i] = static_cast<ToT>(clipped);
+ }
+
+ HWY_ASSERT_VEC_EQ(to_d, expected.get(),
+ DemoteTo(to_d, Load(from_d, from.get())));
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllDemoteToFloat() {
+ // Must test f16 separately because we can only load/store/convert them.
+
+#if HWY_HAVE_FLOAT64
+ const ForDemoteVectors<TestDemoteToFloat<float>, 1> to_float;
+ to_float(double());
+#endif
+}
+
+template <class D>
+AlignedFreeUniquePtr<float[]> ReorderBF16TestCases(D d, size_t& padded) {
+ const float test_cases[] = {
+ // Same as BF16TestCases:
+ // +/- 1
+ 1.0f,
+ -1.0f,
+ // +/- 0
+ 0.0f,
+ -0.0f,
+ // near 0
+ 0.25f,
+ -0.25f,
+ // +/- integer
+ 4.0f,
+ -32.0f,
+ // positive +/- delta
+ 2.015625f,
+ 3.984375f,
+ // negative +/- delta
+ -2.015625f,
+ -3.984375f,
+
+ // No huge values - would interfere with sum. But add more to fill 2 * N:
+ -2.0f,
+ -10.0f,
+ 0.03125f,
+ 1.03125f,
+ 1.5f,
+ 2.0f,
+ 4.0f,
+ 5.0f,
+ 6.0f,
+ 8.0f,
+ 10.0f,
+ 256.0f,
+ 448.0f,
+ 2080.0f,
+ };
+ const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
+ const size_t N = Lanes(d);
+ padded = RoundUpTo(kNumTestCases, 2 * N); // allow loading pairs of vectors
+ auto in = AllocateAligned<float>(padded);
+ auto expected = AllocateAligned<float>(padded);
+ std::copy(test_cases, test_cases + kNumTestCases, in.get());
+ std::fill(in.get() + kNumTestCases, in.get() + padded, 0.0f);
+ return in;
+}
+
+class TestReorderDemote2To {
+ // In-place N^2 selection sort to avoid dependencies
+ void Sort(float* p, size_t count) {
+ for (size_t i = 0; i < count - 1; ++i) {
+ // Find min_element
+ size_t idx_min = i;
+ for (size_t j = i + 1; j < count; j++) {
+ if (p[j] < p[idx_min]) {
+ idx_min = j;
+ }
+ }
+
+ // Swap with current
+ const float tmp = p[i];
+ p[i] = p[idx_min];
+ p[idx_min] = tmp;
+ }
+ }
+
+ public:
+ template <typename TF32, class DF32>
+ HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {
+#if HWY_TARGET != HWY_SCALAR
+ size_t padded;
+ auto in = ReorderBF16TestCases(d32, padded);
+
+ using TBF16 = bfloat16_t;
+ const Repartition<TBF16, DF32> dbf16;
+ const Half<decltype(dbf16)> dbf16_half;
+ const size_t N = Lanes(d32);
+ auto temp16 = AllocateAligned<TBF16>(2 * N);
+ auto expected = AllocateAligned<float>(2 * N);
+ auto actual = AllocateAligned<float>(2 * N);
+
+ for (size_t i = 0; i < padded; i += 2 * N) {
+ const auto f0 = Load(d32, &in[i + 0]);
+ const auto f1 = Load(d32, &in[i + N]);
+ const auto v16 = ReorderDemote2To(dbf16, f0, f1);
+ Store(v16, dbf16, temp16.get());
+ const auto promoted0 = PromoteTo(d32, Load(dbf16_half, temp16.get() + 0));
+ const auto promoted1 = PromoteTo(d32, Load(dbf16_half, temp16.get() + N));
+
+ // Smoke test: sum should be same (with tolerance for non-associativity)
+ const auto sum_expected = GetLane(SumOfLanes(d32, Add(f0, f1)));
+ const auto sum_actual =
+ GetLane(SumOfLanes(d32, Add(promoted0, promoted1)));
+
+ HWY_ASSERT(sum_expected - 1E-4 <= sum_actual &&
+ sum_actual <= sum_expected + 1E-4);
+
+ // Ensure values are the same after sorting to undo the Reorder
+ Store(f0, d32, expected.get() + 0);
+ Store(f1, d32, expected.get() + N);
+ Store(promoted0, d32, actual.get() + 0);
+ Store(promoted1, d32, actual.get() + N);
+ Sort(expected.get(), 2 * N);
+ Sort(actual.get(), 2 * N);
+ HWY_ASSERT_VEC_EQ(d32, expected.get() + 0, Load(d32, actual.get() + 0));
+ HWY_ASSERT_VEC_EQ(d32, expected.get() + N, Load(d32, actual.get() + N));
+ }
+#else // HWY_SCALAR
+ (void)d32;
+#endif
+ }
+};
+
+HWY_NOINLINE void TestAllReorderDemote2To() {
+ ForShrinkableVectors<TestReorderDemote2To>()(float());
+}
+
+struct TestI32F64 {
+ template <typename TF, class DF>
+ HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
+ using TI = int32_t;
+ const Rebind<TI, DF> di;
+ const size_t N = Lanes(df);
+
+ // Integer positive
+ HWY_ASSERT_VEC_EQ(di, Iota(di, TI(4)), DemoteTo(di, Iota(df, TF(4.0))));
+
+ // Integer negative
+ HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), DemoteTo(di, Iota(df, -TF(N))));
+
+ // Above positive
+ HWY_ASSERT_VEC_EQ(di, Iota(di, TI(2)), DemoteTo(di, Iota(df, TF(2.001))));
+
+ // Below positive
+ HWY_ASSERT_VEC_EQ(di, Iota(di, TI(3)), DemoteTo(di, Iota(df, TF(3.9999))));
+
+ const TF eps = static_cast<TF>(0.0001);
+ // Above negative
+ HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)),
+ DemoteTo(di, Iota(df, -TF(N + 1) + eps)));
+
+ // Below negative
+ HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N + 1)),
+ DemoteTo(di, Iota(df, -TF(N + 1) - eps)));
+
+ // Huge positive float
+ HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMax<TI>()),
+ DemoteTo(di, Set(df, TF(1E12))));
+
+ // Huge negative float
+ HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMin<TI>()),
+ DemoteTo(di, Set(df, TF(-1E12))));
+ }
+};
+
+HWY_NOINLINE void TestAllI32F64() {
+#if HWY_HAVE_FLOAT64
+ ForDemoteVectors<TestI32F64>()(double());
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#endif // !HWY_IS_MSAN
+
+#if HWY_ONCE
+
+namespace hwy {
+#if !HWY_IS_MSAN
+HWY_BEFORE_TEST(HwyDemoteTest);
+HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToInt);
+HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToMixed);
+HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToFloat);
+HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllReorderDemote2To);
+HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllI32F64);
+#endif // !HWY_IS_MSAN
+} // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/float_test.cc b/third_party/highway/hwy/tests/float_test.cc
new file mode 100644
index 0000000000..bc6d9020e6
--- /dev/null
+++ b/third_party/highway/hwy/tests/float_test.cc
@@ -0,0 +1,350 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Tests some ops specific to floating-point types (Div, Round etc.)
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm> // std::copy, std::fill
+#include <limits>
+#include <cmath> // std::abs, std::isnan, std::isinf, std::ceil, std::floor
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/float_test.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestDiv {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto v = Iota(d, T(-2));
+ const auto v1 = Set(d, T(1));
+
+ // Unchanged after division by 1.
+ HWY_ASSERT_VEC_EQ(d, v, Div(v, v1));
+
+ const size_t N = Lanes(d);
+ auto expected = AllocateAligned<T>(N);
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = (T(i) - 2) / T(2);
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), Div(v, Set(d, T(2))));
+ }
+};
+
+HWY_NOINLINE void TestAllDiv() { ForFloatTypes(ForPartialVectors<TestDiv>()); }
+
+struct TestApproximateReciprocal {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto v = Iota(d, T(-2));
+ const auto nonzero = IfThenElse(Eq(v, Zero(d)), Set(d, T(1)), v);
+ const size_t N = Lanes(d);
+ auto input = AllocateAligned<T>(N);
+ Store(nonzero, d, input.get());
+
+ auto actual = AllocateAligned<T>(N);
+ Store(ApproximateReciprocal(nonzero), d, actual.get());
+
+ double max_l1 = 0.0;
+ double worst_expected = 0.0;
+ double worst_actual = 0.0;
+ for (size_t i = 0; i < N; ++i) {
+ const double expected = 1.0 / input[i];
+ const double l1 = std::abs(expected - actual[i]);
+ if (l1 > max_l1) {
+ max_l1 = l1;
+ worst_expected = expected;
+ worst_actual = actual[i];
+ }
+ }
+ const double abs_worst_expected = std::abs(worst_expected);
+ if (abs_worst_expected > 1E-5) {
+ const double max_rel = max_l1 / abs_worst_expected;
+ fprintf(stderr, "max l1 %f rel %f (%f vs %f)\n", max_l1, max_rel,
+ worst_expected, worst_actual);
+ HWY_ASSERT(max_rel < 0.004);
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllApproximateReciprocal() {
+ ForPartialVectors<TestApproximateReciprocal>()(float());
+}
+
+struct TestSquareRoot {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto vi = Iota(d, 0);
+ HWY_ASSERT_VEC_EQ(d, vi, Sqrt(Mul(vi, vi)));
+ }
+};
+
+HWY_NOINLINE void TestAllSquareRoot() {
+ ForFloatTypes(ForPartialVectors<TestSquareRoot>());
+}
+
+struct TestReciprocalSquareRoot {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto v = Set(d, 123.0f);
+ const size_t N = Lanes(d);
+ auto lanes = AllocateAligned<T>(N);
+ Store(ApproximateReciprocalSqrt(v), d, lanes.get());
+ for (size_t i = 0; i < N; ++i) {
+ float err = lanes[i] - 0.090166f;
+ if (err < 0.0f) err = -err;
+ if (err >= 4E-4f) {
+ HWY_ABORT("Lane %d (%d): actual %f err %f\n", static_cast<int>(i),
+ static_cast<int>(N), lanes[i], err);
+ }
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllReciprocalSquareRoot() {
+ ForPartialVectors<TestReciprocalSquareRoot>()(float());
+}
+
+template <typename T, class D>
+AlignedFreeUniquePtr<T[]> RoundTestCases(T /*unused*/, D d, size_t& padded) {
+ const T eps = std::numeric_limits<T>::epsilon();
+ const T test_cases[] = {
+ // +/- 1
+ T(1),
+ T(-1),
+ // +/- 0
+ T(0),
+ T(-0),
+ // near 0
+ T(0.4),
+ T(-0.4),
+ // +/- integer
+ T(4),
+ T(-32),
+ // positive near limit
+ MantissaEnd<T>() - T(1.5),
+ MantissaEnd<T>() + T(1.5),
+ // negative near limit
+ -MantissaEnd<T>() - T(1.5),
+ -MantissaEnd<T>() + T(1.5),
+ // positive tiebreak
+ T(1.5),
+ T(2.5),
+ // negative tiebreak
+ T(-1.5),
+ T(-2.5),
+ // positive +/- delta
+ T(2.0001),
+ T(3.9999),
+ // negative +/- delta
+ T(-999.9999),
+ T(-998.0001),
+ // positive +/- epsilon
+ T(1) + eps,
+ T(1) - eps,
+ // negative +/- epsilon
+ T(-1) + eps,
+ T(-1) - eps,
+ // +/- huge (but still fits in float)
+ T(1E34),
+ T(-1E35),
+ // +/- infinity
+ std::numeric_limits<T>::infinity(),
+ -std::numeric_limits<T>::infinity(),
+ // qNaN
+ GetLane(NaN(d))
+ };
+ const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
+ const size_t N = Lanes(d);
+ padded = RoundUpTo(kNumTestCases, N); // allow loading whole vectors
+ auto in = AllocateAligned<T>(padded);
+ auto expected = AllocateAligned<T>(padded);
+ std::copy(test_cases, test_cases + kNumTestCases, in.get());
+ std::fill(in.get() + kNumTestCases, in.get() + padded, T(0));
+ return in;
+}
+
+struct TestRound {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T t, D d) {
+ size_t padded;
+ auto in = RoundTestCases(t, d, padded);
+ auto expected = AllocateAligned<T>(padded);
+
+ for (size_t i = 0; i < padded; ++i) {
+ // Avoid [std::]round, which does not round to nearest *even*.
+ // NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see
+ // https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html
+ expected[i] = static_cast<T>(nearbyint(in[i]));
+ }
+ for (size_t i = 0; i < padded; i += Lanes(d)) {
+ HWY_ASSERT_VEC_EQ(d, &expected[i], Round(Load(d, &in[i])));
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllRound() {
+ ForFloatTypes(ForPartialVectors<TestRound>());
+}
+
+struct TestNearestInt {
+ template <typename TF, class DF>
+ HWY_NOINLINE void operator()(TF tf, const DF df) {
+ using TI = MakeSigned<TF>;
+ const RebindToSigned<DF> di;
+
+ size_t padded;
+ auto in = RoundTestCases(tf, df, padded);
+ auto expected = AllocateAligned<TI>(padded);
+
+ constexpr double max = static_cast<double>(LimitsMax<TI>());
+ for (size_t i = 0; i < padded; ++i) {
+ if (std::isnan(in[i])) {
+ // We replace NaN with 0 below (no_nan)
+ expected[i] = 0;
+ } else if (std::isinf(in[i]) || double{std::abs(in[i])} >= max) {
+ // Avoid undefined result for lrintf
+ expected[i] = std::signbit(in[i]) ? LimitsMin<TI>() : LimitsMax<TI>();
+ } else {
+ expected[i] = static_cast<TI>(lrintf(in[i]));
+ }
+ }
+ for (size_t i = 0; i < padded; i += Lanes(df)) {
+ const auto v = Load(df, &in[i]);
+ const auto no_nan = IfThenElse(Eq(v, v), v, Zero(df));
+ HWY_ASSERT_VEC_EQ(di, &expected[i], NearestInt(no_nan));
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllNearestInt() {
+ ForPartialVectors<TestNearestInt>()(float());
+}
+
+struct TestTrunc {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T t, D d) {
+ size_t padded;
+ auto in = RoundTestCases(t, d, padded);
+ auto expected = AllocateAligned<T>(padded);
+
+ for (size_t i = 0; i < padded; ++i) {
+ // NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see
+ // https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html
+ expected[i] = static_cast<T>(trunc(in[i]));
+ }
+ for (size_t i = 0; i < padded; i += Lanes(d)) {
+ HWY_ASSERT_VEC_EQ(d, &expected[i], Trunc(Load(d, &in[i])));
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllTrunc() {
+ ForFloatTypes(ForPartialVectors<TestTrunc>());
+}
+
+struct TestCeil {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T t, D d) {
+ size_t padded;
+ auto in = RoundTestCases(t, d, padded);
+ auto expected = AllocateAligned<T>(padded);
+
+ for (size_t i = 0; i < padded; ++i) {
+ expected[i] = std::ceil(in[i]);
+ }
+ for (size_t i = 0; i < padded; i += Lanes(d)) {
+ HWY_ASSERT_VEC_EQ(d, &expected[i], Ceil(Load(d, &in[i])));
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllCeil() {
+ ForFloatTypes(ForPartialVectors<TestCeil>());
+}
+
+struct TestFloor {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T t, D d) {
+ size_t padded;
+ auto in = RoundTestCases(t, d, padded);
+ auto expected = AllocateAligned<T>(padded);
+
+ for (size_t i = 0; i < padded; ++i) {
+ expected[i] = std::floor(in[i]);
+ }
+ for (size_t i = 0; i < padded; i += Lanes(d)) {
+ HWY_ASSERT_VEC_EQ(d, &expected[i], Floor(Load(d, &in[i])));
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllFloor() {
+ ForFloatTypes(ForPartialVectors<TestFloor>());
+}
+
+struct TestAbsDiff {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const size_t N = Lanes(d);
+ auto in_lanes_a = AllocateAligned<T>(N);
+ auto in_lanes_b = AllocateAligned<T>(N);
+ auto out_lanes = AllocateAligned<T>(N);
+ for (size_t i = 0; i < N; ++i) {
+ in_lanes_a[i] = static_cast<T>((i ^ 1u) << i);
+ in_lanes_b[i] = static_cast<T>(i << i);
+ out_lanes[i] = std::abs(in_lanes_a[i] - in_lanes_b[i]);
+ }
+ const auto a = Load(d, in_lanes_a.get());
+ const auto b = Load(d, in_lanes_b.get());
+ const auto expected = Load(d, out_lanes.get());
+ HWY_ASSERT_VEC_EQ(d, expected, AbsDiff(a, b));
+ HWY_ASSERT_VEC_EQ(d, expected, AbsDiff(b, a));
+ }
+};
+
+HWY_NOINLINE void TestAllAbsDiff() {
+ ForPartialVectors<TestAbsDiff>()(float());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyFloatTest);
+HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllDiv);
+HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllApproximateReciprocal);
+HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllSquareRoot);
+HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllReciprocalSquareRoot);
+HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllRound);
+HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllNearestInt);
+HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllTrunc);
+HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllCeil);
+HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllFloor);
+HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllAbsDiff);
+} // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/hwy_gtest.h b/third_party/highway/hwy/tests/hwy_gtest.h
new file mode 100644
index 0000000000..a4c21cd171
--- /dev/null
+++ b/third_party/highway/hwy/tests/hwy_gtest.h
@@ -0,0 +1,157 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HWY_TESTS_HWY_GTEST_H_
+#define HWY_TESTS_HWY_GTEST_H_
+
+// Adapters for GUnit to run tests for all targets.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+#include <tuple>
+
+#include "gtest/gtest.h"
+#include "hwy/highway.h"
+
+namespace hwy {
+
+// googletest before 1.10 didn't define INSTANTIATE_TEST_SUITE_P() but instead
+// used INSTANTIATE_TEST_CASE_P which is now deprecated.
+#ifdef INSTANTIATE_TEST_SUITE_P
+#define HWY_GTEST_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_SUITE_P
+#else
+#define HWY_GTEST_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_CASE_P
+#endif
+
+// Helper class to run parametric tests using the hwy target as parameter. To
+// use this define the following in your test:
+// class MyTestSuite : public TestWithParamTarget {
+// ...
+// };
+// HWY_TARGET_INSTANTIATE_TEST_SUITE_P(MyTestSuite);
+// TEST_P(MyTestSuite, MyTest) { ... }
+class TestWithParamTarget : public testing::TestWithParam<int64_t> {
+ protected:
+ void SetUp() override { SetSupportedTargetsForTest(GetParam()); }
+
+ void TearDown() override {
+ // Check that the parametric test calls SupportedTargets() when the source
+ // was compiled with more than one target. In the single-target case only
+ // static dispatch will be used anyway.
+#if (HWY_TARGETS & (HWY_TARGETS - 1)) != 0
+ EXPECT_TRUE(GetChosenTarget().IsInitialized())
+ << "This hwy target parametric test doesn't use dynamic-dispatch and "
+ "doesn't need to be parametric.";
+#endif
+ SetSupportedTargetsForTest(0);
+ }
+};
+
+// Function to convert the test parameter of a TestWithParamTarget for
+// displaying it in the gtest test name.
+static inline std::string TestParamTargetName(
+ const testing::TestParamInfo<int64_t>& info) {
+ return TargetName(info.param);
+}
+
+#define HWY_TARGET_INSTANTIATE_TEST_SUITE_P(suite) \
+ HWY_GTEST_INSTANTIATE_TEST_SUITE_P( \
+ suite##Group, suite, \
+ testing::ValuesIn(::hwy::SupportedAndGeneratedTargets()), \
+ ::hwy::TestParamTargetName)
+
+// Helper class similar to TestWithParamTarget to run parametric tests that
+// depend on the target and another parametric test. If you need to use multiple
+// extra parameters use a std::tuple<> of them and ::testing::Generate(...) as
+// the generator. To use this class define the following in your test:
+// class MyTestSuite : public TestWithParamTargetT<int> {
+// ...
+// };
+// HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T(MyTestSuite, ::testing::Range(0, 9));
+// TEST_P(MyTestSuite, MyTest) { ... GetParam() .... }
+template <typename T>
+class TestWithParamTargetAndT
+ : public ::testing::TestWithParam<std::tuple<int64_t, T>> {
+ public:
+ // Expose the parametric type here so it can be used by the
+ // HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T macro.
+ using HwyParamType = T;
+
+ protected:
+ void SetUp() override {
+ SetSupportedTargetsForTest(std::get<0>(
+ ::testing::TestWithParam<std::tuple<int64_t, T>>::GetParam()));
+ }
+
+ void TearDown() override {
+ // Check that the parametric test calls SupportedTargets() when the source
+ // was compiled with more than one target. In the single-target case only
+ // static dispatch will be used anyway.
+#if (HWY_TARGETS & (HWY_TARGETS - 1)) != 0
+ EXPECT_TRUE(GetChosenTarget().IsInitialized())
+ << "This hwy target parametric test doesn't use dynamic-dispatch and "
+ "doesn't need to be parametric.";
+#endif
+ SetSupportedTargetsForTest(0);
+ }
+
+ T GetParam() {
+ return std::get<1>(
+ ::testing::TestWithParam<std::tuple<int64_t, T>>::GetParam());
+ }
+};
+
+template <typename T>
+std::string TestParamTargetNameAndT(
+ const testing::TestParamInfo<std::tuple<int64_t, T>>& info) {
+ return std::string(TargetName(std::get<0>(info.param))) + "_" +
+ ::testing::PrintToString(std::get<1>(info.param));
+}
+
+#define HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T(suite, generator) \
+ HWY_GTEST_INSTANTIATE_TEST_SUITE_P( \
+ suite##Group, suite, \
+ ::testing::Combine( \
+ testing::ValuesIn(::hwy::SupportedAndGeneratedTargets()), \
+ generator), \
+ ::hwy::TestParamTargetNameAndT<suite::HwyParamType>)
+
+// Helper macro to export a function and define a test that tests it. This is
+// equivalent to do a HWY_EXPORT of a void(void) function and run it in a test:
+// class MyTestSuite : public TestWithParamTarget {
+// ...
+// };
+// HWY_TARGET_INSTANTIATE_TEST_SUITE_P(MyTestSuite);
+// HWY_EXPORT_AND_TEST_P(MyTestSuite, MyTest);
+#define HWY_EXPORT_AND_TEST_P(suite, func_name) \
+ HWY_EXPORT(func_name); \
+ TEST_P(suite, func_name) { HWY_DYNAMIC_DISPATCH(func_name)(); } \
+ static_assert(true, "For requiring trailing semicolon")
+
+#define HWY_EXPORT_AND_TEST_P_T(suite, func_name) \
+ HWY_EXPORT(func_name); \
+ TEST_P(suite, func_name) { HWY_DYNAMIC_DISPATCH(func_name)(GetParam()); } \
+ static_assert(true, "For requiring trailing semicolon")
+
+#define HWY_BEFORE_TEST(suite) \
+ class suite : public hwy::TestWithParamTarget {}; \
+ HWY_TARGET_INSTANTIATE_TEST_SUITE_P(suite); \
+ static_assert(true, "For requiring trailing semicolon")
+
+} // namespace hwy
+
+#endif // HWY_TESTS_HWY_GTEST_H_
diff --git a/third_party/highway/hwy/tests/if_test.cc b/third_party/highway/hwy/tests/if_test.cc
new file mode 100644
index 0000000000..e44a878a0c
--- /dev/null
+++ b/third_party/highway/hwy/tests/if_test.cc
@@ -0,0 +1,175 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "hwy/aligned_allocator.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/if_test.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestIfThenElse {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ RandomState rng;
+
+ using TI = MakeSigned<T>; // For mask > 0 comparison
+ const Rebind<TI, D> di;
+ const size_t N = Lanes(d);
+ auto in1 = AllocateAligned<T>(N);
+ auto in2 = AllocateAligned<T>(N);
+ auto bool_lanes = AllocateAligned<TI>(N);
+ auto expected = AllocateAligned<T>(N);
+
+ // Each lane should have a chance of having mask=true.
+ for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+ for (size_t i = 0; i < N; ++i) {
+ in1[i] = static_cast<T>(Random32(&rng));
+ in2[i] = static_cast<T>(Random32(&rng));
+ bool_lanes[i] = (Random32(&rng) & 16) ? TI(1) : TI(0);
+ }
+
+ const auto v1 = Load(d, in1.get());
+ const auto v2 = Load(d, in2.get());
+ const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
+
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = bool_lanes[i] ? in1[i] : in2[i];
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenElse(mask, v1, v2));
+
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = bool_lanes[i] ? in1[i] : T(0);
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenElseZero(mask, v1));
+
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = bool_lanes[i] ? T(0) : in2[i];
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenZeroElse(mask, v2));
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllIfThenElse() {
+ ForAllTypes(ForPartialVectors<TestIfThenElse>());
+}
+
+struct TestIfVecThenElse {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ RandomState rng;
+
+ using TU = MakeUnsigned<T>; // For all-one mask
+ const Rebind<TU, D> du;
+ const size_t N = Lanes(d);
+ auto in1 = AllocateAligned<T>(N);
+ auto in2 = AllocateAligned<T>(N);
+ auto vec_lanes = AllocateAligned<TU>(N);
+ auto expected = AllocateAligned<T>(N);
+
+ // Each lane should have a chance of having mask=true.
+ for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+ for (size_t i = 0; i < N; ++i) {
+ in1[i] = static_cast<T>(Random32(&rng));
+ in2[i] = static_cast<T>(Random32(&rng));
+ vec_lanes[i] = (Random32(&rng) & 16) ? static_cast<TU>(~TU(0)) : TU(0);
+ }
+
+ const auto v1 = Load(d, in1.get());
+ const auto v2 = Load(d, in2.get());
+ const auto vec = BitCast(d, Load(du, vec_lanes.get()));
+
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = vec_lanes[i] ? in1[i] : in2[i];
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), IfVecThenElse(vec, v1, v2));
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllIfVecThenElse() {
+ ForAllTypes(ForPartialVectors<TestIfVecThenElse>());
+}
+
+struct TestZeroIfNegative {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto v0 = Zero(d);
+ const auto vp = Iota(d, 1);
+ const auto vn = Iota(d, T(-1E5)); // assumes N < 10^5
+
+ // Zero and positive remain unchanged
+ HWY_ASSERT_VEC_EQ(d, v0, ZeroIfNegative(v0));
+ HWY_ASSERT_VEC_EQ(d, vp, ZeroIfNegative(vp));
+
+ // Negative are all replaced with zero
+ HWY_ASSERT_VEC_EQ(d, v0, ZeroIfNegative(vn));
+ }
+};
+
+HWY_NOINLINE void TestAllZeroIfNegative() {
+ ForFloatTypes(ForPartialVectors<TestZeroIfNegative>());
+}
+
+struct TestIfNegative {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto v0 = Zero(d);
+ const auto vp = Iota(d, 1);
+ const auto vn = Or(vp, SignBit(d));
+
+ // Zero and positive remain unchanged
+ HWY_ASSERT_VEC_EQ(d, v0, IfNegativeThenElse(v0, vn, v0));
+ HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(v0, v0, vn));
+ HWY_ASSERT_VEC_EQ(d, vp, IfNegativeThenElse(vp, vn, vp));
+ HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(vp, vp, vn));
+
+ // Negative are replaced with 2nd arg
+ HWY_ASSERT_VEC_EQ(d, v0, IfNegativeThenElse(vn, v0, vp));
+ HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(vn, vn, v0));
+ HWY_ASSERT_VEC_EQ(d, vp, IfNegativeThenElse(vn, vp, vn));
+ }
+};
+
+HWY_NOINLINE void TestAllIfNegative() {
+ ForFloatTypes(ForPartialVectors<TestIfNegative>());
+ ForSignedTypes(ForPartialVectors<TestIfNegative>());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyIfTest);
+HWY_EXPORT_AND_TEST_P(HwyIfTest, TestAllIfThenElse);
+HWY_EXPORT_AND_TEST_P(HwyIfTest, TestAllIfVecThenElse);
+HWY_EXPORT_AND_TEST_P(HwyIfTest, TestAllZeroIfNegative);
+HWY_EXPORT_AND_TEST_P(HwyIfTest, TestAllIfNegative);
+} // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/interleaved_test.cc b/third_party/highway/hwy/tests/interleaved_test.cc
new file mode 100644
index 0000000000..4d1fbd5ac5
--- /dev/null
+++ b/third_party/highway/hwy/tests/interleaved_test.cc
@@ -0,0 +1,256 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/interleaved_test.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestLoadStoreInterleaved2 {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const size_t N = Lanes(d);
+
+ RandomState rng;
+
+ // Data to be interleaved
+ auto bytes = AllocateAligned<T>(2 * N);
+ for (size_t i = 0; i < 2 * N; ++i) {
+ bytes[i] = static_cast<T>(Random32(&rng) & 0xFF);
+ }
+ const auto in0 = Load(d, &bytes[0 * N]);
+ const auto in1 = Load(d, &bytes[1 * N]);
+
+ // Interleave here, ensure vector results match scalar
+ auto expected = AllocateAligned<T>(3 * N);
+ auto actual_aligned = AllocateAligned<T>(3 * N + 1);
+ T* actual = actual_aligned.get() + 1;
+
+ for (size_t rep = 0; rep < 100; ++rep) {
+ for (size_t i = 0; i < N; ++i) {
+ expected[2 * i + 0] = bytes[0 * N + i];
+ expected[2 * i + 1] = bytes[1 * N + i];
+ // Ensure we do not write more than 2*N bytes
+ expected[2 * N + i] = actual[2 * N + i] = 0;
+ }
+ StoreInterleaved2(in0, in1, d, actual);
+ size_t pos = 0;
+ if (!BytesEqual(expected.get(), actual, 3 * N * sizeof(T), &pos)) {
+ Print(d, "in0", in0, pos / 4);
+ Print(d, "in1", in1, pos / 4);
+ const size_t i = pos;
+ fprintf(stderr, "interleaved i=%d %f %f %f %f %f %f %f %f\n",
+ static_cast<int>(i), static_cast<double>(actual[i]),
+ static_cast<double>(actual[i + 1]),
+ static_cast<double>(actual[i + 2]),
+ static_cast<double>(actual[i + 3]),
+ static_cast<double>(actual[i + 4]),
+ static_cast<double>(actual[i + 5]),
+ static_cast<double>(actual[i + 6]),
+ static_cast<double>(actual[i + 7]));
+ HWY_ASSERT(false);
+ }
+
+ Vec<D> out0, out1;
+ LoadInterleaved2(d, actual, out0, out1);
+ HWY_ASSERT_VEC_EQ(d, in0, out0);
+ HWY_ASSERT_VEC_EQ(d, in1, out1);
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllLoadStoreInterleaved2() {
+#if HWY_TARGET == HWY_RVV
+ // Segments are limited to 8 registers, so we can only go up to LMUL=2.
+ const ForExtendableVectors<TestLoadStoreInterleaved2, 2> test;
+#else
+ const ForPartialVectors<TestLoadStoreInterleaved2> test;
+#endif
+ ForAllTypes(test);
+}
+
+// Workaround for build timeout on GCC 12 aarch64, see #776
+#if HWY_COMPILER_GCC_ACTUAL >= 1200 && HWY_ARCH_ARM_A64
+#define HWY_BROKEN_LOAD34 1
+#else
+#define HWY_BROKEN_LOAD34 0
+#endif
+
+#if !HWY_BROKEN_LOAD34
+
+struct TestLoadStoreInterleaved3 {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const size_t N = Lanes(d);
+
+ RandomState rng;
+
+ // Data to be interleaved
+ auto bytes = AllocateAligned<T>(3 * N);
+ for (size_t i = 0; i < 3 * N; ++i) {
+ bytes[i] = static_cast<T>(Random32(&rng) & 0xFF);
+ }
+ const auto in0 = Load(d, &bytes[0 * N]);
+ const auto in1 = Load(d, &bytes[1 * N]);
+ const auto in2 = Load(d, &bytes[2 * N]);
+
+ // Interleave here, ensure vector results match scalar
+ auto expected = AllocateAligned<T>(4 * N);
+ auto actual_aligned = AllocateAligned<T>(4 * N + 1);
+ T* actual = actual_aligned.get() + 1;
+
+ for (size_t rep = 0; rep < 100; ++rep) {
+ for (size_t i = 0; i < N; ++i) {
+ expected[3 * i + 0] = bytes[0 * N + i];
+ expected[3 * i + 1] = bytes[1 * N + i];
+ expected[3 * i + 2] = bytes[2 * N + i];
+ // Ensure we do not write more than 3*N bytes
+ expected[3 * N + i] = actual[3 * N + i] = 0;
+ }
+ StoreInterleaved3(in0, in1, in2, d, actual);
+ size_t pos = 0;
+ if (!BytesEqual(expected.get(), actual, 4 * N * sizeof(T), &pos)) {
+ Print(d, "in0", in0, pos / 3, N);
+ Print(d, "in1", in1, pos / 3, N);
+ Print(d, "in2", in2, pos / 3, N);
+ const size_t i = pos;
+ fprintf(stderr, "interleaved i=%d %f %f %f %f %f %f\n",
+ static_cast<int>(i), static_cast<double>(actual[i]),
+ static_cast<double>(actual[i + 1]),
+ static_cast<double>(actual[i + 2]),
+ static_cast<double>(actual[i + 3]),
+ static_cast<double>(actual[i + 4]),
+ static_cast<double>(actual[i + 5]));
+ HWY_ASSERT(false);
+ }
+
+ Vec<D> out0, out1, out2;
+ LoadInterleaved3(d, actual, out0, out1, out2);
+ HWY_ASSERT_VEC_EQ(d, in0, out0);
+ HWY_ASSERT_VEC_EQ(d, in1, out1);
+ HWY_ASSERT_VEC_EQ(d, in2, out2);
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllLoadStoreInterleaved3() {
+#if HWY_TARGET == HWY_RVV
+ // Segments are limited to 8 registers, so we can only go up to LMUL=2.
+ const ForExtendableVectors<TestLoadStoreInterleaved3, 2> test;
+#else
+ const ForPartialVectors<TestLoadStoreInterleaved3> test;
+#endif
+ ForAllTypes(test);
+}
+
+struct TestLoadStoreInterleaved4 {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const size_t N = Lanes(d);
+
+ RandomState rng;
+
+ // Data to be interleaved
+ auto bytes = AllocateAligned<T>(4 * N);
+
+ for (size_t i = 0; i < 4 * N; ++i) {
+ bytes[i] = static_cast<T>(Random32(&rng) & 0xFF);
+ }
+ const auto in0 = Load(d, &bytes[0 * N]);
+ const auto in1 = Load(d, &bytes[1 * N]);
+ const auto in2 = Load(d, &bytes[2 * N]);
+ const auto in3 = Load(d, &bytes[3 * N]);
+
+ // Interleave here, ensure vector results match scalar
+ auto expected = AllocateAligned<T>(5 * N);
+ auto actual_aligned = AllocateAligned<T>(5 * N + 1);
+ T* actual = actual_aligned.get() + 1;
+
+ for (size_t rep = 0; rep < 100; ++rep) {
+ for (size_t i = 0; i < N; ++i) {
+ expected[4 * i + 0] = bytes[0 * N + i];
+ expected[4 * i + 1] = bytes[1 * N + i];
+ expected[4 * i + 2] = bytes[2 * N + i];
+ expected[4 * i + 3] = bytes[3 * N + i];
+ // Ensure we do not write more than 4*N bytes
+ expected[4 * N + i] = actual[4 * N + i] = 0;
+ }
+ StoreInterleaved4(in0, in1, in2, in3, d, actual);
+ size_t pos = 0;
+ if (!BytesEqual(expected.get(), actual, 5 * N * sizeof(T), &pos)) {
+ Print(d, "in0", in0, pos / 4);
+ Print(d, "in1", in1, pos / 4);
+ Print(d, "in2", in2, pos / 4);
+ Print(d, "in3", in3, pos / 4);
+ const size_t i = pos;
+ fprintf(stderr, "interleaved i=%d %f %f %f %f %f %f %f %f\n",
+ static_cast<int>(i), static_cast<double>(actual[i]),
+ static_cast<double>(actual[i + 1]),
+ static_cast<double>(actual[i + 2]),
+ static_cast<double>(actual[i + 3]),
+ static_cast<double>(actual[i + 4]),
+ static_cast<double>(actual[i + 5]),
+ static_cast<double>(actual[i + 6]),
+ static_cast<double>(actual[i + 7]));
+ HWY_ASSERT(false);
+ }
+
+ Vec<D> out0, out1, out2, out3;
+ LoadInterleaved4(d, actual, out0, out1, out2, out3);
+ HWY_ASSERT_VEC_EQ(d, in0, out0);
+ HWY_ASSERT_VEC_EQ(d, in1, out1);
+ HWY_ASSERT_VEC_EQ(d, in2, out2);
+ HWY_ASSERT_VEC_EQ(d, in3, out3);
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllLoadStoreInterleaved4() {
+#if HWY_TARGET == HWY_RVV
+ // Segments are limited to 8 registers, so we can only go up to LMUL=2.
+ const ForExtendableVectors<TestLoadStoreInterleaved4, 2> test;
+#else
+ const ForPartialVectors<TestLoadStoreInterleaved4> test;
+#endif
+ ForAllTypes(test);
+}
+
+#endif // !HWY_BROKEN_LOAD34
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyInterleavedTest);
+HWY_EXPORT_AND_TEST_P(HwyInterleavedTest, TestAllLoadStoreInterleaved2);
+#if !HWY_BROKEN_LOAD34
+HWY_EXPORT_AND_TEST_P(HwyInterleavedTest, TestAllLoadStoreInterleaved3);
+HWY_EXPORT_AND_TEST_P(HwyInterleavedTest, TestAllLoadStoreInterleaved4);
+#endif
+} // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/list_targets.cc b/third_party/highway/hwy/tests/list_targets.cc
new file mode 100644
index 0000000000..d09ee4fe86
--- /dev/null
+++ b/third_party/highway/hwy/tests/list_targets.cc
@@ -0,0 +1,71 @@
+// Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Simple tool to print the list of targets that were compiled in when building
+// this tool.
+
+#include <stdio.h>
+
+#include "hwy/highway.h"
+
+void PrintTargets(const char* msg, int64_t targets) {
+ fprintf(stderr, "%s", msg);
+ // For each bit:
+ for (int64_t x = targets; x != 0; x = x & (x - 1)) {
+ // Extract value of least-significant bit.
+ fprintf(stderr, " %s", hwy::TargetName(x & (~x + 1)));
+ }
+ fprintf(stderr, "\n");
+}
+
+int main() {
+#ifdef HWY_COMPILE_ONLY_EMU128
+ const int only_emu128 = 1;
+#else
+ const int only_emu128 = 0;
+#endif
+#ifdef HWY_COMPILE_ONLY_SCALAR
+ const int only_scalar = 1;
+#else
+ const int only_scalar = 0;
+#endif
+#ifdef HWY_COMPILE_ONLY_STATIC
+ const int only_static = 1;
+#else
+ const int only_static = 0;
+#endif
+#ifdef HWY_COMPILE_ALL_ATTAINABLE
+ const int all_attain = 1;
+#else
+ const int all_attain = 0;
+#endif
+#ifdef HWY_IS_TEST
+ const int is_test = 1;
+#else
+ const int is_test = 0;
+#endif
+
+ fprintf(stderr,
+ "Config: emu128:%d scalar:%d static:%d all_attain:%d is_test:%d\n",
+ only_emu128, only_scalar, only_static, all_attain, is_test);
+ PrintTargets("Compiled HWY_TARGETS: ", HWY_TARGETS);
+ PrintTargets("HWY_ATTAINABLE_TARGETS:", HWY_ATTAINABLE_TARGETS);
+ PrintTargets("HWY_BASELINE_TARGETS: ", HWY_BASELINE_TARGETS);
+ PrintTargets("HWY_STATIC_TARGET: ", HWY_STATIC_TARGET);
+ PrintTargets("HWY_BROKEN_TARGETS: ", HWY_BROKEN_TARGETS);
+ PrintTargets("HWY_DISABLED_TARGETS: ", HWY_DISABLED_TARGETS);
+ PrintTargets("Current CPU supports: ", hwy::SupportedTargets());
+ return 0;
+}
diff --git a/third_party/highway/hwy/tests/logical_test.cc b/third_party/highway/hwy/tests/logical_test.cc
new file mode 100644
index 0000000000..b646f5ff4b
--- /dev/null
+++ b/third_party/highway/hwy/tests/logical_test.cc
@@ -0,0 +1,246 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h> // memcmp
+
+#include "hwy/aligned_allocator.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/logical_test.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestNot {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto v0 = Zero(d);
+ const auto ones = VecFromMask(d, Eq(v0, v0));
+ const auto v1 = Set(d, 1);
+ const auto vnot1 = Set(d, T(~T(1)));
+
+ HWY_ASSERT_VEC_EQ(d, v0, Not(ones));
+ HWY_ASSERT_VEC_EQ(d, ones, Not(v0));
+ HWY_ASSERT_VEC_EQ(d, v1, Not(vnot1));
+ HWY_ASSERT_VEC_EQ(d, vnot1, Not(v1));
+ }
+};
+
+HWY_NOINLINE void TestAllNot() {
+ ForIntegerTypes(ForPartialVectors<TestNot>());
+}
+
+struct TestLogical {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto v0 = Zero(d);
+ const auto vi = Iota(d, 0);
+
+ auto v = vi;
+ v = And(v, vi);
+ HWY_ASSERT_VEC_EQ(d, vi, v);
+ v = And(v, v0);
+ HWY_ASSERT_VEC_EQ(d, v0, v);
+
+ v = Or(v, vi);
+ HWY_ASSERT_VEC_EQ(d, vi, v);
+ v = Or(v, v0);
+ HWY_ASSERT_VEC_EQ(d, vi, v);
+
+ v = Xor(v, vi);
+ HWY_ASSERT_VEC_EQ(d, v0, v);
+ v = Xor(v, v0);
+ HWY_ASSERT_VEC_EQ(d, v0, v);
+
+ HWY_ASSERT_VEC_EQ(d, v0, And(v0, vi));
+ HWY_ASSERT_VEC_EQ(d, v0, And(vi, v0));
+ HWY_ASSERT_VEC_EQ(d, vi, And(vi, vi));
+
+ HWY_ASSERT_VEC_EQ(d, vi, Or(v0, vi));
+ HWY_ASSERT_VEC_EQ(d, vi, Or(vi, v0));
+ HWY_ASSERT_VEC_EQ(d, vi, Or(vi, vi));
+
+ HWY_ASSERT_VEC_EQ(d, vi, Xor(v0, vi));
+ HWY_ASSERT_VEC_EQ(d, vi, Xor(vi, v0));
+ HWY_ASSERT_VEC_EQ(d, v0, Xor(vi, vi));
+
+ HWY_ASSERT_VEC_EQ(d, vi, AndNot(v0, vi));
+ HWY_ASSERT_VEC_EQ(d, v0, AndNot(vi, v0));
+ HWY_ASSERT_VEC_EQ(d, v0, AndNot(vi, vi));
+
+ HWY_ASSERT_VEC_EQ(d, v0, Or3(v0, v0, v0));
+ HWY_ASSERT_VEC_EQ(d, vi, Or3(v0, vi, v0));
+ HWY_ASSERT_VEC_EQ(d, vi, Or3(v0, v0, vi));
+ HWY_ASSERT_VEC_EQ(d, vi, Or3(v0, vi, vi));
+ HWY_ASSERT_VEC_EQ(d, vi, Or3(vi, v0, v0));
+ HWY_ASSERT_VEC_EQ(d, vi, Or3(vi, vi, v0));
+ HWY_ASSERT_VEC_EQ(d, vi, Or3(vi, v0, vi));
+ HWY_ASSERT_VEC_EQ(d, vi, Or3(vi, vi, vi));
+
+ HWY_ASSERT_VEC_EQ(d, v0, Xor3(v0, v0, v0));
+ HWY_ASSERT_VEC_EQ(d, vi, Xor3(v0, vi, v0));
+ HWY_ASSERT_VEC_EQ(d, vi, Xor3(v0, v0, vi));
+ HWY_ASSERT_VEC_EQ(d, v0, Xor3(v0, vi, vi));
+ HWY_ASSERT_VEC_EQ(d, vi, Xor3(vi, v0, v0));
+ HWY_ASSERT_VEC_EQ(d, v0, Xor3(vi, vi, v0));
+ HWY_ASSERT_VEC_EQ(d, v0, Xor3(vi, v0, vi));
+ HWY_ASSERT_VEC_EQ(d, vi, Xor3(vi, vi, vi));
+
+ HWY_ASSERT_VEC_EQ(d, v0, OrAnd(v0, v0, v0));
+ HWY_ASSERT_VEC_EQ(d, v0, OrAnd(v0, vi, v0));
+ HWY_ASSERT_VEC_EQ(d, v0, OrAnd(v0, v0, vi));
+ HWY_ASSERT_VEC_EQ(d, vi, OrAnd(v0, vi, vi));
+ HWY_ASSERT_VEC_EQ(d, vi, OrAnd(vi, v0, v0));
+ HWY_ASSERT_VEC_EQ(d, vi, OrAnd(vi, vi, v0));
+ HWY_ASSERT_VEC_EQ(d, vi, OrAnd(vi, v0, vi));
+ HWY_ASSERT_VEC_EQ(d, vi, OrAnd(vi, vi, vi));
+ }
+};
+
+HWY_NOINLINE void TestAllLogical() {
+ ForAllTypes(ForPartialVectors<TestLogical>());
+}
+
+struct TestCopySign {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto v0 = Zero(d);
+ const auto vp = Iota(d, 1);
+ const auto vn = Iota(d, T(-1E5)); // assumes N < 10^5
+
+ // Zero remains zero regardless of sign
+ HWY_ASSERT_VEC_EQ(d, v0, CopySign(v0, v0));
+ HWY_ASSERT_VEC_EQ(d, v0, CopySign(v0, vp));
+ HWY_ASSERT_VEC_EQ(d, v0, CopySign(v0, vn));
+ HWY_ASSERT_VEC_EQ(d, v0, CopySignToAbs(v0, v0));
+ HWY_ASSERT_VEC_EQ(d, v0, CopySignToAbs(v0, vp));
+ HWY_ASSERT_VEC_EQ(d, v0, CopySignToAbs(v0, vn));
+
+ // Positive input, positive sign => unchanged
+ HWY_ASSERT_VEC_EQ(d, vp, CopySign(vp, vp));
+ HWY_ASSERT_VEC_EQ(d, vp, CopySignToAbs(vp, vp));
+
+ // Positive input, negative sign => negated
+ HWY_ASSERT_VEC_EQ(d, Neg(vp), CopySign(vp, vn));
+ HWY_ASSERT_VEC_EQ(d, Neg(vp), CopySignToAbs(vp, vn));
+
+ // Negative input, negative sign => unchanged
+ HWY_ASSERT_VEC_EQ(d, vn, CopySign(vn, vn));
+
+ // Negative input, positive sign => negated
+ HWY_ASSERT_VEC_EQ(d, Neg(vn), CopySign(vn, vp));
+ }
+};
+
+HWY_NOINLINE void TestAllCopySign() {
+ ForFloatTypes(ForPartialVectors<TestCopySign>());
+}
+
+struct TestBroadcastSignBit {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto s0 = Zero(d);
+ const auto s1 = Set(d, -1); // all bit set
+ const auto vpos = And(Iota(d, 0), Set(d, LimitsMax<T>()));
+ const auto vneg = Sub(s1, vpos);
+
+ HWY_ASSERT_VEC_EQ(d, s0, BroadcastSignBit(vpos));
+ HWY_ASSERT_VEC_EQ(d, s0, BroadcastSignBit(Set(d, LimitsMax<T>())));
+
+ HWY_ASSERT_VEC_EQ(d, s1, BroadcastSignBit(vneg));
+ HWY_ASSERT_VEC_EQ(d, s1, BroadcastSignBit(Set(d, LimitsMin<T>())));
+ HWY_ASSERT_VEC_EQ(d, s1, BroadcastSignBit(Set(d, LimitsMin<T>() / 2)));
+ }
+};
+
+HWY_NOINLINE void TestAllBroadcastSignBit() {
+ ForSignedTypes(ForPartialVectors<TestBroadcastSignBit>());
+}
+
+struct TestTestBit {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const size_t kNumBits = sizeof(T) * 8;
+ for (size_t i = 0; i < kNumBits; ++i) {
+ const auto bit1 = Set(d, T(1ull << i));
+ const auto bit2 = Set(d, T(1ull << ((i + 1) % kNumBits)));
+ const auto bit3 = Set(d, T(1ull << ((i + 2) % kNumBits)));
+ const auto bits12 = Or(bit1, bit2);
+ const auto bits23 = Or(bit2, bit3);
+ HWY_ASSERT(AllTrue(d, TestBit(bit1, bit1)));
+ HWY_ASSERT(AllTrue(d, TestBit(bits12, bit1)));
+ HWY_ASSERT(AllTrue(d, TestBit(bits12, bit2)));
+
+ HWY_ASSERT(AllFalse(d, TestBit(bits12, bit3)));
+ HWY_ASSERT(AllFalse(d, TestBit(bits23, bit1)));
+ HWY_ASSERT(AllFalse(d, TestBit(bit1, bit2)));
+ HWY_ASSERT(AllFalse(d, TestBit(bit2, bit1)));
+ HWY_ASSERT(AllFalse(d, TestBit(bit1, bit3)));
+ HWY_ASSERT(AllFalse(d, TestBit(bit3, bit1)));
+ HWY_ASSERT(AllFalse(d, TestBit(bit2, bit3)));
+ HWY_ASSERT(AllFalse(d, TestBit(bit3, bit2)));
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllTestBit() {
+ ForIntegerTypes(ForPartialVectors<TestTestBit>());
+}
+
+struct TestPopulationCount {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ RandomState rng;
+ size_t N = Lanes(d);
+ auto data = AllocateAligned<T>(N);
+ auto popcnt = AllocateAligned<T>(N);
+ for (size_t i = 0; i < AdjustedReps(1 << 18) / N; i++) {
+ for (size_t i = 0; i < N; i++) {
+ data[i] = static_cast<T>(rng());
+ popcnt[i] = static_cast<T>(PopCount(data[i]));
+ }
+ HWY_ASSERT_VEC_EQ(d, popcnt.get(), PopulationCount(Load(d, data.get())));
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllPopulationCount() {
+ ForUnsignedTypes(ForPartialVectors<TestPopulationCount>());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyLogicalTest);
+HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllNot);
+HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogical);
+HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCopySign);
+HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllBroadcastSignBit);
+HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllTestBit);
+HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllPopulationCount);
+} // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/mask_mem_test.cc b/third_party/highway/hwy/tests/mask_mem_test.cc
new file mode 100644
index 0000000000..c44119dcd7
--- /dev/null
+++ b/third_party/highway/hwy/tests/mask_mem_test.cc
@@ -0,0 +1,197 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS // before inttypes.h
+#endif
+#include <inttypes.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h> // memcmp
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/mask_mem_test.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestMaskedLoad {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ RandomState rng;
+
+ using TI = MakeSigned<T>; // For mask > 0 comparison
+ const Rebind<TI, D> di;
+ const size_t N = Lanes(d);
+ auto bool_lanes = AllocateAligned<TI>(N);
+
+ auto lanes = AllocateAligned<T>(N);
+ Store(Iota(d, T{1}), d, lanes.get());
+
+ // Each lane should have a chance of having mask=true.
+ for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+ for (size_t i = 0; i < N; ++i) {
+ bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0);
+ }
+
+ const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
+ const auto expected = IfThenElseZero(mask, Load(d, lanes.get()));
+ const auto actual = MaskedLoad(mask, d, lanes.get());
+ HWY_ASSERT_VEC_EQ(d, expected, actual);
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllMaskedLoad() {
+ ForAllTypes(ForPartialVectors<TestMaskedLoad>());
+}
+
+struct TestBlendedStore {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ RandomState rng;
+
+ using TI = MakeSigned<T>; // For mask > 0 comparison
+ const Rebind<TI, D> di;
+ const size_t N = Lanes(d);
+ auto bool_lanes = AllocateAligned<TI>(N);
+
+ const Vec<D> v = Iota(d, T{1});
+ auto actual = AllocateAligned<T>(N);
+ auto expected = AllocateAligned<T>(N);
+
+ // Each lane should have a chance of having mask=true.
+ for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+ for (size_t i = 0; i < N; ++i) {
+ bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0);
+ // Re-initialize to something distinct from v[i].
+ actual[i] = static_cast<T>(127 - (i & 127));
+ expected[i] = bool_lanes[i] ? static_cast<T>(i + 1) : actual[i];
+ }
+
+ const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
+ BlendedStore(v, mask, d, actual.get());
+ HWY_ASSERT_VEC_EQ(d, expected.get(), Load(d, actual.get()));
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllBlendedStore() {
+ ForAllTypes(ForPartialVectors<TestBlendedStore>());
+}
+
+class TestStoreMaskBits {
+ public:
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*t*/, D /*d*/) {
+ RandomState rng;
+ using TI = MakeSigned<T>; // For mask > 0 comparison
+ const Rebind<TI, D> di;
+ const size_t N = Lanes(di);
+ auto bool_lanes = AllocateAligned<TI>(N);
+
+ const ScalableTag<uint8_t, -3> d_bits;
+ const size_t expected_num_bytes = (N + 7) / 8;
+ auto expected = AllocateAligned<uint8_t>(expected_num_bytes);
+ auto actual = AllocateAligned<uint8_t>(HWY_MAX(8, expected_num_bytes));
+
+ for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+ // Generate random mask pattern.
+ for (size_t i = 0; i < N; ++i) {
+ bool_lanes[i] = static_cast<TI>((rng() & 1024) ? 1 : 0);
+ }
+ const auto bools = Load(di, bool_lanes.get());
+ const auto mask = Gt(bools, Zero(di));
+
+ // Requires at least 8 bytes, ensured above.
+ const size_t bytes_written = StoreMaskBits(di, mask, actual.get());
+ if (bytes_written != expected_num_bytes) {
+ fprintf(stderr, "%s expected %" PRIu64 " bytes, actual %" PRIu64 "\n",
+ TypeName(T(), N).c_str(),
+ static_cast<uint64_t>(expected_num_bytes),
+ static_cast<uint64_t>(bytes_written));
+
+ HWY_ASSERT(false);
+ }
+
+ // Requires at least 8 bytes, ensured above.
+ const auto mask2 = LoadMaskBits(di, actual.get());
+ HWY_ASSERT_MASK_EQ(di, mask, mask2);
+
+ memset(expected.get(), 0, expected_num_bytes);
+ for (size_t i = 0; i < N; ++i) {
+ expected[i / 8] =
+ static_cast<uint8_t>(expected[i / 8] | (bool_lanes[i] << (i % 8)));
+ }
+
+ size_t i = 0;
+ // Stored bits must match original mask
+ for (; i < N; ++i) {
+ const TI is_set = (actual[i / 8] & (1 << (i % 8))) ? 1 : 0;
+ if (is_set != bool_lanes[i]) {
+ fprintf(stderr, "%s lane %" PRIu64 ": expected %d, actual %d\n",
+ TypeName(T(), N).c_str(), static_cast<uint64_t>(i),
+ static_cast<int>(bool_lanes[i]), static_cast<int>(is_set));
+ Print(di, "bools", bools, 0, N);
+ Print(d_bits, "expected bytes", Load(d_bits, expected.get()), 0,
+ expected_num_bytes);
+ Print(d_bits, "actual bytes", Load(d_bits, actual.get()), 0,
+ expected_num_bytes);
+
+ HWY_ASSERT(false);
+ }
+ }
+ // Any partial bits in the last byte must be zero
+ for (; i < 8 * bytes_written; ++i) {
+ const int bit = (actual[i / 8] & (1 << (i % 8)));
+ if (bit != 0) {
+ fprintf(stderr, "%s: bit #%" PRIu64 " should be zero\n",
+ TypeName(T(), N).c_str(), static_cast<uint64_t>(i));
+ Print(di, "bools", bools, 0, N);
+ Print(d_bits, "expected bytes", Load(d_bits, expected.get()), 0,
+ expected_num_bytes);
+ Print(d_bits, "actual bytes", Load(d_bits, actual.get()), 0,
+ expected_num_bytes);
+
+ HWY_ASSERT(false);
+ }
+ }
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllStoreMaskBits() {
+ ForAllTypes(ForPartialVectors<TestStoreMaskBits>());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyMaskTest);
+HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllMaskedLoad);
+HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllBlendedStore);
+HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllStoreMaskBits);
+} // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/mask_test.cc b/third_party/highway/hwy/tests/mask_test.cc
new file mode 100644
index 0000000000..cf0d2d4ee8
--- /dev/null
+++ b/third_party/highway/hwy/tests/mask_test.cc
@@ -0,0 +1,295 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h> // memcmp
+
+#include <algorithm> // std::fill
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/mask_test.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// All types.
+struct TestFromVec {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const size_t N = Lanes(d);
+ auto lanes = AllocateAligned<T>(N);
+
+ memset(lanes.get(), 0, N * sizeof(T));
+ const auto actual_false = MaskFromVec(Load(d, lanes.get()));
+ HWY_ASSERT_MASK_EQ(d, MaskFalse(d), actual_false);
+
+ memset(lanes.get(), 0xFF, N * sizeof(T));
+ const auto actual_true = MaskFromVec(Load(d, lanes.get()));
+ HWY_ASSERT_MASK_EQ(d, MaskTrue(d), actual_true);
+ }
+};
+
+HWY_NOINLINE void TestAllFromVec() {
+ ForAllTypes(ForPartialVectors<TestFromVec>());
+}
+
+struct TestFirstN {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const size_t N = Lanes(d);
+ auto bool_lanes = AllocateAligned<T>(N);
+
+ using TN = SignedFromSize<HWY_MIN(sizeof(size_t), sizeof(T))>;
+ const size_t max_len = static_cast<size_t>(LimitsMax<TN>());
+
+ const size_t max_lanes = HWY_MIN(2 * N, AdjustedReps(512));
+ for (size_t len = 0; len <= HWY_MIN(max_lanes, max_len); ++len) {
+ // Loop instead of Iota+Lt to avoid wraparound for 8-bit T.
+ for (size_t i = 0; i < N; ++i) {
+ bool_lanes[i] = (i < len) ? T{1} : 0;
+ }
+ const auto expected = Eq(Load(d, bool_lanes.get()), Set(d, T{1}));
+ HWY_ASSERT_MASK_EQ(d, expected, FirstN(d, len));
+ }
+
+ // Also ensure huge values yield all-true (unless the vector is actually
+ // larger than max_len).
+ for (size_t i = 0; i < N; ++i) {
+ bool_lanes[i] = (i < max_len) ? T{1} : 0;
+ }
+ const auto expected = Eq(Load(d, bool_lanes.get()), Set(d, T{1}));
+ HWY_ASSERT_MASK_EQ(d, expected, FirstN(d, max_len));
+ }
+};
+
+HWY_NOINLINE void TestAllFirstN() {
+ ForAllTypes(ForPartialVectors<TestFirstN>());
+}
+
+struct TestMaskVec {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ RandomState rng;
+
+ using TI = MakeSigned<T>; // For mask > 0 comparison
+ const Rebind<TI, D> di;
+ const size_t N = Lanes(d);
+ auto bool_lanes = AllocateAligned<TI>(N);
+
+ // Each lane should have a chance of having mask=true.
+ for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+ for (size_t i = 0; i < N; ++i) {
+ bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0);
+ }
+
+ const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
+ HWY_ASSERT_MASK_EQ(d, mask, MaskFromVec(VecFromMask(d, mask)));
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllMaskVec() {
+ const ForPartialVectors<TestMaskVec> test;
+
+ test(uint16_t());
+ test(int16_t());
+ // TODO(janwas): float16_t - cannot compare yet
+
+ ForUIF3264(test);
+}
+
+struct TestAllTrueFalse {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto zero = Zero(d);
+ auto v = zero;
+
+ const size_t N = Lanes(d);
+ auto lanes = AllocateAligned<T>(N);
+ std::fill(lanes.get(), lanes.get() + N, T(0));
+
+ HWY_ASSERT(AllTrue(d, Eq(v, zero)));
+ HWY_ASSERT(!AllFalse(d, Eq(v, zero)));
+
+ // Single lane implies AllFalse = !AllTrue. Otherwise, there are multiple
+ // lanes and one is nonzero.
+ const bool expected_all_false = (N != 1);
+
+ // Set each lane to nonzero and back to zero
+ for (size_t i = 0; i < N; ++i) {
+ lanes[i] = T(1);
+ v = Load(d, lanes.get());
+
+ HWY_ASSERT(!AllTrue(d, Eq(v, zero)));
+
+ HWY_ASSERT(expected_all_false ^ AllFalse(d, Eq(v, zero)));
+
+ lanes[i] = T(-1);
+ v = Load(d, lanes.get());
+ HWY_ASSERT(!AllTrue(d, Eq(v, zero)));
+ HWY_ASSERT(expected_all_false ^ AllFalse(d, Eq(v, zero)));
+
+ // Reset to all zero
+ lanes[i] = T(0);
+ v = Load(d, lanes.get());
+ HWY_ASSERT(AllTrue(d, Eq(v, zero)));
+ HWY_ASSERT(!AllFalse(d, Eq(v, zero)));
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllAllTrueFalse() {
+ ForAllTypes(ForPartialVectors<TestAllTrueFalse>());
+}
+
+struct TestCountTrue {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ using TI = MakeSigned<T>; // For mask > 0 comparison
+ const Rebind<TI, D> di;
+ const size_t N = Lanes(di);
+ auto bool_lanes = AllocateAligned<TI>(N);
+ memset(bool_lanes.get(), 0, N * sizeof(TI));
+
+ // For all combinations of zero/nonzero state of subset of lanes:
+ const size_t max_lanes = HWY_MIN(N, size_t(10));
+
+ for (size_t code = 0; code < (1ull << max_lanes); ++code) {
+ // Number of zeros written = number of mask lanes that are true.
+ size_t expected = 0;
+ for (size_t i = 0; i < max_lanes; ++i) {
+ const bool is_true = (code & (1ull << i)) != 0;
+ bool_lanes[i] = is_true ? TI(1) : TI(0);
+ expected += is_true;
+ }
+
+ const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
+ const size_t actual = CountTrue(d, mask);
+ HWY_ASSERT_EQ(expected, actual);
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllCountTrue() {
+ ForAllTypes(ForPartialVectors<TestCountTrue>());
+}
+
+struct TestFindFirstTrue { // Also FindKnownFirstTrue
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ using TI = MakeSigned<T>; // For mask > 0 comparison
+ const Rebind<TI, D> di;
+ const size_t N = Lanes(di);
+ auto bool_lanes = AllocateAligned<TI>(N);
+ memset(bool_lanes.get(), 0, N * sizeof(TI));
+
+ // For all combinations of zero/nonzero state of subset of lanes:
+ const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(9)));
+
+ HWY_ASSERT_EQ(intptr_t(-1), FindFirstTrue(d, MaskFalse(d)));
+ HWY_ASSERT_EQ(intptr_t(0), FindFirstTrue(d, MaskTrue(d)));
+ HWY_ASSERT_EQ(size_t(0), FindKnownFirstTrue(d, MaskTrue(d)));
+
+ for (size_t code = 1; code < (1ull << max_lanes); ++code) {
+ for (size_t i = 0; i < max_lanes; ++i) {
+ bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0);
+ }
+
+ const size_t expected =
+ Num0BitsBelowLS1Bit_Nonzero32(static_cast<uint32_t>(code));
+ const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
+ HWY_ASSERT_EQ(static_cast<intptr_t>(expected), FindFirstTrue(d, mask));
+ HWY_ASSERT_EQ(expected, FindKnownFirstTrue(d, mask));
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllFindFirstTrue() {
+ ForAllTypes(ForPartialVectors<TestFindFirstTrue>());
+}
+
+struct TestLogicalMask {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto m0 = MaskFalse(d);
+ const auto m_all = MaskTrue(d);
+
+ using TI = MakeSigned<T>; // For mask > 0 comparison
+ const Rebind<TI, D> di;
+ const size_t N = Lanes(di);
+ auto bool_lanes = AllocateAligned<TI>(N);
+ memset(bool_lanes.get(), 0, N * sizeof(TI));
+
+ HWY_ASSERT_MASK_EQ(d, m0, Not(m_all));
+ HWY_ASSERT_MASK_EQ(d, m_all, Not(m0));
+
+ Print(d, ".", VecFromMask(d, ExclusiveNeither(m0, m0)));
+ HWY_ASSERT_MASK_EQ(d, m_all, ExclusiveNeither(m0, m0));
+ HWY_ASSERT_MASK_EQ(d, m0, ExclusiveNeither(m_all, m0));
+ HWY_ASSERT_MASK_EQ(d, m0, ExclusiveNeither(m0, m_all));
+
+ // For all combinations of zero/nonzero state of subset of lanes:
+ const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6)));
+ for (size_t code = 0; code < (1ull << max_lanes); ++code) {
+ for (size_t i = 0; i < max_lanes; ++i) {
+ bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0);
+ }
+
+ const auto m = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
+
+ HWY_ASSERT_MASK_EQ(d, m0, Xor(m, m));
+ HWY_ASSERT_MASK_EQ(d, m0, AndNot(m, m));
+ HWY_ASSERT_MASK_EQ(d, m0, AndNot(m_all, m));
+
+ HWY_ASSERT_MASK_EQ(d, m, Or(m, m));
+ HWY_ASSERT_MASK_EQ(d, m, Or(m0, m));
+ HWY_ASSERT_MASK_EQ(d, m, Or(m, m0));
+ HWY_ASSERT_MASK_EQ(d, m, Xor(m0, m));
+ HWY_ASSERT_MASK_EQ(d, m, Xor(m, m0));
+ HWY_ASSERT_MASK_EQ(d, m, And(m, m));
+ HWY_ASSERT_MASK_EQ(d, m, And(m_all, m));
+ HWY_ASSERT_MASK_EQ(d, m, And(m, m_all));
+ HWY_ASSERT_MASK_EQ(d, m, AndNot(m0, m));
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllLogicalMask() {
+ ForAllTypes(ForPartialVectors<TestLogicalMask>());
+}
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyMaskTest);
+HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFromVec);
+HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFirstN);
+HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllMaskVec);
+HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllAllTrueFalse);
+HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllCountTrue);
+HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFindFirstTrue);
+HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllLogicalMask);
+} // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/memory_test.cc b/third_party/highway/hwy/tests/memory_test.cc
new file mode 100644
index 0000000000..d17addf544
--- /dev/null
+++ b/third_party/highway/hwy/tests/memory_test.cc
@@ -0,0 +1,343 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Ensure incompabilities with Windows macros (e.g. #define StoreFence) are
+// detected. Must come before Highway headers.
+#include "hwy/base.h"
+#if defined(_WIN32) || defined(_WIN64)
+#include <windows.h>
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm> // std::fill
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/memory_test.cc"
+#include "hwy/cache_control.h"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestLoadStore {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const size_t N = Lanes(d);
+ const auto hi = Iota(d, static_cast<T>(1 + N));
+ const auto lo = Iota(d, 1);
+ auto lanes = AllocateAligned<T>(2 * N);
+ Store(hi, d, &lanes[N]);
+ Store(lo, d, &lanes[0]);
+
+ // Aligned load
+ const auto lo2 = Load(d, &lanes[0]);
+ HWY_ASSERT_VEC_EQ(d, lo2, lo);
+
+ // Aligned store
+ auto lanes2 = AllocateAligned<T>(2 * N);
+ Store(lo2, d, &lanes2[0]);
+ Store(hi, d, &lanes2[N]);
+ for (size_t i = 0; i < 2 * N; ++i) {
+ HWY_ASSERT_EQ(lanes[i], lanes2[i]);
+ }
+
+ // Unaligned load
+ const auto vu = LoadU(d, &lanes[1]);
+ auto lanes3 = AllocateAligned<T>(N);
+ Store(vu, d, lanes3.get());
+ for (size_t i = 0; i < N; ++i) {
+ HWY_ASSERT_EQ(T(i + 2), lanes3[i]);
+ }
+
+ // Unaligned store
+ StoreU(lo2, d, &lanes2[N / 2]);
+ size_t i = 0;
+ for (; i < N / 2; ++i) {
+ HWY_ASSERT_EQ(lanes[i], lanes2[i]);
+ }
+ for (; i < 3 * N / 2; ++i) {
+ HWY_ASSERT_EQ(T(i - N / 2 + 1), lanes2[i]);
+ }
+ // Subsequent values remain unchanged.
+ for (; i < 2 * N; ++i) {
+ HWY_ASSERT_EQ(T(i + 1), lanes2[i]);
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllLoadStore() {
+ ForAllTypes(ForPartialVectors<TestLoadStore>());
+}
+
+struct TestSafeCopyN {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const size_t N = Lanes(d);
+ const auto v = Iota(d, 1);
+ auto from = AllocateAligned<T>(N + 2);
+ auto to = AllocateAligned<T>(N + 2);
+ Store(v, d, from.get());
+
+ // 0: nothing changes
+ to[0] = T();
+ SafeCopyN(0, d, from.get(), to.get());
+ HWY_ASSERT_EQ(T(), to[0]);
+
+ // 1: only first changes
+ to[1] = T();
+ SafeCopyN(1, d, from.get(), to.get());
+ HWY_ASSERT_EQ(static_cast<T>(1), to[0]);
+ HWY_ASSERT_EQ(T(), to[1]);
+
+ // N-1: last does not change
+ to[N - 1] = T();
+ SafeCopyN(N - 1, d, from.get(), to.get());
+ HWY_ASSERT_EQ(T(), to[N - 1]);
+ // Also check preceding lanes
+ to[N - 1] = static_cast<T>(N);
+ HWY_ASSERT_VEC_EQ(d, to.get(), v);
+
+ // N: all change
+ to[N] = T();
+ SafeCopyN(N, d, from.get(), to.get());
+ HWY_ASSERT_VEC_EQ(d, to.get(), v);
+ HWY_ASSERT_EQ(T(), to[N]);
+
+ // N+1: subsequent lane does not change if using masked store
+ to[N + 1] = T();
+ SafeCopyN(N + 1, d, from.get(), to.get());
+ HWY_ASSERT_VEC_EQ(d, to.get(), v);
+#if !HWY_MEM_OPS_MIGHT_FAULT
+ HWY_ASSERT_EQ(T(), to[N + 1]);
+#endif
+ }
+};
+
+HWY_NOINLINE void TestAllSafeCopyN() {
+ ForAllTypes(ForPartialVectors<TestSafeCopyN>());
+}
+
+struct TestLoadDup128 {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ // Scalar does not define LoadDup128.
+#if HWY_TARGET != HWY_SCALAR || HWY_IDE
+ constexpr size_t N128 = 16 / sizeof(T);
+ alignas(16) T lanes[N128];
+ for (size_t i = 0; i < N128; ++i) {
+ lanes[i] = static_cast<T>(1 + i);
+ }
+
+ const size_t N = Lanes(d);
+ auto expected = AllocateAligned<T>(N);
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = static_cast<T>(i % N128 + 1);
+ }
+
+ HWY_ASSERT_VEC_EQ(d, expected.get(), LoadDup128(d, lanes));
+#else
+ (void)d;
+#endif
+ }
+};
+
+HWY_NOINLINE void TestAllLoadDup128() {
+ ForAllTypes(ForGEVectors<128, TestLoadDup128>());
+}
+
+struct TestStream {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto v = Iota(d, T(1));
+ const size_t affected_bytes =
+ (Lanes(d) * sizeof(T) + HWY_STREAM_MULTIPLE - 1) &
+ ~size_t(HWY_STREAM_MULTIPLE - 1);
+ const size_t affected_lanes = affected_bytes / sizeof(T);
+ auto out = AllocateAligned<T>(2 * affected_lanes);
+ std::fill(out.get(), out.get() + 2 * affected_lanes, T(0));
+
+ Stream(v, d, out.get());
+ FlushStream();
+ const auto actual = Load(d, out.get());
+ HWY_ASSERT_VEC_EQ(d, v, actual);
+ // Ensure Stream didn't modify more memory than expected
+ for (size_t i = affected_lanes; i < 2 * affected_lanes; ++i) {
+ HWY_ASSERT_EQ(T(0), out[i]);
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllStream() {
+ const ForPartialVectors<TestStream> test;
+ // No u8,u16.
+ test(uint32_t());
+ test(uint64_t());
+ // No i8,i16.
+ test(int32_t());
+ test(int64_t());
+ ForFloatTypes(test);
+}
+
+// Assumes little-endian byte order!
+struct TestScatter {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ using Offset = MakeSigned<T>;
+
+ const size_t N = Lanes(d);
+ const size_t range = 4 * N; // number of items to scatter
+ const size_t max_bytes = range * sizeof(T); // upper bound on offset
+
+ RandomState rng;
+
+ // Data to be scattered
+ auto bytes = AllocateAligned<uint8_t>(max_bytes);
+ for (size_t i = 0; i < max_bytes; ++i) {
+ bytes[i] = static_cast<uint8_t>(Random32(&rng) & 0xFF);
+ }
+ const auto data = Load(d, reinterpret_cast<const T*>(bytes.get()));
+
+ // Scatter into these regions, ensure vector results match scalar
+ auto expected = AllocateAligned<T>(range);
+ auto actual = AllocateAligned<T>(range);
+
+ const Rebind<Offset, D> d_offsets;
+ auto offsets = AllocateAligned<Offset>(N); // or indices
+
+ for (size_t rep = 0; rep < 100; ++rep) {
+ // Byte offsets
+ std::fill(expected.get(), expected.get() + range, T(0));
+ std::fill(actual.get(), actual.get() + range, T(0));
+ for (size_t i = 0; i < N; ++i) {
+ // Must be aligned
+ offsets[i] = static_cast<Offset>((Random32(&rng) % range) * sizeof(T));
+ CopyBytes<sizeof(T)>(
+ bytes.get() + i * sizeof(T),
+ reinterpret_cast<uint8_t*>(expected.get()) + offsets[i]);
+ }
+ const auto voffsets = Load(d_offsets, offsets.get());
+ ScatterOffset(data, d, actual.get(), voffsets);
+ if (!BytesEqual(expected.get(), actual.get(), max_bytes)) {
+ Print(d, "Data", data);
+ Print(d_offsets, "Offsets", voffsets);
+ HWY_ASSERT(false);
+ }
+
+ // Indices
+ std::fill(expected.get(), expected.get() + range, T(0));
+ std::fill(actual.get(), actual.get() + range, T(0));
+ for (size_t i = 0; i < N; ++i) {
+ offsets[i] = static_cast<Offset>(Random32(&rng) % range);
+ CopyBytes<sizeof(T)>(bytes.get() + i * sizeof(T),
+ &expected[size_t(offsets[i])]);
+ }
+ const auto vindices = Load(d_offsets, offsets.get());
+ ScatterIndex(data, d, actual.get(), vindices);
+ if (!BytesEqual(expected.get(), actual.get(), max_bytes)) {
+ Print(d, "Data", data);
+ Print(d_offsets, "Indices", vindices);
+ HWY_ASSERT(false);
+ }
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllScatter() {
+ ForUIF3264(ForPartialVectors<TestScatter>());
+}
+
+struct TestGather {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ using Offset = MakeSigned<T>;
+
+ const size_t N = Lanes(d);
+ const size_t range = 4 * N; // number of items to gather
+ const size_t max_bytes = range * sizeof(T); // upper bound on offset
+
+ RandomState rng;
+
+ // Data to be gathered from
+ auto bytes = AllocateAligned<uint8_t>(max_bytes);
+ for (size_t i = 0; i < max_bytes; ++i) {
+ bytes[i] = static_cast<uint8_t>(Random32(&rng) & 0xFF);
+ }
+
+ auto expected = AllocateAligned<T>(N);
+ auto offsets = AllocateAligned<Offset>(N);
+ auto indices = AllocateAligned<Offset>(N);
+
+ for (size_t rep = 0; rep < 100; ++rep) {
+ // Offsets
+ for (size_t i = 0; i < N; ++i) {
+ // Must be aligned
+ offsets[i] = static_cast<Offset>((Random32(&rng) % range) * sizeof(T));
+ CopyBytes<sizeof(T)>(bytes.get() + offsets[i], &expected[i]);
+ }
+
+ const Rebind<Offset, D> d_offset;
+ const T* base = reinterpret_cast<const T*>(bytes.get());
+ auto actual = GatherOffset(d, base, Load(d_offset, offsets.get()));
+ HWY_ASSERT_VEC_EQ(d, expected.get(), actual);
+
+ // Indices
+ for (size_t i = 0; i < N; ++i) {
+ indices[i] =
+ static_cast<Offset>(Random32(&rng) % (max_bytes / sizeof(T)));
+ CopyBytes<sizeof(T)>(base + indices[i], &expected[i]);
+ }
+ actual = GatherIndex(d, base, Load(d_offset, indices.get()));
+ HWY_ASSERT_VEC_EQ(d, expected.get(), actual);
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllGather() {
+ ForUIF3264(ForPartialVectors<TestGather>());
+}
+
+HWY_NOINLINE void TestAllCache() {
+ LoadFence();
+ FlushStream();
+ int test = 0;
+ Prefetch(&test);
+ FlushCacheline(&test);
+ Pause();
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyMemoryTest);
+HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadStore);
+HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllSafeCopyN);
+HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadDup128);
+HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStream);
+HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllScatter);
+HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllGather);
+HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllCache);
+} // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/mul_test.cc b/third_party/highway/hwy/tests/mul_test.cc
new file mode 100644
index 0000000000..5622983cee
--- /dev/null
+++ b/third_party/highway/hwy/tests/mul_test.cc
@@ -0,0 +1,526 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/mul_test.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+template <size_t kBits>
+constexpr uint64_t FirstBits() {
+ return (1ull << kBits) - 1;
+}
+template <>
+constexpr uint64_t FirstBits<64>() {
+ return ~uint64_t{0};
+}
+
+struct TestUnsignedMul {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto v0 = Zero(d);
+ const auto v1 = Set(d, T(1));
+ const auto vi = Iota(d, 1);
+ const auto vj = Iota(d, 3);
+ const size_t N = Lanes(d);
+ auto expected = AllocateAligned<T>(N);
+
+ HWY_ASSERT_VEC_EQ(d, v0, Mul(v0, v0));
+ HWY_ASSERT_VEC_EQ(d, v1, Mul(v1, v1));
+ HWY_ASSERT_VEC_EQ(d, vi, Mul(v1, vi));
+ HWY_ASSERT_VEC_EQ(d, vi, Mul(vi, v1));
+
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = static_cast<T>((1 + i) * (1 + i));
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vi, vi));
+
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = static_cast<T>((1 + i) * (3 + i));
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vi, vj));
+
+ const T max = LimitsMax<T>();
+ const auto vmax = Set(d, max);
+ HWY_ASSERT_VEC_EQ(d, vmax, Mul(vmax, v1));
+ HWY_ASSERT_VEC_EQ(d, vmax, Mul(v1, vmax));
+
+ constexpr uint64_t kMask = FirstBits<sizeof(T) * 8>();
+ const T max2 = (static_cast<uint64_t>(max) * max) & kMask;
+ HWY_ASSERT_VEC_EQ(d, Set(d, max2), Mul(vmax, vmax));
+ }
+};
+
+struct TestSignedMul {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const size_t N = Lanes(d);
+ auto expected = AllocateAligned<T>(N);
+
+ const auto v0 = Zero(d);
+ const auto v1 = Set(d, T(1));
+ const auto vi = Iota(d, 1);
+ const auto vn = Iota(d, -T(N)); // no i8 supported, so no wraparound
+ HWY_ASSERT_VEC_EQ(d, v0, Mul(v0, v0));
+ HWY_ASSERT_VEC_EQ(d, v1, Mul(v1, v1));
+ HWY_ASSERT_VEC_EQ(d, vi, Mul(v1, vi));
+ HWY_ASSERT_VEC_EQ(d, vi, Mul(vi, v1));
+
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = static_cast<T>((1 + i) * (1 + i));
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vi, vi));
+
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = static_cast<T>((-T(N) + T(i)) * T(1u + i));
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vn, vi));
+ HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vi, vn));
+ }
+};
+
+struct TestMulOverflow {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto vMax = Set(d, LimitsMax<T>());
+ HWY_ASSERT_VEC_EQ(d, Mul(vMax, vMax), Mul(vMax, vMax));
+ }
+};
+
+struct TestDivOverflow {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto vZero = Set(d, T(0));
+ const auto v1 = Set(d, T(1));
+ HWY_ASSERT_VEC_EQ(d, Div(v1, vZero), Div(v1, vZero));
+ }
+};
+
+HWY_NOINLINE void TestAllMul() {
+ const ForPartialVectors<TestUnsignedMul> test_unsigned;
+ // No u8.
+ test_unsigned(uint16_t());
+ test_unsigned(uint32_t());
+ test_unsigned(uint64_t());
+
+ const ForPartialVectors<TestSignedMul> test_signed;
+ // No i8.
+ test_signed(int16_t());
+ test_signed(int32_t());
+ test_signed(int64_t());
+
+ const ForPartialVectors<TestMulOverflow> test_mul_overflow;
+ test_mul_overflow(int16_t());
+ test_mul_overflow(int32_t());
+#if HWY_HAVE_INTEGER64
+ test_mul_overflow(int64_t());
+#endif
+
+ const ForPartialVectors<TestDivOverflow> test_div_overflow;
+ test_div_overflow(float());
+#if HWY_HAVE_FLOAT64
+ test_div_overflow(double());
+#endif
+}
+
+struct TestMulHigh {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ using Wide = MakeWide<T>;
+ const size_t N = Lanes(d);
+ auto in_lanes = AllocateAligned<T>(N);
+ auto expected_lanes = AllocateAligned<T>(N);
+
+ const auto vi = Iota(d, 1);
+ // no i8 supported, so no wraparound
+ const auto vni = Iota(d, T(static_cast<T>(~N + 1)));
+
+ const auto v0 = Zero(d);
+ HWY_ASSERT_VEC_EQ(d, v0, MulHigh(v0, v0));
+ HWY_ASSERT_VEC_EQ(d, v0, MulHigh(v0, vi));
+ HWY_ASSERT_VEC_EQ(d, v0, MulHigh(vi, v0));
+
+ // Large positive squared
+ for (size_t i = 0; i < N; ++i) {
+ in_lanes[i] = T(LimitsMax<T>() >> i);
+ expected_lanes[i] = T((Wide(in_lanes[i]) * in_lanes[i]) >> 16);
+ }
+ auto v = Load(d, in_lanes.get());
+ HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(v, v));
+
+ // Large positive * small positive
+ for (size_t i = 0; i < N; ++i) {
+ expected_lanes[i] = T((Wide(in_lanes[i]) * T(1u + i)) >> 16);
+ }
+ HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(v, vi));
+ HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(vi, v));
+
+ // Large positive * small negative
+ for (size_t i = 0; i < N; ++i) {
+ expected_lanes[i] = T((Wide(in_lanes[i]) * T(i - N)) >> 16);
+ }
+ HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(v, vni));
+ HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(vni, v));
+ }
+};
+
+HWY_NOINLINE void TestAllMulHigh() {
+ ForPartialVectors<TestMulHigh> test;
+ test(int16_t());
+ test(uint16_t());
+}
+
+struct TestMulFixedPoint15 {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto v0 = Zero(d);
+ HWY_ASSERT_VEC_EQ(d, v0, MulFixedPoint15(v0, v0));
+ HWY_ASSERT_VEC_EQ(d, v0, MulFixedPoint15(v0, v0));
+
+ const size_t N = Lanes(d);
+ auto in1 = AllocateAligned<T>(N);
+ auto in2 = AllocateAligned<T>(N);
+ auto expected = AllocateAligned<T>(N);
+
+ // Random inputs in each lane
+ RandomState rng;
+ for (size_t rep = 0; rep < AdjustedReps(10000); ++rep) {
+ for (size_t i = 0; i < N; ++i) {
+ in1[i] = static_cast<T>(Random64(&rng) & 0xFFFF);
+ in2[i] = static_cast<T>(Random64(&rng) & 0xFFFF);
+ }
+
+ for (size_t i = 0; i < N; ++i) {
+ // There are three ways to compute the results. x86 and ARM are defined
+ // using 32-bit multiplication results:
+ const int arm = (2 * in1[i] * in2[i] + 0x8000) >> 16;
+ const int x86 = (((in1[i] * in2[i]) >> 14) + 1) >> 1;
+ // On other platforms, split the result into upper and lower 16 bits.
+ const auto v1 = Set(d, in1[i]);
+ const auto v2 = Set(d, in2[i]);
+ const int hi = GetLane(MulHigh(v1, v2));
+ const int lo = GetLane(Mul(v1, v2)) & 0xFFFF;
+ const int split = 2 * hi + ((lo + 0x4000) >> 15);
+ expected[i] = static_cast<T>(arm);
+ if (in1[i] != -32768 || in2[i] != -32768) {
+ HWY_ASSERT_EQ(arm, x86);
+ HWY_ASSERT_EQ(arm, split);
+ }
+ }
+
+ const auto a = Load(d, in1.get());
+ const auto b = Load(d, in2.get());
+ HWY_ASSERT_VEC_EQ(d, expected.get(), MulFixedPoint15(a, b));
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllMulFixedPoint15() {
+ ForPartialVectors<TestMulFixedPoint15>()(int16_t());
+}
+
+struct TestMulEven {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ using Wide = MakeWide<T>;
+ const Repartition<Wide, D> d2;
+ const auto v0 = Zero(d);
+ HWY_ASSERT_VEC_EQ(d2, Zero(d2), MulEven(v0, v0));
+
+ const size_t N = Lanes(d);
+ auto in_lanes = AllocateAligned<T>(N);
+ auto expected = AllocateAligned<Wide>(Lanes(d2));
+ for (size_t i = 0; i < N; i += 2) {
+ in_lanes[i + 0] = LimitsMax<T>() >> i;
+ if (N != 1) {
+ in_lanes[i + 1] = 1; // unused
+ }
+ expected[i / 2] = Wide(in_lanes[i + 0]) * in_lanes[i + 0];
+ }
+
+ const auto v = Load(d, in_lanes.get());
+ HWY_ASSERT_VEC_EQ(d2, expected.get(), MulEven(v, v));
+ }
+};
+
+struct TestMulEvenOdd64 {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+#if HWY_TARGET != HWY_SCALAR
+ const auto v0 = Zero(d);
+ HWY_ASSERT_VEC_EQ(d, Zero(d), MulEven(v0, v0));
+ HWY_ASSERT_VEC_EQ(d, Zero(d), MulOdd(v0, v0));
+
+ const size_t N = Lanes(d);
+ if (N == 1) return;
+
+ auto in1 = AllocateAligned<T>(N);
+ auto in2 = AllocateAligned<T>(N);
+ auto expected_even = AllocateAligned<T>(N);
+ auto expected_odd = AllocateAligned<T>(N);
+
+ // Random inputs in each lane
+ RandomState rng;
+ for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
+ for (size_t i = 0; i < N; ++i) {
+ in1[i] = Random64(&rng);
+ in2[i] = Random64(&rng);
+ }
+
+ for (size_t i = 0; i < N; i += 2) {
+ expected_even[i] = Mul128(in1[i], in2[i], &expected_even[i + 1]);
+ expected_odd[i] = Mul128(in1[i + 1], in2[i + 1], &expected_odd[i + 1]);
+ }
+
+ const auto a = Load(d, in1.get());
+ const auto b = Load(d, in2.get());
+ HWY_ASSERT_VEC_EQ(d, expected_even.get(), MulEven(a, b));
+ HWY_ASSERT_VEC_EQ(d, expected_odd.get(), MulOdd(a, b));
+ }
+#else
+ (void)d;
+#endif // HWY_TARGET != HWY_SCALAR
+ }
+};
+
+HWY_NOINLINE void TestAllMulEven() {
+ ForGEVectors<64, TestMulEven> test;
+ test(int32_t());
+ test(uint32_t());
+
+ ForGEVectors<128, TestMulEvenOdd64>()(uint64_t());
+}
+
+#ifndef HWY_NATIVE_FMA
+#error "Bug in set_macros-inl.h, did not set HWY_NATIVE_FMA"
+#endif
+
+struct TestMulAdd {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto k0 = Zero(d);
+ const auto kNeg0 = Set(d, T(-0.0));
+ const auto v1 = Iota(d, 1);
+ const auto v2 = Iota(d, 2);
+ const size_t N = Lanes(d);
+ auto expected = AllocateAligned<T>(N);
+ HWY_ASSERT_VEC_EQ(d, k0, MulAdd(k0, k0, k0));
+ HWY_ASSERT_VEC_EQ(d, v2, MulAdd(k0, v1, v2));
+ HWY_ASSERT_VEC_EQ(d, v2, MulAdd(v1, k0, v2));
+ HWY_ASSERT_VEC_EQ(d, k0, NegMulAdd(k0, k0, k0));
+ HWY_ASSERT_VEC_EQ(d, v2, NegMulAdd(k0, v1, v2));
+ HWY_ASSERT_VEC_EQ(d, v2, NegMulAdd(v1, k0, v2));
+
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = static_cast<T>((i + 1) * (i + 2));
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), MulAdd(v2, v1, k0));
+ HWY_ASSERT_VEC_EQ(d, expected.get(), MulAdd(v1, v2, k0));
+ HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulAdd(Neg(v2), v1, k0));
+ HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulAdd(v1, Neg(v2), k0));
+
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = static_cast<T>((i + 2) * (i + 2) + (i + 1));
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), MulAdd(v2, v2, v1));
+ HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulAdd(Neg(v2), v2, v1));
+
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] =
+ T(-T(i + 2u) * static_cast<T>(i + 2) + static_cast<T>(1 + i));
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulAdd(v2, v2, v1));
+
+ HWY_ASSERT_VEC_EQ(d, k0, MulSub(k0, k0, k0));
+ HWY_ASSERT_VEC_EQ(d, kNeg0, NegMulSub(k0, k0, k0));
+
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = -T(i + 2);
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), MulSub(k0, v1, v2));
+ HWY_ASSERT_VEC_EQ(d, expected.get(), MulSub(v1, k0, v2));
+ HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulSub(Neg(k0), v1, v2));
+ HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulSub(v1, Neg(k0), v2));
+
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = static_cast<T>((i + 1) * (i + 2));
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), MulSub(v1, v2, k0));
+ HWY_ASSERT_VEC_EQ(d, expected.get(), MulSub(v2, v1, k0));
+ HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulSub(Neg(v1), v2, k0));
+ HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulSub(v2, Neg(v1), k0));
+
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = static_cast<T>((i + 2) * (i + 2) - (1 + i));
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), MulSub(v2, v2, v1));
+ HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulSub(Neg(v2), v2, v1));
+ }
+};
+
+HWY_NOINLINE void TestAllMulAdd() {
+ ForFloatTypes(ForPartialVectors<TestMulAdd>());
+}
+
+struct TestReorderWidenMulAccumulate {
+ template <typename TN, class DN>
+ HWY_NOINLINE void operator()(TN /*unused*/, DN dn) {
+ using TW = MakeWide<TN>;
+ const RepartitionToWide<DN> dw;
+ const Half<DN> dnh;
+ using VW = Vec<decltype(dw)>;
+ using VN = Vec<decltype(dn)>;
+ const size_t NN = Lanes(dn);
+
+ const VW f0 = Zero(dw);
+ const VW f1 = Set(dw, TW{1});
+ const VN bf0 = Zero(dn);
+ // Cannot Set() bfloat16_t directly.
+ const VN bf1 = ReorderDemote2To(dn, f1, f1);
+
+ // Any input zero => both outputs zero
+ VW sum1 = f0;
+ HWY_ASSERT_VEC_EQ(dw, f0,
+ ReorderWidenMulAccumulate(dw, bf0, bf0, f0, sum1));
+ HWY_ASSERT_VEC_EQ(dw, f0, sum1);
+ HWY_ASSERT_VEC_EQ(dw, f0,
+ ReorderWidenMulAccumulate(dw, bf0, bf1, f0, sum1));
+ HWY_ASSERT_VEC_EQ(dw, f0, sum1);
+ HWY_ASSERT_VEC_EQ(dw, f0,
+ ReorderWidenMulAccumulate(dw, bf1, bf0, f0, sum1));
+ HWY_ASSERT_VEC_EQ(dw, f0, sum1);
+
+ // delta[p] := 1, all others zero. For each p: Dot(delta, all-ones) == 1.
+ auto delta_w = AllocateAligned<TW>(NN);
+ for (size_t p = 0; p < NN; ++p) {
+ // Workaround for incorrect Clang wasm codegen: re-initialize the entire
+ // array rather than zero-initialize once and then toggle lane p.
+ for (size_t i = 0; i < NN; ++i) {
+ delta_w[i] = static_cast<TW>(i == p);
+ }
+ const VW delta0 = Load(dw, delta_w.get());
+ const VW delta1 = Load(dw, delta_w.get() + NN / 2);
+ const VN delta = ReorderDemote2To(dn, delta0, delta1);
+
+ {
+ sum1 = f0;
+ const VW sum0 = ReorderWidenMulAccumulate(dw, delta, bf1, f0, sum1);
+ HWY_ASSERT_EQ(TW{1}, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
+ }
+ // Swapped arg order
+ {
+ sum1 = f0;
+ const VW sum0 = ReorderWidenMulAccumulate(dw, bf1, delta, f0, sum1);
+ HWY_ASSERT_EQ(TW{1}, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
+ }
+ // Start with nonzero sum0 or sum1
+ {
+ VW sum0 = PromoteTo(dw, LowerHalf(dnh, delta));
+ sum1 = PromoteTo(dw, UpperHalf(dnh, delta));
+ sum0 = ReorderWidenMulAccumulate(dw, delta, bf1, sum0, sum1);
+ HWY_ASSERT_EQ(TW{2}, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
+ }
+ // Start with nonzero sum0 or sum1, and swap arg order
+ {
+ VW sum0 = PromoteTo(dw, LowerHalf(dnh, delta));
+ sum1 = PromoteTo(dw, UpperHalf(dnh, delta));
+ sum0 = ReorderWidenMulAccumulate(dw, bf1, delta, sum0, sum1);
+ HWY_ASSERT_EQ(TW{2}, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
+ }
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllReorderWidenMulAccumulate() {
+ ForShrinkableVectors<TestReorderWidenMulAccumulate>()(bfloat16_t());
+ ForShrinkableVectors<TestReorderWidenMulAccumulate>()(int16_t());
+}
+
+struct TestRearrangeToOddPlusEven {
+ template <typename TN, class DN>
+ HWY_NOINLINE void operator()(TN /*unused*/, DN dn) {
+ using TW = MakeWide<TN>;
+ const RebindToUnsigned<DN> du;
+ const RepartitionToWide<DN> dw;
+ const Half<DN> dnh;
+ const RebindToUnsigned<decltype(dnh)> duh;
+ using VW = Vec<decltype(dw)>;
+ using VN = Vec<decltype(dn)>;
+ const size_t NW = Lanes(dw);
+
+ const VW up0 = Iota(dw, TW{1});
+ const VW up1 = Iota(dw, static_cast<TW>(1 + NW));
+ // We will compute i * (N-i) to avoid per-lane overflow.
+ const VW down0 = Reverse(dw, up1);
+ const VW down1 = Reverse(dw, up0);
+
+ // Combine is not available for bf16, so cast to u16.
+ const auto a0 = BitCast(duh, DemoteTo(dnh, up0));
+ const auto a1 = BitCast(duh, DemoteTo(dnh, up1));
+ const VN a = BitCast(dn, Combine(du, a1, a0));
+ const auto b0 = BitCast(duh, DemoteTo(dnh, down0));
+ const auto b1 = BitCast(duh, DemoteTo(dnh, down1));
+ const VN b = BitCast(dn, Combine(du, b1, b0));
+
+ const auto expected = AllocateAligned<TW>(NW);
+ for (size_t iw = 0; iw < NW; ++iw) {
+ const size_t in = iw * 2; // even, odd is +1
+ const size_t a0 = 1 + in;
+ const size_t b0 = 1 + 2 * NW - a0;
+ const size_t a1 = a0 + 1;
+ const size_t b1 = b0 - 1;
+ expected[iw] = static_cast<TW>(a0 * b0 + a1 * b1);
+ }
+
+ VW sum1 = Zero(dw);
+ const VW sum0 = ReorderWidenMulAccumulate(dw, a, b, Zero(dw), sum1);
+ const VW sum_odd_even = RearrangeToOddPlusEven(sum0, sum1);
+ HWY_ASSERT_VEC_EQ(dw, expected.get(), sum_odd_even);
+ }
+};
+
+HWY_NOINLINE void TestAllRearrangeToOddPlusEven() {
+ ForShrinkableVectors<TestRearrangeToOddPlusEven>()(bfloat16_t());
+ ForShrinkableVectors<TestRearrangeToOddPlusEven>()(int16_t());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyMulTest);
+HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllMul);
+HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllMulHigh);
+HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllMulFixedPoint15);
+HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllMulEven);
+HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllMulAdd);
+HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllReorderWidenMulAccumulate);
+HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllRearrangeToOddPlusEven);
+
+} // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/reduction_test.cc b/third_party/highway/hwy/tests/reduction_test.cc
new file mode 100644
index 0000000000..5cc051ef1c
--- /dev/null
+++ b/third_party/highway/hwy/tests/reduction_test.cc
@@ -0,0 +1,261 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/reduction_test.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestSumOfLanes {
+ template <typename T, size_t N, int P,
+ hwy::EnableIf<!IsSigned<T>() || ((N & 1) != 0)>* = nullptr>
+ HWY_NOINLINE void SignedEvenLengthVectorTests(Simd<T, N, P>) {
+ // do nothing
+ }
+ template <typename T, size_t N, int P,
+ hwy::EnableIf<IsSigned<T>() && ((N & 1) == 0)>* = nullptr>
+ HWY_NOINLINE void SignedEvenLengthVectorTests(Simd<T, N, P> d) {
+ const T pairs = static_cast<T>(Lanes(d) / 2);
+
+ // Lanes are the repeated sequence -2, 1, [...]; each pair sums to -1,
+ // so the eventual total is just -(N/2).
+ Vec<decltype(d)> v =
+ InterleaveLower(Set(d, static_cast<T>(-2)), Set(d, T{1}));
+ HWY_ASSERT_VEC_EQ(d, Set(d, static_cast<T>(-pairs)), SumOfLanes(d, v));
+
+ // Similar test with a positive result.
+ v = InterleaveLower(Set(d, static_cast<T>(-2)), Set(d, T{4}));
+ HWY_ASSERT_VEC_EQ(d, Set(d, static_cast<T>(pairs * 2)), SumOfLanes(d, v));
+ }
+
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const size_t N = Lanes(d);
+ auto in_lanes = AllocateAligned<T>(N);
+
+ // Lane i = bit i, higher lanes 0
+ double sum = 0.0;
+ // Avoid setting sign bit and cap at double precision
+ constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51);
+ for (size_t i = 0; i < N; ++i) {
+ in_lanes[i] = i < kBits ? static_cast<T>(1ull << i) : 0;
+ sum += static_cast<double>(in_lanes[i]);
+ }
+ HWY_ASSERT_VEC_EQ(d, Set(d, T(sum)),
+ SumOfLanes(d, Load(d, in_lanes.get())));
+
+ // Lane i = i (iota) to include upper lanes
+ sum = 0.0;
+ for (size_t i = 0; i < N; ++i) {
+ sum += static_cast<double>(i);
+ }
+ HWY_ASSERT_VEC_EQ(d, Set(d, T(sum)), SumOfLanes(d, Iota(d, 0)));
+
+ // Run more tests only for signed types with even vector lengths. Some of
+ // this code may not otherwise compile, so put it in a templated function.
+ SignedEvenLengthVectorTests(d);
+ }
+};
+
+HWY_NOINLINE void TestAllSumOfLanes() {
+ ForUIF3264(ForPartialVectors<TestSumOfLanes>());
+ ForUI16(ForPartialVectors<TestSumOfLanes>());
+
+#if HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_SSE4 || HWY_TARGET == HWY_SSSE3
+ ForUI8(ForGEVectors<64, TestSumOfLanes>());
+#endif
+}
+
+struct TestMinOfLanes {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const size_t N = Lanes(d);
+ auto in_lanes = AllocateAligned<T>(N);
+
+ // Lane i = bit i, higher lanes = 2 (not the minimum)
+ T min = HighestValue<T>();
+ // Avoid setting sign bit and cap at double precision
+ constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51);
+ for (size_t i = 0; i < N; ++i) {
+ in_lanes[i] = i < kBits ? static_cast<T>(1ull << i) : 2;
+ min = HWY_MIN(min, in_lanes[i]);
+ }
+ HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get())));
+
+ // Lane i = N - i to include upper lanes
+ min = HighestValue<T>();
+ for (size_t i = 0; i < N; ++i) {
+ in_lanes[i] = static_cast<T>(N - i); // no 8-bit T so no wraparound
+ min = HWY_MIN(min, in_lanes[i]);
+ }
+ HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get())));
+
+ // Bug #910: also check negative values
+ min = HighestValue<T>();
+ const T input_copy[] = {static_cast<T>(-1),
+ static_cast<T>(-2),
+ 1,
+ 2,
+ 3,
+ 4,
+ 5,
+ 6,
+ 7,
+ 8,
+ 9,
+ 10,
+ 11,
+ 12,
+ 13,
+ 14};
+ size_t i = 0;
+ for (; i < HWY_MIN(N, sizeof(input_copy) / sizeof(T)); ++i) {
+ in_lanes[i] = input_copy[i];
+ min = HWY_MIN(min, input_copy[i]);
+ }
+ // Pad with neutral element to full vector (so we can load)
+ for (; i < N; ++i) {
+ in_lanes[i] = min;
+ }
+ HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get())));
+ }
+};
+
+struct TestMaxOfLanes {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const size_t N = Lanes(d);
+ auto in_lanes = AllocateAligned<T>(N);
+
+ T max = LowestValue<T>();
+ // Avoid setting sign bit and cap at double precision
+ constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51);
+ for (size_t i = 0; i < N; ++i) {
+ in_lanes[i] = i < kBits ? static_cast<T>(1ull << i) : 0;
+ max = HWY_MAX(max, in_lanes[i]);
+ }
+ HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get())));
+
+ // Lane i = i to include upper lanes
+ max = LowestValue<T>();
+ for (size_t i = 0; i < N; ++i) {
+ in_lanes[i] = static_cast<T>(i); // no 8-bit T so no wraparound
+ max = HWY_MAX(max, in_lanes[i]);
+ }
+ HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get())));
+
+ // Bug #910: also check negative values
+ max = LowestValue<T>();
+ const T input_copy[] = {static_cast<T>(-1),
+ static_cast<T>(-2),
+ 1,
+ 2,
+ 3,
+ 4,
+ 5,
+ 6,
+ 7,
+ 8,
+ 9,
+ 10,
+ 11,
+ 12,
+ 13,
+ 14};
+ size_t i = 0;
+ for (; i < HWY_MIN(N, sizeof(input_copy) / sizeof(T)); ++i) {
+ in_lanes[i] = input_copy[i];
+ max = HWY_MAX(max, in_lanes[i]);
+ }
+ // Pad with neutral element to full vector (so we can load)
+ for (; i < N; ++i) {
+ in_lanes[i] = max;
+ }
+ HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get())));
+ }
+};
+
+HWY_NOINLINE void TestAllMinMaxOfLanes() {
+ const ForPartialVectors<TestMinOfLanes> test_min;
+ const ForPartialVectors<TestMaxOfLanes> test_max;
+ ForUIF3264(test_min);
+ ForUIF3264(test_max);
+ ForUI16(test_min);
+ ForUI16(test_max);
+
+#if HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_SSE4 || HWY_TARGET == HWY_SSSE3
+ ForUI8(ForGEVectors<64, TestMinOfLanes>());
+ ForUI8(ForGEVectors<64, TestMaxOfLanes>());
+#endif
+}
+
+struct TestSumsOf8 {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ RandomState rng;
+
+ const size_t N = Lanes(d);
+ if (N < 8) return;
+ const Repartition<uint64_t, D> du64;
+
+ auto in_lanes = AllocateAligned<T>(N);
+ auto sum_lanes = AllocateAligned<uint64_t>(N / 8);
+
+ for (size_t rep = 0; rep < 100; ++rep) {
+ for (size_t i = 0; i < N; ++i) {
+ in_lanes[i] = Random64(&rng) & 0xFF;
+ }
+
+ for (size_t idx_sum = 0; idx_sum < N / 8; ++idx_sum) {
+ uint64_t sum = 0;
+ for (size_t i = 0; i < 8; ++i) {
+ sum += in_lanes[idx_sum * 8 + i];
+ }
+ sum_lanes[idx_sum] = sum;
+ }
+
+ const Vec<D> in = Load(d, in_lanes.get());
+ HWY_ASSERT_VEC_EQ(du64, sum_lanes.get(), SumsOf8(in));
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllSumsOf8() {
+ ForGEVectors<64, TestSumsOf8>()(uint8_t());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyReductionTest);
+HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllSumOfLanes);
+HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllMinMaxOfLanes);
+HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllSumsOf8);
+} // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/reverse_test.cc b/third_party/highway/hwy/tests/reverse_test.cc
new file mode 100644
index 0000000000..b1572c03fe
--- /dev/null
+++ b/third_party/highway/hwy/tests/reverse_test.cc
@@ -0,0 +1,186 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+
+#include "hwy/base.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/reverse_test.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestReverse {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const size_t N = Lanes(d);
+ const RebindToUnsigned<D> du; // Iota does not support float16_t.
+ const auto v = BitCast(d, Iota(du, 1));
+ auto expected = AllocateAligned<T>(N);
+
+ // Can't set float16_t value directly, need to permute in memory.
+ auto copy = AllocateAligned<T>(N);
+ Store(v, d, copy.get());
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = copy[N - 1 - i];
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse(d, v));
+ }
+};
+
+struct TestReverse2 {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const size_t N = Lanes(d);
+ const RebindToUnsigned<D> du; // Iota does not support float16_t.
+ const auto v = BitCast(d, Iota(du, 1));
+ auto expected = AllocateAligned<T>(N);
+ if (N == 1) {
+ Store(v, d, expected.get());
+ HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse2(d, v));
+ return;
+ }
+
+ // Can't set float16_t value directly, need to permute in memory.
+ auto copy = AllocateAligned<T>(N);
+ Store(v, d, copy.get());
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = copy[i ^ 1];
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse2(d, v));
+ }
+};
+
+struct TestReverse4 {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const size_t N = Lanes(d);
+ const RebindToUnsigned<D> du; // Iota does not support float16_t.
+ const auto v = BitCast(d, Iota(du, 1));
+ auto expected = AllocateAligned<T>(N);
+
+ // Can't set float16_t value directly, need to permute in memory.
+ auto copy = AllocateAligned<T>(N);
+ Store(v, d, copy.get());
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = copy[i ^ 3];
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse4(d, v));
+ }
+};
+
+struct TestReverse8 {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const size_t N = Lanes(d);
+ const RebindToUnsigned<D> du; // Iota does not support float16_t.
+ const auto v = BitCast(d, Iota(du, 1));
+ auto expected = AllocateAligned<T>(N);
+
+ // Can't set float16_t value directly, need to permute in memory.
+ auto copy = AllocateAligned<T>(N);
+ Store(v, d, copy.get());
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = copy[i ^ 7];
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse8(d, v));
+ }
+};
+
+HWY_NOINLINE void TestAllReverse() {
+ // 8-bit is not supported because Risc-V uses rgather of Lanes - Iota,
+ // which requires 16 bits.
+ ForUIF163264(ForPartialVectors<TestReverse>());
+}
+
+HWY_NOINLINE void TestAllReverse2() {
+ // 8-bit is not supported because Risc-V uses rgather of Lanes - Iota,
+ // which requires 16 bits.
+ ForUIF64(ForGEVectors<128, TestReverse2>());
+ ForUIF32(ForGEVectors<64, TestReverse2>());
+ ForUIF16(ForGEVectors<32, TestReverse2>());
+
+#if HWY_TARGET == HWY_SSSE3
+ // Implemented mainly for internal use.
+ ForUI8(ForPartialVectors<TestReverse2>());
+#endif
+}
+
+HWY_NOINLINE void TestAllReverse4() {
+ // 8-bit is not supported because Risc-V uses rgather of Lanes - Iota,
+ // which requires 16 bits.
+ ForUIF64(ForGEVectors<256, TestReverse4>());
+ ForUIF32(ForGEVectors<128, TestReverse4>());
+ ForUIF16(ForGEVectors<64, TestReverse4>());
+}
+
+HWY_NOINLINE void TestAllReverse8() {
+ // 8-bit is not supported because Risc-V uses rgather of Lanes - Iota,
+ // which requires 16 bits.
+ ForUIF64(ForGEVectors<512, TestReverse8>());
+ ForUIF32(ForGEVectors<256, TestReverse8>());
+ ForUIF16(ForGEVectors<128, TestReverse8>());
+}
+
+struct TestReverseBlocks {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const size_t N = Lanes(d);
+ const RebindToUnsigned<D> du; // Iota does not support float16_t.
+ const auto v = BitCast(d, Iota(du, 1));
+ auto expected = AllocateAligned<T>(N);
+
+ constexpr size_t kLanesPerBlock = 16 / sizeof(T);
+ const size_t num_blocks = N / kLanesPerBlock;
+ HWY_ASSERT(num_blocks != 0);
+
+ // Can't set float16_t value directly, need to permute in memory.
+ auto copy = AllocateAligned<T>(N);
+ Store(v, d, copy.get());
+ for (size_t i = 0; i < N; ++i) {
+ const size_t idx_block = i / kLanesPerBlock;
+ const size_t base = (num_blocks - 1 - idx_block) * kLanesPerBlock;
+ expected[i] = copy[base + (i % kLanesPerBlock)];
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ReverseBlocks(d, v));
+ }
+};
+
+HWY_NOINLINE void TestAllReverseBlocks() {
+ ForAllTypes(ForGEVectors<128, TestReverseBlocks>());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyReverseTest);
+HWY_EXPORT_AND_TEST_P(HwyReverseTest, TestAllReverse);
+HWY_EXPORT_AND_TEST_P(HwyReverseTest, TestAllReverse2);
+HWY_EXPORT_AND_TEST_P(HwyReverseTest, TestAllReverse4);
+HWY_EXPORT_AND_TEST_P(HwyReverseTest, TestAllReverse8);
+HWY_EXPORT_AND_TEST_P(HwyReverseTest, TestAllReverseBlocks);
+} // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/shift_test.cc b/third_party/highway/hwy/tests/shift_test.cc
new file mode 100644
index 0000000000..585eba761c
--- /dev/null
+++ b/third_party/highway/hwy/tests/shift_test.cc
@@ -0,0 +1,428 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <limits>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/shift_test.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+template <bool kSigned>
+struct TestLeftShifts {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T t, D d) {
+ if (kSigned) {
+ // Also test positive values
+ TestLeftShifts</*kSigned=*/false>()(t, d);
+ }
+
+ using TI = MakeSigned<T>;
+ using TU = MakeUnsigned<T>;
+ const size_t N = Lanes(d);
+ auto expected = AllocateAligned<T>(N);
+
+ // Values to shift
+ const auto values = Iota(d, static_cast<T>(kSigned ? -TI(N) : TI(0)));
+ constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
+
+ // 0
+ HWY_ASSERT_VEC_EQ(d, values, ShiftLeft<0>(values));
+ HWY_ASSERT_VEC_EQ(d, values, ShiftLeftSame(values, 0));
+
+ // 1
+ for (size_t i = 0; i < N; ++i) {
+ const T value = kSigned ? T(T(i) - T(N)) : T(i);
+ expected[i] = T(TU(value) << 1);
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<1>(values));
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, 1));
+
+ // max
+ for (size_t i = 0; i < N; ++i) {
+ const T value = kSigned ? T(T(i) - T(N)) : T(i);
+ expected[i] = T(TU(value) << kMaxShift);
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<kMaxShift>(values));
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, kMaxShift));
+ }
+};
+
+template <bool kSigned>
+struct TestVariableLeftShifts {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T t, D d) {
+ if (kSigned) {
+ // Also test positive values
+ TestVariableLeftShifts</*kSigned=*/false>()(t, d);
+ }
+
+ using TI = MakeSigned<T>;
+ using TU = MakeUnsigned<T>;
+ const size_t N = Lanes(d);
+ auto expected = AllocateAligned<T>(N);
+
+ const auto v0 = Zero(d);
+ const auto v1 = Set(d, 1);
+ const auto values = Iota(d, kSigned ? -TI(N) : TI(0)); // value to shift
+
+ constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
+ const auto max_shift = Set(d, kMaxShift);
+ const auto small_shifts = And(Iota(d, 0), max_shift);
+ const auto large_shifts = max_shift - small_shifts;
+
+ // Same: 0
+ HWY_ASSERT_VEC_EQ(d, values, Shl(values, v0));
+
+ // Same: 1
+ for (size_t i = 0; i < N; ++i) {
+ const T value = kSigned ? T(i) - T(N) : T(i);
+ expected[i] = T(TU(value) << 1);
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, v1));
+
+ // Same: max
+ for (size_t i = 0; i < N; ++i) {
+ const T value = kSigned ? T(i) - T(N) : T(i);
+ expected[i] = T(TU(value) << kMaxShift);
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, max_shift));
+
+ // Variable: small
+ for (size_t i = 0; i < N; ++i) {
+ const T value = kSigned ? T(i) - T(N) : T(i);
+ expected[i] = T(TU(value) << (i & kMaxShift));
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, small_shifts));
+
+ // Variable: large
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = T(TU(1) << (kMaxShift - (i & kMaxShift)));
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(v1, large_shifts));
+ }
+};
+
+struct TestUnsignedRightShifts {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const size_t N = Lanes(d);
+ auto expected = AllocateAligned<T>(N);
+
+ const auto values = Iota(d, 0);
+
+ const T kMax = LimitsMax<T>();
+ constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
+
+ // Shift by 0
+ HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values));
+ HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0));
+
+ // Shift by 1
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = T(T(i & kMax) >> 1);
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values));
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1));
+
+ // max
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = T(T(i & kMax) >> kMaxShift);
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<kMaxShift>(values));
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, kMaxShift));
+ }
+};
+
+struct TestRotateRight {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const size_t N = Lanes(d);
+ auto expected = AllocateAligned<T>(N);
+
+ constexpr size_t kBits = sizeof(T) * 8;
+ const auto mask_shift = Set(d, T{kBits});
+ // Cover as many bit positions as possible to test shifting out
+ const auto values = Shl(Set(d, T{1}), And(Iota(d, 0), mask_shift));
+
+ // Rotate by 0
+ HWY_ASSERT_VEC_EQ(d, values, RotateRight<0>(values));
+
+ // Rotate by 1
+ Store(values, d, expected.get());
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = (expected[i] >> 1) | (expected[i] << (kBits - 1));
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<1>(values));
+
+ // Rotate by half
+ Store(values, d, expected.get());
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = (expected[i] >> (kBits / 2)) | (expected[i] << (kBits / 2));
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<kBits / 2>(values));
+
+ // Rotate by max
+ Store(values, d, expected.get());
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = (expected[i] >> (kBits - 1)) | (expected[i] << 1);
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<kBits - 1>(values));
+ }
+};
+
+struct TestVariableUnsignedRightShifts {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const size_t N = Lanes(d);
+ auto expected = AllocateAligned<T>(N);
+
+ const auto v0 = Zero(d);
+ const auto v1 = Set(d, 1);
+ const auto values = Iota(d, 0);
+
+ const T kMax = LimitsMax<T>();
+ const auto max = Set(d, kMax);
+
+ constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
+ const auto max_shift = Set(d, kMaxShift);
+ const auto small_shifts = And(Iota(d, 0), max_shift);
+ const auto large_shifts = max_shift - small_shifts;
+
+ // Same: 0
+ HWY_ASSERT_VEC_EQ(d, values, Shr(values, v0));
+
+ // Same: 1
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = T(T(i & kMax) >> 1);
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, v1));
+
+ // Same: max
+ HWY_ASSERT_VEC_EQ(d, v0, Shr(values, max_shift));
+
+ // Variable: small
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = T(i) >> (i & kMaxShift);
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, small_shifts));
+
+ // Variable: Large
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = kMax >> (kMaxShift - (i & kMaxShift));
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(max, large_shifts));
+ }
+};
+
+template <int kAmount, typename T>
+T RightShiftNegative(T val) {
+ // C++ shifts are implementation-defined for negative numbers, and we have
+ // seen divisions replaced with shifts, so resort to bit operations.
+ using TU = hwy::MakeUnsigned<T>;
+ TU bits;
+ CopySameSize(&val, &bits);
+
+ const TU shifted = TU(bits >> kAmount);
+
+ const TU all = TU(~TU(0));
+ const size_t num_zero = sizeof(TU) * 8 - 1 - kAmount;
+ const TU sign_extended = static_cast<TU>((all << num_zero) & LimitsMax<TU>());
+
+ bits = shifted | sign_extended;
+ CopySameSize(&bits, &val);
+ return val;
+}
+
+class TestSignedRightShifts {
+ public:
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const size_t N = Lanes(d);
+ auto expected = AllocateAligned<T>(N);
+ constexpr T kMin = LimitsMin<T>();
+ constexpr T kMax = LimitsMax<T>();
+ constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
+
+ // First test positive values, negative are checked below.
+ const auto v0 = Zero(d);
+ const auto values = And(Iota(d, 0), Set(d, kMax));
+
+ // Shift by 0
+ HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values));
+ HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0));
+
+ // Shift by 1
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = T(T(i & kMax) >> 1);
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values));
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1));
+
+ // max
+ HWY_ASSERT_VEC_EQ(d, v0, ShiftRight<kMaxShift>(values));
+ HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(values, kMaxShift));
+
+ // Even negative value
+ Test<0>(kMin, d, __LINE__);
+ Test<1>(kMin, d, __LINE__);
+ Test<2>(kMin, d, __LINE__);
+ Test<kMaxShift>(kMin, d, __LINE__);
+
+ const T odd = static_cast<T>(kMin + 1);
+ Test<0>(odd, d, __LINE__);
+ Test<1>(odd, d, __LINE__);
+ Test<2>(odd, d, __LINE__);
+ Test<kMaxShift>(odd, d, __LINE__);
+ }
+
+ private:
+ template <int kAmount, typename T, class D>
+ void Test(T val, D d, int line) {
+ const auto expected = Set(d, RightShiftNegative<kAmount>(val));
+ const auto in = Set(d, val);
+ const char* file = __FILE__;
+ AssertVecEqual(d, expected, ShiftRight<kAmount>(in), file, line);
+ AssertVecEqual(d, expected, ShiftRightSame(in, kAmount), file, line);
+ }
+};
+
+struct TestVariableSignedRightShifts {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ using TU = MakeUnsigned<T>;
+ const size_t N = Lanes(d);
+ auto expected = AllocateAligned<T>(N);
+
+ constexpr T kMin = LimitsMin<T>();
+ constexpr T kMax = LimitsMax<T>();
+
+ constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
+
+ // First test positive values, negative are checked below.
+ const auto v0 = Zero(d);
+ const auto positive = Iota(d, 0) & Set(d, kMax);
+
+ // Shift by 0
+ HWY_ASSERT_VEC_EQ(d, positive, ShiftRight<0>(positive));
+ HWY_ASSERT_VEC_EQ(d, positive, ShiftRightSame(positive, 0));
+
+ // Shift by 1
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = T(T(i & kMax) >> 1);
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(positive));
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(positive, 1));
+
+ // max
+ HWY_ASSERT_VEC_EQ(d, v0, ShiftRight<kMaxShift>(positive));
+ HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(positive, kMaxShift));
+
+ const auto max_shift = Set(d, kMaxShift);
+ const auto small_shifts = And(Iota(d, 0), max_shift);
+ const auto large_shifts = max_shift - small_shifts;
+
+ const auto negative = Iota(d, kMin);
+
+ // Test varying negative to shift
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = RightShiftNegative<1>(static_cast<T>(kMin + i));
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(negative, Set(d, 1)));
+
+ // Shift MSB right by small amounts
+ for (size_t i = 0; i < N; ++i) {
+ const size_t amount = i & kMaxShift;
+ const TU shifted = ~((1ull << (kMaxShift - amount)) - 1);
+ CopySameSize(&shifted, &expected[i]);
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(Set(d, kMin), small_shifts));
+
+ // Shift MSB right by large amounts
+ for (size_t i = 0; i < N; ++i) {
+ const size_t amount = kMaxShift - (i & kMaxShift);
+ const TU shifted = ~((1ull << (kMaxShift - amount)) - 1);
+ CopySameSize(&shifted, &expected[i]);
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(Set(d, kMin), large_shifts));
+ }
+};
+
+HWY_NOINLINE void TestAllShifts() {
+ ForUnsignedTypes(ForPartialVectors<TestLeftShifts</*kSigned=*/false>>());
+ ForSignedTypes(ForPartialVectors<TestLeftShifts</*kSigned=*/true>>());
+ ForUnsignedTypes(ForPartialVectors<TestUnsignedRightShifts>());
+ ForSignedTypes(ForPartialVectors<TestSignedRightShifts>());
+}
+
+HWY_NOINLINE void TestAllVariableShifts() {
+ const ForPartialVectors<TestLeftShifts</*kSigned=*/false>> shl_u;
+ const ForPartialVectors<TestLeftShifts</*kSigned=*/true>> shl_s;
+ const ForPartialVectors<TestUnsignedRightShifts> shr_u;
+ const ForPartialVectors<TestSignedRightShifts> shr_s;
+
+ shl_u(uint16_t());
+ shr_u(uint16_t());
+
+ shl_u(uint32_t());
+ shr_u(uint32_t());
+
+ shl_s(int16_t());
+ shr_s(int16_t());
+
+ shl_s(int32_t());
+ shr_s(int32_t());
+
+#if HWY_HAVE_INTEGER64
+ shl_u(uint64_t());
+ shr_u(uint64_t());
+
+ shl_s(int64_t());
+ shr_s(int64_t());
+#endif
+}
+
+HWY_NOINLINE void TestAllRotateRight() {
+ const ForPartialVectors<TestRotateRight> test;
+ test(uint32_t());
+#if HWY_HAVE_INTEGER64
+ test(uint64_t());
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyShiftTest);
+HWY_EXPORT_AND_TEST_P(HwyShiftTest, TestAllShifts);
+HWY_EXPORT_AND_TEST_P(HwyShiftTest, TestAllVariableShifts);
+HWY_EXPORT_AND_TEST_P(HwyShiftTest, TestAllRotateRight);
+} // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/swizzle_test.cc b/third_party/highway/hwy/tests/swizzle_test.cc
new file mode 100644
index 0000000000..f447f7a800
--- /dev/null
+++ b/third_party/highway/hwy/tests/swizzle_test.cc
@@ -0,0 +1,272 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <string.h> // memset
+
+#include "hwy/base.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/swizzle_test.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestGetLane {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto v = Iota(d, T(1));
+ HWY_ASSERT_EQ(T(1), GetLane(v));
+ }
+};
+
+HWY_NOINLINE void TestAllGetLane() {
+ ForAllTypes(ForPartialVectors<TestGetLane>());
+}
+
+struct TestExtractLane {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const auto v = Iota(d, T(1));
+ for (size_t i = 0; i < Lanes(d); ++i) {
+ const T actual = ExtractLane(v, i);
+ HWY_ASSERT_EQ(static_cast<T>(i + 1), actual);
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllExtractLane() {
+ ForAllTypes(ForPartialVectors<TestExtractLane>());
+}
+
+struct TestInsertLane {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ using V = Vec<D>;
+ const V v = Iota(d, T(1));
+ const size_t N = Lanes(d);
+ auto lanes = AllocateAligned<T>(N);
+ Store(v, d, lanes.get());
+
+ for (size_t i = 0; i < Lanes(d); ++i) {
+ lanes[i] = T{0};
+ const V actual = InsertLane(v, i, static_cast<T>(i + 1));
+ HWY_ASSERT_VEC_EQ(d, v, actual);
+ Store(v, d, lanes.get()); // restore lane i
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllInsertLane() {
+ ForAllTypes(ForPartialVectors<TestInsertLane>());
+}
+
+struct TestDupEven {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const size_t N = Lanes(d);
+ auto expected = AllocateAligned<T>(N);
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = static_cast<T>((static_cast<int>(i) & ~1) + 1);
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), DupEven(Iota(d, 1)));
+ }
+};
+
+HWY_NOINLINE void TestAllDupEven() {
+ ForUIF3264(ForShrinkableVectors<TestDupEven>());
+}
+
+struct TestDupOdd {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+#if HWY_TARGET != HWY_SCALAR
+ const size_t N = Lanes(d);
+ auto expected = AllocateAligned<T>(N);
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = static_cast<T>((static_cast<int>(i) & ~1) + 2);
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), DupOdd(Iota(d, 1)));
+#else
+ (void)d;
+#endif
+ }
+};
+
+HWY_NOINLINE void TestAllDupOdd() {
+ ForUIF3264(ForShrinkableVectors<TestDupOdd>());
+}
+
+struct TestOddEven {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const size_t N = Lanes(d);
+ const auto even = Iota(d, 1);
+ const auto odd = Iota(d, static_cast<T>(1 + N));
+ auto expected = AllocateAligned<T>(N);
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = static_cast<T>(1 + i + ((i & 1) ? N : 0));
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), OddEven(odd, even));
+ }
+};
+
+HWY_NOINLINE void TestAllOddEven() {
+ ForAllTypes(ForShrinkableVectors<TestOddEven>());
+}
+
+struct TestOddEvenBlocks {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const size_t N = Lanes(d);
+ const auto even = Iota(d, 1);
+ const auto odd = Iota(d, static_cast<T>(1 + N));
+ auto expected = AllocateAligned<T>(N);
+ for (size_t i = 0; i < N; ++i) {
+ const size_t idx_block = i / (16 / sizeof(T));
+ expected[i] = static_cast<T>(1 + i + ((idx_block & 1) ? N : 0));
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), OddEvenBlocks(odd, even));
+ }
+};
+
+HWY_NOINLINE void TestAllOddEvenBlocks() {
+ ForAllTypes(ForGEVectors<128, TestOddEvenBlocks>());
+}
+
+struct TestSwapAdjacentBlocks {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const size_t N = Lanes(d);
+ constexpr size_t kLanesPerBlock = 16 / sizeof(T);
+ if (N < 2 * kLanesPerBlock) return;
+ const auto vi = Iota(d, 1);
+ auto expected = AllocateAligned<T>(N);
+ for (size_t i = 0; i < N; ++i) {
+ const size_t idx_block = i / kLanesPerBlock;
+ const size_t base = (idx_block ^ 1) * kLanesPerBlock;
+ const size_t mod = i % kLanesPerBlock;
+ expected[i] = static_cast<T>(1 + base + mod);
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), SwapAdjacentBlocks(vi));
+ }
+};
+
+HWY_NOINLINE void TestAllSwapAdjacentBlocks() {
+ ForAllTypes(ForGEVectors<128, TestSwapAdjacentBlocks>());
+}
+
+struct TestTableLookupLanes {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const RebindToSigned<D> di;
+ using TI = TFromD<decltype(di)>;
+#if HWY_TARGET != HWY_SCALAR
+ const size_t N = Lanes(d);
+ auto idx = AllocateAligned<TI>(N);
+ memset(idx.get(), 0, N * sizeof(TI));
+ auto expected = AllocateAligned<T>(N);
+ const auto v = Iota(d, 1);
+
+ if (N <= 8) { // Test all permutations
+ for (size_t i0 = 0; i0 < N; ++i0) {
+ idx[0] = static_cast<TI>(i0);
+
+ for (size_t i1 = 0; i1 < N; ++i1) {
+ if (N >= 2) idx[1] = static_cast<TI>(i1);
+ for (size_t i2 = 0; i2 < N; ++i2) {
+ if (N >= 4) idx[2] = static_cast<TI>(i2);
+ for (size_t i3 = 0; i3 < N; ++i3) {
+ if (N >= 4) idx[3] = static_cast<TI>(i3);
+
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = static_cast<T>(idx[i] + 1); // == v[idx[i]]
+ }
+
+ const auto opaque1 = IndicesFromVec(d, Load(di, idx.get()));
+ const auto actual1 = TableLookupLanes(v, opaque1);
+ HWY_ASSERT_VEC_EQ(d, expected.get(), actual1);
+
+ const auto opaque2 = SetTableIndices(d, idx.get());
+ const auto actual2 = TableLookupLanes(v, opaque2);
+ HWY_ASSERT_VEC_EQ(d, expected.get(), actual2);
+ }
+ }
+ }
+ }
+ } else {
+ // Too many permutations to test exhaustively; choose one with repeated
+ // and cross-block indices and ensure indices do not exceed #lanes.
+ // For larger vectors, upper lanes will be zero.
+ HWY_ALIGN TI idx_source[16] = {1, 3, 2, 2, 8, 1, 7, 6,
+ 15, 14, 14, 15, 4, 9, 8, 5};
+ for (size_t i = 0; i < N; ++i) {
+ idx[i] = (i < 16) ? idx_source[i] : 0;
+ // Avoid undefined results / asan error for scalar by capping indices.
+ if (idx[i] >= static_cast<TI>(N)) {
+ idx[i] = static_cast<TI>(N - 1);
+ }
+ expected[i] = static_cast<T>(idx[i] + 1); // == v[idx[i]]
+ }
+
+ const auto opaque1 = IndicesFromVec(d, Load(di, idx.get()));
+ const auto actual1 = TableLookupLanes(v, opaque1);
+ HWY_ASSERT_VEC_EQ(d, expected.get(), actual1);
+
+ const auto opaque2 = SetTableIndices(d, idx.get());
+ const auto actual2 = TableLookupLanes(v, opaque2);
+ HWY_ASSERT_VEC_EQ(d, expected.get(), actual2);
+ }
+#else
+ const TI index = 0;
+ const auto v = Set(d, 1);
+ const auto opaque1 = SetTableIndices(d, &index);
+ HWY_ASSERT_VEC_EQ(d, v, TableLookupLanes(v, opaque1));
+ const auto opaque2 = IndicesFromVec(d, Zero(di));
+ HWY_ASSERT_VEC_EQ(d, v, TableLookupLanes(v, opaque2));
+#endif
+ }
+};
+
+HWY_NOINLINE void TestAllTableLookupLanes() {
+ ForUIF3264(ForPartialVectors<TestTableLookupLanes>());
+}
+
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwySwizzleTest);
+HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllGetLane);
+HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllExtractLane);
+HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllInsertLane);
+HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllDupEven);
+HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllDupOdd);
+HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllOddEven);
+HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllOddEvenBlocks);
+HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllSwapAdjacentBlocks);
+HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllTableLookupLanes);
+} // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/tests/test_util-inl.h b/third_party/highway/hwy/tests/test_util-inl.h
new file mode 100644
index 0000000000..972b3361e0
--- /dev/null
+++ b/third_party/highway/hwy/tests/test_util-inl.h
@@ -0,0 +1,665 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Target-specific helper functions for use by *_test.cc.
+
+#include <stdint.h>
+
+#include "hwy/base.h"
+#include "hwy/tests/hwy_gtest.h"
+#include "hwy/tests/test_util.h"
+
+// After test_util (also includes highway.h)
+#include "hwy/print-inl.h"
+
+// Per-target include guard
+#if defined(HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_) == \
+ defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_
+#undef HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_
+#else
+#define HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_
+#endif
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// Compare expected vector to vector.
+// HWY_INLINE works around a Clang SVE compiler bug where all but the first
+// 128 bits (the NEON register) of actual are zero.
+template <class D, typename T = TFromD<D>, class V = Vec<D>>
+HWY_INLINE void AssertVecEqual(D d, const T* expected, VecArg<V> actual,
+ const char* filename, const int line) {
+ const size_t N = Lanes(d);
+ auto actual_lanes = AllocateAligned<T>(N);
+ Store(actual, d, actual_lanes.get());
+
+ const auto info = hwy::detail::MakeTypeInfo<T>();
+ const char* target_name = hwy::TargetName(HWY_TARGET);
+ hwy::detail::AssertArrayEqual(info, expected, actual_lanes.get(), N,
+ target_name, filename, line);
+}
+
+// Compare expected lanes to vector.
+// HWY_INLINE works around a Clang SVE compiler bug where all but the first
+// 128 bits (the NEON register) of actual are zero.
+template <class D, typename T = TFromD<D>, class V = Vec<D>>
+HWY_INLINE void AssertVecEqual(D d, VecArg<V> expected, VecArg<V> actual,
+ const char* filename, int line) {
+ auto expected_lanes = AllocateAligned<T>(Lanes(d));
+ Store(expected, d, expected_lanes.get());
+ AssertVecEqual(d, expected_lanes.get(), actual, filename, line);
+}
+
+// Only checks the valid mask elements (those whose index < Lanes(d)).
+template <class D>
+HWY_NOINLINE void AssertMaskEqual(D d, VecArg<Mask<D>> a, VecArg<Mask<D>> b,
+ const char* filename, int line) {
+ // lvalues prevented MSAN failure in farm_sve.
+ const Vec<D> va = VecFromMask(d, a);
+ const Vec<D> vb = VecFromMask(d, b);
+ AssertVecEqual(d, va, vb, filename, line);
+
+ const char* target_name = hwy::TargetName(HWY_TARGET);
+ AssertEqual(CountTrue(d, a), CountTrue(d, b), target_name, filename, line);
+ AssertEqual(AllTrue(d, a), AllTrue(d, b), target_name, filename, line);
+ AssertEqual(AllFalse(d, a), AllFalse(d, b), target_name, filename, line);
+
+ const size_t N = Lanes(d);
+#if HWY_TARGET == HWY_SCALAR
+ const Rebind<uint8_t, D> d8;
+#else
+ const Repartition<uint8_t, D> d8;
+#endif
+ const size_t N8 = Lanes(d8);
+ auto bits_a = AllocateAligned<uint8_t>(HWY_MAX(size_t{8}, N8));
+ auto bits_b = AllocateAligned<uint8_t>(size_t{HWY_MAX(8, N8)});
+ memset(bits_a.get(), 0, N8);
+ memset(bits_b.get(), 0, N8);
+ const size_t num_bytes_a = StoreMaskBits(d, a, bits_a.get());
+ const size_t num_bytes_b = StoreMaskBits(d, b, bits_b.get());
+ AssertEqual(num_bytes_a, num_bytes_b, target_name, filename, line);
+ size_t i = 0;
+ // First check whole bytes (if that many elements are still valid)
+ for (; i < N / 8; ++i) {
+ if (bits_a[i] != bits_b[i]) {
+ fprintf(stderr, "Mismatch in byte %d: %d != %d\n", static_cast<int>(i),
+ bits_a[i], bits_b[i]);
+ Print(d8, "expect", Load(d8, bits_a.get()), 0, N8);
+ Print(d8, "actual", Load(d8, bits_b.get()), 0, N8);
+ hwy::Abort(filename, line, "Masks not equal");
+ }
+ }
+ // Then the valid bit(s) in the last byte.
+ const size_t remainder = N % 8;
+ if (remainder != 0) {
+ const int mask = (1 << remainder) - 1;
+ const int valid_a = bits_a[i] & mask;
+ const int valid_b = bits_b[i] & mask;
+ if (valid_a != valid_b) {
+ fprintf(stderr, "Mismatch in last byte %d: %d != %d\n",
+ static_cast<int>(i), valid_a, valid_b);
+ Print(d8, "expect", Load(d8, bits_a.get()), 0, N8);
+ Print(d8, "actual", Load(d8, bits_b.get()), 0, N8);
+ hwy::Abort(filename, line, "Masks not equal");
+ }
+ }
+}
+
+// Only sets valid elements (those whose index < Lanes(d)). This helps catch
+// tests that are not masking off the (undefined) upper mask elements.
+//
+// TODO(janwas): with HWY_NOINLINE GCC zeros the upper half of AVX2 masks.
+template <class D>
+HWY_INLINE Mask<D> MaskTrue(const D d) {
+ return FirstN(d, Lanes(d));
+}
+
+template <class D>
+HWY_INLINE Mask<D> MaskFalse(const D d) {
+ const auto zero = Zero(RebindToSigned<D>());
+ return RebindMask(d, Lt(zero, zero));
+}
+
+#ifndef HWY_ASSERT_EQ
+
+#define HWY_ASSERT_EQ(expected, actual) \
+ hwy::AssertEqual(expected, actual, hwy::TargetName(HWY_TARGET), __FILE__, \
+ __LINE__)
+
+#define HWY_ASSERT_ARRAY_EQ(expected, actual, count) \
+ hwy::AssertArrayEqual(expected, actual, count, hwy::TargetName(HWY_TARGET), \
+ __FILE__, __LINE__)
+
+#define HWY_ASSERT_STRING_EQ(expected, actual) \
+ hwy::AssertStringEqual(expected, actual, hwy::TargetName(HWY_TARGET), \
+ __FILE__, __LINE__)
+
+#define HWY_ASSERT_VEC_EQ(d, expected, actual) \
+ AssertVecEqual(d, expected, actual, __FILE__, __LINE__)
+
+#define HWY_ASSERT_MASK_EQ(d, expected, actual) \
+ AssertMaskEqual(d, expected, actual, __FILE__, __LINE__)
+
+#endif // HWY_ASSERT_EQ
+
+namespace detail {
+
+// Helpers for instantiating tests with combinations of lane types / counts.
+
+// Calls Test for each CappedTag<T, N> where N is in [kMinLanes, kMul * kMinArg]
+// and the resulting Lanes() is in [min_lanes, max_lanes]. The upper bound
+// is required to ensure capped vectors remain extendable. Implemented by
+// recursively halving kMul until it is zero.
+template <typename T, size_t kMul, size_t kMinArg, class Test>
+struct ForeachCappedR {
+ static void Do(size_t min_lanes, size_t max_lanes) {
+ const CappedTag<T, kMul * kMinArg> d;
+
+ // If we already don't have enough lanes, stop.
+ const size_t lanes = Lanes(d);
+ if (lanes < min_lanes) return;
+
+ if (lanes <= max_lanes) {
+ Test()(T(), d);
+ }
+ ForeachCappedR<T, kMul / 2, kMinArg, Test>::Do(min_lanes, max_lanes);
+ }
+};
+
+// Base case to stop the recursion.
+template <typename T, size_t kMinArg, class Test>
+struct ForeachCappedR<T, 0, kMinArg, Test> {
+ static void Do(size_t, size_t) {}
+};
+
+#if HWY_HAVE_SCALABLE
+
+template <typename T>
+constexpr int MinPow2() {
+ // Highway follows RVV LMUL in that the smallest fraction is 1/8th (encoded
+ // as kPow2 == -3). The fraction also must not result in zero lanes for the
+ // smallest possible vector size, which is 128 bits even on RISC-V (with the
+ // application processor profile).
+ return HWY_MAX(-3, -static_cast<int>(CeilLog2(16 / sizeof(T))));
+}
+
+// Iterates kPow2 upward through +3.
+template <typename T, int kPow2, int kAddPow2, class Test>
+struct ForeachShiftR {
+ static void Do(size_t min_lanes) {
+ const ScalableTag<T, kPow2 + kAddPow2> d;
+
+ // Precondition: [kPow2, 3] + kAddPow2 is a valid fraction of the minimum
+ // vector size, so we always have enough lanes, except ForGEVectors.
+ if (Lanes(d) >= min_lanes) {
+ Test()(T(), d);
+ } else {
+ fprintf(stderr, "%d lanes < %d: T=%d pow=%d\n",
+ static_cast<int>(Lanes(d)), static_cast<int>(min_lanes),
+ static_cast<int>(sizeof(T)), kPow2 + kAddPow2);
+ HWY_ASSERT(min_lanes != 1);
+ }
+
+ ForeachShiftR<T, kPow2 + 1, kAddPow2, Test>::Do(min_lanes);
+ }
+};
+
+// Base case to stop the recursion.
+template <typename T, int kAddPow2, class Test>
+struct ForeachShiftR<T, 4, kAddPow2, Test> {
+ static void Do(size_t) {}
+};
+#else
+// ForeachCappedR already handled all possible sizes.
+#endif // HWY_HAVE_SCALABLE
+
+} // namespace detail
+
+// These 'adapters' call a test for all possible N or kPow2 subject to
+// constraints such as "vectors must be extendable" or "vectors >= 128 bits".
+// They may be called directly, or via For*Types. Note that for an adapter C,
+// `C<Test>(T())` does not call the test - the correct invocation is
+// `C<Test>()(T())`, or preferably `ForAllTypes(C<Test>())`. We check at runtime
+// that operator() is called to prevent such bugs. Note that this is not
+// thread-safe, but that is fine because C are typically local variables.
+
+// Calls Test for all power of two N in [1, Lanes(d) >> kPow2]. This is for
+// ops that widen their input, e.g. Combine (not supported by HWY_SCALAR).
+template <class Test, int kPow2 = 1>
+class ForExtendableVectors {
+ mutable bool called_ = false;
+
+ public:
+ ~ForExtendableVectors() {
+ if (!called_) {
+ HWY_ABORT("Test is incorrect, ensure operator() is called");
+ }
+ }
+
+ template <typename T>
+ void operator()(T /*unused*/) const {
+ called_ = true;
+ constexpr size_t kMaxCapped = HWY_LANES(T);
+ // Skip CappedTag that are already full vectors.
+ const size_t max_lanes = Lanes(ScalableTag<T>()) >> kPow2;
+ (void)kMaxCapped;
+ (void)max_lanes;
+#if HWY_TARGET == HWY_SCALAR
+ // not supported
+#else
+ detail::ForeachCappedR<T, (kMaxCapped >> kPow2), 1, Test>::Do(1, max_lanes);
+#if HWY_TARGET == HWY_RVV
+ // For each [MinPow2, 3 - kPow2]; counter is [MinPow2 + kPow2, 3].
+ detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2, -kPow2, Test>::Do(1);
+#elif HWY_HAVE_SCALABLE
+ // For each [MinPow2, 0 - kPow2]; counter is [MinPow2 + kPow2 + 3, 3].
+ detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2 + 3, -kPow2 - 3,
+ Test>::Do(1);
+#endif
+#endif // HWY_SCALAR
+ }
+};
+
+// Calls Test for all power of two N in [1 << kPow2, Lanes(d)]. This is for ops
+// that narrow their input, e.g. UpperHalf.
+template <class Test, int kPow2 = 1>
+class ForShrinkableVectors {
+ mutable bool called_ = false;
+
+ public:
+ ~ForShrinkableVectors() {
+ if (!called_) {
+ HWY_ABORT("Test is incorrect, ensure operator() is called");
+ }
+ }
+
+ template <typename T>
+ void operator()(T /*unused*/) const {
+ called_ = true;
+ constexpr size_t kMinLanes = size_t{1} << kPow2;
+ constexpr size_t kMaxCapped = HWY_LANES(T);
+ // For shrinking, an upper limit is unnecessary.
+ constexpr size_t max_lanes = kMaxCapped;
+
+ (void)kMinLanes;
+ (void)max_lanes;
+ (void)max_lanes;
+#if HWY_TARGET == HWY_SCALAR
+ // not supported
+#else
+ detail::ForeachCappedR<T, (kMaxCapped >> kPow2), kMinLanes, Test>::Do(
+ kMinLanes, max_lanes);
+#if HWY_TARGET == HWY_RVV
+ // For each [MinPow2 + kPow2, 3]; counter is [MinPow2 + kPow2, 3].
+ detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2, 0, Test>::Do(
+ kMinLanes);
+#elif HWY_HAVE_SCALABLE
+ // For each [MinPow2 + kPow2, 0]; counter is [MinPow2 + kPow2 + 3, 3].
+ detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2 + 3, -3, Test>::Do(
+ kMinLanes);
+#endif
+#endif // HWY_TARGET == HWY_SCALAR
+ }
+};
+
+// Calls Test for all supported power of two vectors of at least kMinBits.
+// Examples: AES or 64x64 require 128 bits, casts may require 64 bits.
+template <size_t kMinBits, class Test>
+class ForGEVectors {
+ mutable bool called_ = false;
+
+ public:
+ ~ForGEVectors() {
+ if (!called_) {
+ HWY_ABORT("Test is incorrect, ensure operator() is called");
+ }
+ }
+
+ template <typename T>
+ void operator()(T /*unused*/) const {
+ called_ = true;
+ constexpr size_t kMaxCapped = HWY_LANES(T);
+ constexpr size_t kMinLanes = kMinBits / 8 / sizeof(T);
+ // An upper limit is unnecessary.
+ constexpr size_t max_lanes = kMaxCapped;
+ (void)max_lanes;
+#if HWY_TARGET == HWY_SCALAR
+ (void)kMinLanes; // not supported
+#else
+ detail::ForeachCappedR<T, HWY_LANES(T) / kMinLanes, kMinLanes, Test>::Do(
+ kMinLanes, max_lanes);
+#if HWY_TARGET == HWY_RVV
+ // Can be 0 (handled below) if kMinBits > 64.
+ constexpr size_t kRatio = 128 / kMinBits;
+ constexpr int kMinPow2 =
+ kRatio == 0 ? 0 : -static_cast<int>(CeilLog2(kRatio));
+ // For each [kMinPow2, 3]; counter is [kMinPow2, 3].
+ detail::ForeachShiftR<T, kMinPow2, 0, Test>::Do(kMinLanes);
+#elif HWY_HAVE_SCALABLE
+ // Can be 0 (handled below) if kMinBits > 128.
+ constexpr size_t kRatio = 128 / kMinBits;
+ constexpr int kMinPow2 =
+ kRatio == 0 ? 0 : -static_cast<int>(CeilLog2(kRatio));
+ // For each [kMinPow2, 0]; counter is [kMinPow2 + 3, 3].
+ detail::ForeachShiftR<T, kMinPow2 + 3, -3, Test>::Do(kMinLanes);
+#endif
+#endif // HWY_TARGET == HWY_SCALAR
+ }
+};
+
+template <class Test>
+using ForGE128Vectors = ForGEVectors<128, Test>;
+
+// Calls Test for all N that can be promoted (not the same as Extendable because
+// HWY_SCALAR has one lane). Also used for ZipLower, but not ZipUpper.
+template <class Test, int kPow2 = 1>
+class ForPromoteVectors {
+ mutable bool called_ = false;
+
+ public:
+ ~ForPromoteVectors() {
+ if (!called_) {
+ HWY_ABORT("Test is incorrect, ensure operator() is called");
+ }
+ }
+
+ template <typename T>
+ void operator()(T /*unused*/) const {
+ called_ = true;
+ constexpr size_t kFactor = size_t{1} << kPow2;
+ static_assert(kFactor >= 2 && kFactor * sizeof(T) <= sizeof(uint64_t), "");
+ constexpr size_t kMaxCapped = HWY_LANES(T);
+ constexpr size_t kMinLanes = kFactor;
+ // Skip CappedTag that are already full vectors.
+ const size_t max_lanes = Lanes(ScalableTag<T>()) >> kPow2;
+ (void)kMaxCapped;
+ (void)kMinLanes;
+ (void)max_lanes;
+#if HWY_TARGET == HWY_SCALAR
+ detail::ForeachCappedR<T, 1, 1, Test>::Do(1, 1);
+#else
+ // TODO(janwas): call Extendable if kMinLanes check not required?
+ detail::ForeachCappedR<T, (kMaxCapped >> kPow2), 1, Test>::Do(kMinLanes,
+ max_lanes);
+#if HWY_TARGET == HWY_RVV
+ // For each [MinPow2, 3 - kPow2]; counter is [MinPow2 + kPow2, 3].
+ detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2, -kPow2, Test>::Do(
+ kMinLanes);
+#elif HWY_HAVE_SCALABLE
+ // For each [MinPow2, 0 - kPow2]; counter is [MinPow2 + kPow2 + 3, 3].
+ detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2 + 3, -kPow2 - 3,
+ Test>::Do(kMinLanes);
+#endif
+#endif // HWY_SCALAR
+ }
+};
+
+// Calls Test for all N than can be demoted (not the same as Shrinkable because
+// HWY_SCALAR has one lane).
+template <class Test, int kPow2 = 1>
+class ForDemoteVectors {
+ mutable bool called_ = false;
+
+ public:
+ ~ForDemoteVectors() {
+ if (!called_) {
+ HWY_ABORT("Test is incorrect, ensure operator() is called");
+ }
+ }
+
+ template <typename T>
+ void operator()(T /*unused*/) const {
+ called_ = true;
+ constexpr size_t kMinLanes = size_t{1} << kPow2;
+ constexpr size_t kMaxCapped = HWY_LANES(T);
+ // For shrinking, an upper limit is unnecessary.
+ constexpr size_t max_lanes = kMaxCapped;
+
+ (void)kMinLanes;
+ (void)max_lanes;
+ (void)max_lanes;
+#if HWY_TARGET == HWY_SCALAR
+ detail::ForeachCappedR<T, 1, 1, Test>::Do(1, 1);
+#else
+ detail::ForeachCappedR<T, (kMaxCapped >> kPow2), kMinLanes, Test>::Do(
+ kMinLanes, max_lanes);
+
+// TODO(janwas): call Extendable if kMinLanes check not required?
+#if HWY_TARGET == HWY_RVV
+ // For each [MinPow2 + kPow2, 3]; counter is [MinPow2 + kPow2, 3].
+ detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2, 0, Test>::Do(
+ kMinLanes);
+#elif HWY_HAVE_SCALABLE
+ // For each [MinPow2 + kPow2, 0]; counter is [MinPow2 + kPow2 + 3, 3].
+ detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2 + 3, -3, Test>::Do(
+ kMinLanes);
+#endif
+#endif // HWY_TARGET == HWY_SCALAR
+ }
+};
+
+// For LowerHalf/Quarter.
+template <class Test, int kPow2 = 1>
+class ForHalfVectors {
+ mutable bool called_ = false;
+
+ public:
+ ~ForHalfVectors() {
+ if (!called_) {
+ HWY_ABORT("Test is incorrect, ensure operator() is called");
+ }
+ }
+
+ template <typename T>
+ void operator()(T /*unused*/) const {
+ called_ = true;
+#if HWY_TARGET == HWY_SCALAR
+ detail::ForeachCappedR<T, 1, 1, Test>::Do(1, 1);
+#else
+ constexpr size_t kMinLanes = size_t{1} << kPow2;
+ // For shrinking, an upper limit is unnecessary.
+ constexpr size_t kMaxCapped = HWY_LANES(T);
+ detail::ForeachCappedR<T, (kMaxCapped >> kPow2), kMinLanes, Test>::Do(
+ kMinLanes, kMaxCapped);
+
+// TODO(janwas): call Extendable if kMinLanes check not required?
+#if HWY_TARGET == HWY_RVV
+ // For each [MinPow2 + kPow2, 3]; counter is [MinPow2 + kPow2, 3].
+ detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2, 0, Test>::Do(
+ kMinLanes);
+#elif HWY_HAVE_SCALABLE
+ // For each [MinPow2 + kPow2, 0]; counter is [MinPow2 + kPow2 + 3, 3].
+ detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2 + 3, -3, Test>::Do(
+ kMinLanes);
+#endif
+#endif // HWY_TARGET == HWY_SCALAR
+ }
+};
+
+// Calls Test for all power of two N in [1, Lanes(d)]. This is the default
+// for ops that do not narrow nor widen their input, nor require 128 bits.
+template <class Test>
+class ForPartialVectors {
+ mutable bool called_ = false;
+
+ public:
+ ~ForPartialVectors() {
+ if (!called_) {
+ HWY_ABORT("Test is incorrect, ensure operator() is called");
+ }
+ }
+
+ template <typename T>
+ void operator()(T t) const {
+ called_ = true;
+#if HWY_TARGET == HWY_SCALAR
+ (void)t;
+ detail::ForeachCappedR<T, 1, 1, Test>::Do(1, 1);
+#else
+ ForExtendableVectors<Test, 0>()(t);
+#endif
+ }
+};
+
+// Type lists to shorten call sites:
+
+template <class Func>
+void ForSignedTypes(const Func& func) {
+ func(int8_t());
+ func(int16_t());
+ func(int32_t());
+#if HWY_HAVE_INTEGER64
+ func(int64_t());
+#endif
+}
+
+template <class Func>
+void ForUnsignedTypes(const Func& func) {
+ func(uint8_t());
+ func(uint16_t());
+ func(uint32_t());
+#if HWY_HAVE_INTEGER64
+ func(uint64_t());
+#endif
+}
+
+template <class Func>
+void ForIntegerTypes(const Func& func) {
+ ForSignedTypes(func);
+ ForUnsignedTypes(func);
+}
+
+template <class Func>
+void ForFloatTypes(const Func& func) {
+ func(float());
+#if HWY_HAVE_FLOAT64
+ func(double());
+#endif
+}
+
+template <class Func>
+void ForAllTypes(const Func& func) {
+ ForIntegerTypes(func);
+ ForFloatTypes(func);
+}
+
+template <class Func>
+void ForUI8(const Func& func) {
+ func(uint8_t());
+ func(int8_t());
+}
+
+template <class Func>
+void ForUI16(const Func& func) {
+ func(uint16_t());
+ func(int16_t());
+}
+
+template <class Func>
+void ForUIF16(const Func& func) {
+ ForUI16(func);
+#if HWY_HAVE_FLOAT16
+ func(float16_t());
+#endif
+}
+
+template <class Func>
+void ForUI32(const Func& func) {
+ func(uint32_t());
+ func(int32_t());
+}
+
+template <class Func>
+void ForUIF32(const Func& func) {
+ ForUI32(func);
+ func(float());
+}
+
+template <class Func>
+void ForUI64(const Func& func) {
+#if HWY_HAVE_INTEGER64
+ func(uint64_t());
+ func(int64_t());
+#endif
+}
+
+template <class Func>
+void ForUIF64(const Func& func) {
+ ForUI64(func);
+#if HWY_HAVE_FLOAT64
+ func(double());
+#endif
+}
+
+template <class Func>
+void ForUI3264(const Func& func) {
+ ForUI32(func);
+ ForUI64(func);
+}
+
+template <class Func>
+void ForUIF3264(const Func& func) {
+ ForUIF32(func);
+ ForUIF64(func);
+}
+
+template <class Func>
+void ForUI163264(const Func& func) {
+ ForUI16(func);
+ ForUI3264(func);
+}
+
+template <class Func>
+void ForUIF163264(const Func& func) {
+ ForUIF16(func);
+ ForUIF3264(func);
+}
+
+// For tests that involve loops, adjust the trip count so that emulated tests
+// finish quickly (but always at least 2 iterations to ensure some diversity).
+constexpr size_t AdjustedReps(size_t max_reps) {
+#if HWY_ARCH_RVV
+ return HWY_MAX(max_reps / 32, 2);
+#elif HWY_IS_DEBUG_BUILD
+ return HWY_MAX(max_reps / 8, 2);
+#elif HWY_ARCH_ARM
+ return HWY_MAX(max_reps / 4, 2);
+#else
+ return HWY_MAX(max_reps, 2);
+#endif
+}
+
+// Same as above, but the loop trip count will be 1 << max_pow2.
+constexpr size_t AdjustedLog2Reps(size_t max_pow2) {
+ // If "negative" (unsigned wraparound), use original.
+#if HWY_ARCH_RVV
+ return HWY_MIN(max_pow2 - 4, max_pow2);
+#elif HWY_IS_DEBUG_BUILD
+ return HWY_MIN(max_pow2 - 1, max_pow2);
+#elif HWY_ARCH_ARM
+ return HWY_MIN(max_pow2 - 1, max_pow2);
+#else
+ return max_pow2;
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#endif // per-target include guard
diff --git a/third_party/highway/hwy/tests/test_util.cc b/third_party/highway/hwy/tests/test_util.cc
new file mode 100644
index 0000000000..a0796b15f9
--- /dev/null
+++ b/third_party/highway/hwy/tests/test_util.cc
@@ -0,0 +1,117 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/tests/test_util.h"
+
+#include <stddef.h>
+#include <stdio.h>
+
+#include <cmath>
+
+#include "hwy/base.h"
+#include "hwy/print.h"
+
+namespace hwy {
+
+HWY_TEST_DLLEXPORT bool BytesEqual(const void* p1, const void* p2,
+ const size_t size, size_t* pos) {
+ const uint8_t* bytes1 = reinterpret_cast<const uint8_t*>(p1);
+ const uint8_t* bytes2 = reinterpret_cast<const uint8_t*>(p2);
+ for (size_t i = 0; i < size; ++i) {
+ if (bytes1[i] != bytes2[i]) {
+ if (pos != nullptr) {
+ *pos = i;
+ }
+ return false;
+ }
+ }
+ return true;
+}
+
+void AssertStringEqual(const char* expected, const char* actual,
+ const char* target_name, const char* filename,
+ int line) {
+ while (*expected == *actual++) {
+ if (*expected++ == '\0') return;
+ }
+
+ Abort(filename, line, "%s string mismatch: expected '%s', got '%s'.\n",
+ target_name, expected, actual);
+}
+
+namespace detail {
+
+HWY_TEST_DLLEXPORT bool IsEqual(const TypeInfo& info, const void* expected_ptr,
+ const void* actual_ptr) {
+ if (!info.is_float) {
+ return BytesEqual(expected_ptr, actual_ptr, info.sizeof_t);
+ }
+
+ if (info.sizeof_t == 4) {
+ float expected, actual;
+ CopyBytes<4>(expected_ptr, &expected);
+ CopyBytes<4>(actual_ptr, &actual);
+ return ComputeUlpDelta(expected, actual) <= 1;
+ } else if (info.sizeof_t == 8) {
+ double expected, actual;
+ CopyBytes<8>(expected_ptr, &expected);
+ CopyBytes<8>(actual_ptr, &actual);
+ return ComputeUlpDelta(expected, actual) <= 1;
+ } else {
+ HWY_ABORT("Unexpected float size %d\n", static_cast<int>(info.sizeof_t));
+ return false;
+ }
+}
+
+HWY_TEST_DLLEXPORT HWY_NORETURN void PrintMismatchAndAbort(
+ const TypeInfo& info, const void* expected_ptr, const void* actual_ptr,
+ const char* target_name, const char* filename, int line, size_t lane,
+ size_t num_lanes) {
+ char type_name[100];
+ TypeName(info, 1, type_name);
+ char expected_str[100];
+ ToString(info, expected_ptr, expected_str);
+ char actual_str[100];
+ ToString(info, actual_ptr, actual_str);
+ Abort(filename, line,
+ "%s, %sx%d lane %d mismatch: expected '%s', got '%s'.\n", target_name,
+ type_name, static_cast<int>(num_lanes), static_cast<int>(lane),
+ expected_str, actual_str);
+}
+
+HWY_TEST_DLLEXPORT void AssertArrayEqual(const TypeInfo& info,
+ const void* expected_void,
+ const void* actual_void, size_t N,
+ const char* target_name,
+ const char* filename, int line) {
+ const uint8_t* expected_array =
+ reinterpret_cast<const uint8_t*>(expected_void);
+ const uint8_t* actual_array = reinterpret_cast<const uint8_t*>(actual_void);
+ for (size_t i = 0; i < N; ++i) {
+ const void* expected_ptr = expected_array + i * info.sizeof_t;
+ const void* actual_ptr = actual_array + i * info.sizeof_t;
+ if (!IsEqual(info, expected_ptr, actual_ptr)) {
+ fprintf(stderr, "\n\n");
+ PrintArray(info, "expect", expected_array, N, i);
+ PrintArray(info, "actual", actual_array, N, i);
+
+ PrintMismatchAndAbort(info, expected_ptr, actual_ptr, target_name,
+ filename, line, i, N);
+ }
+ }
+}
+
+} // namespace detail
+} // namespace hwy
diff --git a/third_party/highway/hwy/tests/test_util.h b/third_party/highway/hwy/tests/test_util.h
new file mode 100644
index 0000000000..558d1bcfba
--- /dev/null
+++ b/third_party/highway/hwy/tests/test_util.h
@@ -0,0 +1,173 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HWY_TESTS_TEST_UTIL_H_
+#define HWY_TESTS_TEST_UTIL_H_
+
+// Target-independent helper functions for use by *_test.cc.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <cmath> // std::isnan
+#include <string>
+
+#include "hwy/aligned_allocator.h"
+#include "hwy/base.h"
+#include "hwy/highway.h"
+#include "hwy/highway_export.h"
+#include "hwy/print.h"
+
+namespace hwy {
+
+// The maximum vector size used in tests when defining test data. DEPRECATED.
+constexpr size_t kTestMaxVectorSize = 64;
+
+// 64-bit random generator (Xorshift128+). Much smaller state than std::mt19937,
+// which triggers a compiler bug.
+class RandomState {
+ public:
+ explicit RandomState(const uint64_t seed = 0x123456789ull) {
+ s0_ = SplitMix64(seed + 0x9E3779B97F4A7C15ull);
+ s1_ = SplitMix64(s0_);
+ }
+
+ HWY_INLINE uint64_t operator()() {
+ uint64_t s1 = s0_;
+ const uint64_t s0 = s1_;
+ const uint64_t bits = s1 + s0;
+ s0_ = s0;
+ s1 ^= s1 << 23;
+ s1 ^= s0 ^ (s1 >> 18) ^ (s0 >> 5);
+ s1_ = s1;
+ return bits;
+ }
+
+ private:
+ static uint64_t SplitMix64(uint64_t z) {
+ z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull;
+ z = (z ^ (z >> 27)) * 0x94D049BB133111EBull;
+ return z ^ (z >> 31);
+ }
+
+ uint64_t s0_;
+ uint64_t s1_;
+};
+
+static HWY_INLINE uint32_t Random32(RandomState* rng) {
+ return static_cast<uint32_t>((*rng)());
+}
+
+static HWY_INLINE uint64_t Random64(RandomState* rng) { return (*rng)(); }
+
+// Prevents the compiler from eliding the computations that led to "output".
+// Works by indicating to the compiler that "output" is being read and modified.
+// The +r constraint avoids unnecessary writes to memory, but only works for
+// built-in types.
+template <class T>
+inline void PreventElision(T&& output) {
+#if HWY_COMPILER_MSVC
+ (void)output;
+#else // HWY_COMPILER_MSVC
+ asm volatile("" : "+r"(output) : : "memory");
+#endif // HWY_COMPILER_MSVC
+}
+
+HWY_TEST_DLLEXPORT bool BytesEqual(const void* p1, const void* p2,
+ const size_t size, size_t* pos = nullptr);
+
+void AssertStringEqual(const char* expected, const char* actual,
+ const char* target_name, const char* filename, int line);
+
+namespace detail {
+
+template <typename T, typename TU = MakeUnsigned<T>>
+TU ComputeUlpDelta(const T expected, const T actual) {
+ // Handle -0 == 0 and infinities.
+ if (expected == actual) return 0;
+
+ // Consider "equal" if both are NaN, so we can verify an expected NaN.
+ // Needs a special case because there are many possible NaN representations.
+ if (std::isnan(expected) && std::isnan(actual)) return 0;
+
+ // Compute the difference in units of last place. We do not need to check for
+ // differing signs; they will result in large differences, which is fine.
+ TU ux, uy;
+ CopySameSize(&expected, &ux);
+ CopySameSize(&actual, &uy);
+
+ // Avoid unsigned->signed cast: 2's complement is only guaranteed by C++20.
+ const TU ulp = HWY_MAX(ux, uy) - HWY_MIN(ux, uy);
+ return ulp;
+}
+
+HWY_TEST_DLLEXPORT bool IsEqual(const TypeInfo& info, const void* expected_ptr,
+ const void* actual_ptr);
+
+HWY_TEST_DLLEXPORT HWY_NORETURN void PrintMismatchAndAbort(
+ const TypeInfo& info, const void* expected_ptr, const void* actual_ptr,
+ const char* target_name, const char* filename, int line, size_t lane = 0,
+ size_t num_lanes = 1);
+
+HWY_TEST_DLLEXPORT void AssertArrayEqual(const TypeInfo& info,
+ const void* expected_void,
+ const void* actual_void, size_t N,
+ const char* target_name,
+ const char* filename, int line);
+
+} // namespace detail
+
+// Returns a name for the vector/part/scalar. The type prefix is u/i/f for
+// unsigned/signed/floating point, followed by the number of bits per lane;
+// then 'x' followed by the number of lanes. Example: u8x16. This is useful for
+// understanding which instantiation of a generic test failed.
+template <typename T>
+std::string TypeName(T /*unused*/, size_t N) {
+ char string100[100];
+ detail::TypeName(detail::MakeTypeInfo<T>(), N, string100);
+ return string100;
+}
+
+// Compare non-vector, non-string T.
+template <typename T>
+HWY_INLINE bool IsEqual(const T expected, const T actual) {
+ const auto info = detail::MakeTypeInfo<T>();
+ return detail::IsEqual(info, &expected, &actual);
+}
+
+template <typename T>
+HWY_INLINE void AssertEqual(const T expected, const T actual,
+ const char* target_name, const char* filename,
+ int line, size_t lane = 0) {
+ const auto info = detail::MakeTypeInfo<T>();
+ if (!detail::IsEqual(info, &expected, &actual)) {
+ detail::PrintMismatchAndAbort(info, &expected, &actual, target_name,
+ filename, line, lane);
+ }
+}
+
+template <typename T>
+HWY_INLINE void AssertArrayEqual(const T* expected, const T* actual,
+ size_t count, const char* target_name,
+ const char* filename, int line) {
+ const auto info = hwy::detail::MakeTypeInfo<T>();
+ detail::AssertArrayEqual(info, expected, actual, count, target_name, filename,
+ line);
+}
+
+} // namespace hwy
+
+#endif // HWY_TESTS_TEST_UTIL_H_
diff --git a/third_party/highway/hwy/tests/test_util_test.cc b/third_party/highway/hwy/tests/test_util_test.cc
new file mode 100644
index 0000000000..1911467c34
--- /dev/null
+++ b/third_party/highway/hwy/tests/test_util_test.cc
@@ -0,0 +1,107 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/test_util_test.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestName {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T t, D d) {
+ char num[10];
+ std::string expected = IsFloat<T>() ? "f" : (IsSigned<T>() ? "i" : "u");
+ snprintf(num, sizeof(num), "%u" , static_cast<unsigned>(sizeof(T) * 8));
+ expected += num;
+
+ const size_t N = Lanes(d);
+ if (N != 1) {
+ expected += 'x';
+ snprintf(num, sizeof(num), "%u", static_cast<unsigned>(N));
+ expected += num;
+ }
+ const std::string actual = TypeName(t, N);
+ if (expected != actual) {
+ HWY_ABORT("%s mismatch: expected '%s', got '%s'.\n",
+ hwy::TargetName(HWY_TARGET), expected.c_str(), actual.c_str());
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllName() { ForAllTypes(ForPartialVectors<TestName>()); }
+
+struct TestEqualInteger {
+ template <class T>
+ HWY_NOINLINE void operator()(T /*t*/) const {
+ HWY_ASSERT_EQ(T(0), T(0));
+ HWY_ASSERT_EQ(T(1), T(1));
+ HWY_ASSERT_EQ(T(-1), T(-1));
+ HWY_ASSERT_EQ(LimitsMin<T>(), LimitsMin<T>());
+
+ HWY_ASSERT(!IsEqual(T(0), T(1)));
+ HWY_ASSERT(!IsEqual(T(1), T(0)));
+ HWY_ASSERT(!IsEqual(T(1), T(-1)));
+ HWY_ASSERT(!IsEqual(T(-1), T(1)));
+ HWY_ASSERT(!IsEqual(LimitsMin<T>(), LimitsMax<T>()));
+ HWY_ASSERT(!IsEqual(LimitsMax<T>(), LimitsMin<T>()));
+ }
+};
+
+struct TestEqualFloat {
+ template <class T>
+ HWY_NOINLINE void operator()(T /*t*/) const {
+ HWY_ASSERT(IsEqual(T(0), T(0)));
+ HWY_ASSERT(IsEqual(T(1), T(1)));
+ HWY_ASSERT(IsEqual(T(-1), T(-1)));
+ HWY_ASSERT(IsEqual(MantissaEnd<T>(), MantissaEnd<T>()));
+
+ HWY_ASSERT(!IsEqual(T(0), T(1)));
+ HWY_ASSERT(!IsEqual(T(1), T(0)));
+ HWY_ASSERT(!IsEqual(T(1), T(-1)));
+ HWY_ASSERT(!IsEqual(T(-1), T(1)));
+ HWY_ASSERT(!IsEqual(LowestValue<T>(), HighestValue<T>()));
+ HWY_ASSERT(!IsEqual(HighestValue<T>(), LowestValue<T>()));
+ }
+};
+
+HWY_NOINLINE void TestAllEqual() {
+ ForIntegerTypes(TestEqualInteger());
+ ForFloatTypes(TestEqualFloat());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(TestUtilTest);
+HWY_EXPORT_AND_TEST_P(TestUtilTest, TestAllName);
+HWY_EXPORT_AND_TEST_P(TestUtilTest, TestAllEqual);
+} // namespace hwy
+
+#endif