diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
commit | 36d22d82aa202bb199967e9512281e9a53db42c9 (patch) | |
tree | 105e8c98ddea1c1e4784a60a5a6410fa416be2de /third_party/highway/hwy/tests | |
parent | Initial commit. (diff) | |
download | firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip |
Adding upstream version 115.7.0esr.upstream/115.7.0esr
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/highway/hwy/tests')
27 files changed, 9189 insertions, 0 deletions
diff --git a/third_party/highway/hwy/tests/arithmetic_test.cc b/third_party/highway/hwy/tests/arithmetic_test.cc new file mode 100644 index 0000000000..fa533228a0 --- /dev/null +++ b/third_party/highway/hwy/tests/arithmetic_test.cc @@ -0,0 +1,499 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <stddef.h> +#include <stdint.h> + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/arithmetic_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +struct TestPlusMinus { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v2 = Iota(d, T(2)); + const auto v3 = Iota(d, T(3)); + const auto v4 = Iota(d, T(4)); + + const size_t N = Lanes(d); + auto lanes = AllocateAligned<T>(N); + for (size_t i = 0; i < N; ++i) { + lanes[i] = static_cast<T>((2 + i) + (3 + i)); + } + HWY_ASSERT_VEC_EQ(d, lanes.get(), Add(v2, v3)); + HWY_ASSERT_VEC_EQ(d, Set(d, 2), Sub(v4, v2)); + + for (size_t i = 0; i < N; ++i) { + lanes[i] = static_cast<T>((2 + i) + (4 + i)); + } + auto sum = v2; + sum = Add(sum, v4); // sum == 6,8.. + HWY_ASSERT_VEC_EQ(d, Load(d, lanes.get()), sum); + + sum = Sub(sum, v4); + HWY_ASSERT_VEC_EQ(d, v2, sum); + } +}; + +struct TestPlusMinusOverflow { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v1 = Iota(d, T(1)); + const auto vMax = Iota(d, LimitsMax<T>()); + const auto vMin = Iota(d, LimitsMin<T>()); + + // Check that no UB triggered. + // "assert" here is formal - to avoid compiler dropping calculations + HWY_ASSERT_VEC_EQ(d, Add(v1, vMax), Add(vMax, v1)); + HWY_ASSERT_VEC_EQ(d, Add(vMax, vMax), Add(vMax, vMax)); + HWY_ASSERT_VEC_EQ(d, Sub(vMin, v1), Sub(vMin, v1)); + HWY_ASSERT_VEC_EQ(d, Sub(vMin, vMax), Sub(vMin, vMax)); + } +}; + +HWY_NOINLINE void TestAllPlusMinus() { + ForAllTypes(ForPartialVectors<TestPlusMinus>()); + ForIntegerTypes(ForPartialVectors<TestPlusMinusOverflow>()); +} + +struct TestUnsignedSaturatingArithmetic { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v0 = Zero(d); + const auto vi = Iota(d, 1); + const auto vm = Set(d, LimitsMax<T>()); + + HWY_ASSERT_VEC_EQ(d, Add(v0, v0), SaturatedAdd(v0, v0)); + HWY_ASSERT_VEC_EQ(d, Add(v0, vi), SaturatedAdd(v0, vi)); + HWY_ASSERT_VEC_EQ(d, Add(v0, vm), SaturatedAdd(v0, vm)); + HWY_ASSERT_VEC_EQ(d, vm, SaturatedAdd(vi, vm)); + HWY_ASSERT_VEC_EQ(d, vm, SaturatedAdd(vm, vm)); + + HWY_ASSERT_VEC_EQ(d, v0, SaturatedSub(v0, v0)); + HWY_ASSERT_VEC_EQ(d, v0, SaturatedSub(v0, vi)); + HWY_ASSERT_VEC_EQ(d, v0, SaturatedSub(vi, vi)); + HWY_ASSERT_VEC_EQ(d, v0, SaturatedSub(vi, vm)); + HWY_ASSERT_VEC_EQ(d, Sub(vm, vi), SaturatedSub(vm, vi)); + } +}; + +struct TestSignedSaturatingArithmetic { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v0 = Zero(d); + const auto vpm = Set(d, LimitsMax<T>()); + // Ensure all lanes are positive, even if Iota wraps around + const auto vi = Or(And(Iota(d, 0), vpm), Set(d, 1)); + const auto vn = Sub(v0, vi); + const auto vnm = Set(d, LimitsMin<T>()); + HWY_ASSERT_MASK_EQ(d, MaskTrue(d), Gt(vi, v0)); + HWY_ASSERT_MASK_EQ(d, MaskTrue(d), Lt(vn, v0)); + + HWY_ASSERT_VEC_EQ(d, v0, SaturatedAdd(v0, v0)); + HWY_ASSERT_VEC_EQ(d, vi, SaturatedAdd(v0, vi)); + HWY_ASSERT_VEC_EQ(d, vpm, SaturatedAdd(v0, vpm)); + HWY_ASSERT_VEC_EQ(d, vpm, SaturatedAdd(vi, vpm)); + HWY_ASSERT_VEC_EQ(d, vpm, SaturatedAdd(vpm, vpm)); + + HWY_ASSERT_VEC_EQ(d, v0, SaturatedSub(v0, v0)); + HWY_ASSERT_VEC_EQ(d, Sub(v0, vi), SaturatedSub(v0, vi)); + HWY_ASSERT_VEC_EQ(d, vn, SaturatedSub(vn, v0)); + HWY_ASSERT_VEC_EQ(d, vnm, SaturatedSub(vnm, vi)); + HWY_ASSERT_VEC_EQ(d, vnm, SaturatedSub(vnm, vpm)); + } +}; + +struct TestSaturatingArithmeticOverflow { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v1 = Iota(d, T(1)); + const auto vMax = Iota(d, LimitsMax<T>()); + const auto vMin = Iota(d, LimitsMin<T>()); + + // Check that no UB triggered. + // "assert" here is formal - to avoid compiler dropping calculations + HWY_ASSERT_VEC_EQ(d, SaturatedAdd(v1, vMax), SaturatedAdd(vMax, v1)); + HWY_ASSERT_VEC_EQ(d, SaturatedAdd(vMax, vMax), SaturatedAdd(vMax, vMax)); + HWY_ASSERT_VEC_EQ(d, SaturatedAdd(vMin, vMax), SaturatedAdd(vMin, vMax)); + HWY_ASSERT_VEC_EQ(d, SaturatedAdd(vMin, vMin), SaturatedAdd(vMin, vMin)); + HWY_ASSERT_VEC_EQ(d, SaturatedSub(vMin, v1), SaturatedSub(vMin, v1)); + HWY_ASSERT_VEC_EQ(d, SaturatedSub(vMin, vMax), SaturatedSub(vMin, vMax)); + HWY_ASSERT_VEC_EQ(d, SaturatedSub(vMax, vMin), SaturatedSub(vMax, vMin)); + HWY_ASSERT_VEC_EQ(d, SaturatedSub(vMin, vMin), SaturatedSub(vMin, vMin)); + } +}; + +HWY_NOINLINE void TestAllSaturatingArithmetic() { + const ForPartialVectors<TestUnsignedSaturatingArithmetic> test_unsigned; + test_unsigned(uint8_t()); + test_unsigned(uint16_t()); + + const ForPartialVectors<TestSignedSaturatingArithmetic> test_signed; + test_signed(int8_t()); + test_signed(int16_t()); + + const ForPartialVectors<TestSaturatingArithmeticOverflow> test_overflow; + test_overflow(int8_t()); + test_overflow(uint8_t()); + test_overflow(int16_t()); + test_overflow(uint16_t()); +} + +struct TestAverage { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v0 = Zero(d); + const auto v1 = Set(d, T(1)); + const auto v2 = Set(d, T(2)); + + HWY_ASSERT_VEC_EQ(d, v0, AverageRound(v0, v0)); + HWY_ASSERT_VEC_EQ(d, v1, AverageRound(v0, v1)); + HWY_ASSERT_VEC_EQ(d, v1, AverageRound(v1, v1)); + HWY_ASSERT_VEC_EQ(d, v2, AverageRound(v1, v2)); + HWY_ASSERT_VEC_EQ(d, v2, AverageRound(v2, v2)); + } +}; + +HWY_NOINLINE void TestAllAverage() { + const ForPartialVectors<TestAverage> test; + test(uint8_t()); + test(uint16_t()); +} + +struct TestAbs { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v0 = Zero(d); + const auto vp1 = Set(d, T(1)); + const auto vn1 = Set(d, T(-1)); + const auto vpm = Set(d, LimitsMax<T>()); + const auto vnm = Set(d, LimitsMin<T>()); + + HWY_ASSERT_VEC_EQ(d, v0, Abs(v0)); + HWY_ASSERT_VEC_EQ(d, vp1, Abs(vp1)); + HWY_ASSERT_VEC_EQ(d, vp1, Abs(vn1)); + HWY_ASSERT_VEC_EQ(d, vpm, Abs(vpm)); + HWY_ASSERT_VEC_EQ(d, vnm, Abs(vnm)); + } +}; + +struct TestFloatAbs { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v0 = Zero(d); + const auto vp1 = Set(d, T(1)); + const auto vn1 = Set(d, T(-1)); + const auto vp2 = Set(d, T(0.01)); + const auto vn2 = Set(d, T(-0.01)); + + HWY_ASSERT_VEC_EQ(d, v0, Abs(v0)); + HWY_ASSERT_VEC_EQ(d, vp1, Abs(vp1)); + HWY_ASSERT_VEC_EQ(d, vp1, Abs(vn1)); + HWY_ASSERT_VEC_EQ(d, vp2, Abs(vp2)); + HWY_ASSERT_VEC_EQ(d, vp2, Abs(vn2)); + } +}; + +HWY_NOINLINE void TestAllAbs() { + ForSignedTypes(ForPartialVectors<TestAbs>()); + ForFloatTypes(ForPartialVectors<TestFloatAbs>()); +} + +struct TestNeg { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v0 = Zero(d); + const auto vn = Set(d, T(-3)); + const auto vp = Set(d, T(3)); + HWY_ASSERT_VEC_EQ(d, v0, Neg(v0)); + HWY_ASSERT_VEC_EQ(d, vp, Neg(vn)); + HWY_ASSERT_VEC_EQ(d, vn, Neg(vp)); + } +}; + +struct TestNegOverflow { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto vn = Set(d, LimitsMin<T>()); + const auto vp = Set(d, LimitsMax<T>()); + HWY_ASSERT_VEC_EQ(d, Neg(vn), Neg(vn)); + HWY_ASSERT_VEC_EQ(d, Neg(vp), Neg(vp)); + } +}; + +HWY_NOINLINE void TestAllNeg() { + ForSignedTypes(ForPartialVectors<TestNeg>()); + ForFloatTypes(ForPartialVectors<TestNeg>()); + ForSignedTypes(ForPartialVectors<TestNegOverflow>()); +} + +struct TestUnsignedMinMax { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v0 = Zero(d); + // Leave headroom such that v1 < v2 even after wraparound. + const auto mod = And(Iota(d, 0), Set(d, LimitsMax<T>() >> 1)); + const auto v1 = Add(mod, Set(d, 1)); + const auto v2 = Add(mod, Set(d, 2)); + HWY_ASSERT_VEC_EQ(d, v1, Min(v1, v2)); + HWY_ASSERT_VEC_EQ(d, v2, Max(v1, v2)); + HWY_ASSERT_VEC_EQ(d, v0, Min(v1, v0)); + HWY_ASSERT_VEC_EQ(d, v1, Max(v1, v0)); + + const auto vmin = Set(d, LimitsMin<T>()); + const auto vmax = Set(d, LimitsMax<T>()); + + HWY_ASSERT_VEC_EQ(d, vmin, Min(vmin, vmax)); + HWY_ASSERT_VEC_EQ(d, vmin, Min(vmax, vmin)); + + HWY_ASSERT_VEC_EQ(d, vmax, Max(vmin, vmax)); + HWY_ASSERT_VEC_EQ(d, vmax, Max(vmax, vmin)); + } +}; + +struct TestSignedMinMax { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + // Leave headroom such that v1 < v2 even after wraparound. + const auto mod = And(Iota(d, 0), Set(d, LimitsMax<T>() >> 1)); + const auto v1 = Add(mod, Set(d, 1)); + const auto v2 = Add(mod, Set(d, 2)); + const auto v_neg = Sub(Zero(d), v1); + HWY_ASSERT_VEC_EQ(d, v1, Min(v1, v2)); + HWY_ASSERT_VEC_EQ(d, v2, Max(v1, v2)); + HWY_ASSERT_VEC_EQ(d, v_neg, Min(v1, v_neg)); + HWY_ASSERT_VEC_EQ(d, v1, Max(v1, v_neg)); + + const auto v0 = Zero(d); + const auto vmin = Set(d, LimitsMin<T>()); + const auto vmax = Set(d, LimitsMax<T>()); + HWY_ASSERT_VEC_EQ(d, vmin, Min(v0, vmin)); + HWY_ASSERT_VEC_EQ(d, vmin, Min(vmin, v0)); + HWY_ASSERT_VEC_EQ(d, v0, Max(v0, vmin)); + HWY_ASSERT_VEC_EQ(d, v0, Max(vmin, v0)); + + HWY_ASSERT_VEC_EQ(d, vmin, Min(vmin, vmax)); + HWY_ASSERT_VEC_EQ(d, vmin, Min(vmax, vmin)); + + HWY_ASSERT_VEC_EQ(d, vmax, Max(vmin, vmax)); + HWY_ASSERT_VEC_EQ(d, vmax, Max(vmax, vmin)); + } +}; + +struct TestFloatMinMax { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v1 = Iota(d, 1); + const auto v2 = Iota(d, 2); + const auto v_neg = Iota(d, -T(Lanes(d))); + HWY_ASSERT_VEC_EQ(d, v1, Min(v1, v2)); + HWY_ASSERT_VEC_EQ(d, v2, Max(v1, v2)); + HWY_ASSERT_VEC_EQ(d, v_neg, Min(v1, v_neg)); + HWY_ASSERT_VEC_EQ(d, v1, Max(v1, v_neg)); + + const auto v0 = Zero(d); + const auto vmin = Set(d, T(-1E30)); + const auto vmax = Set(d, T(1E30)); + HWY_ASSERT_VEC_EQ(d, vmin, Min(v0, vmin)); + HWY_ASSERT_VEC_EQ(d, vmin, Min(vmin, v0)); + HWY_ASSERT_VEC_EQ(d, v0, Max(v0, vmin)); + HWY_ASSERT_VEC_EQ(d, v0, Max(vmin, v0)); + + HWY_ASSERT_VEC_EQ(d, vmin, Min(vmin, vmax)); + HWY_ASSERT_VEC_EQ(d, vmin, Min(vmax, vmin)); + + HWY_ASSERT_VEC_EQ(d, vmax, Max(vmin, vmax)); + HWY_ASSERT_VEC_EQ(d, vmax, Max(vmax, vmin)); + } +}; + +HWY_NOINLINE void TestAllMinMax() { + ForUnsignedTypes(ForPartialVectors<TestUnsignedMinMax>()); + ForSignedTypes(ForPartialVectors<TestSignedMinMax>()); + ForFloatTypes(ForPartialVectors<TestFloatMinMax>()); +} + +template <class D> +static HWY_NOINLINE Vec<D> Make128(D d, uint64_t hi, uint64_t lo) { + alignas(16) uint64_t in[2]; + in[0] = lo; + in[1] = hi; + return LoadDup128(d, in); +} + +struct TestMinMax128 { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using V = Vec<D>; + const size_t N = Lanes(d); + auto a_lanes = AllocateAligned<T>(N); + auto b_lanes = AllocateAligned<T>(N); + auto min_lanes = AllocateAligned<T>(N); + auto max_lanes = AllocateAligned<T>(N); + RandomState rng; + + const V v00 = Zero(d); + const V v01 = Make128(d, 0, 1); + const V v10 = Make128(d, 1, 0); + const V v11 = Add(v01, v10); + + // Same arg + HWY_ASSERT_VEC_EQ(d, v00, Min128(d, v00, v00)); + HWY_ASSERT_VEC_EQ(d, v01, Min128(d, v01, v01)); + HWY_ASSERT_VEC_EQ(d, v10, Min128(d, v10, v10)); + HWY_ASSERT_VEC_EQ(d, v11, Min128(d, v11, v11)); + HWY_ASSERT_VEC_EQ(d, v00, Max128(d, v00, v00)); + HWY_ASSERT_VEC_EQ(d, v01, Max128(d, v01, v01)); + HWY_ASSERT_VEC_EQ(d, v10, Max128(d, v10, v10)); + HWY_ASSERT_VEC_EQ(d, v11, Max128(d, v11, v11)); + + // First arg less + HWY_ASSERT_VEC_EQ(d, v00, Min128(d, v00, v01)); + HWY_ASSERT_VEC_EQ(d, v01, Min128(d, v01, v10)); + HWY_ASSERT_VEC_EQ(d, v10, Min128(d, v10, v11)); + HWY_ASSERT_VEC_EQ(d, v01, Max128(d, v00, v01)); + HWY_ASSERT_VEC_EQ(d, v10, Max128(d, v01, v10)); + HWY_ASSERT_VEC_EQ(d, v11, Max128(d, v10, v11)); + + // Second arg less + HWY_ASSERT_VEC_EQ(d, v00, Min128(d, v01, v00)); + HWY_ASSERT_VEC_EQ(d, v01, Min128(d, v10, v01)); + HWY_ASSERT_VEC_EQ(d, v10, Min128(d, v11, v10)); + HWY_ASSERT_VEC_EQ(d, v01, Max128(d, v01, v00)); + HWY_ASSERT_VEC_EQ(d, v10, Max128(d, v10, v01)); + HWY_ASSERT_VEC_EQ(d, v11, Max128(d, v11, v10)); + + // Also check 128-bit blocks are independent + for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) { + for (size_t i = 0; i < N; ++i) { + a_lanes[i] = Random64(&rng); + b_lanes[i] = Random64(&rng); + } + const V a = Load(d, a_lanes.get()); + const V b = Load(d, b_lanes.get()); + for (size_t i = 0; i < N; i += 2) { + const bool lt = a_lanes[i + 1] == b_lanes[i + 1] + ? (a_lanes[i] < b_lanes[i]) + : (a_lanes[i + 1] < b_lanes[i + 1]); + min_lanes[i + 0] = lt ? a_lanes[i + 0] : b_lanes[i + 0]; + min_lanes[i + 1] = lt ? a_lanes[i + 1] : b_lanes[i + 1]; + max_lanes[i + 0] = lt ? b_lanes[i + 0] : a_lanes[i + 0]; + max_lanes[i + 1] = lt ? b_lanes[i + 1] : a_lanes[i + 1]; + } + HWY_ASSERT_VEC_EQ(d, min_lanes.get(), Min128(d, a, b)); + HWY_ASSERT_VEC_EQ(d, max_lanes.get(), Max128(d, a, b)); + } + } +}; + +HWY_NOINLINE void TestAllMinMax128() { + ForGEVectors<128, TestMinMax128>()(uint64_t()); +} + +struct TestMinMax128Upper { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using V = Vec<D>; + const size_t N = Lanes(d); + auto a_lanes = AllocateAligned<T>(N); + auto b_lanes = AllocateAligned<T>(N); + auto min_lanes = AllocateAligned<T>(N); + auto max_lanes = AllocateAligned<T>(N); + RandomState rng; + + const V v00 = Zero(d); + const V v01 = Make128(d, 0, 1); + const V v10 = Make128(d, 1, 0); + const V v11 = Add(v01, v10); + + // Same arg + HWY_ASSERT_VEC_EQ(d, v00, Min128Upper(d, v00, v00)); + HWY_ASSERT_VEC_EQ(d, v01, Min128Upper(d, v01, v01)); + HWY_ASSERT_VEC_EQ(d, v10, Min128Upper(d, v10, v10)); + HWY_ASSERT_VEC_EQ(d, v11, Min128Upper(d, v11, v11)); + HWY_ASSERT_VEC_EQ(d, v00, Max128Upper(d, v00, v00)); + HWY_ASSERT_VEC_EQ(d, v01, Max128Upper(d, v01, v01)); + HWY_ASSERT_VEC_EQ(d, v10, Max128Upper(d, v10, v10)); + HWY_ASSERT_VEC_EQ(d, v11, Max128Upper(d, v11, v11)); + + // Equivalent but not equal (chooses second arg) + HWY_ASSERT_VEC_EQ(d, v01, Min128Upper(d, v00, v01)); + HWY_ASSERT_VEC_EQ(d, v11, Min128Upper(d, v10, v11)); + HWY_ASSERT_VEC_EQ(d, v00, Min128Upper(d, v01, v00)); + HWY_ASSERT_VEC_EQ(d, v10, Min128Upper(d, v11, v10)); + HWY_ASSERT_VEC_EQ(d, v00, Max128Upper(d, v01, v00)); + HWY_ASSERT_VEC_EQ(d, v10, Max128Upper(d, v11, v10)); + HWY_ASSERT_VEC_EQ(d, v01, Max128Upper(d, v00, v01)); + HWY_ASSERT_VEC_EQ(d, v11, Max128Upper(d, v10, v11)); + + // First arg less + HWY_ASSERT_VEC_EQ(d, v01, Min128Upper(d, v01, v10)); + HWY_ASSERT_VEC_EQ(d, v10, Max128Upper(d, v01, v10)); + + // Second arg less + HWY_ASSERT_VEC_EQ(d, v01, Min128Upper(d, v10, v01)); + HWY_ASSERT_VEC_EQ(d, v10, Max128Upper(d, v10, v01)); + + // Also check 128-bit blocks are independent + for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) { + for (size_t i = 0; i < N; ++i) { + a_lanes[i] = Random64(&rng); + b_lanes[i] = Random64(&rng); + } + const V a = Load(d, a_lanes.get()); + const V b = Load(d, b_lanes.get()); + for (size_t i = 0; i < N; i += 2) { + const bool lt = a_lanes[i + 1] < b_lanes[i + 1]; + min_lanes[i + 0] = lt ? a_lanes[i + 0] : b_lanes[i + 0]; + min_lanes[i + 1] = lt ? a_lanes[i + 1] : b_lanes[i + 1]; + max_lanes[i + 0] = lt ? b_lanes[i + 0] : a_lanes[i + 0]; + max_lanes[i + 1] = lt ? b_lanes[i + 1] : a_lanes[i + 1]; + } + HWY_ASSERT_VEC_EQ(d, min_lanes.get(), Min128Upper(d, a, b)); + HWY_ASSERT_VEC_EQ(d, max_lanes.get(), Max128Upper(d, a, b)); + } + } +}; + +HWY_NOINLINE void TestAllMinMax128Upper() { + ForGEVectors<128, TestMinMax128Upper>()(uint64_t()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyArithmeticTest); +HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllPlusMinus); +HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSaturatingArithmetic); +HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAverage); +HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAbs); +HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllNeg); +HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax); +HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax128); +HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax128Upper); +} // namespace hwy + +#endif diff --git a/third_party/highway/hwy/tests/blockwise_shift_test.cc b/third_party/highway/hwy/tests/blockwise_shift_test.cc new file mode 100644 index 0000000000..4e5250841b --- /dev/null +++ b/third_party/highway/hwy/tests/blockwise_shift_test.cc @@ -0,0 +1,270 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <stddef.h> +#include <stdint.h> +#include <string.h> // memcpy + +#include <algorithm> // std::fill + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/blockwise_shift_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +struct TestShiftBytes { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + // Scalar does not define Shift*Bytes. +#if HWY_TARGET != HWY_SCALAR || HWY_IDE + const Repartition<uint8_t, D> du8; + const size_t N8 = Lanes(du8); + + // Zero remains zero + const auto v0 = Zero(d); + HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(v0)); + HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(d, v0)); + HWY_ASSERT_VEC_EQ(d, v0, ShiftRightBytes<1>(d, v0)); + + // Zero after shifting out the high/low byte + auto bytes = AllocateAligned<uint8_t>(N8); + std::fill(bytes.get(), bytes.get() + N8, 0); + bytes[N8 - 1] = 0x7F; + const auto vhi = BitCast(d, Load(du8, bytes.get())); + bytes[N8 - 1] = 0; + bytes[0] = 0x7F; + const auto vlo = BitCast(d, Load(du8, bytes.get())); + HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(vhi)); + HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(d, vhi)); + HWY_ASSERT_VEC_EQ(d, v0, ShiftRightBytes<1>(d, vlo)); + + // Check expected result with Iota + const size_t N = Lanes(d); + auto in = AllocateAligned<T>(N); + const uint8_t* in_bytes = reinterpret_cast<const uint8_t*>(in.get()); + const auto v = BitCast(d, Iota(du8, 1)); + Store(v, d, in.get()); + + auto expected = AllocateAligned<T>(N); + uint8_t* expected_bytes = reinterpret_cast<uint8_t*>(expected.get()); + + const size_t block_size = HWY_MIN(N8, 16); + for (size_t block = 0; block < N8; block += block_size) { + expected_bytes[block] = 0; + memcpy(expected_bytes + block + 1, in_bytes + block, block_size - 1); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftBytes<1>(v)); + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftBytes<1>(d, v)); + + for (size_t block = 0; block < N8; block += block_size) { + memcpy(expected_bytes + block, in_bytes + block + 1, block_size - 1); + expected_bytes[block + block_size - 1] = 0; + } + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightBytes<1>(d, v)); +#else + (void)d; +#endif // #if HWY_TARGET != HWY_SCALAR + } +}; + +HWY_NOINLINE void TestAllShiftBytes() { + ForIntegerTypes(ForPartialVectors<TestShiftBytes>()); +} + +struct TestShiftLeftLanes { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + // Scalar does not define Shift*Lanes. +#if HWY_TARGET != HWY_SCALAR || HWY_IDE + const auto v = Iota(d, T(1)); + const size_t N = Lanes(d); + if (N == 1) return; + auto expected = AllocateAligned<T>(N); + + HWY_ASSERT_VEC_EQ(d, v, ShiftLeftLanes<0>(v)); + HWY_ASSERT_VEC_EQ(d, v, ShiftLeftLanes<0>(d, v)); + + constexpr size_t kLanesPerBlock = 16 / sizeof(T); + + for (size_t i = 0; i < N; ++i) { + expected[i] = (i % kLanesPerBlock) == 0 ? T(0) : T(i); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftLanes<1>(v)); + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftLanes<1>(d, v)); +#else + (void)d; +#endif // #if HWY_TARGET != HWY_SCALAR + } +}; + +struct TestShiftRightLanes { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + // Scalar does not define Shift*Lanes. +#if HWY_TARGET != HWY_SCALAR || HWY_IDE + const auto v = Iota(d, T(1)); + const size_t N = Lanes(d); + if (N == 1) return; + auto expected = AllocateAligned<T>(N); + + HWY_ASSERT_VEC_EQ(d, v, ShiftRightLanes<0>(d, v)); + + constexpr size_t kLanesPerBlock = 16 / sizeof(T); + + for (size_t i = 0; i < N; ++i) { + const size_t mod = i % kLanesPerBlock; + expected[i] = mod == (kLanesPerBlock - 1) || i >= N - 1 ? T(0) : T(2 + i); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightLanes<1>(d, v)); +#else + (void)d; +#endif // #if HWY_TARGET != HWY_SCALAR + } +}; + +HWY_NOINLINE void TestAllShiftLeftLanes() { + ForAllTypes(ForPartialVectors<TestShiftLeftLanes>()); +} + +HWY_NOINLINE void TestAllShiftRightLanes() { + ForAllTypes(ForPartialVectors<TestShiftRightLanes>()); +} + +// Scalar does not define CombineShiftRightBytes. +#if HWY_TARGET != HWY_SCALAR || HWY_IDE + +template <int kBytes> +struct TestCombineShiftRightBytes { + template <class T, class D> + HWY_NOINLINE void operator()(T, D d) { + constexpr size_t kBlockSize = 16; + static_assert(kBytes < kBlockSize, "Shift count is per block"); + const Repartition<uint8_t, D> d8; + const size_t N8 = Lanes(d8); + if (N8 < 16) return; + auto hi_bytes = AllocateAligned<uint8_t>(N8); + auto lo_bytes = AllocateAligned<uint8_t>(N8); + auto expected_bytes = AllocateAligned<uint8_t>(N8); + uint8_t combined[2 * kBlockSize]; + + // Random inputs in each lane + RandomState rng; + for (size_t rep = 0; rep < AdjustedReps(100); ++rep) { + for (size_t i = 0; i < N8; ++i) { + hi_bytes[i] = static_cast<uint8_t>(Random64(&rng) & 0xFF); + lo_bytes[i] = static_cast<uint8_t>(Random64(&rng) & 0xFF); + } + for (size_t i = 0; i < N8; i += kBlockSize) { + // Arguments are not the same size. + CopyBytes<kBlockSize>(&lo_bytes[i], combined); + CopyBytes<kBlockSize>(&hi_bytes[i], combined + kBlockSize); + CopyBytes<kBlockSize>(combined + kBytes, &expected_bytes[i]); + } + + const auto hi = BitCast(d, Load(d8, hi_bytes.get())); + const auto lo = BitCast(d, Load(d8, lo_bytes.get())); + const auto expected = BitCast(d, Load(d8, expected_bytes.get())); + HWY_ASSERT_VEC_EQ(d, expected, CombineShiftRightBytes<kBytes>(d, hi, lo)); + } + } +}; + +template <int kLanes> +struct TestCombineShiftRightLanes { + template <class T, class D> + HWY_NOINLINE void operator()(T, D d) { + const Repartition<uint8_t, D> d8; + const size_t N8 = Lanes(d8); + if (N8 < 16) return; + + auto hi_bytes = AllocateAligned<uint8_t>(N8); + auto lo_bytes = AllocateAligned<uint8_t>(N8); + auto expected_bytes = AllocateAligned<uint8_t>(N8); + constexpr size_t kBlockSize = 16; + uint8_t combined[2 * kBlockSize]; + + // Random inputs in each lane + RandomState rng; + for (size_t rep = 0; rep < AdjustedReps(100); ++rep) { + for (size_t i = 0; i < N8; ++i) { + hi_bytes[i] = static_cast<uint8_t>(Random64(&rng) & 0xFF); + lo_bytes[i] = static_cast<uint8_t>(Random64(&rng) & 0xFF); + } + for (size_t i = 0; i < N8; i += kBlockSize) { + // Arguments are not the same size. + CopyBytes<kBlockSize>(&lo_bytes[i], combined); + CopyBytes<kBlockSize>(&hi_bytes[i], combined + kBlockSize); + CopyBytes<kBlockSize>(combined + kLanes * sizeof(T), + &expected_bytes[i]); + } + + const auto hi = BitCast(d, Load(d8, hi_bytes.get())); + const auto lo = BitCast(d, Load(d8, lo_bytes.get())); + const auto expected = BitCast(d, Load(d8, expected_bytes.get())); + HWY_ASSERT_VEC_EQ(d, expected, CombineShiftRightLanes<kLanes>(d, hi, lo)); + } + } +}; + +#endif // #if HWY_TARGET != HWY_SCALAR + +struct TestCombineShiftRight { + template <class T, class D> + HWY_NOINLINE void operator()(T t, D d) { +// Scalar does not define CombineShiftRightBytes. +#if HWY_TARGET != HWY_SCALAR || HWY_IDE + constexpr int kMaxBytes = + HWY_MIN(16, static_cast<int>(MaxLanes(d) * sizeof(T))); + constexpr int kMaxLanes = kMaxBytes / static_cast<int>(sizeof(T)); + TestCombineShiftRightBytes<kMaxBytes - 1>()(t, d); + TestCombineShiftRightBytes<HWY_MAX(kMaxBytes / 2, 1)>()(t, d); + TestCombineShiftRightBytes<1>()(t, d); + + TestCombineShiftRightLanes<kMaxLanes - 1>()(t, d); + TestCombineShiftRightLanes<HWY_MAX(kMaxLanes / 2, -1)>()(t, d); + TestCombineShiftRightLanes<1>()(t, d); +#else + (void)t; + (void)d; +#endif + } +}; + +HWY_NOINLINE void TestAllCombineShiftRight() { + // Need at least 2 lanes. + ForAllTypes(ForShrinkableVectors<TestCombineShiftRight>()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyBlockwiseShiftTest); +HWY_EXPORT_AND_TEST_P(HwyBlockwiseShiftTest, TestAllShiftBytes); +HWY_EXPORT_AND_TEST_P(HwyBlockwiseShiftTest, TestAllShiftLeftLanes); +HWY_EXPORT_AND_TEST_P(HwyBlockwiseShiftTest, TestAllShiftRightLanes); +HWY_EXPORT_AND_TEST_P(HwyBlockwiseShiftTest, TestAllCombineShiftRight); +} // namespace hwy + +#endif diff --git a/third_party/highway/hwy/tests/blockwise_test.cc b/third_party/highway/hwy/tests/blockwise_test.cc new file mode 100644 index 0000000000..e5ac9ab362 --- /dev/null +++ b/third_party/highway/hwy/tests/blockwise_test.cc @@ -0,0 +1,454 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <stddef.h> +#include <stdint.h> +#include <string.h> + +#include <algorithm> // std::fill + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/blockwise_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +template <typename D, int kLane> +struct TestBroadcastR { + HWY_NOINLINE void operator()() const { + using T = typename D::T; + const D d; + const size_t N = Lanes(d); + if (kLane >= N) return; + auto in_lanes = AllocateAligned<T>(N); + std::fill(in_lanes.get(), in_lanes.get() + N, T(0)); + const size_t blockN = HWY_MIN(N * sizeof(T), 16) / sizeof(T); + // Need to set within each 128-bit block + for (size_t block = 0; block < N; block += blockN) { + in_lanes[block + kLane] = static_cast<T>(block + 1); + } + const auto in = Load(d, in_lanes.get()); + auto expected = AllocateAligned<T>(N); + for (size_t block = 0; block < N; block += blockN) { + for (size_t i = 0; i < blockN; ++i) { + expected[block + i] = T(block + 1); + } + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Broadcast<kLane>(in)); + + TestBroadcastR<D, kLane - 1>()(); + } +}; + +template <class D> +struct TestBroadcastR<D, -1> { + void operator()() const {} +}; + +struct TestBroadcast { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + TestBroadcastR<D, HWY_MIN(MaxLanes(d), 16 / sizeof(T)) - 1>()(); + } +}; + +HWY_NOINLINE void TestAllBroadcast() { + const ForPartialVectors<TestBroadcast> test; + // No u/i8. + test(uint16_t()); + test(int16_t()); + ForUIF3264(test); +} + +template <bool kFull> +struct ChooseTableSize { + template <typename T, typename DIdx> + using type = DIdx; +}; +template <> +struct ChooseTableSize<true> { + template <typename T, typename DIdx> + using type = ScalableTag<T>; +}; + +template <bool kFull> +struct TestTableLookupBytes { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { +#if HWY_TARGET != HWY_SCALAR + RandomState rng; + + const typename ChooseTableSize<kFull>::template type<T, D> d_tbl; + const Repartition<uint8_t, decltype(d_tbl)> d_tbl8; + const size_t NT8 = Lanes(d_tbl8); + + const Repartition<uint8_t, D> d8; + const size_t N8 = Lanes(d8); + + // Random input bytes + auto in_bytes = AllocateAligned<uint8_t>(NT8); + for (size_t i = 0; i < NT8; ++i) { + in_bytes[i] = Random32(&rng) & 0xFF; + } + const auto in = BitCast(d_tbl, Load(d_tbl8, in_bytes.get())); + + // Enough test data; for larger vectors, upper lanes will be zero. + const uint8_t index_bytes_source[64] = { + // Same index as source, multiple outputs from same input, + // unused input (9), ascending/descending and nonconsecutive neighbors. + 0, 2, 1, 2, 15, 12, 13, 14, 6, 7, 8, 5, 4, 3, 10, 11, + 11, 10, 3, 4, 5, 8, 7, 6, 14, 13, 12, 15, 2, 1, 2, 0, + 4, 3, 2, 2, 5, 6, 7, 7, 15, 15, 15, 15, 15, 15, 0, 1}; + auto index_bytes = AllocateAligned<uint8_t>(N8); + const size_t max_index = HWY_MIN(NT8, 16) - 1; + for (size_t i = 0; i < N8; ++i) { + index_bytes[i] = (i < 64) ? index_bytes_source[i] : 0; + // Avoid asan error for partial vectors. + index_bytes[i] = static_cast<uint8_t>(HWY_MIN(index_bytes[i], max_index)); + } + const auto indices = Load(d, reinterpret_cast<const T*>(index_bytes.get())); + + const size_t N = Lanes(d); + auto expected = AllocateAligned<T>(N); + uint8_t* expected_bytes = reinterpret_cast<uint8_t*>(expected.get()); + + for (size_t block = 0; block < N8; block += 16) { + for (size_t i = 0; i < 16 && (block + i) < N8; ++i) { + const uint8_t index = index_bytes[block + i]; + HWY_ASSERT(index <= max_index); + // Note that block + index may exceed NT8 on RVV, which is fine because + // the operation uses the larger of the table and index vector size. + HWY_ASSERT(block + index < HWY_MAX(N8, NT8)); + // For large vectors, the lane index may wrap around due to block, + // also wrap around after 8-bit overflow. + expected_bytes[block + i] = + in_bytes[(block + index) % HWY_MIN(NT8, 256)]; + } + } + HWY_ASSERT_VEC_EQ(d, expected.get(), TableLookupBytes(in, indices)); + + // Individually test zeroing each byte position. + for (size_t i = 0; i < N8; ++i) { + const uint8_t prev_expected = expected_bytes[i]; + const uint8_t prev_index = index_bytes[i]; + expected_bytes[i] = 0; + + const int idx = 0x80 + (static_cast<int>(Random32(&rng) & 7) << 4); + HWY_ASSERT(0x80 <= idx && idx < 256); + index_bytes[i] = static_cast<uint8_t>(idx); + + const auto indices = + Load(d, reinterpret_cast<const T*>(index_bytes.get())); + HWY_ASSERT_VEC_EQ(d, expected.get(), TableLookupBytesOr0(in, indices)); + expected_bytes[i] = prev_expected; + index_bytes[i] = prev_index; + } +#else + (void)d; +#endif + } +}; + +HWY_NOINLINE void TestAllTableLookupBytesSame() { + // Partial index, same-sized table. + ForIntegerTypes(ForPartialVectors<TestTableLookupBytes<false>>()); +} + +HWY_NOINLINE void TestAllTableLookupBytesMixed() { + // Partial index, full-size table. + ForIntegerTypes(ForPartialVectors<TestTableLookupBytes<true>>()); +} + +struct TestInterleaveLower { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using TU = MakeUnsigned<T>; + const size_t N = Lanes(d); + auto even_lanes = AllocateAligned<T>(N); + auto odd_lanes = AllocateAligned<T>(N); + auto expected = AllocateAligned<T>(N); + for (size_t i = 0; i < N; ++i) { + even_lanes[i] = static_cast<T>(2 * i + 0); + odd_lanes[i] = static_cast<T>(2 * i + 1); + } + const auto even = Load(d, even_lanes.get()); + const auto odd = Load(d, odd_lanes.get()); + + const size_t blockN = HWY_MIN(16 / sizeof(T), N); + for (size_t i = 0; i < Lanes(d); ++i) { + const size_t block = i / blockN; + const size_t index = (i % blockN) + block * 2 * blockN; + expected[i] = static_cast<T>(index & LimitsMax<TU>()); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveLower(even, odd)); + HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveLower(d, even, odd)); + } +}; + +struct TestInterleaveUpper { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + if (N == 1) return; + auto even_lanes = AllocateAligned<T>(N); + auto odd_lanes = AllocateAligned<T>(N); + auto expected = AllocateAligned<T>(N); + for (size_t i = 0; i < N; ++i) { + even_lanes[i] = static_cast<T>(2 * i + 0); + odd_lanes[i] = static_cast<T>(2 * i + 1); + } + const auto even = Load(d, even_lanes.get()); + const auto odd = Load(d, odd_lanes.get()); + + const size_t blockN = HWY_MIN(16 / sizeof(T), N); + for (size_t i = 0; i < Lanes(d); ++i) { + const size_t block = i / blockN; + expected[i] = T((i % blockN) + block * 2 * blockN + blockN); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveUpper(d, even, odd)); + } +}; + +HWY_NOINLINE void TestAllInterleave() { + // Not DemoteVectors because this cannot be supported by HWY_SCALAR. + ForAllTypes(ForShrinkableVectors<TestInterleaveLower>()); + ForAllTypes(ForShrinkableVectors<TestInterleaveUpper>()); +} + +struct TestZipLower { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using WideT = MakeWide<T>; + static_assert(sizeof(T) * 2 == sizeof(WideT), "Must be double-width"); + static_assert(IsSigned<T>() == IsSigned<WideT>(), "Must have same sign"); + const size_t N = Lanes(d); + auto even_lanes = AllocateAligned<T>(N); + auto odd_lanes = AllocateAligned<T>(N); + // At least 2 lanes for HWY_SCALAR + auto zip_lanes = AllocateAligned<T>(HWY_MAX(N, 2)); + const T kMaxT = LimitsMax<T>(); + for (size_t i = 0; i < N; ++i) { + even_lanes[i] = static_cast<T>((2 * i + 0) & kMaxT); + odd_lanes[i] = static_cast<T>((2 * i + 1) & kMaxT); + } + const auto even = Load(d, even_lanes.get()); + const auto odd = Load(d, odd_lanes.get()); + + const Repartition<WideT, D> dw; +#if HWY_TARGET == HWY_SCALAR + // Safely handle big-endian + const auto expected = Set(dw, static_cast<WideT>(1ULL << (sizeof(T) * 8))); +#else + const size_t blockN = HWY_MIN(size_t(16) / sizeof(T), N); + for (size_t i = 0; i < N; i += 2) { + const size_t base = (i / blockN) * blockN; + const size_t mod = i % blockN; + zip_lanes[i + 0] = even_lanes[mod / 2 + base]; + zip_lanes[i + 1] = odd_lanes[mod / 2 + base]; + } + const auto expected = + Load(dw, reinterpret_cast<const WideT*>(zip_lanes.get())); +#endif // HWY_TARGET == HWY_SCALAR + HWY_ASSERT_VEC_EQ(dw, expected, ZipLower(even, odd)); + HWY_ASSERT_VEC_EQ(dw, expected, ZipLower(dw, even, odd)); + } +}; + +HWY_NOINLINE void TestAllZipLower() { + const ForDemoteVectors<TestZipLower> lower_unsigned; + lower_unsigned(uint8_t()); + lower_unsigned(uint16_t()); +#if HWY_HAVE_INTEGER64 + lower_unsigned(uint32_t()); // generates u64 +#endif + + const ForDemoteVectors<TestZipLower> lower_signed; + lower_signed(int8_t()); + lower_signed(int16_t()); +#if HWY_HAVE_INTEGER64 + lower_signed(int32_t()); // generates i64 +#endif + + // No float - concatenating f32 does not result in a f64 +} + +// Remove this test (so it does not show as having run) if the only target is +// HWY_SCALAR, which does not support this op. +#if HWY_TARGETS != HWY_SCALAR + +struct TestZipUpper { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { +#if HWY_TARGET == HWY_SCALAR + (void)d; +#else + using WideT = MakeWide<T>; + static_assert(sizeof(T) * 2 == sizeof(WideT), "Must be double-width"); + static_assert(IsSigned<T>() == IsSigned<WideT>(), "Must have same sign"); + const size_t N = Lanes(d); + if (N < 16 / sizeof(T)) return; + auto even_lanes = AllocateAligned<T>(N); + auto odd_lanes = AllocateAligned<T>(N); + auto zip_lanes = AllocateAligned<T>(N); + const T kMaxT = LimitsMax<T>(); + for (size_t i = 0; i < N; ++i) { + even_lanes[i] = static_cast<T>((2 * i + 0) & kMaxT); + odd_lanes[i] = static_cast<T>((2 * i + 1) & kMaxT); + } + const auto even = Load(d, even_lanes.get()); + const auto odd = Load(d, odd_lanes.get()); + + const size_t blockN = HWY_MIN(size_t(16) / sizeof(T), N); + + for (size_t i = 0; i < N; i += 2) { + const size_t base = (i / blockN) * blockN + blockN / 2; + const size_t mod = i % blockN; + zip_lanes[i + 0] = even_lanes[mod / 2 + base]; + zip_lanes[i + 1] = odd_lanes[mod / 2 + base]; + } + const Repartition<WideT, D> dw; + const auto expected = + Load(dw, reinterpret_cast<const WideT*>(zip_lanes.get())); + HWY_ASSERT_VEC_EQ(dw, expected, ZipUpper(dw, even, odd)); +#endif // HWY_TARGET == HWY_SCALAR + } +}; + +HWY_NOINLINE void TestAllZipUpper() { + const ForShrinkableVectors<TestZipUpper> upper_unsigned; + upper_unsigned(uint8_t()); + upper_unsigned(uint16_t()); +#if HWY_HAVE_INTEGER64 + upper_unsigned(uint32_t()); // generates u64 +#endif + + const ForShrinkableVectors<TestZipUpper> upper_signed; + upper_signed(int8_t()); + upper_signed(int16_t()); +#if HWY_HAVE_INTEGER64 + upper_signed(int32_t()); // generates i64 +#endif + + // No float - concatenating f32 does not result in a f64 +} + +#endif // HWY_TARGETS != HWY_SCALAR + +class TestSpecialShuffle32 { + public: + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v = Iota(d, 0); + VerifyLanes32(d, Shuffle2301(v), 2, 3, 0, 1, __FILE__, __LINE__); + VerifyLanes32(d, Shuffle1032(v), 1, 0, 3, 2, __FILE__, __LINE__); + VerifyLanes32(d, Shuffle0321(v), 0, 3, 2, 1, __FILE__, __LINE__); + VerifyLanes32(d, Shuffle2103(v), 2, 1, 0, 3, __FILE__, __LINE__); + VerifyLanes32(d, Shuffle0123(v), 0, 1, 2, 3, __FILE__, __LINE__); + } + + private: + // HWY_INLINE works around a Clang SVE compiler bug where all but the first + // 128 bits (the NEON register) of actual are zero. + template <class D, class V> + HWY_INLINE void VerifyLanes32(D d, VecArg<V> actual, const size_t i3, + const size_t i2, const size_t i1, + const size_t i0, const char* filename, + const int line) { + using T = TFromD<D>; + constexpr size_t kBlockN = 16 / sizeof(T); + const size_t N = Lanes(d); + if (N < 4) return; + auto expected = AllocateAligned<T>(N); + for (size_t block = 0; block < N; block += kBlockN) { + expected[block + 3] = static_cast<T>(block + i3); + expected[block + 2] = static_cast<T>(block + i2); + expected[block + 1] = static_cast<T>(block + i1); + expected[block + 0] = static_cast<T>(block + i0); + } + AssertVecEqual(d, expected.get(), actual, filename, line); + } +}; + +class TestSpecialShuffle64 { + public: + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v = Iota(d, 0); + VerifyLanes64(d, Shuffle01(v), 0, 1, __FILE__, __LINE__); + } + + private: + // HWY_INLINE works around a Clang SVE compiler bug where all but the first + // 128 bits (the NEON register) of actual are zero. + template <class D, class V> + HWY_INLINE void VerifyLanes64(D d, VecArg<V> actual, const size_t i1, + const size_t i0, const char* filename, + const int line) { + using T = TFromD<D>; + constexpr size_t kBlockN = 16 / sizeof(T); + const size_t N = Lanes(d); + if (N < 2) return; + auto expected = AllocateAligned<T>(N); + for (size_t block = 0; block < N; block += kBlockN) { + expected[block + 1] = static_cast<T>(block + i1); + expected[block + 0] = static_cast<T>(block + i0); + } + AssertVecEqual(d, expected.get(), actual, filename, line); + } +}; + +HWY_NOINLINE void TestAllSpecialShuffles() { + const ForGEVectors<128, TestSpecialShuffle32> test32; + test32(uint32_t()); + test32(int32_t()); + test32(float()); + +#if HWY_HAVE_INTEGER64 + const ForGEVectors<128, TestSpecialShuffle64> test64; + test64(uint64_t()); + test64(int64_t()); +#endif + +#if HWY_HAVE_FLOAT64 + const ForGEVectors<128, TestSpecialShuffle64> test_d; + test_d(double()); +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyBlockwiseTest); +HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllBroadcast); +HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllTableLookupBytesSame); +HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllTableLookupBytesMixed); +HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllInterleave); +HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllZipLower); +#if HWY_TARGETS != HWY_SCALAR +HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllZipUpper); +#endif +HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllSpecialShuffles); +} // namespace hwy + +#endif diff --git a/third_party/highway/hwy/tests/combine_test.cc b/third_party/highway/hwy/tests/combine_test.cc new file mode 100644 index 0000000000..e2f4cbeb00 --- /dev/null +++ b/third_party/highway/hwy/tests/combine_test.cc @@ -0,0 +1,275 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <stddef.h> +#include <stdint.h> +#include <string.h> // memcpy + +#include <algorithm> // std::fill + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/combine_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +struct TestLowerHalf { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const Half<D> d2; + + const size_t N = Lanes(d); + auto lanes = AllocateAligned<T>(N); + auto lanes2 = AllocateAligned<T>(N); + std::fill(lanes.get(), lanes.get() + N, T(0)); + std::fill(lanes2.get(), lanes2.get() + N, T(0)); + const auto v = Iota(d, 1); + Store(LowerHalf(d2, v), d2, lanes.get()); + Store(LowerHalf(v), d2, lanes2.get()); // optionally without D + size_t i = 0; + for (; i < Lanes(d2); ++i) { + HWY_ASSERT_EQ(T(1 + i), lanes[i]); + HWY_ASSERT_EQ(T(1 + i), lanes2[i]); + } + // Other half remains unchanged + for (; i < N; ++i) { + HWY_ASSERT_EQ(T(0), lanes[i]); + HWY_ASSERT_EQ(T(0), lanes2[i]); + } + } +}; + +struct TestLowerQuarter { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const Half<D> d2; + const Half<decltype(d2)> d4; + + const size_t N = Lanes(d); + auto lanes = AllocateAligned<T>(N); + auto lanes2 = AllocateAligned<T>(N); + std::fill(lanes.get(), lanes.get() + N, T(0)); + std::fill(lanes2.get(), lanes2.get() + N, T(0)); + const auto v = Iota(d, 1); + const auto lo = LowerHalf(d4, LowerHalf(d2, v)); + const auto lo2 = LowerHalf(LowerHalf(v)); // optionally without D + Store(lo, d4, lanes.get()); + Store(lo2, d4, lanes2.get()); + size_t i = 0; + for (; i < Lanes(d4); ++i) { + HWY_ASSERT_EQ(T(i + 1), lanes[i]); + HWY_ASSERT_EQ(T(i + 1), lanes2[i]); + } + // Upper 3/4 remain unchanged + for (; i < N; ++i) { + HWY_ASSERT_EQ(T(0), lanes[i]); + HWY_ASSERT_EQ(T(0), lanes2[i]); + } + } +}; + +HWY_NOINLINE void TestAllLowerHalf() { + ForAllTypes(ForHalfVectors<TestLowerHalf>()); + + // The minimum vector size is 128 bits, so there's no guarantee we can have + // quarters of 64-bit lanes, hence test 'all' other types. + ForHalfVectors<TestLowerQuarter, 2> test_quarter; + ForUI8(test_quarter); + ForUI16(test_quarter); // exclude float16_t - cannot compare + ForUIF32(test_quarter); +} + +struct TestUpperHalf { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + // Scalar does not define UpperHalf. +#if HWY_TARGET != HWY_SCALAR + const Half<D> d2; + const size_t N2 = Lanes(d2); + HWY_ASSERT(N2 * 2 == Lanes(d)); + auto expected = AllocateAligned<T>(N2); + size_t i = 0; + for (; i < N2; ++i) { + expected[i] = static_cast<T>(N2 + 1 + i); + } + HWY_ASSERT_VEC_EQ(d2, expected.get(), UpperHalf(d2, Iota(d, 1))); +#else + (void)d; +#endif + } +}; + +HWY_NOINLINE void TestAllUpperHalf() { + ForAllTypes(ForHalfVectors<TestUpperHalf>()); +} + +struct TestZeroExtendVector { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const Twice<D> d2; + + const auto v = Iota(d, 1); + const size_t N = Lanes(d); + const size_t N2 = Lanes(d2); + // If equal, then N was already MaxLanes(d) and it's not clear what + // Combine or ZeroExtendVector should return. + if (N2 == N) return; + HWY_ASSERT(N2 == 2 * N); + auto lanes = AllocateAligned<T>(N2); + Store(v, d, &lanes[0]); + Store(v, d, &lanes[N]); + + const auto ext = ZeroExtendVector(d2, v); + Store(ext, d2, lanes.get()); + + // Lower half is unchanged + HWY_ASSERT_VEC_EQ(d, v, Load(d, &lanes[0])); + // Upper half is zero + HWY_ASSERT_VEC_EQ(d, Zero(d), Load(d, &lanes[N])); + } +}; + +HWY_NOINLINE void TestAllZeroExtendVector() { + ForAllTypes(ForExtendableVectors<TestZeroExtendVector>()); +} + +struct TestCombine { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const Twice<D> d2; + const size_t N2 = Lanes(d2); + auto lanes = AllocateAligned<T>(N2); + + const auto lo = Iota(d, 1); + const auto hi = Iota(d, static_cast<T>(N2 / 2 + 1)); + const auto combined = Combine(d2, hi, lo); + Store(combined, d2, lanes.get()); + + const auto expected = Iota(d2, 1); + HWY_ASSERT_VEC_EQ(d2, expected, combined); + } +}; + +HWY_NOINLINE void TestAllCombine() { + ForAllTypes(ForExtendableVectors<TestCombine>()); +} + +struct TestConcat { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + if (N == 1) return; + const size_t half_bytes = N * sizeof(T) / 2; + + auto hi = AllocateAligned<T>(N); + auto lo = AllocateAligned<T>(N); + auto expected = AllocateAligned<T>(N); + RandomState rng; + for (size_t rep = 0; rep < 10; ++rep) { + for (size_t i = 0; i < N; ++i) { + hi[i] = static_cast<T>(Random64(&rng) & 0xFF); + lo[i] = static_cast<T>(Random64(&rng) & 0xFF); + } + + { + memcpy(&expected[N / 2], &hi[N / 2], half_bytes); + memcpy(&expected[0], &lo[0], half_bytes); + const auto vhi = Load(d, hi.get()); + const auto vlo = Load(d, lo.get()); + HWY_ASSERT_VEC_EQ(d, expected.get(), ConcatUpperLower(d, vhi, vlo)); + } + + { + memcpy(&expected[N / 2], &hi[N / 2], half_bytes); + memcpy(&expected[0], &lo[N / 2], half_bytes); + const auto vhi = Load(d, hi.get()); + const auto vlo = Load(d, lo.get()); + HWY_ASSERT_VEC_EQ(d, expected.get(), ConcatUpperUpper(d, vhi, vlo)); + } + + { + memcpy(&expected[N / 2], &hi[0], half_bytes); + memcpy(&expected[0], &lo[N / 2], half_bytes); + const auto vhi = Load(d, hi.get()); + const auto vlo = Load(d, lo.get()); + HWY_ASSERT_VEC_EQ(d, expected.get(), ConcatLowerUpper(d, vhi, vlo)); + } + + { + memcpy(&expected[N / 2], &hi[0], half_bytes); + memcpy(&expected[0], &lo[0], half_bytes); + const auto vhi = Load(d, hi.get()); + const auto vlo = Load(d, lo.get()); + HWY_ASSERT_VEC_EQ(d, expected.get(), ConcatLowerLower(d, vhi, vlo)); + } + } + } +}; + +HWY_NOINLINE void TestAllConcat() { + ForAllTypes(ForShrinkableVectors<TestConcat>()); +} + +struct TestConcatOddEven { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { +#if HWY_TARGET != HWY_SCALAR + const size_t N = Lanes(d); + const auto hi = Iota(d, static_cast<T>(N)); + const auto lo = Iota(d, 0); + const auto even = Add(Iota(d, 0), Iota(d, 0)); + const auto odd = Add(even, Set(d, 1)); + HWY_ASSERT_VEC_EQ(d, odd, ConcatOdd(d, hi, lo)); + HWY_ASSERT_VEC_EQ(d, even, ConcatEven(d, hi, lo)); + + // This test catches inadvertent saturation. + const auto min = Set(d, LowestValue<T>()); + const auto max = Set(d, HighestValue<T>()); + HWY_ASSERT_VEC_EQ(d, max, ConcatOdd(d, max, max)); + HWY_ASSERT_VEC_EQ(d, max, ConcatEven(d, max, max)); + HWY_ASSERT_VEC_EQ(d, min, ConcatOdd(d, min, min)); + HWY_ASSERT_VEC_EQ(d, min, ConcatEven(d, min, min)); +#else + (void)d; +#endif + } +}; + +HWY_NOINLINE void TestAllConcatOddEven() { + ForAllTypes(ForShrinkableVectors<TestConcatOddEven>()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyCombineTest); +HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllLowerHalf); +HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllUpperHalf); +HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllZeroExtendVector); +HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllCombine); +HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllConcat); +HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllConcatOddEven); +} // namespace hwy + +#endif // HWY_ONCE diff --git a/third_party/highway/hwy/tests/compare_test.cc b/third_party/highway/hwy/tests/compare_test.cc new file mode 100644 index 0000000000..a96e29fc62 --- /dev/null +++ b/third_party/highway/hwy/tests/compare_test.cc @@ -0,0 +1,509 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <stddef.h> +#include <stdint.h> +#include <string.h> // memset + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/compare_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// All types. +struct TestEquality { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v2 = Iota(d, 2); + const auto v2b = Iota(d, 2); + const auto v3 = Iota(d, 3); + + const auto mask_false = MaskFalse(d); + const auto mask_true = MaskTrue(d); + + HWY_ASSERT_MASK_EQ(d, mask_false, Eq(v2, v3)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq(v3, v2)); + HWY_ASSERT_MASK_EQ(d, mask_true, Eq(v2, v2)); + HWY_ASSERT_MASK_EQ(d, mask_true, Eq(v2, v2b)); + + HWY_ASSERT_MASK_EQ(d, mask_true, Ne(v2, v3)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne(v3, v2)); + HWY_ASSERT_MASK_EQ(d, mask_false, Ne(v2, v2)); + HWY_ASSERT_MASK_EQ(d, mask_false, Ne(v2, v2b)); + } +}; + +HWY_NOINLINE void TestAllEquality() { + ForAllTypes(ForPartialVectors<TestEquality>()); +} + +// a > b should be true, verify that for Gt/Lt and with swapped args. +template <class D> +void EnsureGreater(D d, TFromD<D> a, TFromD<D> b, const char* file, int line) { + const auto mask_false = MaskFalse(d); + const auto mask_true = MaskTrue(d); + + const auto va = Set(d, a); + const auto vb = Set(d, b); + AssertMaskEqual(d, mask_true, Gt(va, vb), file, line); + AssertMaskEqual(d, mask_false, Lt(va, vb), file, line); + + // Swapped order + AssertMaskEqual(d, mask_false, Gt(vb, va), file, line); + AssertMaskEqual(d, mask_true, Lt(vb, va), file, line); + + // Also ensure irreflexive + AssertMaskEqual(d, mask_false, Gt(va, va), file, line); + AssertMaskEqual(d, mask_false, Gt(vb, vb), file, line); + AssertMaskEqual(d, mask_false, Lt(va, va), file, line); + AssertMaskEqual(d, mask_false, Lt(vb, vb), file, line); +} + +#define HWY_ENSURE_GREATER(d, a, b) EnsureGreater(d, a, b, __FILE__, __LINE__) + +struct TestStrictUnsigned { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const T max = LimitsMax<T>(); + const auto v0 = Zero(d); + const auto v2 = And(Iota(d, T(2)), Set(d, 255)); // 0..255 + + const auto mask_false = MaskFalse(d); + + // Individual values of interest + HWY_ENSURE_GREATER(d, 2, 1); + HWY_ENSURE_GREATER(d, 1, 0); + HWY_ENSURE_GREATER(d, 128, 127); + HWY_ENSURE_GREATER(d, max, max / 2); + HWY_ENSURE_GREATER(d, max, 1); + HWY_ENSURE_GREATER(d, max, 0); + + // Also use Iota to ensure lanes are independent + HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, v0)); + HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v0, v2)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v0, v0)); + HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v0, v0)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, v2)); + HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v2, v2)); + } +}; + +HWY_NOINLINE void TestAllStrictUnsigned() { + ForUnsignedTypes(ForPartialVectors<TestStrictUnsigned>()); +} + +struct TestStrictInt { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const T min = LimitsMin<T>(); + const T max = LimitsMax<T>(); + const auto v0 = Zero(d); + const auto v2 = And(Iota(d, T(2)), Set(d, 127)); // 0..127 + const auto vn = Sub(Neg(v2), Set(d, 1)); // -1..-128 + + const auto mask_false = MaskFalse(d); + const auto mask_true = MaskTrue(d); + + // Individual values of interest + HWY_ENSURE_GREATER(d, 2, 1); + HWY_ENSURE_GREATER(d, 1, 0); + HWY_ENSURE_GREATER(d, 0, -1); + HWY_ENSURE_GREATER(d, -1, -2); + HWY_ENSURE_GREATER(d, max, max / 2); + HWY_ENSURE_GREATER(d, max, 1); + HWY_ENSURE_GREATER(d, max, 0); + HWY_ENSURE_GREATER(d, max, -1); + HWY_ENSURE_GREATER(d, max, min); + HWY_ENSURE_GREATER(d, 0, min); + HWY_ENSURE_GREATER(d, min / 2, min); + + // Also use Iota to ensure lanes are independent + HWY_ASSERT_MASK_EQ(d, mask_true, Gt(v2, vn)); + HWY_ASSERT_MASK_EQ(d, mask_true, Lt(vn, v2)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, vn)); + HWY_ASSERT_MASK_EQ(d, mask_false, Gt(vn, v2)); + + HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v0, v0)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, v2)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt(vn, vn)); + HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v0, v0)); + HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v2, v2)); + HWY_ASSERT_MASK_EQ(d, mask_false, Gt(vn, vn)); + } +}; + +// S-SSE3 bug (#795): same upper, differing MSB in lower +struct TestStrictInt64 { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto m0 = MaskFalse(d); + const auto m1 = MaskTrue(d); + HWY_ASSERT_MASK_EQ(d, m0, Lt(Set(d, 0x380000000LL), Set(d, 0x300000001LL))); + HWY_ASSERT_MASK_EQ(d, m1, Lt(Set(d, 0xF00000000LL), Set(d, 0xF80000000LL))); + HWY_ASSERT_MASK_EQ(d, m1, Lt(Set(d, 0xF00000000LL), Set(d, 0xF80000001LL))); + } +}; + +HWY_NOINLINE void TestAllStrictInt() { + ForSignedTypes(ForPartialVectors<TestStrictInt>()); + ForPartialVectors<TestStrictInt64>()(int64_t()); +} + +struct TestStrictFloat { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const T huge_neg = T(-1E35); + const T huge_pos = T(1E36); + const auto v0 = Zero(d); + const auto v2 = Iota(d, T(2)); + const auto vn = Neg(v2); + + const auto mask_false = MaskFalse(d); + const auto mask_true = MaskTrue(d); + + // Individual values of interest + HWY_ENSURE_GREATER(d, 2, 1); + HWY_ENSURE_GREATER(d, 1, 0); + HWY_ENSURE_GREATER(d, 0, -1); + HWY_ENSURE_GREATER(d, -1, -2); + HWY_ENSURE_GREATER(d, huge_pos, 1); + HWY_ENSURE_GREATER(d, huge_pos, 0); + HWY_ENSURE_GREATER(d, huge_pos, -1); + HWY_ENSURE_GREATER(d, huge_pos, huge_neg); + HWY_ENSURE_GREATER(d, 0, huge_neg); + + // Also use Iota to ensure lanes are independent + HWY_ASSERT_MASK_EQ(d, mask_true, Gt(v2, vn)); + HWY_ASSERT_MASK_EQ(d, mask_true, Lt(vn, v2)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, vn)); + HWY_ASSERT_MASK_EQ(d, mask_false, Gt(vn, v2)); + + HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v0, v0)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, v2)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt(vn, vn)); + HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v0, v0)); + HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v2, v2)); + HWY_ASSERT_MASK_EQ(d, mask_false, Gt(vn, vn)); + } +}; + +HWY_NOINLINE void TestAllStrictFloat() { + ForFloatTypes(ForPartialVectors<TestStrictFloat>()); +} + +struct TestWeakFloat { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v2 = Iota(d, T(2)); + const auto vn = Iota(d, -T(Lanes(d))); + + const auto mask_false = MaskFalse(d); + const auto mask_true = MaskTrue(d); + + HWY_ASSERT_MASK_EQ(d, mask_true, Ge(v2, v2)); + HWY_ASSERT_MASK_EQ(d, mask_true, Le(vn, vn)); + + HWY_ASSERT_MASK_EQ(d, mask_true, Ge(v2, vn)); + HWY_ASSERT_MASK_EQ(d, mask_true, Le(vn, v2)); + + HWY_ASSERT_MASK_EQ(d, mask_false, Le(v2, vn)); + HWY_ASSERT_MASK_EQ(d, mask_false, Ge(vn, v2)); + } +}; + +HWY_NOINLINE void TestAllWeakFloat() { + ForFloatTypes(ForPartialVectors<TestWeakFloat>()); +} + +template <class D> +static HWY_NOINLINE Vec<D> Make128(D d, uint64_t hi, uint64_t lo) { + alignas(16) uint64_t in[2]; + in[0] = lo; + in[1] = hi; + return LoadDup128(d, in); +} + +struct TestLt128 { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using V = Vec<D>; + const V v00 = Zero(d); + const V v01 = Make128(d, 0, 1); + const V v10 = Make128(d, 1, 0); + const V v11 = Add(v01, v10); + + const auto mask_false = MaskFalse(d); + const auto mask_true = MaskTrue(d); + + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v00, v00)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v01, v01)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v10, v10)); + + HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v00, v01)); + HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v01, v10)); + HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v01, v11)); + + // Reversed order + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v01, v00)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v10, v01)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v11, v01)); + + // Also check 128-bit blocks are independent + const V iota = Iota(d, 1); + HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, iota, Add(iota, v01))); + HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, iota, Add(iota, v10))); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, Add(iota, v01), iota)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, Add(iota, v10), iota)); + + // Max value + const V vm = Make128(d, LimitsMax<T>(), LimitsMax<T>()); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, vm)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, v00)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, v01)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, v10)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, v11)); + HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v00, vm)); + HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v01, vm)); + HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v10, vm)); + HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v11, vm)); + } +}; + +HWY_NOINLINE void TestAllLt128() { ForGEVectors<128, TestLt128>()(uint64_t()); } + +struct TestLt128Upper { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using V = Vec<D>; + const V v00 = Zero(d); + const V v01 = Make128(d, 0, 1); + const V v10 = Make128(d, 1, 0); + const V v11 = Add(v01, v10); + + const auto mask_false = MaskFalse(d); + const auto mask_true = MaskTrue(d); + + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v00, v00)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v01, v01)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v10, v10)); + + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v00, v01)); + HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v01, v10)); + HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v01, v11)); + + // Reversed order + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v01, v00)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v10, v01)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v11, v01)); + + // Also check 128-bit blocks are independent + const V iota = Iota(d, 1); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, iota, Add(iota, v01))); + HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, iota, Add(iota, v10))); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, Add(iota, v01), iota)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, Add(iota, v10), iota)); + + // Max value + const V vm = Make128(d, LimitsMax<T>(), LimitsMax<T>()); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, vm, vm)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, vm, v00)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, vm, v01)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, vm, v10)); + HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, vm, v11)); + HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v00, vm)); + HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v01, vm)); + HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v10, vm)); + HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v11, vm)); + } +}; + +HWY_NOINLINE void TestAllLt128Upper() { + ForGEVectors<128, TestLt128Upper>()(uint64_t()); +} + +struct TestEq128 { // Also Ne128 + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using V = Vec<D>; + const V v00 = Zero(d); + const V v01 = Make128(d, 0, 1); + const V v10 = Make128(d, 1, 0); + const V v11 = Add(v01, v10); + + const auto mask_false = MaskFalse(d); + const auto mask_true = MaskTrue(d); + + HWY_ASSERT_MASK_EQ(d, mask_true, Eq128(d, v00, v00)); + HWY_ASSERT_MASK_EQ(d, mask_true, Eq128(d, v01, v01)); + HWY_ASSERT_MASK_EQ(d, mask_true, Eq128(d, v10, v10)); + HWY_ASSERT_MASK_EQ(d, mask_false, Ne128(d, v00, v00)); + HWY_ASSERT_MASK_EQ(d, mask_false, Ne128(d, v01, v01)); + HWY_ASSERT_MASK_EQ(d, mask_false, Ne128(d, v10, v10)); + + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v00, v01)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v01, v10)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v01, v11)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v00, v01)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v01, v10)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v01, v11)); + + // Reversed order + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v01, v00)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v10, v01)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v11, v01)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v01, v00)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v10, v01)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v11, v01)); + + // Also check 128-bit blocks are independent + const V iota = Iota(d, 1); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, iota, Add(iota, v01))); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, iota, Add(iota, v10))); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, Add(iota, v01), iota)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, Add(iota, v10), iota)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, iota, Add(iota, v01))); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, iota, Add(iota, v10))); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, Add(iota, v01), iota)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, Add(iota, v10), iota)); + + // Max value + const V vm = Make128(d, LimitsMax<T>(), LimitsMax<T>()); + HWY_ASSERT_MASK_EQ(d, mask_true, Eq128(d, vm, vm)); + HWY_ASSERT_MASK_EQ(d, mask_false, Ne128(d, vm, vm)); + + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, vm, v00)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, vm, v01)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, vm, v10)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, vm, v11)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v00, vm)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v01, vm)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v10, vm)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v11, vm)); + + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, vm, v00)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, vm, v01)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, vm, v10)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, vm, v11)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v00, vm)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v01, vm)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v10, vm)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v11, vm)); + } +}; + +HWY_NOINLINE void TestAllEq128() { ForGEVectors<128, TestEq128>()(uint64_t()); } + +struct TestEq128Upper { // Also Ne128Upper + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using V = Vec<D>; + const V v00 = Zero(d); + const V v01 = Make128(d, 0, 1); + const V v10 = Make128(d, 1, 0); + const V v11 = Add(v01, v10); + + const auto mask_false = MaskFalse(d); + const auto mask_true = MaskTrue(d); + + HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v00, v00)); + HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v01, v01)); + HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v10, v10)); + HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v00, v00)); + HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v01, v01)); + HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v10, v10)); + + HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v00, v01)); + HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v00, v01)); + + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v01, v10)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v01, v11)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v01, v10)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v01, v11)); + + // Reversed order + HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v01, v00)); + HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v01, v00)); + + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v10, v01)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v11, v01)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v10, v01)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v11, v01)); + + // Also check 128-bit blocks are independent + const V iota = Iota(d, 1); + HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, iota, Add(iota, v01))); + HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, iota, Add(iota, v01))); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, iota, Add(iota, v10))); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, iota, Add(iota, v10))); + HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, Add(iota, v01), iota)); + HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, Add(iota, v01), iota)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, Add(iota, v10), iota)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, Add(iota, v10), iota)); + + // Max value + const V vm = Make128(d, LimitsMax<T>(), LimitsMax<T>()); + HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, vm, vm)); + HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, vm, vm)); + + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, vm, v00)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, vm, v01)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, vm, v10)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, vm, v11)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v00, vm)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v01, vm)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v10, vm)); + HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v11, vm)); + + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, vm, v00)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, vm, v01)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, vm, v10)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, vm, v11)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v00, vm)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v01, vm)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v10, vm)); + HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v11, vm)); + } +}; + +HWY_NOINLINE void TestAllEq128Upper() { + ForGEVectors<128, TestEq128Upper>()(uint64_t()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyCompareTest); +HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllEquality); +HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictUnsigned); +HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictInt); +HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictFloat); +HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllWeakFloat); +HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllLt128); +HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllLt128Upper); +HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllEq128); +HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllEq128Upper); +} // namespace hwy + +#endif diff --git a/third_party/highway/hwy/tests/compress_test.cc b/third_party/highway/hwy/tests/compress_test.cc new file mode 100644 index 0000000000..ae008b4dc4 --- /dev/null +++ b/third_party/highway/hwy/tests/compress_test.cc @@ -0,0 +1,833 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <stddef.h> +#include <stdint.h> +#include <string.h> // memset + +#include <array> // IWYU pragma: keep + +#include "hwy/base.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/compress_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// Regenerate tables used in the implementation, instead of testing. +#define HWY_PRINT_TABLES 0 + +#if !HWY_PRINT_TABLES || HWY_IDE + +template <class D, class DI, typename T = TFromD<D>, typename TI = TFromD<DI>> +void CheckStored(D d, DI di, const char* op, size_t expected_pos, + size_t actual_pos, size_t num_to_check, + const AlignedFreeUniquePtr<T[]>& in, + const AlignedFreeUniquePtr<TI[]>& mask_lanes, + const AlignedFreeUniquePtr<T[]>& expected, const T* actual_u, + int line) { + if (expected_pos != actual_pos) { + hwy::Abort(__FILE__, line, + "%s: size mismatch for %s: expected %d, actual %d\n", op, + TypeName(T(), Lanes(d)).c_str(), static_cast<int>(expected_pos), + static_cast<int>(actual_pos)); + } + // Modified from AssertVecEqual - we may not be checking all lanes. + for (size_t i = 0; i < num_to_check; ++i) { + if (!IsEqual(expected[i], actual_u[i])) { + const size_t N = Lanes(d); + fprintf(stderr, "%s: mismatch at i=%d of %d, line %d:\n\n", op, + static_cast<int>(i), static_cast<int>(num_to_check), line); + Print(di, "mask", Load(di, mask_lanes.get()), 0, N); + Print(d, "in", Load(d, in.get()), 0, N); + Print(d, "expect", Load(d, expected.get()), 0, num_to_check); + Print(d, "actual", Load(d, actual_u), 0, num_to_check); + HWY_ASSERT(false); + } + } +} + +struct TestCompress { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + RandomState rng; + + using TI = MakeSigned<T>; // For mask > 0 comparison + using TU = MakeUnsigned<T>; + const Rebind<TI, D> di; + const size_t N = Lanes(d); + + for (int frac : {0, 2, 3}) { + // For CompressStore + const size_t misalign = static_cast<size_t>(frac) * N / 4; + + auto in_lanes = AllocateAligned<T>(N); + auto mask_lanes = AllocateAligned<TI>(N); + auto garbage = AllocateAligned<TU>(N); + auto expected = AllocateAligned<T>(N); + auto actual_a = AllocateAligned<T>(misalign + N); + T* actual_u = actual_a.get() + misalign; + + const size_t bits_size = RoundUpTo((N + 7) / 8, 8); + auto bits = AllocateAligned<uint8_t>(bits_size); + memset(bits.get(), 0, bits_size); // for MSAN + + // Each lane should have a chance of having mask=true. + for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { + size_t expected_pos = 0; + for (size_t i = 0; i < N; ++i) { + const uint64_t r = Random32(&rng); + in_lanes[i] = T(); // cannot initialize float16_t directly. + CopyBytes<sizeof(T)>(&r, &in_lanes[i]); // not same size + mask_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0); + if (mask_lanes[i] > 0) { + expected[expected_pos++] = in_lanes[i]; + } + garbage[i] = static_cast<TU>(Random64(&rng)); + } + size_t num_to_check; + if (CompressIsPartition<T>::value) { + // For non-native Compress, also check that mask=false lanes were + // moved to the back of the vector (highest indices). + size_t extra = expected_pos; + for (size_t i = 0; i < N; ++i) { + if (mask_lanes[i] == 0) { + expected[extra++] = in_lanes[i]; + } + } + HWY_ASSERT(extra == N); + num_to_check = N; + } else { + // For native Compress, only the mask=true lanes are defined. + num_to_check = expected_pos; + } + + const auto in = Load(d, in_lanes.get()); + const auto mask = + RebindMask(d, Gt(Load(di, mask_lanes.get()), Zero(di))); + StoreMaskBits(d, mask, bits.get()); + + // Compress + memset(actual_u, 0, N * sizeof(T)); + StoreU(Compress(in, mask), d, actual_u); + CheckStored(d, di, "Compress", expected_pos, expected_pos, num_to_check, + in_lanes, mask_lanes, expected, actual_u, __LINE__); + + // CompressNot + memset(actual_u, 0, N * sizeof(T)); + StoreU(CompressNot(in, Not(mask)), d, actual_u); + CheckStored(d, di, "CompressNot", expected_pos, expected_pos, + num_to_check, in_lanes, mask_lanes, expected, actual_u, + __LINE__); + + // CompressStore + memset(actual_u, 0, N * sizeof(T)); + const size_t size1 = CompressStore(in, mask, d, actual_u); + // expected_pos instead of num_to_check because this op is not + // affected by CompressIsPartition. + CheckStored(d, di, "CompressStore", expected_pos, size1, expected_pos, + in_lanes, mask_lanes, expected, actual_u, __LINE__); + + // CompressBlendedStore + memcpy(actual_u, garbage.get(), N * sizeof(T)); + const size_t size2 = CompressBlendedStore(in, mask, d, actual_u); + // expected_pos instead of num_to_check because this op only writes + // the mask=true lanes. + CheckStored(d, di, "CompressBlendedStore", expected_pos, size2, + expected_pos, in_lanes, mask_lanes, expected, actual_u, + __LINE__); + // Subsequent lanes are untouched. + for (size_t i = size2; i < N; ++i) { +#if HWY_COMPILER_MSVC && HWY_TARGET == HWY_AVX2 + // TODO(eustas): re-enable when compiler is fixed +#else + HWY_ASSERT_EQ(garbage[i], reinterpret_cast<TU*>(actual_u)[i]); +#endif + } + + // CompressBits + memset(actual_u, 0, N * sizeof(T)); + StoreU(CompressBits(in, bits.get()), d, actual_u); + CheckStored(d, di, "CompressBits", expected_pos, expected_pos, + num_to_check, in_lanes, mask_lanes, expected, actual_u, + __LINE__); + + // CompressBitsStore + memset(actual_u, 0, N * sizeof(T)); + const size_t size3 = CompressBitsStore(in, bits.get(), d, actual_u); + // expected_pos instead of num_to_check because this op is not + // affected by CompressIsPartition. + CheckStored(d, di, "CompressBitsStore", expected_pos, size3, + expected_pos, in_lanes, mask_lanes, expected, actual_u, + __LINE__); + } // rep + } // frac + } // operator() +}; + +HWY_NOINLINE void TestAllCompress() { + ForAllTypes(ForPartialVectors<TestCompress>()); +} + +struct TestCompressBlocks { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { +#if HWY_TARGET == HWY_SCALAR + (void)d; +#else + static_assert(sizeof(T) == 8 && !IsSigned<T>(), "Should be u64"); + RandomState rng; + + using TI = MakeSigned<T>; // For mask > 0 comparison + const Rebind<TI, D> di; + const size_t N = Lanes(d); + + auto in_lanes = AllocateAligned<T>(N); + auto mask_lanes = AllocateAligned<TI>(N); + auto expected = AllocateAligned<T>(N); + auto actual = AllocateAligned<T>(N); + + // Each lane should have a chance of having mask=true. + for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { + size_t expected_pos = 0; + for (size_t i = 0; i < N; i += 2) { + const uint64_t bits = Random32(&rng); + in_lanes[i + 1] = in_lanes[i] = T(); // cannot set float16_t directly. + CopyBytes<sizeof(T)>(&bits, &in_lanes[i]); // not same size + CopyBytes<sizeof(T)>(&bits, &in_lanes[i + 1]); // not same size + mask_lanes[i + 1] = mask_lanes[i] = TI{(Random32(&rng) & 8) ? 1 : 0}; + if (mask_lanes[i] > 0) { + expected[expected_pos++] = in_lanes[i]; + expected[expected_pos++] = in_lanes[i + 1]; + } + } + size_t num_to_check; + if (CompressIsPartition<T>::value) { + // For non-native Compress, also check that mask=false lanes were + // moved to the back of the vector (highest indices). + size_t extra = expected_pos; + for (size_t i = 0; i < N; ++i) { + if (mask_lanes[i] == 0) { + expected[extra++] = in_lanes[i]; + } + } + HWY_ASSERT(extra == N); + num_to_check = N; + } else { + // For native Compress, only the mask=true lanes are defined. + num_to_check = expected_pos; + } + + const auto in = Load(d, in_lanes.get()); + const auto mask = RebindMask(d, Gt(Load(di, mask_lanes.get()), Zero(di))); + + // CompressBlocksNot + memset(actual.get(), 0, N * sizeof(T)); + StoreU(CompressBlocksNot(in, Not(mask)), d, actual.get()); + CheckStored(d, di, "CompressBlocksNot", expected_pos, expected_pos, + num_to_check, in_lanes, mask_lanes, expected, actual.get(), + __LINE__); + } // rep +#endif // HWY_TARGET == HWY_SCALAR + } // operator() +}; + +HWY_NOINLINE void TestAllCompressBlocks() { + ForGE128Vectors<TestCompressBlocks>()(uint64_t()); +} + +#endif // !HWY_PRINT_TABLES + +#if HWY_PRINT_TABLES || HWY_IDE +namespace detail { // for code folding + +void PrintCompress8x8Tables() { + printf("======================================= 8x8\n"); + constexpr size_t N = 8; + for (uint64_t code = 0; code < (1ull << N); ++code) { + std::array<uint8_t, N> indices{0}; + size_t pos = 0; + // All lanes where mask = true + for (size_t i = 0; i < N; ++i) { + if (code & (1ull << i)) { + indices[pos++] = i; + } + } + // All lanes where mask = false + for (size_t i = 0; i < N; ++i) { + if (!(code & (1ull << i))) { + indices[pos++] = i; + } + } + HWY_ASSERT(pos == N); + + for (size_t i = 0; i < N; ++i) { + printf("%d,", indices[i]); + } + printf(code & 1 ? "//\n" : "/**/"); + } + printf("\n"); +} + +void PrintCompress16x8Tables() { + printf("======================================= 16x8\n"); + constexpr size_t N = 8; // 128-bit SIMD + for (uint64_t code = 0; code < (1ull << N); ++code) { + std::array<uint8_t, N> indices{0}; + size_t pos = 0; + // All lanes where mask = true + for (size_t i = 0; i < N; ++i) { + if (code & (1ull << i)) { + indices[pos++] = i; + } + } + // All lanes where mask = false + for (size_t i = 0; i < N; ++i) { + if (!(code & (1ull << i))) { + indices[pos++] = i; + } + } + HWY_ASSERT(pos == N); + + // Doubled (for converting lane to byte indices) + for (size_t i = 0; i < N; ++i) { + printf("%d,", 2 * indices[i]); + } + printf(code & 1 ? "//\n" : "/**/"); + } + printf("\n"); +} + +void PrintCompressNot16x8Tables() { + printf("======================================= Not 16x8\n"); + constexpr size_t N = 8; // 128-bit SIMD + for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) { + const uint64_t code = ~not_code; + std::array<uint8_t, N> indices{0}; + size_t pos = 0; + // All lanes where mask = true + for (size_t i = 0; i < N; ++i) { + if (code & (1ull << i)) { + indices[pos++] = i; + } + } + // All lanes where mask = false + for (size_t i = 0; i < N; ++i) { + if (!(code & (1ull << i))) { + indices[pos++] = i; + } + } + HWY_ASSERT(pos == N); + + // Doubled (for converting lane to byte indices) + for (size_t i = 0; i < N; ++i) { + printf("%d,", 2 * indices[i]); + } + printf(not_code & 1 ? "//\n" : "/**/"); + } + printf("\n"); +} + +// Compressed to nibbles, unpacked via variable right shift. Also includes +// FirstN bits in the nibble MSB. +void PrintCompress32x8Tables() { + printf("======================================= 32/64x8\n"); + constexpr size_t N = 8; // AVX2 or 64-bit AVX3 + for (uint64_t code = 0; code < (1ull << N); ++code) { + const size_t count = PopCount(code); + std::array<uint32_t, N> indices{0}; + size_t pos = 0; + // All lanes where mask = true + for (size_t i = 0; i < N; ++i) { + if (code & (1ull << i)) { + indices[pos++] = i; + } + } + // All lanes where mask = false + for (size_t i = 0; i < N; ++i) { + if (!(code & (1ull << i))) { + indices[pos++] = i; + } + } + HWY_ASSERT(pos == N); + + // Convert to nibbles + uint64_t packed = 0; + for (size_t i = 0; i < N; ++i) { + HWY_ASSERT(indices[i] < N); + if (i < count) { + indices[i] |= N; + HWY_ASSERT(indices[i] < 0x10); + } + packed += indices[i] << (i * 4); + } + + HWY_ASSERT(packed < (1ull << (N * 4))); + printf("0x%08x,", static_cast<uint32_t>(packed)); + } + printf("\n"); +} + +void PrintCompressNot32x8Tables() { + printf("======================================= Not 32/64x8\n"); + constexpr size_t N = 8; // AVX2 or 64-bit AVX3 + for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) { + const uint64_t code = ~not_code; + const size_t count = PopCount(code); + std::array<uint32_t, N> indices{0}; + size_t pos = 0; + // All lanes where mask = true + for (size_t i = 0; i < N; ++i) { + if (code & (1ull << i)) { + indices[pos++] = i; + } + } + // All lanes where mask = false + for (size_t i = 0; i < N; ++i) { + if (!(code & (1ull << i))) { + indices[pos++] = i; + } + } + HWY_ASSERT(pos == N); + + // Convert to nibbles + uint64_t packed = 0; + for (size_t i = 0; i < N; ++i) { + HWY_ASSERT(indices[i] < N); + if (i < count) { + indices[i] |= N; + HWY_ASSERT(indices[i] < 0x10); + } + packed += indices[i] << (i * 4); + } + + HWY_ASSERT(packed < (1ull << (N * 4))); + printf("0x%08x,", static_cast<uint32_t>(packed)); + } + printf("\n"); +} + +// Compressed to nibbles (for AVX3 64x4) +void PrintCompress64x4NibbleTables() { + printf("======================================= 64x4Nibble\n"); + constexpr size_t N = 4; // AVX2 + for (uint64_t code = 0; code < (1ull << N); ++code) { + std::array<uint32_t, N> indices{0}; + size_t pos = 0; + // All lanes where mask = true + for (size_t i = 0; i < N; ++i) { + if (code & (1ull << i)) { + indices[pos++] = i; + } + } + // All lanes where mask = false + for (size_t i = 0; i < N; ++i) { + if (!(code & (1ull << i))) { + indices[pos++] = i; + } + } + HWY_ASSERT(pos == N); + + // Convert to nibbles + uint64_t packed = 0; + for (size_t i = 0; i < N; ++i) { + HWY_ASSERT(indices[i] < N); + packed += indices[i] << (i * 4); + } + + HWY_ASSERT(packed < (1ull << (N * 4))); + printf("0x%08x,", static_cast<uint32_t>(packed)); + } + printf("\n"); +} + +void PrintCompressNot64x4NibbleTables() { + printf("======================================= Not 64x4Nibble\n"); + constexpr size_t N = 4; // AVX2 + for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) { + const uint64_t code = ~not_code; + std::array<uint32_t, N> indices{0}; + size_t pos = 0; + // All lanes where mask = true + for (size_t i = 0; i < N; ++i) { + if (code & (1ull << i)) { + indices[pos++] = i; + } + } + // All lanes where mask = false + for (size_t i = 0; i < N; ++i) { + if (!(code & (1ull << i))) { + indices[pos++] = i; + } + } + HWY_ASSERT(pos == N); + + // Convert to nibbles + uint64_t packed = 0; + for (size_t i = 0; i < N; ++i) { + HWY_ASSERT(indices[i] < N); + packed += indices[i] << (i * 4); + } + + HWY_ASSERT(packed < (1ull << (N * 4))); + printf("0x%08x,", static_cast<uint32_t>(packed)); + } + printf("\n"); +} + +void PrintCompressNot64x2NibbleTables() { + printf("======================================= Not 64x2Nibble\n"); + constexpr size_t N = 2; // 128-bit + for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) { + const uint64_t code = ~not_code; + std::array<uint32_t, N> indices{0}; + size_t pos = 0; + // All lanes where mask = true + for (size_t i = 0; i < N; ++i) { + if (code & (1ull << i)) { + indices[pos++] = i; + } + } + // All lanes where mask = false + for (size_t i = 0; i < N; ++i) { + if (!(code & (1ull << i))) { + indices[pos++] = i; + } + } + HWY_ASSERT(pos == N); + + // Convert to nibbles + uint64_t packed = 0; + for (size_t i = 0; i < N; ++i) { + HWY_ASSERT(indices[i] < N); + packed += indices[i] << (i * 4); + } + + HWY_ASSERT(packed < (1ull << (N * 4))); + printf("0x%08x,", static_cast<uint32_t>(packed)); + } + printf("\n"); +} + +void PrintCompress64x4Tables() { + printf("======================================= 64x4 uncompressed\n"); + constexpr size_t N = 4; // SVE_256 + for (uint64_t code = 0; code < (1ull << N); ++code) { + std::array<size_t, N> indices{0}; + size_t pos = 0; + // All lanes where mask = true + for (size_t i = 0; i < N; ++i) { + if (code & (1ull << i)) { + indices[pos++] = i; + } + } + // All lanes where mask = false + for (size_t i = 0; i < N; ++i) { + if (!(code & (1ull << i))) { + indices[pos++] = i; + } + } + HWY_ASSERT(pos == N); + + // Store uncompressed indices because SVE TBL returns 0 if an index is out + // of bounds. On AVX3 we simply variable-shift because permute indices are + // interpreted modulo N. Compression is not worth the extra shift+AND + // because the table is anyway only 512 bytes. + for (size_t i = 0; i < N; ++i) { + printf("%d,", static_cast<int>(indices[i])); + } + } + printf("\n"); +} + +void PrintCompressNot64x4Tables() { + printf("======================================= Not 64x4 uncompressed\n"); + constexpr size_t N = 4; // SVE_256 + for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) { + const uint64_t code = ~not_code; + std::array<size_t, N> indices{0}; + size_t pos = 0; + // All lanes where mask = true + for (size_t i = 0; i < N; ++i) { + if (code & (1ull << i)) { + indices[pos++] = i; + } + } + // All lanes where mask = false + for (size_t i = 0; i < N; ++i) { + if (!(code & (1ull << i))) { + indices[pos++] = i; + } + } + HWY_ASSERT(pos == N); + + // Store uncompressed indices because SVE TBL returns 0 if an index is out + // of bounds. On AVX3 we simply variable-shift because permute indices are + // interpreted modulo N. Compression is not worth the extra shift+AND + // because the table is anyway only 512 bytes. + for (size_t i = 0; i < N; ++i) { + printf("%d,", static_cast<int>(indices[i])); + } + } + printf("\n"); +} + +// Same as above, but prints pairs of u32 indices (for AVX2). Also includes +// FirstN bits in the nibble MSB. +void PrintCompress64x4PairTables() { + printf("======================================= 64x4 u32 index\n"); + constexpr size_t N = 4; // AVX2 + for (uint64_t code = 0; code < (1ull << N); ++code) { + const size_t count = PopCount(code); + std::array<size_t, N> indices{0}; + size_t pos = 0; + // All lanes where mask = true + for (size_t i = 0; i < N; ++i) { + if (code & (1ull << i)) { + indices[pos++] = i; + } + } + // All lanes where mask = false + for (size_t i = 0; i < N; ++i) { + if (!(code & (1ull << i))) { + indices[pos++] = i; + } + } + HWY_ASSERT(pos == N); + + // Store uncompressed indices because SVE TBL returns 0 if an index is out + // of bounds. On AVX3 we simply variable-shift because permute indices are + // interpreted modulo N. Compression is not worth the extra shift+AND + // because the table is anyway only 512 bytes. + for (size_t i = 0; i < N; ++i) { + const int first_n_bit = i < count ? 8 : 0; + const int low = static_cast<int>(2 * indices[i]) + first_n_bit; + HWY_ASSERT(low < 0x10); + printf("%d, %d, ", low, low + 1); + } + } + printf("\n"); +} + +void PrintCompressNot64x4PairTables() { + printf("======================================= Not 64x4 u32 index\n"); + constexpr size_t N = 4; // AVX2 + for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) { + const uint64_t code = ~not_code; + const size_t count = PopCount(code); + std::array<size_t, N> indices{0}; + size_t pos = 0; + // All lanes where mask = true + for (size_t i = 0; i < N; ++i) { + if (code & (1ull << i)) { + indices[pos++] = i; + } + } + // All lanes where mask = false + for (size_t i = 0; i < N; ++i) { + if (!(code & (1ull << i))) { + indices[pos++] = i; + } + } + HWY_ASSERT(pos == N); + + // Store uncompressed indices because SVE TBL returns 0 if an index is out + // of bounds. On AVX3 we simply variable-shift because permute indices are + // interpreted modulo N. Compression is not worth the extra shift+AND + // because the table is anyway only 512 bytes. + for (size_t i = 0; i < N; ++i) { + const int first_n_bit = i < count ? 8 : 0; + const int low = static_cast<int>(2 * indices[i]) + first_n_bit; + HWY_ASSERT(low < 0x10); + printf("%d, %d, ", low, low + 1); + } + } + printf("\n"); +} + +// 4-tuple of byte indices +void PrintCompress32x4Tables() { + printf("======================================= 32x4\n"); + using T = uint32_t; + constexpr size_t N = 4; // SSE4 + for (uint64_t code = 0; code < (1ull << N); ++code) { + std::array<uint32_t, N> indices{0}; + size_t pos = 0; + // All lanes where mask = true + for (size_t i = 0; i < N; ++i) { + if (code & (1ull << i)) { + indices[pos++] = i; + } + } + // All lanes where mask = false + for (size_t i = 0; i < N; ++i) { + if (!(code & (1ull << i))) { + indices[pos++] = i; + } + } + HWY_ASSERT(pos == N); + + for (size_t i = 0; i < N; ++i) { + for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) { + printf("%d,", static_cast<int>(sizeof(T) * indices[i] + idx_byte)); + } + } + } + printf("\n"); +} + +void PrintCompressNot32x4Tables() { + printf("======================================= Not 32x4\n"); + using T = uint32_t; + constexpr size_t N = 4; // SSE4 + for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) { + const uint64_t code = ~not_code; + std::array<uint32_t, N> indices{0}; + size_t pos = 0; + // All lanes where mask = true + for (size_t i = 0; i < N; ++i) { + if (code & (1ull << i)) { + indices[pos++] = i; + } + } + // All lanes where mask = false + for (size_t i = 0; i < N; ++i) { + if (!(code & (1ull << i))) { + indices[pos++] = i; + } + } + HWY_ASSERT(pos == N); + + for (size_t i = 0; i < N; ++i) { + for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) { + printf("%d,", static_cast<int>(sizeof(T) * indices[i] + idx_byte)); + } + } + } + printf("\n"); +} + +// 8-tuple of byte indices +void PrintCompress64x2Tables() { + printf("======================================= 64x2\n"); + using T = uint64_t; + constexpr size_t N = 2; // SSE4 + for (uint64_t code = 0; code < (1ull << N); ++code) { + std::array<uint32_t, N> indices{0}; + size_t pos = 0; + // All lanes where mask = true + for (size_t i = 0; i < N; ++i) { + if (code & (1ull << i)) { + indices[pos++] = i; + } + } + // All lanes where mask = false + for (size_t i = 0; i < N; ++i) { + if (!(code & (1ull << i))) { + indices[pos++] = i; + } + } + HWY_ASSERT(pos == N); + + for (size_t i = 0; i < N; ++i) { + for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) { + printf("%d,", static_cast<int>(sizeof(T) * indices[i] + idx_byte)); + } + } + } + printf("\n"); +} + +void PrintCompressNot64x2Tables() { + printf("======================================= Not 64x2\n"); + using T = uint64_t; + constexpr size_t N = 2; // SSE4 + for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) { + const uint64_t code = ~not_code; + std::array<uint32_t, N> indices{0}; + size_t pos = 0; + // All lanes where mask = true + for (size_t i = 0; i < N; ++i) { + if (code & (1ull << i)) { + indices[pos++] = i; + } + } + // All lanes where mask = false + for (size_t i = 0; i < N; ++i) { + if (!(code & (1ull << i))) { + indices[pos++] = i; + } + } + HWY_ASSERT(pos == N); + + for (size_t i = 0; i < N; ++i) { + for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) { + printf("%d,", static_cast<int>(sizeof(T) * indices[i] + idx_byte)); + } + } + } + printf("\n"); +} + +} // namespace detail + +HWY_NOINLINE void PrintTables() { + // Only print once. +#if HWY_TARGET == HWY_STATIC_TARGET + detail::PrintCompress32x8Tables(); + detail::PrintCompressNot32x8Tables(); + detail::PrintCompress64x4NibbleTables(); + detail::PrintCompressNot64x4NibbleTables(); + detail::PrintCompressNot64x2NibbleTables(); + detail::PrintCompress64x4Tables(); + detail::PrintCompressNot64x4Tables(); + detail::PrintCompress32x4Tables(); + detail::PrintCompressNot32x4Tables(); + detail::PrintCompress64x2Tables(); + detail::PrintCompressNot64x2Tables(); + detail::PrintCompress64x4PairTables(); + detail::PrintCompressNot64x4PairTables(); + detail::PrintCompress16x8Tables(); + detail::PrintCompress8x8Tables(); + detail::PrintCompressNot16x8Tables(); +#endif +} + +#endif // HWY_PRINT_TABLES + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyCompressTest); +#if HWY_PRINT_TABLES +// Only print instead of running tests; this will be visible in the log. +HWY_EXPORT_AND_TEST_P(HwyCompressTest, PrintTables); +#else +HWY_EXPORT_AND_TEST_P(HwyCompressTest, TestAllCompress); +HWY_EXPORT_AND_TEST_P(HwyCompressTest, TestAllCompressBlocks); +#endif +} // namespace hwy + +#endif diff --git a/third_party/highway/hwy/tests/convert_test.cc b/third_party/highway/hwy/tests/convert_test.cc new file mode 100644 index 0000000000..a7aea5fe9e --- /dev/null +++ b/third_party/highway/hwy/tests/convert_test.cc @@ -0,0 +1,643 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <stddef.h> +#include <stdint.h> +#include <string.h> + +#include <cmath> // std::isfinite + +#include "hwy/base.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/convert_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// Cast and ensure bytes are the same. Called directly from TestAllBitCast or +// via TestBitCastFrom. +template <typename ToT> +struct TestBitCast { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const Repartition<ToT, D> dto; + const size_t N = Lanes(d); + const size_t Nto = Lanes(dto); + if (N == 0 || Nto == 0) return; + HWY_ASSERT_EQ(N * sizeof(T), Nto * sizeof(ToT)); + const auto vf = Iota(d, 1); + const auto vt = BitCast(dto, vf); + // Must return the same bits + auto from_lanes = AllocateAligned<T>(Lanes(d)); + auto to_lanes = AllocateAligned<ToT>(Lanes(dto)); + Store(vf, d, from_lanes.get()); + Store(vt, dto, to_lanes.get()); + HWY_ASSERT( + BytesEqual(from_lanes.get(), to_lanes.get(), Lanes(d) * sizeof(T))); + } +}; + +// From D to all types. +struct TestBitCastFrom { + template <typename T, class D> + HWY_NOINLINE void operator()(T t, D d) { + TestBitCast<uint8_t>()(t, d); + TestBitCast<uint16_t>()(t, d); + TestBitCast<uint32_t>()(t, d); +#if HWY_HAVE_INTEGER64 + TestBitCast<uint64_t>()(t, d); +#endif + TestBitCast<int8_t>()(t, d); + TestBitCast<int16_t>()(t, d); + TestBitCast<int32_t>()(t, d); +#if HWY_HAVE_INTEGER64 + TestBitCast<int64_t>()(t, d); +#endif + TestBitCast<float>()(t, d); +#if HWY_HAVE_FLOAT64 + TestBitCast<double>()(t, d); +#endif + } +}; + +HWY_NOINLINE void TestAllBitCast() { + // For HWY_SCALAR and partial vectors, we can only cast to same-sized types: + // the former can't partition its single lane, and the latter can be smaller + // than a destination type. + const ForPartialVectors<TestBitCast<uint8_t>> to_u8; + to_u8(uint8_t()); + to_u8(int8_t()); + + const ForPartialVectors<TestBitCast<int8_t>> to_i8; + to_i8(uint8_t()); + to_i8(int8_t()); + + const ForPartialVectors<TestBitCast<uint16_t>> to_u16; + to_u16(uint16_t()); + to_u16(int16_t()); + + const ForPartialVectors<TestBitCast<int16_t>> to_i16; + to_i16(uint16_t()); + to_i16(int16_t()); + + const ForPartialVectors<TestBitCast<uint32_t>> to_u32; + to_u32(uint32_t()); + to_u32(int32_t()); + to_u32(float()); + + const ForPartialVectors<TestBitCast<int32_t>> to_i32; + to_i32(uint32_t()); + to_i32(int32_t()); + to_i32(float()); + +#if HWY_HAVE_INTEGER64 + const ForPartialVectors<TestBitCast<uint64_t>> to_u64; + to_u64(uint64_t()); + to_u64(int64_t()); +#if HWY_HAVE_FLOAT64 + to_u64(double()); +#endif + + const ForPartialVectors<TestBitCast<int64_t>> to_i64; + to_i64(uint64_t()); + to_i64(int64_t()); +#if HWY_HAVE_FLOAT64 + to_i64(double()); +#endif +#endif // HWY_HAVE_INTEGER64 + + const ForPartialVectors<TestBitCast<float>> to_float; + to_float(uint32_t()); + to_float(int32_t()); + to_float(float()); + +#if HWY_HAVE_FLOAT64 + const ForPartialVectors<TestBitCast<double>> to_double; + to_double(double()); +#if HWY_HAVE_INTEGER64 + to_double(uint64_t()); + to_double(int64_t()); +#endif // HWY_HAVE_INTEGER64 +#endif // HWY_HAVE_FLOAT64 + +#if HWY_TARGET != HWY_SCALAR + // For non-scalar vectors, we can cast all types to all. + ForAllTypes(ForGEVectors<64, TestBitCastFrom>()); +#endif +} + +template <typename ToT> +struct TestPromoteTo { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D from_d) { + static_assert(sizeof(T) < sizeof(ToT), "Input type must be narrower"); + const Rebind<ToT, D> to_d; + + const size_t N = Lanes(from_d); + auto from = AllocateAligned<T>(N); + auto expected = AllocateAligned<ToT>(N); + + RandomState rng; + for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { + for (size_t i = 0; i < N; ++i) { + const uint64_t bits = rng(); + CopyBytes<sizeof(T)>(&bits, &from[i]); // not same size + expected[i] = from[i]; + } + + HWY_ASSERT_VEC_EQ(to_d, expected.get(), + PromoteTo(to_d, Load(from_d, from.get()))); + } + } +}; + +HWY_NOINLINE void TestAllPromoteTo() { + const ForPromoteVectors<TestPromoteTo<uint16_t>, 1> to_u16div2; + to_u16div2(uint8_t()); + + const ForPromoteVectors<TestPromoteTo<uint32_t>, 2> to_u32div4; + to_u32div4(uint8_t()); + + const ForPromoteVectors<TestPromoteTo<uint32_t>, 1> to_u32div2; + to_u32div2(uint16_t()); + + const ForPromoteVectors<TestPromoteTo<int16_t>, 1> to_i16div2; + to_i16div2(uint8_t()); + to_i16div2(int8_t()); + + const ForPromoteVectors<TestPromoteTo<int32_t>, 1> to_i32div2; + to_i32div2(uint16_t()); + to_i32div2(int16_t()); + + const ForPromoteVectors<TestPromoteTo<int32_t>, 2> to_i32div4; + to_i32div4(uint8_t()); + to_i32div4(int8_t()); + + // Must test f16/bf16 separately because we can only load/store/convert them. + +#if HWY_HAVE_INTEGER64 + const ForPromoteVectors<TestPromoteTo<uint64_t>, 1> to_u64div2; + to_u64div2(uint32_t()); + + const ForPromoteVectors<TestPromoteTo<int64_t>, 1> to_i64div2; + to_i64div2(int32_t()); +#endif + +#if HWY_HAVE_FLOAT64 + const ForPromoteVectors<TestPromoteTo<double>, 1> to_f64div2; + to_f64div2(int32_t()); + to_f64div2(float()); +#endif +} + +template <typename T, HWY_IF_FLOAT(T)> +bool IsFinite(T t) { + return std::isfinite(t); +} +// Wrapper avoids calling std::isfinite for integer types (ambiguous). +template <typename T, HWY_IF_NOT_FLOAT(T)> +bool IsFinite(T /*unused*/) { + return true; +} + +template <class D> +AlignedFreeUniquePtr<float[]> F16TestCases(D d, size_t& padded) { + const float test_cases[] = { + // +/- 1 + 1.0f, -1.0f, + // +/- 0 + 0.0f, -0.0f, + // near 0 + 0.25f, -0.25f, + // +/- integer + 4.0f, -32.0f, + // positive near limit + 65472.0f, 65504.0f, + // negative near limit + -65472.0f, -65504.0f, + // positive +/- delta + 2.00390625f, 3.99609375f, + // negative +/- delta + -2.00390625f, -3.99609375f, + // No infinity/NaN - implementation-defined due to ARM. + }; + constexpr size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]); + const size_t N = Lanes(d); + HWY_ASSERT(N != 0); + padded = RoundUpTo(kNumTestCases, N); // allow loading whole vectors + auto in = AllocateAligned<float>(padded); + auto expected = AllocateAligned<float>(padded); + size_t i = 0; + for (; i < kNumTestCases; ++i) { + in[i] = test_cases[i]; + } + for (; i < padded; ++i) { + in[i] = 0.0f; + } + return in; +} + +struct TestF16 { + template <typename TF32, class DF32> + HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) { +#if HWY_HAVE_FLOAT16 + size_t padded; + const size_t N = Lanes(d32); // same count for f16 + HWY_ASSERT(N != 0); + auto in = F16TestCases(d32, padded); + using TF16 = float16_t; + const Rebind<TF16, DF32> d16; + auto temp16 = AllocateAligned<TF16>(N); + + for (size_t i = 0; i < padded; i += N) { + const auto loaded = Load(d32, &in[i]); + Store(DemoteTo(d16, loaded), d16, temp16.get()); + HWY_ASSERT_VEC_EQ(d32, loaded, PromoteTo(d32, Load(d16, temp16.get()))); + } +#else + (void)d32; +#endif + } +}; + +HWY_NOINLINE void TestAllF16() { ForDemoteVectors<TestF16>()(float()); } + +template <class D> +AlignedFreeUniquePtr<float[]> BF16TestCases(D d, size_t& padded) { + const float test_cases[] = { + // +/- 1 + 1.0f, -1.0f, + // +/- 0 + 0.0f, -0.0f, + // near 0 + 0.25f, -0.25f, + // +/- integer + 4.0f, -32.0f, + // positive near limit + 3.389531389251535E38f, 1.99384199368e+38f, + // negative near limit + -3.389531389251535E38f, -1.99384199368e+38f, + // positive +/- delta + 2.015625f, 3.984375f, + // negative +/- delta + -2.015625f, -3.984375f, + }; + constexpr size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]); + const size_t N = Lanes(d); + HWY_ASSERT(N != 0); + padded = RoundUpTo(kNumTestCases, N); // allow loading whole vectors + auto in = AllocateAligned<float>(padded); + auto expected = AllocateAligned<float>(padded); + size_t i = 0; + for (; i < kNumTestCases; ++i) { + in[i] = test_cases[i]; + } + for (; i < padded; ++i) { + in[i] = 0.0f; + } + return in; +} + +struct TestBF16 { + template <typename TF32, class DF32> + HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) { +#if !defined(HWY_EMULATE_SVE) + size_t padded; + auto in = BF16TestCases(d32, padded); + using TBF16 = bfloat16_t; +#if HWY_TARGET == HWY_SCALAR + const Rebind<TBF16, DF32> dbf16; // avoid 4/2 = 2 lanes +#else + const Repartition<TBF16, DF32> dbf16; +#endif + const Half<decltype(dbf16)> dbf16_half; + const size_t N = Lanes(d32); + HWY_ASSERT(Lanes(dbf16_half) <= N); + auto temp16 = AllocateAligned<TBF16>(N); + + for (size_t i = 0; i < padded; i += N) { + const auto loaded = Load(d32, &in[i]); + const auto v16 = DemoteTo(dbf16_half, loaded); + Store(v16, dbf16_half, temp16.get()); + const auto v16_loaded = Load(dbf16_half, temp16.get()); + HWY_ASSERT_VEC_EQ(d32, loaded, PromoteTo(d32, v16_loaded)); + } +#else + (void)d32; +#endif + } +}; + +HWY_NOINLINE void TestAllBF16() { ForShrinkableVectors<TestBF16>()(float()); } + +struct TestConvertU8 { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, const D du32) { + const Rebind<uint8_t, D> du8; + const auto wrap = Set(du32, 0xFF); + HWY_ASSERT_VEC_EQ(du8, Iota(du8, 0), U8FromU32(And(Iota(du32, 0), wrap))); + HWY_ASSERT_VEC_EQ(du8, Iota(du8, 0x7F), + U8FromU32(And(Iota(du32, 0x7F), wrap))); + } +}; + +HWY_NOINLINE void TestAllConvertU8() { + ForDemoteVectors<TestConvertU8, 2>()(uint32_t()); +} + +template <typename From, typename To, class D> +constexpr bool IsSupportedTruncation() { + return (sizeof(To) < sizeof(From)) && + (Pow2(Rebind<To, D>()) + 3 >= static_cast<int>(CeilLog2(sizeof(To)))); +} + +struct TestTruncateTo { + template <typename From, typename To, class D, + hwy::EnableIf<!IsSupportedTruncation<From, To, D>()>* = nullptr> + HWY_NOINLINE void testTo(From, To, const D) { + // do nothing + } + + template <typename From, typename To, class D, + hwy::EnableIf<IsSupportedTruncation<From, To, D>()>* = nullptr> + HWY_NOINLINE void testTo(From, To, const D d) { + constexpr uint32_t base = 0xFA578D00; + const Rebind<To, D> dTo; + const auto src = Iota(d, static_cast<From>(base)); + const auto expected = Iota(dTo, static_cast<To>(base)); + const VFromD<decltype(dTo)> actual = TruncateTo(dTo, src); + HWY_ASSERT_VEC_EQ(dTo, expected, actual); + } + + template <typename T, class D> + HWY_NOINLINE void operator()(T from, const D d) { + testTo<T, uint8_t, D>(from, uint8_t(), d); + testTo<T, uint16_t, D>(from, uint16_t(), d); + testTo<T, uint32_t, D>(from, uint32_t(), d); + } +}; + +HWY_NOINLINE void TestAllTruncate() { + ForUnsignedTypes(ForPartialVectors<TestTruncateTo>()); +} + +// Separate function to attempt to work around a compiler bug on ARM: when this +// is merged with TestIntFromFloat, outputs match a previous Iota(-(N+1)) input. +struct TestIntFromFloatHuge { + template <typename TF, class DF> + HWY_NOINLINE void operator()(TF /*unused*/, const DF df) { + // The ARMv7 manual says that float->int saturates, i.e. chooses the + // nearest representable value. This works correctly on armhf with GCC, but + // not with clang. For reasons unknown, MSVC also runs into an out-of-memory + // error here. +#if HWY_COMPILER_CLANG || HWY_COMPILER_MSVC + (void)df; +#else + using TI = MakeSigned<TF>; + const Rebind<TI, DF> di; + + // Workaround for incorrect 32-bit GCC codegen for SSSE3 - Print-ing + // the expected lvalue also seems to prevent the issue. + const size_t N = Lanes(df); + auto expected = AllocateAligned<TI>(N); + + // Huge positive + Store(Set(di, LimitsMax<TI>()), di, expected.get()); + HWY_ASSERT_VEC_EQ(di, expected.get(), ConvertTo(di, Set(df, TF(1E20)))); + + // Huge negative + Store(Set(di, LimitsMin<TI>()), di, expected.get()); + HWY_ASSERT_VEC_EQ(di, expected.get(), ConvertTo(di, Set(df, TF(-1E20)))); +#endif + } +}; + +class TestIntFromFloat { + template <typename TF, class DF> + static HWY_NOINLINE void TestPowers(TF /*unused*/, const DF df) { + using TI = MakeSigned<TF>; + const Rebind<TI, DF> di; + constexpr size_t kBits = sizeof(TF) * 8; + + // Powers of two, plus offsets to set some mantissa bits. + const int64_t ofs_table[3] = {0LL, 3LL << (kBits / 2), 1LL << (kBits - 15)}; + for (int sign = 0; sign < 2; ++sign) { + for (size_t shift = 0; shift < kBits - 1; ++shift) { + for (int64_t ofs : ofs_table) { + const int64_t mag = (int64_t{1} << shift) + ofs; + const int64_t val = sign ? mag : -mag; + HWY_ASSERT_VEC_EQ(di, Set(di, static_cast<TI>(val)), + ConvertTo(di, Set(df, static_cast<TF>(val)))); + } + } + } + } + + template <typename TF, class DF> + static HWY_NOINLINE void TestRandom(TF /*unused*/, const DF df) { + using TI = MakeSigned<TF>; + const Rebind<TI, DF> di; + const size_t N = Lanes(df); + + // TF does not have enough precision to represent TI. + const double min = static_cast<double>(LimitsMin<TI>()); + const double max = static_cast<double>(LimitsMax<TI>()); + + // Also check random values. + auto from = AllocateAligned<TF>(N); + auto expected = AllocateAligned<TI>(N); + RandomState rng; + for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) { + for (size_t i = 0; i < N; ++i) { + do { + const uint64_t bits = rng(); + CopyBytes<sizeof(TF)>(&bits, &from[i]); // not same size + } while (!std::isfinite(from[i])); + if (from[i] >= max) { + expected[i] = LimitsMax<TI>(); + } else if (from[i] <= min) { + expected[i] = LimitsMin<TI>(); + } else { + expected[i] = static_cast<TI>(from[i]); + } + } + + HWY_ASSERT_VEC_EQ(di, expected.get(), + ConvertTo(di, Load(df, from.get()))); + } + } + + public: + template <typename TF, class DF> + HWY_NOINLINE void operator()(TF tf, const DF df) { + using TI = MakeSigned<TF>; + const Rebind<TI, DF> di; + const size_t N = Lanes(df); + + // Integer positive + HWY_ASSERT_VEC_EQ(di, Iota(di, TI(4)), ConvertTo(di, Iota(df, TF(4.0)))); + + // Integer negative + HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), ConvertTo(di, Iota(df, -TF(N)))); + + // Above positive + HWY_ASSERT_VEC_EQ(di, Iota(di, TI(2)), ConvertTo(di, Iota(df, TF(2.001)))); + + // Below positive + HWY_ASSERT_VEC_EQ(di, Iota(di, TI(3)), ConvertTo(di, Iota(df, TF(3.9999)))); + + const TF eps = static_cast<TF>(0.0001); + // Above negative + HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), + ConvertTo(di, Iota(df, -TF(N + 1) + eps))); + + // Below negative + HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N + 1)), + ConvertTo(di, Iota(df, -TF(N + 1) - eps))); + + TestPowers(tf, df); + TestRandom(tf, df); + } +}; + +HWY_NOINLINE void TestAllIntFromFloat() { + ForFloatTypes(ForPartialVectors<TestIntFromFloatHuge>()); + ForFloatTypes(ForPartialVectors<TestIntFromFloat>()); +} + +struct TestFloatFromInt { + template <typename TF, class DF> + HWY_NOINLINE void operator()(TF /*unused*/, const DF df) { + using TI = MakeSigned<TF>; + const RebindToSigned<DF> di; + const size_t N = Lanes(df); + + // Integer positive + HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), ConvertTo(df, Iota(di, TI(4)))); + + // Integer negative + HWY_ASSERT_VEC_EQ(df, Iota(df, -TF(N)), ConvertTo(df, Iota(di, -TI(N)))); + + // Max positive + HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMax<TI>())), + ConvertTo(df, Set(di, LimitsMax<TI>()))); + + // Min negative + HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMin<TI>())), + ConvertTo(df, Set(di, LimitsMin<TI>()))); + } +}; + +HWY_NOINLINE void TestAllFloatFromInt() { + ForFloatTypes(ForPartialVectors<TestFloatFromInt>()); +} + +struct TestFloatFromUint { + template <typename TF, class DF> + HWY_NOINLINE void operator()(TF /*unused*/, const DF df) { + using TU = MakeUnsigned<TF>; + const RebindToUnsigned<DF> du; + + // Integer positive + HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), ConvertTo(df, Iota(du, TU(4)))); + HWY_ASSERT_VEC_EQ(df, Iota(df, TF(65535.0)), + ConvertTo(df, Iota(du, 65535))); // 2^16-1 + if (sizeof(TF) > 4) { + HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4294967295.0)), + ConvertTo(df, Iota(du, 4294967295ULL))); // 2^32-1 + } + + // Max positive + HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMax<TU>())), + ConvertTo(df, Set(du, LimitsMax<TU>()))); + + // Zero + HWY_ASSERT_VEC_EQ(df, Zero(df), ConvertTo(df, Zero(du))); + } +}; + +HWY_NOINLINE void TestAllFloatFromUint() { + ForFloatTypes(ForPartialVectors<TestFloatFromUint>()); +} + +struct TestI32F64 { + template <typename TF, class DF> + HWY_NOINLINE void operator()(TF /*unused*/, const DF df) { + using TI = int32_t; + const Rebind<TI, DF> di; + const size_t N = Lanes(df); + + // Integer positive + HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), PromoteTo(df, Iota(di, TI(4)))); + + // Integer negative + HWY_ASSERT_VEC_EQ(df, Iota(df, -TF(N)), PromoteTo(df, Iota(di, -TI(N)))); + + // Above positive + HWY_ASSERT_VEC_EQ(df, Iota(df, TF(2.0)), PromoteTo(df, Iota(di, TI(2)))); + + // Below positive + HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), PromoteTo(df, Iota(di, TI(4)))); + + // Above negative + HWY_ASSERT_VEC_EQ(df, Iota(df, TF(-4.0)), PromoteTo(df, Iota(di, TI(-4)))); + + // Below negative + HWY_ASSERT_VEC_EQ(df, Iota(df, TF(-2.0)), PromoteTo(df, Iota(di, TI(-2)))); + + // Max positive int + HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMax<TI>())), + PromoteTo(df, Set(di, LimitsMax<TI>()))); + + // Min negative int + HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMin<TI>())), + PromoteTo(df, Set(di, LimitsMin<TI>()))); + } +}; + +HWY_NOINLINE void TestAllI32F64() { +#if HWY_HAVE_FLOAT64 + ForDemoteVectors<TestI32F64>()(double()); +#endif +} + + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyConvertTest); +HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllBitCast); +HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllPromoteTo); +HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllF16); +HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllBF16); +HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllConvertU8); +HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllTruncate); +HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllIntFromFloat); +HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllFloatFromInt); +HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllFloatFromUint); +HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllI32F64); +} // namespace hwy + +#endif diff --git a/third_party/highway/hwy/tests/crypto_test.cc b/third_party/highway/hwy/tests/crypto_test.cc new file mode 100644 index 0000000000..b7dfb198a3 --- /dev/null +++ b/third_party/highway/hwy/tests/crypto_test.cc @@ -0,0 +1,553 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <stddef.h> +#include <stdint.h> +#include <string.h> // memcpy + +#include "hwy/aligned_allocator.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/crypto_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +#define HWY_PRINT_CLMUL_GOLDEN 0 + +#if HWY_TARGET != HWY_SCALAR + +class TestAES { + template <typename T, class D> + HWY_NOINLINE void TestSBox(T /*unused*/, D d) { + // The generic implementation of the S-box is difficult to verify by + // inspection, so we add a white-box test that verifies it using enumeration + // (outputs for 0..255 vs. https://en.wikipedia.org/wiki/Rijndael_S-box). + const uint8_t sbox[256] = { + 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, + 0xfe, 0xd7, 0xab, 0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, + 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26, + 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, + 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, + 0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, + 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed, + 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, + 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, + 0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, + 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec, + 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, + 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, + 0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, + 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d, + 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, + 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, + 0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, + 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1, 0xf8, 0x98, 0x11, + 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, + 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, + 0xb0, 0x54, 0xbb, 0x16}; + + // Ensure it's safe to load an entire vector by padding. + const size_t N = Lanes(d); + const size_t padded = RoundUpTo(256, N); + auto expected = AllocateAligned<T>(padded); + // Must wrap around to match the input (Iota). + for (size_t pos = 0; pos < padded;) { + const size_t remaining = HWY_MIN(padded - pos, size_t(256)); + memcpy(expected.get() + pos, sbox, remaining); + pos += remaining; + } + + for (size_t i = 0; i < 256; i += N) { + const auto in = Iota(d, static_cast<T>(i)); + HWY_ASSERT_VEC_EQ(d, expected.get() + i, detail::SubBytes(in)); + } + } + + public: + template <typename T, class D> + HWY_NOINLINE void operator()(T t, D d) { + // Test vector (after first KeyAddition) from + // https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Standards-and-Guidelines/documents/examples/AES_Core128.pdf + alignas(16) constexpr uint8_t test_lanes[16] = { + 0x40, 0xBF, 0xAB, 0xF4, 0x06, 0xEE, 0x4D, 0x30, + 0x42, 0xCA, 0x6B, 0x99, 0x7A, 0x5C, 0x58, 0x16}; + const auto test = LoadDup128(d, test_lanes); + + // = ShiftRow result + alignas(16) constexpr uint8_t expected_sr_lanes[16] = { + 0x09, 0x28, 0x7F, 0x47, 0x6F, 0x74, 0x6A, 0xBF, + 0x2C, 0x4A, 0x62, 0x04, 0xDA, 0x08, 0xE3, 0xEE}; + const auto expected_sr = LoadDup128(d, expected_sr_lanes); + + // = MixColumn result + alignas(16) constexpr uint8_t expected_mc_lanes[16] = { + 0x52, 0x9F, 0x16, 0xC2, 0x97, 0x86, 0x15, 0xCA, + 0xE0, 0x1A, 0xAE, 0x54, 0xBA, 0x1A, 0x26, 0x59}; + const auto expected_mc = LoadDup128(d, expected_mc_lanes); + + // = KeyAddition result + alignas(16) constexpr uint8_t expected_lanes[16] = { + 0xF2, 0x65, 0xE8, 0xD5, 0x1F, 0xD2, 0x39, 0x7B, + 0xC3, 0xB9, 0x97, 0x6D, 0x90, 0x76, 0x50, 0x5C}; + const auto expected = LoadDup128(d, expected_lanes); + + alignas(16) uint8_t key_lanes[16]; + for (size_t i = 0; i < 16; ++i) { + key_lanes[i] = expected_mc_lanes[i] ^ expected_lanes[i]; + } + const auto round_key = LoadDup128(d, key_lanes); + + HWY_ASSERT_VEC_EQ(d, expected_mc, AESRound(test, Zero(d))); + HWY_ASSERT_VEC_EQ(d, expected, AESRound(test, round_key)); + HWY_ASSERT_VEC_EQ(d, expected_sr, AESLastRound(test, Zero(d))); + HWY_ASSERT_VEC_EQ(d, Xor(expected_sr, round_key), + AESLastRound(test, round_key)); + + TestSBox(t, d); + } +}; +HWY_NOINLINE void TestAllAES() { ForGEVectors<128, TestAES>()(uint8_t()); } + +#else +HWY_NOINLINE void TestAllAES() {} +#endif // HWY_TARGET != HWY_SCALAR + +struct TestCLMul { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + // needs 64 bit lanes and 128-bit result +#if HWY_TARGET != HWY_SCALAR && HWY_HAVE_INTEGER64 + const size_t N = Lanes(d); + if (N == 1) return; + + auto in1 = AllocateAligned<T>(N); + auto in2 = AllocateAligned<T>(N); + + constexpr size_t kCLMulNum = 512; + // Depends on rng! + static constexpr uint64_t kCLMulLower[kCLMulNum] = { + 0x24511d4ce34d6350ULL, 0x4ca582edde1236bbULL, 0x537e58f72dac25a8ULL, + 0x4e942d5e130b9225ULL, 0x75a906c519257a68ULL, 0x1df9f85126d96c5eULL, + 0x464e7c13f4ad286aULL, 0x138535ee35dabc40ULL, 0xb2f7477b892664ecULL, + 0x01557b077167c25dULL, 0xf32682490ee49624ULL, 0x0025bac603b9e140ULL, + 0xcaa86aca3e3daf40ULL, 0x1fbcfe4af73eb6c4ULL, 0x8ee8064dd0aae5dcULL, + 0x1248cb547858c213ULL, 0x37a55ee5b10fb34cULL, 0x6eb5c97b958f86e2ULL, + 0x4b1ab3eb655ea7cdULL, 0x1d66645a85627520ULL, 0xf8728e96daa36748ULL, + 0x38621043e6ff5e3bULL, 0xd1d28b5da5ffefb4ULL, 0x0a5cd65931546df7ULL, + 0x2a0639be3d844150ULL, 0x0e2d0f18c8d6f045ULL, 0xfacc770b963326c1ULL, + 0x19611b31ca2ef141ULL, 0xabea29510dd87518ULL, 0x18a7dc4b205f2768ULL, + 0x9d3975ea5612dc86ULL, 0x06319c139e374773ULL, 0x6641710400b4c390ULL, + 0x356c29b6001c3670ULL, 0xe9e04d851e040a00ULL, 0x21febe561222d79aULL, + 0xc071eaae6e148090ULL, 0x0eed351a0af94f5bULL, 0x04324eedb3c03688ULL, + 0x39e89b136e0d6ccdULL, 0x07d0fd2777a31600ULL, 0x44b8573827209822ULL, + 0x6d690229ea177d78ULL, 0x1b9749d960ba9f18ULL, 0x190945271c0fbb94ULL, + 0x189aea0e07d2c88eULL, 0xf18eab6b65a6beb2ULL, 0x57744b21c13d0d84ULL, + 0xf63050a613e95c2eULL, 0x12cd20d25f97102fULL, 0x5a5df0678dbcba60ULL, + 0x0b08fb80948bfafcULL, 0x44cf1cbe7c6fc3c8ULL, 0x166a470ef25da288ULL, + 0x2c498a609204e48cULL, 0x261b0a22585697ecULL, 0x737750574af7dde4ULL, + 0x4079959c60b01e0cULL, 0x06ed8aac13f782d6ULL, 0x019d454ba9b5ef20ULL, + 0xea1edbf96d49e858ULL, 0x17c2f3ebde9ac469ULL, 0x5cf72706e3d6f5e4ULL, + 0x16e856aa3c841516ULL, 0x256f7e3cef83368eULL, 0x47e17c8eb2774e77ULL, + 0x9b48ac150a804821ULL, 0x584523f61ccfdf22ULL, 0xedcb6a2a75d9e7f2ULL, + 0x1fe3d1838e537aa7ULL, 0x778872e9f64549caULL, 0x2f1cea6f0d3faf92ULL, + 0x0e8c4b6a9343f326ULL, 0x01902d1ba3048954ULL, 0xc5c1fd5269e91dc0ULL, + 0x0ef8a4707817eb9cULL, 0x1f696f09a5354ca4ULL, 0x369cd9de808b818cULL, + 0xf6917d1dd43fd784ULL, 0x7f4b76bf40dc166fULL, 0x4ce67698724ace12ULL, + 0x02c3bf60e6e9cd92ULL, 0xb8229e45b21458e8ULL, 0x415efd41e91adf49ULL, + 0x5edfcd516bb921cdULL, 0x5ff2c29429fd187eULL, 0x0af666b17103b3e0ULL, + 0x1f5e4ff8f54c9a5bULL, 0x429253d8a5544ba6ULL, 0x19de2fdf9f4d9dcaULL, + 0x29bf3d37ddc19a40ULL, 0x04d4513a879552baULL, 0x5cc7476cf71ee155ULL, + 0x40011f8c238784a5ULL, 0x1a3ae50b0fd2ee2bULL, 0x7db22f432ba462baULL, + 0x417290b0bee2284aULL, 0x055a6bd5bb853db2ULL, 0xaa667daeed8c2a34ULL, + 0x0d6b316bda7f3577ULL, 0x72d35598468e3d5dULL, 0x375b594804bfd33aULL, + 0x16ed3a319b540ae8ULL, 0x093bace4b4695afdULL, 0xc7118754ec2737ceULL, + 0x0fff361f0505c81aULL, 0x996e9e7291321af0ULL, 0x496b1d9b0b89ba8cULL, + 0x65a98b2e9181da9cULL, 0x70759c8dd45575dfULL, 0x3446fe727f5e2cbbULL, + 0x1121ae609d195e74ULL, 0x5ff5d68ce8a21018ULL, 0x0e27eca3825b60d6ULL, + 0x82f628bceca3d1daULL, 0x2756a0914e344047ULL, 0xa460406c1c708d50ULL, + 0x63ce32a0c083e491ULL, 0xc883e5a685c480e0ULL, 0x602c951891e600f9ULL, + 0x02ecb2e3911ca5f8ULL, 0x0d8675f4bb70781aULL, 0x43545cc3c78ea496ULL, + 0x04164b01d6b011c2ULL, 0x3acbb323dcab2c9bULL, 0x31c5ba4e22793082ULL, + 0x5a6484af5f7c2d10ULL, 0x1a929b16194e8078ULL, 0x7a6a75d03b313924ULL, + 0x0553c73a35b1d525ULL, 0xf18628c51142be34ULL, 0x1b51cf80d7efd8f5ULL, + 0x52e0ca4df63ee258ULL, 0x0e977099160650c9ULL, 0x6be1524e92024f70ULL, + 0x0ee2152625438b9dULL, 0xfa32af436f6d8eb4ULL, 0x5ecf49c2154287e5ULL, + 0x6b72f4ae3590569dULL, 0x086c5ee6e87bfb68ULL, 0x737a4f0dc04b6187ULL, + 0x08c3439280edea41ULL, 0x9547944f01636c5cULL, 0x6acfbfc2571cd71fULL, + 0x85d7842972449637ULL, 0x252ea5e5a7fad86aULL, 0x4e41468f99ba1632ULL, + 0x095e0c3ae63b25a2ULL, 0xb005ce88fd1c9425ULL, 0x748e668abbe09f03ULL, + 0xb2cfdf466b187d18ULL, 0x60b11e633d8fe845ULL, 0x07144c4d246db604ULL, + 0x139bcaac55e96125ULL, 0x118679b5a6176327ULL, 0x1cebe90fa4d9f83fULL, + 0x22244f52f0d312acULL, 0x669d4e17c9bfb713ULL, 0x96390e0b834bb0d0ULL, + 0x01f7f0e82ba08071ULL, 0x2dffeee31ca6d284ULL, 0x1f4738745ef039feULL, + 0x4ce0dd2b603b6420ULL, 0x0035fc905910a4d5ULL, 0x07df2b533df6fb04ULL, + 0x1cee2735c9b910ddULL, 0x2bc4af565f7809eaULL, 0x2f876c1f5cb1076cULL, + 0x33e079524099d056ULL, 0x169e0405d2f9efbaULL, 0x018643ab548a358cULL, + 0x1bb6fc4331cffe92ULL, 0x05111d3a04e92faaULL, 0x23c27ecf0d638b73ULL, + 0x1b79071dc1685d68ULL, 0x0662d20aba8e1e0cULL, 0xe7f6440277144c6fULL, + 0x4ca38b64c22196c0ULL, 0x43c05f6d1936fbeeULL, 0x0654199d4d1faf0fULL, + 0xf2014054e71c2d04ULL, 0x0a103e47e96b4c84ULL, 0x7986e691dd35b040ULL, + 0x4e1ebb53c306a341ULL, 0x2775bb3d75d65ba6ULL, 0x0562ab0adeff0f15ULL, + 0x3c2746ad5eba3eacULL, 0x1facdb5765680c60ULL, 0xb802a60027d81d00ULL, + 0x1191d0f6366ae3a9ULL, 0x81a97b5ae0ea5d14ULL, 0x06bee05b6178a770ULL, + 0xc7baeb2fe1d6aeb3ULL, 0x594cb5b867d04fdfULL, 0xf515a80138a4e350ULL, + 0x646417ad8073cf38ULL, 0x4a229a43373fb8d4ULL, 0x10fa6eafff1ca453ULL, + 0x9f060700895cc731ULL, 0x00521133d11d11f4ULL, 0xb940a2bb912a7a5cULL, + 0x3fab180670ad2a3cULL, 0x45a5f0e5b6fdb95dULL, 0x27c1baad6f946b15ULL, + 0x336c6bdbe527cf58ULL, 0x3b83aa602a5baea3ULL, 0xdf749153f9bcc376ULL, + 0x1a05513a6c0b4a90ULL, 0xb81e0b570a075c47ULL, 0x471fabb40bdc27ceULL, + 0x9dec9472f6853f60ULL, 0x361f71b88114193bULL, 0x3b550a8c4feeff00ULL, + 0x0f6cde5a68bc9bc0ULL, 0x3f50121a925703e0ULL, 0x6967ff66d6d343a9ULL, + 0xff6b5bd2ce7bc3ccULL, 0x05474cea08bf6cd8ULL, 0xf76eabbfaf108eb0ULL, + 0x067529be4fc6d981ULL, 0x4d766b137cf8a988ULL, 0x2f09c7395c5cfbbdULL, + 0x388793712da06228ULL, 0x02c9ff342c8f339aULL, 0x152c734139a860a3ULL, + 0x35776eb2b270c04dULL, 0x0f8d8b41f11c4608ULL, 0x0c2071665be6b288ULL, + 0xc034e212b3f71d88ULL, 0x071d961ef3276f99ULL, 0xf98598ee75b60773ULL, + 0x062062c58c6724e4ULL, 0xd156438e2125572cULL, 0x38552d59a7f0f7c8ULL, + 0x1a402178206e413cULL, 0x1f1f996c68293b26ULL, 0x8bce3cafe1730f7eULL, + 0x2d0480a0828f6bf5ULL, 0x6c99cffa171f92f6ULL, 0x0087f842bb0ac681ULL, + 0x11d7ed06e1e7fd3eULL, 0x07cb1186f2385dc6ULL, 0x5d7763ebff1e170fULL, + 0x2dacc870231ac292ULL, 0x8486317a9ffb390cULL, 0x1c3a6dd20c959ac6ULL, + 0x90dc96e3992e06b8ULL, 0x70d60bfa33e72b67ULL, 0x70c9bddd0985ee63ULL, + 0x012c9767b3673093ULL, 0xfcd3bc5580f6a88aULL, 0x0ac80017ef6308c3ULL, + 0xdb67d709ef4bba09ULL, 0x4c63e324f0e247ccULL, 0xa15481d3fe219d60ULL, + 0x094c4279cdccb501ULL, 0x965a28c72575cb82ULL, 0x022869db25e391ebULL, + 0x37f528c146023910ULL, 0x0c1290636917deceULL, 0x9aee25e96251ca9cULL, + 0x728ac5ba853b69c2ULL, 0x9f272c93c4be20c8ULL, 0x06c1aa6319d28124ULL, + 0x4324496b1ca8a4f7ULL, 0x0096ecfe7dfc0189ULL, 0x9e06131b19ae0020ULL, + 0x15278b15902f4597ULL, 0x2a9fece8c13842d8ULL, 0x1d4e6781f0e1355eULL, + 0x6855b712d3dbf7c0ULL, 0x06a07fad99be6f46ULL, 0x3ed9d7957e4d1d7cULL, + 0x0c326f7cbc248bb2ULL, 0xe6363ad2c537cf51ULL, 0x0e12eb1c40723f13ULL, + 0xf5c6ac850afba803ULL, 0x0322a79d615fa9f0ULL, 0x6116696ed97bd5f8ULL, + 0x0d438080fbbdc9f1ULL, 0x2e4dc42c38f1e243ULL, 0x64948e9104f3a5bfULL, + 0x9fd622371bdb5f00ULL, 0x0f12bf082b2a1b6eULL, 0x4b1f8d867d78031cULL, + 0x134392ea9f5ef832ULL, 0xf3d70472321bc23eULL, 0x05fcbe5e9eea268eULL, + 0x136dede7175a22cfULL, 0x1308f8baac2cbcccULL, 0xd691026f0915eb64ULL, + 0x0e49a668345c3a38ULL, 0x24ddbbe8bc96f331ULL, 0x4d2ec9479b640578ULL, + 0x450f0697327b359cULL, 0x32b45360f4488ee0ULL, 0x4f6d9ecec46a105aULL, + 0x5500c63401ae8e80ULL, 0x47dea495cf6f98baULL, 0x13dc9a2dfca80babULL, + 0xe6f8a93f7b24ca92ULL, 0x073f57a6d900a87fULL, 0x9ddb935fd3aa695aULL, + 0x101e98d24b39e8aaULL, 0x6b8d0eb95a507ddcULL, 0x45a908b3903d209bULL, + 0x6c96a3e119e617d4ULL, 0x2442787543d3be48ULL, 0xd3bc055c7544b364ULL, + 0x7693bb042ca8653eULL, 0xb95e3a4ea5d0101eULL, 0x116f0d459bb94a73ULL, + 0x841244b72cdc5e90ULL, 0x1271acced6cb34d3ULL, 0x07d289106524d638ULL, + 0x537c9cf49c01b5bbULL, 0x8a8e16706bb7a5daULL, 0x12e50a9c499dc3a9ULL, + 0x1cade520db2ba830ULL, 0x1add52f000d7db70ULL, 0x12cf15db2ce78e30ULL, + 0x0657eaf606bfc866ULL, 0x4026816d3b05b1d0ULL, 0x1ba0ebdf90128e4aULL, + 0xdfd649375996dd6eULL, 0x0f416e906c23d9aeULL, 0x384273cad0582a24ULL, + 0x2ff27b0378a46189ULL, 0xc4ecd18a2d7a7616ULL, 0x35cef0b5cd51d640ULL, + 0x7d582363643f48b7ULL, 0x0984ad746ad0ab7cULL, 0x2990a999835f9688ULL, + 0x2d4df66a97b19e05ULL, 0x592c79720af99aa2ULL, 0x052863c230602cd3ULL, + 0x5f5e2b15edcf2840ULL, 0x01dff1b694b978b0ULL, 0x14345a48b622025eULL, + 0x028fab3b6407f715ULL, 0x3455d188e6feca50ULL, 0x1d0d40288fb1b5fdULL, + 0x4685c5c2b6a1e5aeULL, 0x3a2077b1e5fe5adeULL, 0x1bc55d611445a0d8ULL, + 0x05480ae95f3f83feULL, 0xbbb59cfcf7e17fb6ULL, 0x13f7f10970bbb990ULL, + 0x6d00ac169425a352ULL, 0x7da0db397ef2d5d3ULL, 0x5b512a247f8d2479ULL, + 0x637eaa6a977c3c32ULL, 0x3720f0ae37cba89cULL, 0x443df6e6aa7f525bULL, + 0x28664c287dcef321ULL, 0x03c267c00cf35e49ULL, 0x690185572d4021deULL, + 0x2707ff2596e321c2ULL, 0xd865f5af7722c380ULL, 0x1ea285658e33aafbULL, + 0xc257c5e88755bef4ULL, 0x066f67275cfcc31eULL, 0xb09931945cc0fed0ULL, + 0x58c1dc38d6e3a03fULL, 0xf99489678fc94ee8ULL, 0x75045bb99be5758aULL, + 0x6c163bc34b40feefULL, 0x0420063ce7bdd3b4ULL, 0xf86ef10582bf2e28ULL, + 0x162c3449ca14858cULL, 0x94106aa61dfe3280ULL, 0x4073ae7a4e7e4941ULL, + 0x32b13fd179c250b4ULL, 0x0178fbb216a7e744ULL, 0xf840ae2f1cf92669ULL, + 0x18fc709acc80243dULL, 0x20ac2ebd69f4d558ULL, 0x6e580ad9c73ad46aULL, + 0x76d2b535b541c19dULL, 0x6c7a3fb9dd0ce0afULL, 0xc3481689b9754f28ULL, + 0x156e813b6557abdbULL, 0x6ee372e31276eb10ULL, 0x19cf37c038c8d381ULL, + 0x00d4d906c9ae3072ULL, 0x09f03cbb6dfbfd40ULL, 0x461ba31c4125f3cfULL, + 0x25b29fc63ad9f05bULL, 0x6808c95c2dddede9ULL, 0x0564224337066d9bULL, + 0xc87eb5f4a4d966f2ULL, 0x66fc66e1701f5847ULL, 0xc553a3559f74da28ULL, + 0x1dfd841be574df43ULL, 0x3ee2f100c3ebc082ULL, 0x1a2c4f9517b56e89ULL, + 0x502f65c4b535c8ffULL, 0x1da5663ab6f96ec0ULL, 0xba1f80b73988152cULL, + 0x364ff12182ac8dc1ULL, 0xe3457a3c4871db31ULL, 0x6ae9cadf92fd7e84ULL, + 0x9621ba3d6ca15186ULL, 0x00ff5af878c144ceULL, 0x918464dc130101a4ULL, + 0x036511e6b187efa6ULL, 0x06667d66550ff260ULL, 0x7fd18913f9b51bc1ULL, + 0x3740e6b27af77aa8ULL, 0x1f546c2fd358ff8aULL, 0x42f1424e3115c891ULL, + 0x03767db4e3a1bb33ULL, 0xa171a1c564345060ULL, 0x0afcf632fd7b1324ULL, + 0xb59508d933ffb7d0ULL, 0x57d766c42071be83ULL, 0x659f0447546114a2ULL, + 0x4070364481c460aeULL, 0xa2b9752280644d52ULL, 0x04ab884bea5771bdULL, + 0x87cd135602a232b4ULL, 0x15e54cd9a8155313ULL, 0x1e8005efaa3e1047ULL, + 0x696b93f4ab15d39fULL, 0x0855a8e540de863aULL, 0x0bb11799e79f9426ULL, + 0xeffa61e5c1b579baULL, 0x1e060a1d11808219ULL, 0x10e219205667c599ULL, + 0x2f7b206091c49498ULL, 0xb48854c820064860ULL, 0x21c4aaa3bfbe4a38ULL, + 0x8f4a032a3fa67e9cULL, 0x3146b3823401e2acULL, 0x3afee26f19d88400ULL, + 0x167087c485791d38ULL, 0xb67a1ed945b0fb4bULL, 0x02436eb17e27f1c0ULL, + 0xe05afce2ce2d2790ULL, 0x49c536fc6224cfebULL, 0x178865b3b862b856ULL, + 0x1ce530de26acde5bULL, 0x87312c0b30a06f38ULL, 0x03e653b578558d76ULL, + 0x4d3663c21d8b3accULL, 0x038003c23626914aULL, 0xd9d5a2c052a09451ULL, + 0x39b5acfe08a49384ULL, 0x40f349956d5800e4ULL, 0x0968b6950b1bd8feULL, + 0xd60b2ca030f3779cULL, 0x7c8bc11a23ce18edULL, 0xcc23374e27630bc2ULL, + 0x2e38fc2a8bb33210ULL, 0xe421357814ee5c44ULL, 0x315fb65ea71ec671ULL, + 0xfb1b0223f70ed290ULL, 0x30556c9f983eaf07ULL, 0x8dd438c3d0cd625aULL, + 0x05a8fd0c7ffde71bULL, 0x764d1313b5aeec7aULL, 0x2036af5de9622f47ULL, + 0x508a5bfadda292feULL, 0x3f77f04ba2830e90ULL, 0x9047cd9c66ca66d2ULL, + 0x1168b5318a54eb21ULL, 0xc93462d221da2e15ULL, 0x4c2c7cc54abc066eULL, + 0x767a56fec478240eULL, 0x095de72546595bd3ULL, 0xc9da535865158558ULL, + 0x1baccf36f33e73fbULL, 0xf3d7dbe64df77f18ULL, 0x1f8ebbb7be4850b8ULL, + 0x043c5ed77bce25a1ULL, 0x07d401041b2a178aULL, 0x9181ebb8bd8d5618ULL, + 0x078b935dc3e4034aULL, 0x7b59c08954214300ULL, 0x03570dc2a4f84421ULL, + 0xdd8715b82f6b4078ULL, 0x2bb49c8bb544163bULL, 0xc9eb125564d59686ULL, + 0x5fdc7a38f80b810aULL, 0x3a4a6d8fff686544ULL, 0x28360e2418627d3aULL, + 0x60874244c95ed992ULL, 0x2115cc1dd9c34ed3ULL, 0xfaa3ef61f55e9efcULL, + 0x27ac9b1ef1adc7e6ULL, 0x95ea00478fec3f54ULL, 0x5aea808b2d99ab43ULL, + 0xc8f79e51fe43a580ULL, 0x5dbccd714236ce25ULL, 0x783fa76ed0753458ULL, + 0x48cb290f19d84655ULL, 0xc86a832f7696099aULL, 0x52f30c6fec0e71d3ULL, + 0x77d4e91e8cdeb886ULL, 0x7169a703c6a79ccdULL, 0x98208145b9596f74ULL, + 0x0945695c761c0796ULL, 0x0be897830d17bae0ULL, 0x033ad3924caeeeb4ULL, + 0xedecb6cfa2d303a8ULL, 0x3f86b074818642e7ULL, 0xeefa7c878a8b03f4ULL, + 0x093c101b80922551ULL, 0xfb3b4e6c26ac0034ULL, 0x162bf87999b94f5eULL, + 0xeaedae76e975b17cULL, 0x1852aa090effe18eULL}; + + static constexpr uint64_t kCLMulUpper[kCLMulNum] = { + 0xbb41199b1d587c69ULL, 0x514d94d55894ee29ULL, 0xebc6cd4d2efd5d16ULL, + 0x042044ad2de477fdULL, 0xb865c8b0fcdf4b15ULL, 0x0724d7e551cc40f3ULL, + 0xb15a16f39edb0bccULL, 0x37d64419ede7a171ULL, 0x2aa01bb80c753401ULL, + 0x06ff3f8a95fdaf4dULL, 0x79898cc0838546deULL, 0x776acbd1b237c60aULL, + 0x4c1753be4f4e0064ULL, 0x0ba9243601206ed3ULL, 0xd567c3b1bf3ec557ULL, + 0x043fac7bcff61fb3ULL, 0x49356232b159fb2fULL, 0x3910c82038102d4dULL, + 0x30592fef753eb300ULL, 0x7b2660e0c92a9e9aULL, 0x8246c9248d671ef0ULL, + 0x5a0dcd95147af5faULL, 0x43fde953909cc0eaULL, 0x06147b972cb96e1bULL, + 0xd84193a6b2411d80ULL, 0x00cd7711b950196fULL, 0x1088f9f4ade7fa64ULL, + 0x05a13096ec113cfbULL, 0x958d816d53b00edcULL, 0x3846154a7cdba9cbULL, + 0x8af516db6b27d1e6ULL, 0x1a1d462ab8a33b13ULL, 0x4040b0ac1b2c754cULL, + 0x05127fe9af2fe1d6ULL, 0x9f96e79374321fa6ULL, 0x06ff64a4d9c326f3ULL, + 0x28709566e158ac15ULL, 0x301701d7111ca51cULL, 0x31e0445d1b9d9544ULL, + 0x0a95aff69bf1d03eULL, 0x7c298c8414ecb879ULL, 0x00801499b4143195ULL, + 0x91521a00dd676a5cULL, 0x2777526a14c2f723ULL, 0xfa26aac6a6357dddULL, + 0x1d265889b0187a4bULL, 0xcd6e70fa8ed283e4ULL, 0x18a815aa50ea92caULL, + 0xc01e082694a263c6ULL, 0x4b40163ba53daf25ULL, 0xbc658caff6501673ULL, + 0x3ba35359586b9652ULL, 0x74f96acc97a4936cULL, 0x3989dfdb0cf1d2cfULL, + 0x358a01eaa50dda32ULL, 0x01109a5ed8f0802bULL, 0x55b84922e63c2958ULL, + 0x55b14843d87551d5ULL, 0x1db8ec61b1b578d8ULL, 0x79a2d49ef8c3658fULL, + 0xa304516816b3fbe0ULL, 0x163ecc09cc7b82f9ULL, 0xab91e8d22aabef00ULL, + 0x0ed6b09262de8354ULL, 0xcfd47d34cf73f6f2ULL, 0x7dbd1db2390bc6c3ULL, + 0x5ae789d3875e7b00ULL, 0x1d60fd0e70fe8fa4ULL, 0x690bc15d5ae4f6f5ULL, + 0x121ef5565104fb44ULL, 0x6e98e89297353b54ULL, 0x42554949249d62edULL, + 0xd6d6d16b12df78d2ULL, 0x320b33549b74975dULL, 0xd2a0618763d22e00ULL, + 0x0808deb93cba2017ULL, 0x01bd3b2302a2cc70ULL, 0x0b7b8dd4d71c8dd6ULL, + 0x34d60a3382a0756cULL, 0x40984584c8219629ULL, 0xf1152cba10093a66ULL, + 0x068001c6b2159ccbULL, 0x3d70f13c6cda0800ULL, 0x0e6b6746a322b956ULL, + 0x83a494319d8c770bULL, 0x0faecf64a8553e9aULL, 0xa34919222c39b1bcULL, + 0x0c63850d89e71c6fULL, 0x585f0bee92e53dc8ULL, 0x10f222b13b4fa5deULL, + 0x61573114f94252f2ULL, 0x09d59c311fba6c27ULL, 0x014effa7da49ed4eULL, + 0x4a400a1bc1c31d26ULL, 0xc9091c047b484972ULL, 0x3989f341ec2230ccULL, + 0xdcb03a98b3aee41eULL, 0x4a54a676a33a95e1ULL, 0xe499b7753951ef7cULL, + 0x2f43b1d1061d8b48ULL, 0xc3313bdc68ceb146ULL, 0x5159f6bc0e99227fULL, + 0x98128e6d9c05efcaULL, 0x15ea32b27f77815bULL, 0xe882c054e2654eecULL, + 0x003d2cdb8faee8c6ULL, 0xb416dd333a9fe1dfULL, 0x73f6746aefcfc98bULL, + 0x93dc114c10a38d70ULL, 0x05055941657845eaULL, 0x2ed7351347349334ULL, + 0x26fb1ee2c69ae690ULL, 0xa4575d10dc5b28e0ULL, 0x3395b11295e485ebULL, + 0xe840f198a224551cULL, 0x78e6e5a431d941d4ULL, 0xa1fee3ceab27f391ULL, + 0x07d35b3c5698d0dcULL, 0x983c67fca9174a29ULL, 0x2bb6bbae72b5144aULL, + 0xa7730b8d13ce58efULL, 0x51b5272883de1998ULL, 0xb334e128bb55e260ULL, + 0x1cacf5fbbe1b9974ULL, 0x71a9df4bb743de60ULL, 0x5176fe545c2d0d7aULL, + 0xbe592ecf1a16d672ULL, 0x27aa8a30c3efe460ULL, 0x4c78a32f47991e06ULL, + 0x383459294312f26aULL, 0x97ba789127f1490cULL, 0x51c9aa8a3abd1ef1ULL, + 0xcc7355188121e50fULL, 0x0ecb3a178ae334c1ULL, 0x84879a5e574b7160ULL, + 0x0765298f6389e8f3ULL, 0x5c6750435539bb22ULL, 0x11a05cf056c937b5ULL, + 0xb5dc2172dbfb7662ULL, 0x3ffc17915d9f40e8ULL, 0xbc7904daf3b431b0ULL, + 0x71f2088490930a7cULL, 0xa89505fd9efb53c4ULL, 0x02e194afd61c5671ULL, + 0x99a97f4abf35fcecULL, 0x26830aad30fae96fULL, 0x4b2abc16b25cf0b0ULL, + 0x07ec6fffa1cafbdbULL, 0xf38188fde97a280cULL, 0x121335701afff64dULL, + 0xea5ef38b4e672a64ULL, 0x477edbcae3eabf03ULL, 0xa32813cc0e0d244dULL, + 0x13346d2af4972eefULL, 0xcbc18357af1cfa9aULL, 0x561b630316e73fa6ULL, + 0xe9dfb53249249305ULL, 0x5d2b9dd1479312eeULL, 0x3458008119b56d04ULL, + 0x50e6790b49801385ULL, 0x5bb9febe2349492bULL, 0x0c2813954299098fULL, + 0xf747b0c890a071d5ULL, 0x417e8f82cc028d77ULL, 0xa134fee611d804f8ULL, + 0x24c99ee9a0408761ULL, 0x3ebb224e727137f3ULL, 0x0686022073ceb846ULL, + 0xa05e901fb82ad7daULL, 0x0ece7dc43ab470fcULL, 0x2d334ecc58f7d6a3ULL, + 0x23166fadacc54e40ULL, 0x9c3a4472f839556eULL, 0x071717ab5267a4adULL, + 0xb6600ac351ba3ea0ULL, 0x30ec748313bb63d4ULL, 0xb5374e39287b23ccULL, + 0x074d75e784238aebULL, 0x77315879243914a4ULL, 0x3bbb1971490865f1ULL, + 0xa355c21f4fbe02d3ULL, 0x0027f4bb38c8f402ULL, 0xeef8708e652bc5f0ULL, + 0x7b9aa56cf9440050ULL, 0x113ac03c16cfc924ULL, 0x395db36d3e4bef9fULL, + 0x5d826fabcaa597aeULL, 0x2a77d3c58786d7e0ULL, 0x85996859a3ba19d4ULL, + 0x01e7e3c904c2d97fULL, 0x34f90b9b98d51fd0ULL, 0x243aa97fd2e99bb7ULL, + 0x40a0cebc4f65c1e8ULL, 0x46d3922ed4a5503eULL, 0x446e7ecaf1f9c0a4ULL, + 0x49dc11558bc2e6aeULL, 0xe7a9f20881793af8ULL, 0x5771cc4bc98103f1ULL, + 0x2446ea6e718fce90ULL, 0x25d14aca7f7da198ULL, 0x4347af186f9af964ULL, + 0x10cb44fc9146363aULL, 0x8a35587afce476b4ULL, 0x575144662fee3d3aULL, + 0x69f41177a6bc7a05ULL, 0x02ff8c38d6b3c898ULL, 0x57c73589a226ca40ULL, + 0x732f6b5baae66683ULL, 0x00c008bbedd4bb34ULL, 0x7412ff09524d6cadULL, + 0xb8fd0b5ad8c145a8ULL, 0x74bd9f94b6cdc7dfULL, 0x68233b317ca6c19cULL, + 0x314b9c2c08b15c54ULL, 0x5bd1ad72072ebd08ULL, 0x6610e6a6c07030e4ULL, + 0xa4fc38e885ead7ceULL, 0x36975d1ca439e034ULL, 0xa358f0fe358ffb1aULL, + 0x38e247ad663acf7dULL, 0x77daed3643b5deb8ULL, 0x5507c2aeae1ec3d0ULL, + 0xfdec226c73acf775ULL, 0x1b87ff5f5033492dULL, 0xa832dee545d9033fULL, + 0x1cee43a61e41783bULL, 0xdff82b2e2d822f69ULL, 0x2bbc9a376cb38cf2ULL, + 0x117b1cdaf765dc02ULL, 0x26a407f5682be270ULL, 0x8eb664cf5634af28ULL, + 0x17cb4513bec68551ULL, 0xb0df6527900cbfd0ULL, 0x335a2dc79c5afdfcULL, + 0xa2f0ca4cd38dca88ULL, 0x1c370713b81a2de1ULL, 0x849d5df654d1adfcULL, + 0x2fd1f7675ae14e44ULL, 0x4ff64dfc02247f7bULL, 0x3a2bcf40e395a48dULL, + 0x436248c821b187c1ULL, 0x29f4337b1c7104c0ULL, 0xfc317c46e6630ec4ULL, + 0x2774bccc4e3264c7ULL, 0x2d03218d9d5bee23ULL, 0x36a0ed04d659058aULL, + 0x452484461573cab6ULL, 0x0708edf87ed6272bULL, 0xf07960a1587446cbULL, + 0x3660167b067d84e0ULL, 0x65990a6993ddf8c4ULL, 0x0b197cd3d0b40b3fULL, + 0x1dcec4ab619f3a05ULL, 0x722ab223a84f9182ULL, 0x0822d61a81e7c38fULL, + 0x3d22ad75da563201ULL, 0x93cef6979fd35e0fULL, 0x05c3c25ae598b14cULL, + 0x1338df97dd496377ULL, 0x15bc324dc9c20acfULL, 0x96397c6127e6e8cfULL, + 0x004d01069ef2050fULL, 0x2fcf2e27893fdcbcULL, 0x072f77c3e44f4a5cULL, + 0x5eb1d80b3fe44918ULL, 0x1f59e7c28cc21f22ULL, 0x3390ce5df055c1f8ULL, + 0x4c0ef11df92cb6bfULL, 0x50f82f9e0848c900ULL, 0x08d0fde3ffc0ae38ULL, + 0xbd8d0089a3fbfb73ULL, 0x118ba5b0f311ef59ULL, 0x9be9a8407b926a61ULL, + 0x4ea04fbb21318f63ULL, 0xa1c8e7bb07b871ffULL, 0x1253a7262d5d3b02ULL, + 0x13e997a0512e5b29ULL, 0x54318460ce9055baULL, 0x4e1d8a4db0054798ULL, + 0x0b235226e2cade32ULL, 0x2588732c1476b315ULL, 0x16a378750ba8ac68ULL, + 0xba0b116c04448731ULL, 0x4dd02bd47694c2f1ULL, 0x16d6797b218b6b25ULL, + 0x769eb3709cfbf936ULL, 0x197746a0ce396f38ULL, 0x7d17ad8465961d6eULL, + 0xfe58f4998ae19bb4ULL, 0x36df24305233ce69ULL, 0xb88a4eb008f4ee72ULL, + 0x302b2eb923334787ULL, 0x15a4e3edbe13d448ULL, 0x39a4bf64dd7730ceULL, + 0xedf25421b31090c4ULL, 0x4d547fc131be3b69ULL, 0x2b316e120ca3b90eULL, + 0x0faf2357bf18a169ULL, 0x71f34b54ee2c1d62ULL, 0x18eaf6e5c93a3824ULL, + 0x7e168ba03c1b4c18ULL, 0x1a534dd586d9e871ULL, 0xa2cccd307f5f8c38ULL, + 0x2999a6fb4dce30f6ULL, 0x8f6d3b02c1d549a6ULL, 0x5cf7f90d817aac5aULL, + 0xd2a4ceefe66c8170ULL, 0x11560edc4ca959feULL, 0x89e517e6f0dc464dULL, + 0x75bb8972dddd2085ULL, 0x13859ed1e459d65aULL, 0x057114653326fa84ULL, + 0xe2e6f465173cc86cULL, 0x0ada4076497d7de4ULL, 0xa856fa10ec6dbf8aULL, + 0x41505d9a7c25d875ULL, 0x3091b6278382eccdULL, 0x055737185b2c3f13ULL, + 0x2f4df8ecd6f9c632ULL, 0x0633e89c33552d98ULL, 0xf7673724d16db440ULL, + 0x7331bd08e636c391ULL, 0x0252f29672fee426ULL, 0x1fc384946b6b9ddeULL, + 0x03460c12c901443aULL, 0x003a0792e10abcdaULL, 0x8dbec31f624e37d0ULL, + 0x667420d5bfe4dcbeULL, 0xfbfa30e874ed7641ULL, 0x46d1ae14db7ecef6ULL, + 0x216bd7e8f5448768ULL, 0x32bcd40d3d69cc88ULL, 0x2e991dbc39b65abeULL, + 0x0e8fb123a502f553ULL, 0x3d2d486b2c7560c0ULL, 0x09aba1db3079fe03ULL, + 0xcb540c59398c9bceULL, 0x363970e5339ed600ULL, 0x2caee457c28af00eULL, + 0x005e7d7ee47f41a0ULL, 0x69fad3eb10f44100ULL, 0x048109388c75beb3ULL, + 0x253dddf96c7a6fb8ULL, 0x4c47f705b9d47d09ULL, 0x6cec894228b5e978ULL, + 0x04044bb9f8ff45c2ULL, 0x079e75704d775caeULL, 0x073bd54d2a9e2c33ULL, + 0xcec7289270a364fbULL, 0x19e7486f19cd9e4eULL, 0xb50ac15b86b76608ULL, + 0x0620cf81f165c812ULL, 0x63eaaf13be7b11d4ULL, 0x0e0cf831948248c2ULL, + 0xf0412df8f46e7957ULL, 0x671c1fe752517e3fULL, 0x8841bfb04dd3f540ULL, + 0x122de4142249f353ULL, 0x40a4959fb0e76870ULL, 0x25cfd3d4b4bbc459ULL, + 0x78a07c82930c60d0ULL, 0x12c2de24d4cbc969ULL, 0x85d44866096ad7f4ULL, + 0x1fd917ca66b2007bULL, 0x01fbbb0751764764ULL, 0x3d2a4953c6fe0fdcULL, + 0xcc1489c5737afd94ULL, 0x1817c5b6a5346f41ULL, 0xe605a6a7e9985644ULL, + 0x3c50412328ff1946ULL, 0xd8c7fd65817f1291ULL, 0x0bd66975ab66339bULL, + 0x2baf8fa1c7d10fa9ULL, 0x24abdf06ddef848dULL, 0x14df0c9b2ea4f6c2ULL, + 0x2be950edfd2cb1f7ULL, 0x21911e21094178b6ULL, 0x0fa54d518a93b379ULL, + 0xb52508e0ac01ab42ULL, 0x0e035b5fd8cb79beULL, 0x1c1c6d1a3b3c8648ULL, + 0x286037b42ea9871cULL, 0xfe67bf311e48a340ULL, 0x02324131e932a472ULL, + 0x2486dc2dd919e2deULL, 0x008aec7f1da1d2ebULL, 0x63269ba0e8d3eb3aULL, + 0x23c0f11154adb62fULL, 0xc6052393ecd4c018ULL, 0x523585b7d2f5b9fcULL, + 0xf7e6f8c1e87564c9ULL, 0x09eb9fe5dd32c1a3ULL, 0x4d4f86886e055472ULL, + 0x67ea17b58a37966bULL, 0x3d3ce8c23b1ed1a8ULL, 0x0df97c5ac48857ceULL, + 0x9b6992623759eb12ULL, 0x275aa9551ae091f2ULL, 0x08855e19ac5e62e5ULL, + 0x1155fffe0ae083ccULL, 0xbc9c78db7c570240ULL, 0x074560c447dd2418ULL, + 0x3bf78d330bcf1e70ULL, 0x49867cd4b7ed134bULL, 0x8e6eee0cb4470accULL, + 0x1dabafdf59233dd6ULL, 0xea3a50d844fc3fb8ULL, 0x4f03f4454764cb87ULL, + 0x1f2f41cc36c9e6ecULL, 0x53cba4df42963441ULL, 0x10883b70a88d91fbULL, + 0x62b1fc77d4eb9481ULL, 0x893d8f2604b362e1ULL, 0x0933b7855368b440ULL, + 0x9351b545703b2fceULL, 0x59c1d489b9bdd3b4ULL, 0xe72a9c4311417b18ULL, + 0x5355df77e88eb226ULL, 0xe802c37aa963d7e1ULL, 0x381c3747bd6c3bc3ULL, + 0x378565573444258cULL, 0x37848b1e52b43c18ULL, 0x5da2cd32bdce12b6ULL, + 0x13166c5da615f6fdULL, 0xa51ef95efcc66ac8ULL, 0x640c95e473f1e541ULL, + 0x6ec68def1f217500ULL, 0x49ce3543c76a4079ULL, 0x5fc6fd3cddc706b5ULL, + 0x05c3c0f0f6a1fb0dULL, 0xe7820c0996ad1bddULL, 0x21f0d752a088f35cULL, + 0x755405b51d6fc4a0ULL, 0x7ec7649ca4b0e351ULL, 0x3d2b6a46a251f790ULL, + 0x23e1176b19f418adULL, 0x06056575efe8ac05ULL, 0x0f75981b6966e477ULL, + 0x06e87ec41ad437e4ULL, 0x43f6c255d5e1cb84ULL, 0xe4e67d1120ceb580ULL, + 0x2cd67b9e12c26d7bULL, 0xcd00b5ff7fd187f1ULL, 0x3f6cd40accdc4106ULL, + 0x3e895c835459b330ULL, 0x0814d53a217c0850ULL, 0xc9111fe78bc3a62dULL, + 0x719967e351473204ULL, 0xe757707d24282aa4ULL, 0x7226b7f5607f98e6ULL, + 0x7b268ffae3c08d96ULL, 0x16d3917c8b86020eULL, 0x5128bca51c49ea64ULL, + 0x345ffea02bb1698dULL, 0x9460f5111fe4fbc8ULL, 0x60dd1aa5762852cbULL, + 0xbb7440ed3c81667cULL, 0x0a4b12affa7f6f5cULL, 0x95cbcb0ae03861b6ULL, + 0x07ab3b0591db6070ULL, 0xc6476a4c3de78982ULL, 0x204e82e8623ad725ULL, + 0x569a5b4e8ac2a5ccULL, 0x425a1d77d72ebae2ULL, 0xcdaad5551ab33830ULL, + 0x0b7c68fd8422939eULL, 0x46d9a01f53ec3020ULL, 0x102871edbb29e852ULL, + 0x7a8e8084039075a5ULL, 0x40eaede8615e376aULL, 0x4dc67d757a1c751fULL, + 0x1176ef33063f9145ULL, 0x4ea230285b1c8156ULL, 0x6b2aa46ce0027392ULL, + 0x32b13230fba1b068ULL, 0x0e69796851bb984fULL, 0xb749f4542db698c0ULL, + 0x19ad0241ffffd49cULL, 0x2f41e92ef6caff52ULL, 0x4d0b068576747439ULL, + 0x14d607aef7463e00ULL, 0x1443d00d85fb440eULL, 0x529b43bf68688780ULL, + 0x21133a6bc3a3e378ULL, 0x865b6436dae0e7e5ULL, 0x6b4fe83dc1d6defcULL, + 0x03a5858a0ca0be46ULL, 0x1e841b187e67f312ULL, 0x61ee22ef40a66940ULL, + 0x0494bd2e9e741ef8ULL, 0x4eb59e323010e72cULL, 0x19f2abcfb749810eULL, + 0xb30f1e4f994ef9bcULL, 0x53cf6cdd51bd2d96ULL, 0x263943036497a514ULL, + 0x0d4b52170aa2edbaULL, 0x0c4758a1c7b4f758ULL, 0x178dadb1b502b51aULL, + 0x1ddbb20a602eb57aULL, 0x1fc2e2564a9f27fdULL, 0xd5f8c50a0e3d6f90ULL, + 0x0081da3bbe72ac09ULL, 0xcf140d002ccdb200ULL, 0x0ae8389f09b017feULL, + 0x17cc9ffdc03f4440ULL, 0x04eb921d704bcdddULL, 0x139a0ce4cdc521abULL, + 0x0bfce00c145cb0f0ULL, 0x99925ff132eff707ULL, 0x063f6e5da50c3d35ULL, + 0xa0c25dea3f0e6e29ULL, 0x0c7a9048cc8e040fULL, + }; + + const size_t padded = RoundUpTo(kCLMulNum, N); + auto expected_lower = AllocateAligned<T>(padded); + auto expected_upper = AllocateAligned<T>(padded); + CopyBytes<kCLMulNum * sizeof(T)>(kCLMulLower, expected_lower.get()); + CopyBytes<kCLMulNum * sizeof(T)>(kCLMulUpper, expected_upper.get()); + const size_t padding_size = (padded - kCLMulNum) * sizeof(T); + memset(expected_lower.get() + kCLMulNum, 0, padding_size); + memset(expected_upper.get() + kCLMulNum, 0, padding_size); + + // Random inputs in each lane + RandomState rng; + for (size_t rep = 0; rep < kCLMulNum / N; ++rep) { + for (size_t i = 0; i < N; ++i) { + in1[i] = Random64(&rng); + in2[i] = Random64(&rng); + } + + const auto a = Load(d, in1.get()); + const auto b = Load(d, in2.get()); +#if HWY_PRINT_CLMUL_GOLDEN + Store(CLMulLower(a, b), d, expected_lower.get() + rep * N); + Store(CLMulUpper(a, b), d, expected_upper.get() + rep * N); +#else + HWY_ASSERT_VEC_EQ(d, expected_lower.get() + rep * N, CLMulLower(a, b)); + HWY_ASSERT_VEC_EQ(d, expected_upper.get() + rep * N, CLMulUpper(a, b)); +#endif + } + +#if HWY_PRINT_CLMUL_GOLDEN + // RVV lacks PRIu64, so print 32-bit halves. + for (size_t i = 0; i < kCLMulNum; ++i) { + printf("0x%08x%08xULL,", static_cast<uint32_t>(expected_lower[i] >> 32), + static_cast<uint32_t>(expected_lower[i] & 0xFFFFFFFFU)); + } + printf("\n"); + for (size_t i = 0; i < kCLMulNum; ++i) { + printf("0x%08x%08xULL,", static_cast<uint32_t>(expected_upper[i] >> 32), + static_cast<uint32_t>(expected_upper[i] & 0xFFFFFFFFU)); + } +#endif // HWY_PRINT_CLMUL_GOLDEN +#else + (void)d; +#endif + } +}; + +HWY_NOINLINE void TestAllCLMul() { ForGEVectors<128, TestCLMul>()(uint64_t()); } + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyCryptoTest); +HWY_EXPORT_AND_TEST_P(HwyCryptoTest, TestAllAES); +HWY_EXPORT_AND_TEST_P(HwyCryptoTest, TestAllCLMul); +} // namespace hwy + +#endif diff --git a/third_party/highway/hwy/tests/demote_test.cc b/third_party/highway/hwy/tests/demote_test.cc new file mode 100644 index 0000000000..22469113d5 --- /dev/null +++ b/third_party/highway/hwy/tests/demote_test.cc @@ -0,0 +1,328 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <stddef.h> +#include <stdint.h> + +#include <cmath> // std::isfinite + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/demote_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +// Causes build timeout. +#if !HWY_IS_MSAN + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +template <typename T, HWY_IF_FLOAT(T)> +bool IsFiniteT(T t) { + return std::isfinite(t); +} +// Wrapper avoids calling std::isfinite for integer types (ambiguous). +template <typename T, HWY_IF_NOT_FLOAT(T)> +bool IsFiniteT(T /*unused*/) { + return true; +} + +template <typename ToT> +struct TestDemoteTo { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D from_d) { + static_assert(!IsFloat<ToT>(), "Use TestDemoteToFloat for float output"); + static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider"); + const Rebind<ToT, D> to_d; + + const size_t N = Lanes(from_d); + auto from = AllocateAligned<T>(N); + auto expected = AllocateAligned<ToT>(N); + + // Narrower range in the wider type, for clamping before we cast + const T min = LimitsMin<ToT>(); + const T max = LimitsMax<ToT>(); + + const auto value_ok = [&](T& value) { + if (!IsFiniteT(value)) return false; + return true; + }; + + RandomState rng; + for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) { + for (size_t i = 0; i < N; ++i) { + do { + const uint64_t bits = rng(); + CopyBytes<sizeof(T)>(&bits, &from[i]); // not same size + } while (!value_ok(from[i])); + expected[i] = static_cast<ToT>(HWY_MIN(HWY_MAX(min, from[i]), max)); + } + + const auto in = Load(from_d, from.get()); + HWY_ASSERT_VEC_EQ(to_d, expected.get(), DemoteTo(to_d, in)); + } + } +}; + +HWY_NOINLINE void TestAllDemoteToInt() { + ForDemoteVectors<TestDemoteTo<uint8_t>>()(int16_t()); + ForDemoteVectors<TestDemoteTo<uint8_t>, 2>()(int32_t()); + + ForDemoteVectors<TestDemoteTo<int8_t>>()(int16_t()); + ForDemoteVectors<TestDemoteTo<int8_t>, 2>()(int32_t()); + + const ForDemoteVectors<TestDemoteTo<uint16_t>> to_u16; + to_u16(int32_t()); + + const ForDemoteVectors<TestDemoteTo<int16_t>> to_i16; + to_i16(int32_t()); +} + +HWY_NOINLINE void TestAllDemoteToMixed() { +#if HWY_HAVE_FLOAT64 + const ForDemoteVectors<TestDemoteTo<int32_t>> to_i32; + to_i32(double()); +#endif +} + +template <typename ToT> +struct TestDemoteToFloat { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D from_d) { + // For floats, we clamp differently and cannot call LimitsMin. + static_assert(IsFloat<ToT>(), "Use TestDemoteTo for integer output"); + static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider"); + const Rebind<ToT, D> to_d; + + const size_t N = Lanes(from_d); + auto from = AllocateAligned<T>(N); + auto expected = AllocateAligned<ToT>(N); + + RandomState rng; + for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) { + for (size_t i = 0; i < N; ++i) { + do { + const uint64_t bits = rng(); + CopyBytes<sizeof(T)>(&bits, &from[i]); // not same size + } while (!IsFiniteT(from[i])); + const T magn = std::abs(from[i]); + const T max_abs = HighestValue<ToT>(); + // NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see + // https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html + const T clipped = copysign(HWY_MIN(magn, max_abs), from[i]); + expected[i] = static_cast<ToT>(clipped); + } + + HWY_ASSERT_VEC_EQ(to_d, expected.get(), + DemoteTo(to_d, Load(from_d, from.get()))); + } + } +}; + +HWY_NOINLINE void TestAllDemoteToFloat() { + // Must test f16 separately because we can only load/store/convert them. + +#if HWY_HAVE_FLOAT64 + const ForDemoteVectors<TestDemoteToFloat<float>, 1> to_float; + to_float(double()); +#endif +} + +template <class D> +AlignedFreeUniquePtr<float[]> ReorderBF16TestCases(D d, size_t& padded) { + const float test_cases[] = { + // Same as BF16TestCases: + // +/- 1 + 1.0f, + -1.0f, + // +/- 0 + 0.0f, + -0.0f, + // near 0 + 0.25f, + -0.25f, + // +/- integer + 4.0f, + -32.0f, + // positive +/- delta + 2.015625f, + 3.984375f, + // negative +/- delta + -2.015625f, + -3.984375f, + + // No huge values - would interfere with sum. But add more to fill 2 * N: + -2.0f, + -10.0f, + 0.03125f, + 1.03125f, + 1.5f, + 2.0f, + 4.0f, + 5.0f, + 6.0f, + 8.0f, + 10.0f, + 256.0f, + 448.0f, + 2080.0f, + }; + const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]); + const size_t N = Lanes(d); + padded = RoundUpTo(kNumTestCases, 2 * N); // allow loading pairs of vectors + auto in = AllocateAligned<float>(padded); + auto expected = AllocateAligned<float>(padded); + std::copy(test_cases, test_cases + kNumTestCases, in.get()); + std::fill(in.get() + kNumTestCases, in.get() + padded, 0.0f); + return in; +} + +class TestReorderDemote2To { + // In-place N^2 selection sort to avoid dependencies + void Sort(float* p, size_t count) { + for (size_t i = 0; i < count - 1; ++i) { + // Find min_element + size_t idx_min = i; + for (size_t j = i + 1; j < count; j++) { + if (p[j] < p[idx_min]) { + idx_min = j; + } + } + + // Swap with current + const float tmp = p[i]; + p[i] = p[idx_min]; + p[idx_min] = tmp; + } + } + + public: + template <typename TF32, class DF32> + HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) { +#if HWY_TARGET != HWY_SCALAR + size_t padded; + auto in = ReorderBF16TestCases(d32, padded); + + using TBF16 = bfloat16_t; + const Repartition<TBF16, DF32> dbf16; + const Half<decltype(dbf16)> dbf16_half; + const size_t N = Lanes(d32); + auto temp16 = AllocateAligned<TBF16>(2 * N); + auto expected = AllocateAligned<float>(2 * N); + auto actual = AllocateAligned<float>(2 * N); + + for (size_t i = 0; i < padded; i += 2 * N) { + const auto f0 = Load(d32, &in[i + 0]); + const auto f1 = Load(d32, &in[i + N]); + const auto v16 = ReorderDemote2To(dbf16, f0, f1); + Store(v16, dbf16, temp16.get()); + const auto promoted0 = PromoteTo(d32, Load(dbf16_half, temp16.get() + 0)); + const auto promoted1 = PromoteTo(d32, Load(dbf16_half, temp16.get() + N)); + + // Smoke test: sum should be same (with tolerance for non-associativity) + const auto sum_expected = GetLane(SumOfLanes(d32, Add(f0, f1))); + const auto sum_actual = + GetLane(SumOfLanes(d32, Add(promoted0, promoted1))); + + HWY_ASSERT(sum_expected - 1E-4 <= sum_actual && + sum_actual <= sum_expected + 1E-4); + + // Ensure values are the same after sorting to undo the Reorder + Store(f0, d32, expected.get() + 0); + Store(f1, d32, expected.get() + N); + Store(promoted0, d32, actual.get() + 0); + Store(promoted1, d32, actual.get() + N); + Sort(expected.get(), 2 * N); + Sort(actual.get(), 2 * N); + HWY_ASSERT_VEC_EQ(d32, expected.get() + 0, Load(d32, actual.get() + 0)); + HWY_ASSERT_VEC_EQ(d32, expected.get() + N, Load(d32, actual.get() + N)); + } +#else // HWY_SCALAR + (void)d32; +#endif + } +}; + +HWY_NOINLINE void TestAllReorderDemote2To() { + ForShrinkableVectors<TestReorderDemote2To>()(float()); +} + +struct TestI32F64 { + template <typename TF, class DF> + HWY_NOINLINE void operator()(TF /*unused*/, const DF df) { + using TI = int32_t; + const Rebind<TI, DF> di; + const size_t N = Lanes(df); + + // Integer positive + HWY_ASSERT_VEC_EQ(di, Iota(di, TI(4)), DemoteTo(di, Iota(df, TF(4.0)))); + + // Integer negative + HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), DemoteTo(di, Iota(df, -TF(N)))); + + // Above positive + HWY_ASSERT_VEC_EQ(di, Iota(di, TI(2)), DemoteTo(di, Iota(df, TF(2.001)))); + + // Below positive + HWY_ASSERT_VEC_EQ(di, Iota(di, TI(3)), DemoteTo(di, Iota(df, TF(3.9999)))); + + const TF eps = static_cast<TF>(0.0001); + // Above negative + HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), + DemoteTo(di, Iota(df, -TF(N + 1) + eps))); + + // Below negative + HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N + 1)), + DemoteTo(di, Iota(df, -TF(N + 1) - eps))); + + // Huge positive float + HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMax<TI>()), + DemoteTo(di, Set(df, TF(1E12)))); + + // Huge negative float + HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMin<TI>()), + DemoteTo(di, Set(df, TF(-1E12)))); + } +}; + +HWY_NOINLINE void TestAllI32F64() { +#if HWY_HAVE_FLOAT64 + ForDemoteVectors<TestI32F64>()(double()); +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // !HWY_IS_MSAN + +#if HWY_ONCE + +namespace hwy { +#if !HWY_IS_MSAN +HWY_BEFORE_TEST(HwyDemoteTest); +HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToInt); +HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToMixed); +HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToFloat); +HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllReorderDemote2To); +HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllI32F64); +#endif // !HWY_IS_MSAN +} // namespace hwy + +#endif diff --git a/third_party/highway/hwy/tests/float_test.cc b/third_party/highway/hwy/tests/float_test.cc new file mode 100644 index 0000000000..bc6d9020e6 --- /dev/null +++ b/third_party/highway/hwy/tests/float_test.cc @@ -0,0 +1,350 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Tests some ops specific to floating-point types (Div, Round etc.) + +#include <stddef.h> +#include <stdint.h> + +#include <algorithm> // std::copy, std::fill +#include <limits> +#include <cmath> // std::abs, std::isnan, std::isinf, std::ceil, std::floor + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/float_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +struct TestDiv { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v = Iota(d, T(-2)); + const auto v1 = Set(d, T(1)); + + // Unchanged after division by 1. + HWY_ASSERT_VEC_EQ(d, v, Div(v, v1)); + + const size_t N = Lanes(d); + auto expected = AllocateAligned<T>(N); + for (size_t i = 0; i < N; ++i) { + expected[i] = (T(i) - 2) / T(2); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Div(v, Set(d, T(2)))); + } +}; + +HWY_NOINLINE void TestAllDiv() { ForFloatTypes(ForPartialVectors<TestDiv>()); } + +struct TestApproximateReciprocal { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v = Iota(d, T(-2)); + const auto nonzero = IfThenElse(Eq(v, Zero(d)), Set(d, T(1)), v); + const size_t N = Lanes(d); + auto input = AllocateAligned<T>(N); + Store(nonzero, d, input.get()); + + auto actual = AllocateAligned<T>(N); + Store(ApproximateReciprocal(nonzero), d, actual.get()); + + double max_l1 = 0.0; + double worst_expected = 0.0; + double worst_actual = 0.0; + for (size_t i = 0; i < N; ++i) { + const double expected = 1.0 / input[i]; + const double l1 = std::abs(expected - actual[i]); + if (l1 > max_l1) { + max_l1 = l1; + worst_expected = expected; + worst_actual = actual[i]; + } + } + const double abs_worst_expected = std::abs(worst_expected); + if (abs_worst_expected > 1E-5) { + const double max_rel = max_l1 / abs_worst_expected; + fprintf(stderr, "max l1 %f rel %f (%f vs %f)\n", max_l1, max_rel, + worst_expected, worst_actual); + HWY_ASSERT(max_rel < 0.004); + } + } +}; + +HWY_NOINLINE void TestAllApproximateReciprocal() { + ForPartialVectors<TestApproximateReciprocal>()(float()); +} + +struct TestSquareRoot { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto vi = Iota(d, 0); + HWY_ASSERT_VEC_EQ(d, vi, Sqrt(Mul(vi, vi))); + } +}; + +HWY_NOINLINE void TestAllSquareRoot() { + ForFloatTypes(ForPartialVectors<TestSquareRoot>()); +} + +struct TestReciprocalSquareRoot { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v = Set(d, 123.0f); + const size_t N = Lanes(d); + auto lanes = AllocateAligned<T>(N); + Store(ApproximateReciprocalSqrt(v), d, lanes.get()); + for (size_t i = 0; i < N; ++i) { + float err = lanes[i] - 0.090166f; + if (err < 0.0f) err = -err; + if (err >= 4E-4f) { + HWY_ABORT("Lane %d (%d): actual %f err %f\n", static_cast<int>(i), + static_cast<int>(N), lanes[i], err); + } + } + } +}; + +HWY_NOINLINE void TestAllReciprocalSquareRoot() { + ForPartialVectors<TestReciprocalSquareRoot>()(float()); +} + +template <typename T, class D> +AlignedFreeUniquePtr<T[]> RoundTestCases(T /*unused*/, D d, size_t& padded) { + const T eps = std::numeric_limits<T>::epsilon(); + const T test_cases[] = { + // +/- 1 + T(1), + T(-1), + // +/- 0 + T(0), + T(-0), + // near 0 + T(0.4), + T(-0.4), + // +/- integer + T(4), + T(-32), + // positive near limit + MantissaEnd<T>() - T(1.5), + MantissaEnd<T>() + T(1.5), + // negative near limit + -MantissaEnd<T>() - T(1.5), + -MantissaEnd<T>() + T(1.5), + // positive tiebreak + T(1.5), + T(2.5), + // negative tiebreak + T(-1.5), + T(-2.5), + // positive +/- delta + T(2.0001), + T(3.9999), + // negative +/- delta + T(-999.9999), + T(-998.0001), + // positive +/- epsilon + T(1) + eps, + T(1) - eps, + // negative +/- epsilon + T(-1) + eps, + T(-1) - eps, + // +/- huge (but still fits in float) + T(1E34), + T(-1E35), + // +/- infinity + std::numeric_limits<T>::infinity(), + -std::numeric_limits<T>::infinity(), + // qNaN + GetLane(NaN(d)) + }; + const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]); + const size_t N = Lanes(d); + padded = RoundUpTo(kNumTestCases, N); // allow loading whole vectors + auto in = AllocateAligned<T>(padded); + auto expected = AllocateAligned<T>(padded); + std::copy(test_cases, test_cases + kNumTestCases, in.get()); + std::fill(in.get() + kNumTestCases, in.get() + padded, T(0)); + return in; +} + +struct TestRound { + template <typename T, class D> + HWY_NOINLINE void operator()(T t, D d) { + size_t padded; + auto in = RoundTestCases(t, d, padded); + auto expected = AllocateAligned<T>(padded); + + for (size_t i = 0; i < padded; ++i) { + // Avoid [std::]round, which does not round to nearest *even*. + // NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see + // https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html + expected[i] = static_cast<T>(nearbyint(in[i])); + } + for (size_t i = 0; i < padded; i += Lanes(d)) { + HWY_ASSERT_VEC_EQ(d, &expected[i], Round(Load(d, &in[i]))); + } + } +}; + +HWY_NOINLINE void TestAllRound() { + ForFloatTypes(ForPartialVectors<TestRound>()); +} + +struct TestNearestInt { + template <typename TF, class DF> + HWY_NOINLINE void operator()(TF tf, const DF df) { + using TI = MakeSigned<TF>; + const RebindToSigned<DF> di; + + size_t padded; + auto in = RoundTestCases(tf, df, padded); + auto expected = AllocateAligned<TI>(padded); + + constexpr double max = static_cast<double>(LimitsMax<TI>()); + for (size_t i = 0; i < padded; ++i) { + if (std::isnan(in[i])) { + // We replace NaN with 0 below (no_nan) + expected[i] = 0; + } else if (std::isinf(in[i]) || double{std::abs(in[i])} >= max) { + // Avoid undefined result for lrintf + expected[i] = std::signbit(in[i]) ? LimitsMin<TI>() : LimitsMax<TI>(); + } else { + expected[i] = static_cast<TI>(lrintf(in[i])); + } + } + for (size_t i = 0; i < padded; i += Lanes(df)) { + const auto v = Load(df, &in[i]); + const auto no_nan = IfThenElse(Eq(v, v), v, Zero(df)); + HWY_ASSERT_VEC_EQ(di, &expected[i], NearestInt(no_nan)); + } + } +}; + +HWY_NOINLINE void TestAllNearestInt() { + ForPartialVectors<TestNearestInt>()(float()); +} + +struct TestTrunc { + template <typename T, class D> + HWY_NOINLINE void operator()(T t, D d) { + size_t padded; + auto in = RoundTestCases(t, d, padded); + auto expected = AllocateAligned<T>(padded); + + for (size_t i = 0; i < padded; ++i) { + // NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see + // https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html + expected[i] = static_cast<T>(trunc(in[i])); + } + for (size_t i = 0; i < padded; i += Lanes(d)) { + HWY_ASSERT_VEC_EQ(d, &expected[i], Trunc(Load(d, &in[i]))); + } + } +}; + +HWY_NOINLINE void TestAllTrunc() { + ForFloatTypes(ForPartialVectors<TestTrunc>()); +} + +struct TestCeil { + template <typename T, class D> + HWY_NOINLINE void operator()(T t, D d) { + size_t padded; + auto in = RoundTestCases(t, d, padded); + auto expected = AllocateAligned<T>(padded); + + for (size_t i = 0; i < padded; ++i) { + expected[i] = std::ceil(in[i]); + } + for (size_t i = 0; i < padded; i += Lanes(d)) { + HWY_ASSERT_VEC_EQ(d, &expected[i], Ceil(Load(d, &in[i]))); + } + } +}; + +HWY_NOINLINE void TestAllCeil() { + ForFloatTypes(ForPartialVectors<TestCeil>()); +} + +struct TestFloor { + template <typename T, class D> + HWY_NOINLINE void operator()(T t, D d) { + size_t padded; + auto in = RoundTestCases(t, d, padded); + auto expected = AllocateAligned<T>(padded); + + for (size_t i = 0; i < padded; ++i) { + expected[i] = std::floor(in[i]); + } + for (size_t i = 0; i < padded; i += Lanes(d)) { + HWY_ASSERT_VEC_EQ(d, &expected[i], Floor(Load(d, &in[i]))); + } + } +}; + +HWY_NOINLINE void TestAllFloor() { + ForFloatTypes(ForPartialVectors<TestFloor>()); +} + +struct TestAbsDiff { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + auto in_lanes_a = AllocateAligned<T>(N); + auto in_lanes_b = AllocateAligned<T>(N); + auto out_lanes = AllocateAligned<T>(N); + for (size_t i = 0; i < N; ++i) { + in_lanes_a[i] = static_cast<T>((i ^ 1u) << i); + in_lanes_b[i] = static_cast<T>(i << i); + out_lanes[i] = std::abs(in_lanes_a[i] - in_lanes_b[i]); + } + const auto a = Load(d, in_lanes_a.get()); + const auto b = Load(d, in_lanes_b.get()); + const auto expected = Load(d, out_lanes.get()); + HWY_ASSERT_VEC_EQ(d, expected, AbsDiff(a, b)); + HWY_ASSERT_VEC_EQ(d, expected, AbsDiff(b, a)); + } +}; + +HWY_NOINLINE void TestAllAbsDiff() { + ForPartialVectors<TestAbsDiff>()(float()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyFloatTest); +HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllDiv); +HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllApproximateReciprocal); +HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllSquareRoot); +HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllReciprocalSquareRoot); +HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllRound); +HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllNearestInt); +HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllTrunc); +HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllCeil); +HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllFloor); +HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllAbsDiff); +} // namespace hwy + +#endif diff --git a/third_party/highway/hwy/tests/hwy_gtest.h b/third_party/highway/hwy/tests/hwy_gtest.h new file mode 100644 index 0000000000..a4c21cd171 --- /dev/null +++ b/third_party/highway/hwy/tests/hwy_gtest.h @@ -0,0 +1,157 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HWY_TESTS_HWY_GTEST_H_ +#define HWY_TESTS_HWY_GTEST_H_ + +// Adapters for GUnit to run tests for all targets. + +#include <stddef.h> +#include <stdint.h> + +#include <string> +#include <tuple> + +#include "gtest/gtest.h" +#include "hwy/highway.h" + +namespace hwy { + +// googletest before 1.10 didn't define INSTANTIATE_TEST_SUITE_P() but instead +// used INSTANTIATE_TEST_CASE_P which is now deprecated. +#ifdef INSTANTIATE_TEST_SUITE_P +#define HWY_GTEST_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_SUITE_P +#else +#define HWY_GTEST_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_CASE_P +#endif + +// Helper class to run parametric tests using the hwy target as parameter. To +// use this define the following in your test: +// class MyTestSuite : public TestWithParamTarget { +// ... +// }; +// HWY_TARGET_INSTANTIATE_TEST_SUITE_P(MyTestSuite); +// TEST_P(MyTestSuite, MyTest) { ... } +class TestWithParamTarget : public testing::TestWithParam<int64_t> { + protected: + void SetUp() override { SetSupportedTargetsForTest(GetParam()); } + + void TearDown() override { + // Check that the parametric test calls SupportedTargets() when the source + // was compiled with more than one target. In the single-target case only + // static dispatch will be used anyway. +#if (HWY_TARGETS & (HWY_TARGETS - 1)) != 0 + EXPECT_TRUE(GetChosenTarget().IsInitialized()) + << "This hwy target parametric test doesn't use dynamic-dispatch and " + "doesn't need to be parametric."; +#endif + SetSupportedTargetsForTest(0); + } +}; + +// Function to convert the test parameter of a TestWithParamTarget for +// displaying it in the gtest test name. +static inline std::string TestParamTargetName( + const testing::TestParamInfo<int64_t>& info) { + return TargetName(info.param); +} + +#define HWY_TARGET_INSTANTIATE_TEST_SUITE_P(suite) \ + HWY_GTEST_INSTANTIATE_TEST_SUITE_P( \ + suite##Group, suite, \ + testing::ValuesIn(::hwy::SupportedAndGeneratedTargets()), \ + ::hwy::TestParamTargetName) + +// Helper class similar to TestWithParamTarget to run parametric tests that +// depend on the target and another parametric test. If you need to use multiple +// extra parameters use a std::tuple<> of them and ::testing::Generate(...) as +// the generator. To use this class define the following in your test: +// class MyTestSuite : public TestWithParamTargetT<int> { +// ... +// }; +// HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T(MyTestSuite, ::testing::Range(0, 9)); +// TEST_P(MyTestSuite, MyTest) { ... GetParam() .... } +template <typename T> +class TestWithParamTargetAndT + : public ::testing::TestWithParam<std::tuple<int64_t, T>> { + public: + // Expose the parametric type here so it can be used by the + // HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T macro. + using HwyParamType = T; + + protected: + void SetUp() override { + SetSupportedTargetsForTest(std::get<0>( + ::testing::TestWithParam<std::tuple<int64_t, T>>::GetParam())); + } + + void TearDown() override { + // Check that the parametric test calls SupportedTargets() when the source + // was compiled with more than one target. In the single-target case only + // static dispatch will be used anyway. +#if (HWY_TARGETS & (HWY_TARGETS - 1)) != 0 + EXPECT_TRUE(GetChosenTarget().IsInitialized()) + << "This hwy target parametric test doesn't use dynamic-dispatch and " + "doesn't need to be parametric."; +#endif + SetSupportedTargetsForTest(0); + } + + T GetParam() { + return std::get<1>( + ::testing::TestWithParam<std::tuple<int64_t, T>>::GetParam()); + } +}; + +template <typename T> +std::string TestParamTargetNameAndT( + const testing::TestParamInfo<std::tuple<int64_t, T>>& info) { + return std::string(TargetName(std::get<0>(info.param))) + "_" + + ::testing::PrintToString(std::get<1>(info.param)); +} + +#define HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T(suite, generator) \ + HWY_GTEST_INSTANTIATE_TEST_SUITE_P( \ + suite##Group, suite, \ + ::testing::Combine( \ + testing::ValuesIn(::hwy::SupportedAndGeneratedTargets()), \ + generator), \ + ::hwy::TestParamTargetNameAndT<suite::HwyParamType>) + +// Helper macro to export a function and define a test that tests it. This is +// equivalent to do a HWY_EXPORT of a void(void) function and run it in a test: +// class MyTestSuite : public TestWithParamTarget { +// ... +// }; +// HWY_TARGET_INSTANTIATE_TEST_SUITE_P(MyTestSuite); +// HWY_EXPORT_AND_TEST_P(MyTestSuite, MyTest); +#define HWY_EXPORT_AND_TEST_P(suite, func_name) \ + HWY_EXPORT(func_name); \ + TEST_P(suite, func_name) { HWY_DYNAMIC_DISPATCH(func_name)(); } \ + static_assert(true, "For requiring trailing semicolon") + +#define HWY_EXPORT_AND_TEST_P_T(suite, func_name) \ + HWY_EXPORT(func_name); \ + TEST_P(suite, func_name) { HWY_DYNAMIC_DISPATCH(func_name)(GetParam()); } \ + static_assert(true, "For requiring trailing semicolon") + +#define HWY_BEFORE_TEST(suite) \ + class suite : public hwy::TestWithParamTarget {}; \ + HWY_TARGET_INSTANTIATE_TEST_SUITE_P(suite); \ + static_assert(true, "For requiring trailing semicolon") + +} // namespace hwy + +#endif // HWY_TESTS_HWY_GTEST_H_ diff --git a/third_party/highway/hwy/tests/if_test.cc b/third_party/highway/hwy/tests/if_test.cc new file mode 100644 index 0000000000..e44a878a0c --- /dev/null +++ b/third_party/highway/hwy/tests/if_test.cc @@ -0,0 +1,175 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <stddef.h> +#include <stdint.h> + +#include "hwy/aligned_allocator.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/if_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +struct TestIfThenElse { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + RandomState rng; + + using TI = MakeSigned<T>; // For mask > 0 comparison + const Rebind<TI, D> di; + const size_t N = Lanes(d); + auto in1 = AllocateAligned<T>(N); + auto in2 = AllocateAligned<T>(N); + auto bool_lanes = AllocateAligned<TI>(N); + auto expected = AllocateAligned<T>(N); + + // Each lane should have a chance of having mask=true. + for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { + for (size_t i = 0; i < N; ++i) { + in1[i] = static_cast<T>(Random32(&rng)); + in2[i] = static_cast<T>(Random32(&rng)); + bool_lanes[i] = (Random32(&rng) & 16) ? TI(1) : TI(0); + } + + const auto v1 = Load(d, in1.get()); + const auto v2 = Load(d, in2.get()); + const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); + + for (size_t i = 0; i < N; ++i) { + expected[i] = bool_lanes[i] ? in1[i] : in2[i]; + } + HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenElse(mask, v1, v2)); + + for (size_t i = 0; i < N; ++i) { + expected[i] = bool_lanes[i] ? in1[i] : T(0); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenElseZero(mask, v1)); + + for (size_t i = 0; i < N; ++i) { + expected[i] = bool_lanes[i] ? T(0) : in2[i]; + } + HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenZeroElse(mask, v2)); + } + } +}; + +HWY_NOINLINE void TestAllIfThenElse() { + ForAllTypes(ForPartialVectors<TestIfThenElse>()); +} + +struct TestIfVecThenElse { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + RandomState rng; + + using TU = MakeUnsigned<T>; // For all-one mask + const Rebind<TU, D> du; + const size_t N = Lanes(d); + auto in1 = AllocateAligned<T>(N); + auto in2 = AllocateAligned<T>(N); + auto vec_lanes = AllocateAligned<TU>(N); + auto expected = AllocateAligned<T>(N); + + // Each lane should have a chance of having mask=true. + for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { + for (size_t i = 0; i < N; ++i) { + in1[i] = static_cast<T>(Random32(&rng)); + in2[i] = static_cast<T>(Random32(&rng)); + vec_lanes[i] = (Random32(&rng) & 16) ? static_cast<TU>(~TU(0)) : TU(0); + } + + const auto v1 = Load(d, in1.get()); + const auto v2 = Load(d, in2.get()); + const auto vec = BitCast(d, Load(du, vec_lanes.get())); + + for (size_t i = 0; i < N; ++i) { + expected[i] = vec_lanes[i] ? in1[i] : in2[i]; + } + HWY_ASSERT_VEC_EQ(d, expected.get(), IfVecThenElse(vec, v1, v2)); + } + } +}; + +HWY_NOINLINE void TestAllIfVecThenElse() { + ForAllTypes(ForPartialVectors<TestIfVecThenElse>()); +} + +struct TestZeroIfNegative { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v0 = Zero(d); + const auto vp = Iota(d, 1); + const auto vn = Iota(d, T(-1E5)); // assumes N < 10^5 + + // Zero and positive remain unchanged + HWY_ASSERT_VEC_EQ(d, v0, ZeroIfNegative(v0)); + HWY_ASSERT_VEC_EQ(d, vp, ZeroIfNegative(vp)); + + // Negative are all replaced with zero + HWY_ASSERT_VEC_EQ(d, v0, ZeroIfNegative(vn)); + } +}; + +HWY_NOINLINE void TestAllZeroIfNegative() { + ForFloatTypes(ForPartialVectors<TestZeroIfNegative>()); +} + +struct TestIfNegative { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v0 = Zero(d); + const auto vp = Iota(d, 1); + const auto vn = Or(vp, SignBit(d)); + + // Zero and positive remain unchanged + HWY_ASSERT_VEC_EQ(d, v0, IfNegativeThenElse(v0, vn, v0)); + HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(v0, v0, vn)); + HWY_ASSERT_VEC_EQ(d, vp, IfNegativeThenElse(vp, vn, vp)); + HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(vp, vp, vn)); + + // Negative are replaced with 2nd arg + HWY_ASSERT_VEC_EQ(d, v0, IfNegativeThenElse(vn, v0, vp)); + HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(vn, vn, v0)); + HWY_ASSERT_VEC_EQ(d, vp, IfNegativeThenElse(vn, vp, vn)); + } +}; + +HWY_NOINLINE void TestAllIfNegative() { + ForFloatTypes(ForPartialVectors<TestIfNegative>()); + ForSignedTypes(ForPartialVectors<TestIfNegative>()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyIfTest); +HWY_EXPORT_AND_TEST_P(HwyIfTest, TestAllIfThenElse); +HWY_EXPORT_AND_TEST_P(HwyIfTest, TestAllIfVecThenElse); +HWY_EXPORT_AND_TEST_P(HwyIfTest, TestAllZeroIfNegative); +HWY_EXPORT_AND_TEST_P(HwyIfTest, TestAllIfNegative); +} // namespace hwy + +#endif diff --git a/third_party/highway/hwy/tests/interleaved_test.cc b/third_party/highway/hwy/tests/interleaved_test.cc new file mode 100644 index 0000000000..4d1fbd5ac5 --- /dev/null +++ b/third_party/highway/hwy/tests/interleaved_test.cc @@ -0,0 +1,256 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <stddef.h> +#include <stdint.h> + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/interleaved_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +struct TestLoadStoreInterleaved2 { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + + RandomState rng; + + // Data to be interleaved + auto bytes = AllocateAligned<T>(2 * N); + for (size_t i = 0; i < 2 * N; ++i) { + bytes[i] = static_cast<T>(Random32(&rng) & 0xFF); + } + const auto in0 = Load(d, &bytes[0 * N]); + const auto in1 = Load(d, &bytes[1 * N]); + + // Interleave here, ensure vector results match scalar + auto expected = AllocateAligned<T>(3 * N); + auto actual_aligned = AllocateAligned<T>(3 * N + 1); + T* actual = actual_aligned.get() + 1; + + for (size_t rep = 0; rep < 100; ++rep) { + for (size_t i = 0; i < N; ++i) { + expected[2 * i + 0] = bytes[0 * N + i]; + expected[2 * i + 1] = bytes[1 * N + i]; + // Ensure we do not write more than 2*N bytes + expected[2 * N + i] = actual[2 * N + i] = 0; + } + StoreInterleaved2(in0, in1, d, actual); + size_t pos = 0; + if (!BytesEqual(expected.get(), actual, 3 * N * sizeof(T), &pos)) { + Print(d, "in0", in0, pos / 4); + Print(d, "in1", in1, pos / 4); + const size_t i = pos; + fprintf(stderr, "interleaved i=%d %f %f %f %f %f %f %f %f\n", + static_cast<int>(i), static_cast<double>(actual[i]), + static_cast<double>(actual[i + 1]), + static_cast<double>(actual[i + 2]), + static_cast<double>(actual[i + 3]), + static_cast<double>(actual[i + 4]), + static_cast<double>(actual[i + 5]), + static_cast<double>(actual[i + 6]), + static_cast<double>(actual[i + 7])); + HWY_ASSERT(false); + } + + Vec<D> out0, out1; + LoadInterleaved2(d, actual, out0, out1); + HWY_ASSERT_VEC_EQ(d, in0, out0); + HWY_ASSERT_VEC_EQ(d, in1, out1); + } + } +}; + +HWY_NOINLINE void TestAllLoadStoreInterleaved2() { +#if HWY_TARGET == HWY_RVV + // Segments are limited to 8 registers, so we can only go up to LMUL=2. + const ForExtendableVectors<TestLoadStoreInterleaved2, 2> test; +#else + const ForPartialVectors<TestLoadStoreInterleaved2> test; +#endif + ForAllTypes(test); +} + +// Workaround for build timeout on GCC 12 aarch64, see #776 +#if HWY_COMPILER_GCC_ACTUAL >= 1200 && HWY_ARCH_ARM_A64 +#define HWY_BROKEN_LOAD34 1 +#else +#define HWY_BROKEN_LOAD34 0 +#endif + +#if !HWY_BROKEN_LOAD34 + +struct TestLoadStoreInterleaved3 { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + + RandomState rng; + + // Data to be interleaved + auto bytes = AllocateAligned<T>(3 * N); + for (size_t i = 0; i < 3 * N; ++i) { + bytes[i] = static_cast<T>(Random32(&rng) & 0xFF); + } + const auto in0 = Load(d, &bytes[0 * N]); + const auto in1 = Load(d, &bytes[1 * N]); + const auto in2 = Load(d, &bytes[2 * N]); + + // Interleave here, ensure vector results match scalar + auto expected = AllocateAligned<T>(4 * N); + auto actual_aligned = AllocateAligned<T>(4 * N + 1); + T* actual = actual_aligned.get() + 1; + + for (size_t rep = 0; rep < 100; ++rep) { + for (size_t i = 0; i < N; ++i) { + expected[3 * i + 0] = bytes[0 * N + i]; + expected[3 * i + 1] = bytes[1 * N + i]; + expected[3 * i + 2] = bytes[2 * N + i]; + // Ensure we do not write more than 3*N bytes + expected[3 * N + i] = actual[3 * N + i] = 0; + } + StoreInterleaved3(in0, in1, in2, d, actual); + size_t pos = 0; + if (!BytesEqual(expected.get(), actual, 4 * N * sizeof(T), &pos)) { + Print(d, "in0", in0, pos / 3, N); + Print(d, "in1", in1, pos / 3, N); + Print(d, "in2", in2, pos / 3, N); + const size_t i = pos; + fprintf(stderr, "interleaved i=%d %f %f %f %f %f %f\n", + static_cast<int>(i), static_cast<double>(actual[i]), + static_cast<double>(actual[i + 1]), + static_cast<double>(actual[i + 2]), + static_cast<double>(actual[i + 3]), + static_cast<double>(actual[i + 4]), + static_cast<double>(actual[i + 5])); + HWY_ASSERT(false); + } + + Vec<D> out0, out1, out2; + LoadInterleaved3(d, actual, out0, out1, out2); + HWY_ASSERT_VEC_EQ(d, in0, out0); + HWY_ASSERT_VEC_EQ(d, in1, out1); + HWY_ASSERT_VEC_EQ(d, in2, out2); + } + } +}; + +HWY_NOINLINE void TestAllLoadStoreInterleaved3() { +#if HWY_TARGET == HWY_RVV + // Segments are limited to 8 registers, so we can only go up to LMUL=2. + const ForExtendableVectors<TestLoadStoreInterleaved3, 2> test; +#else + const ForPartialVectors<TestLoadStoreInterleaved3> test; +#endif + ForAllTypes(test); +} + +struct TestLoadStoreInterleaved4 { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + + RandomState rng; + + // Data to be interleaved + auto bytes = AllocateAligned<T>(4 * N); + + for (size_t i = 0; i < 4 * N; ++i) { + bytes[i] = static_cast<T>(Random32(&rng) & 0xFF); + } + const auto in0 = Load(d, &bytes[0 * N]); + const auto in1 = Load(d, &bytes[1 * N]); + const auto in2 = Load(d, &bytes[2 * N]); + const auto in3 = Load(d, &bytes[3 * N]); + + // Interleave here, ensure vector results match scalar + auto expected = AllocateAligned<T>(5 * N); + auto actual_aligned = AllocateAligned<T>(5 * N + 1); + T* actual = actual_aligned.get() + 1; + + for (size_t rep = 0; rep < 100; ++rep) { + for (size_t i = 0; i < N; ++i) { + expected[4 * i + 0] = bytes[0 * N + i]; + expected[4 * i + 1] = bytes[1 * N + i]; + expected[4 * i + 2] = bytes[2 * N + i]; + expected[4 * i + 3] = bytes[3 * N + i]; + // Ensure we do not write more than 4*N bytes + expected[4 * N + i] = actual[4 * N + i] = 0; + } + StoreInterleaved4(in0, in1, in2, in3, d, actual); + size_t pos = 0; + if (!BytesEqual(expected.get(), actual, 5 * N * sizeof(T), &pos)) { + Print(d, "in0", in0, pos / 4); + Print(d, "in1", in1, pos / 4); + Print(d, "in2", in2, pos / 4); + Print(d, "in3", in3, pos / 4); + const size_t i = pos; + fprintf(stderr, "interleaved i=%d %f %f %f %f %f %f %f %f\n", + static_cast<int>(i), static_cast<double>(actual[i]), + static_cast<double>(actual[i + 1]), + static_cast<double>(actual[i + 2]), + static_cast<double>(actual[i + 3]), + static_cast<double>(actual[i + 4]), + static_cast<double>(actual[i + 5]), + static_cast<double>(actual[i + 6]), + static_cast<double>(actual[i + 7])); + HWY_ASSERT(false); + } + + Vec<D> out0, out1, out2, out3; + LoadInterleaved4(d, actual, out0, out1, out2, out3); + HWY_ASSERT_VEC_EQ(d, in0, out0); + HWY_ASSERT_VEC_EQ(d, in1, out1); + HWY_ASSERT_VEC_EQ(d, in2, out2); + HWY_ASSERT_VEC_EQ(d, in3, out3); + } + } +}; + +HWY_NOINLINE void TestAllLoadStoreInterleaved4() { +#if HWY_TARGET == HWY_RVV + // Segments are limited to 8 registers, so we can only go up to LMUL=2. + const ForExtendableVectors<TestLoadStoreInterleaved4, 2> test; +#else + const ForPartialVectors<TestLoadStoreInterleaved4> test; +#endif + ForAllTypes(test); +} + +#endif // !HWY_BROKEN_LOAD34 + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyInterleavedTest); +HWY_EXPORT_AND_TEST_P(HwyInterleavedTest, TestAllLoadStoreInterleaved2); +#if !HWY_BROKEN_LOAD34 +HWY_EXPORT_AND_TEST_P(HwyInterleavedTest, TestAllLoadStoreInterleaved3); +HWY_EXPORT_AND_TEST_P(HwyInterleavedTest, TestAllLoadStoreInterleaved4); +#endif +} // namespace hwy + +#endif diff --git a/third_party/highway/hwy/tests/list_targets.cc b/third_party/highway/hwy/tests/list_targets.cc new file mode 100644 index 0000000000..d09ee4fe86 --- /dev/null +++ b/third_party/highway/hwy/tests/list_targets.cc @@ -0,0 +1,71 @@ +// Copyright 2020 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Simple tool to print the list of targets that were compiled in when building +// this tool. + +#include <stdio.h> + +#include "hwy/highway.h" + +void PrintTargets(const char* msg, int64_t targets) { + fprintf(stderr, "%s", msg); + // For each bit: + for (int64_t x = targets; x != 0; x = x & (x - 1)) { + // Extract value of least-significant bit. + fprintf(stderr, " %s", hwy::TargetName(x & (~x + 1))); + } + fprintf(stderr, "\n"); +} + +int main() { +#ifdef HWY_COMPILE_ONLY_EMU128 + const int only_emu128 = 1; +#else + const int only_emu128 = 0; +#endif +#ifdef HWY_COMPILE_ONLY_SCALAR + const int only_scalar = 1; +#else + const int only_scalar = 0; +#endif +#ifdef HWY_COMPILE_ONLY_STATIC + const int only_static = 1; +#else + const int only_static = 0; +#endif +#ifdef HWY_COMPILE_ALL_ATTAINABLE + const int all_attain = 1; +#else + const int all_attain = 0; +#endif +#ifdef HWY_IS_TEST + const int is_test = 1; +#else + const int is_test = 0; +#endif + + fprintf(stderr, + "Config: emu128:%d scalar:%d static:%d all_attain:%d is_test:%d\n", + only_emu128, only_scalar, only_static, all_attain, is_test); + PrintTargets("Compiled HWY_TARGETS: ", HWY_TARGETS); + PrintTargets("HWY_ATTAINABLE_TARGETS:", HWY_ATTAINABLE_TARGETS); + PrintTargets("HWY_BASELINE_TARGETS: ", HWY_BASELINE_TARGETS); + PrintTargets("HWY_STATIC_TARGET: ", HWY_STATIC_TARGET); + PrintTargets("HWY_BROKEN_TARGETS: ", HWY_BROKEN_TARGETS); + PrintTargets("HWY_DISABLED_TARGETS: ", HWY_DISABLED_TARGETS); + PrintTargets("Current CPU supports: ", hwy::SupportedTargets()); + return 0; +} diff --git a/third_party/highway/hwy/tests/logical_test.cc b/third_party/highway/hwy/tests/logical_test.cc new file mode 100644 index 0000000000..b646f5ff4b --- /dev/null +++ b/third_party/highway/hwy/tests/logical_test.cc @@ -0,0 +1,246 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <stddef.h> +#include <stdint.h> +#include <string.h> // memcmp + +#include "hwy/aligned_allocator.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/logical_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +struct TestNot { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v0 = Zero(d); + const auto ones = VecFromMask(d, Eq(v0, v0)); + const auto v1 = Set(d, 1); + const auto vnot1 = Set(d, T(~T(1))); + + HWY_ASSERT_VEC_EQ(d, v0, Not(ones)); + HWY_ASSERT_VEC_EQ(d, ones, Not(v0)); + HWY_ASSERT_VEC_EQ(d, v1, Not(vnot1)); + HWY_ASSERT_VEC_EQ(d, vnot1, Not(v1)); + } +}; + +HWY_NOINLINE void TestAllNot() { + ForIntegerTypes(ForPartialVectors<TestNot>()); +} + +struct TestLogical { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v0 = Zero(d); + const auto vi = Iota(d, 0); + + auto v = vi; + v = And(v, vi); + HWY_ASSERT_VEC_EQ(d, vi, v); + v = And(v, v0); + HWY_ASSERT_VEC_EQ(d, v0, v); + + v = Or(v, vi); + HWY_ASSERT_VEC_EQ(d, vi, v); + v = Or(v, v0); + HWY_ASSERT_VEC_EQ(d, vi, v); + + v = Xor(v, vi); + HWY_ASSERT_VEC_EQ(d, v0, v); + v = Xor(v, v0); + HWY_ASSERT_VEC_EQ(d, v0, v); + + HWY_ASSERT_VEC_EQ(d, v0, And(v0, vi)); + HWY_ASSERT_VEC_EQ(d, v0, And(vi, v0)); + HWY_ASSERT_VEC_EQ(d, vi, And(vi, vi)); + + HWY_ASSERT_VEC_EQ(d, vi, Or(v0, vi)); + HWY_ASSERT_VEC_EQ(d, vi, Or(vi, v0)); + HWY_ASSERT_VEC_EQ(d, vi, Or(vi, vi)); + + HWY_ASSERT_VEC_EQ(d, vi, Xor(v0, vi)); + HWY_ASSERT_VEC_EQ(d, vi, Xor(vi, v0)); + HWY_ASSERT_VEC_EQ(d, v0, Xor(vi, vi)); + + HWY_ASSERT_VEC_EQ(d, vi, AndNot(v0, vi)); + HWY_ASSERT_VEC_EQ(d, v0, AndNot(vi, v0)); + HWY_ASSERT_VEC_EQ(d, v0, AndNot(vi, vi)); + + HWY_ASSERT_VEC_EQ(d, v0, Or3(v0, v0, v0)); + HWY_ASSERT_VEC_EQ(d, vi, Or3(v0, vi, v0)); + HWY_ASSERT_VEC_EQ(d, vi, Or3(v0, v0, vi)); + HWY_ASSERT_VEC_EQ(d, vi, Or3(v0, vi, vi)); + HWY_ASSERT_VEC_EQ(d, vi, Or3(vi, v0, v0)); + HWY_ASSERT_VEC_EQ(d, vi, Or3(vi, vi, v0)); + HWY_ASSERT_VEC_EQ(d, vi, Or3(vi, v0, vi)); + HWY_ASSERT_VEC_EQ(d, vi, Or3(vi, vi, vi)); + + HWY_ASSERT_VEC_EQ(d, v0, Xor3(v0, v0, v0)); + HWY_ASSERT_VEC_EQ(d, vi, Xor3(v0, vi, v0)); + HWY_ASSERT_VEC_EQ(d, vi, Xor3(v0, v0, vi)); + HWY_ASSERT_VEC_EQ(d, v0, Xor3(v0, vi, vi)); + HWY_ASSERT_VEC_EQ(d, vi, Xor3(vi, v0, v0)); + HWY_ASSERT_VEC_EQ(d, v0, Xor3(vi, vi, v0)); + HWY_ASSERT_VEC_EQ(d, v0, Xor3(vi, v0, vi)); + HWY_ASSERT_VEC_EQ(d, vi, Xor3(vi, vi, vi)); + + HWY_ASSERT_VEC_EQ(d, v0, OrAnd(v0, v0, v0)); + HWY_ASSERT_VEC_EQ(d, v0, OrAnd(v0, vi, v0)); + HWY_ASSERT_VEC_EQ(d, v0, OrAnd(v0, v0, vi)); + HWY_ASSERT_VEC_EQ(d, vi, OrAnd(v0, vi, vi)); + HWY_ASSERT_VEC_EQ(d, vi, OrAnd(vi, v0, v0)); + HWY_ASSERT_VEC_EQ(d, vi, OrAnd(vi, vi, v0)); + HWY_ASSERT_VEC_EQ(d, vi, OrAnd(vi, v0, vi)); + HWY_ASSERT_VEC_EQ(d, vi, OrAnd(vi, vi, vi)); + } +}; + +HWY_NOINLINE void TestAllLogical() { + ForAllTypes(ForPartialVectors<TestLogical>()); +} + +struct TestCopySign { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v0 = Zero(d); + const auto vp = Iota(d, 1); + const auto vn = Iota(d, T(-1E5)); // assumes N < 10^5 + + // Zero remains zero regardless of sign + HWY_ASSERT_VEC_EQ(d, v0, CopySign(v0, v0)); + HWY_ASSERT_VEC_EQ(d, v0, CopySign(v0, vp)); + HWY_ASSERT_VEC_EQ(d, v0, CopySign(v0, vn)); + HWY_ASSERT_VEC_EQ(d, v0, CopySignToAbs(v0, v0)); + HWY_ASSERT_VEC_EQ(d, v0, CopySignToAbs(v0, vp)); + HWY_ASSERT_VEC_EQ(d, v0, CopySignToAbs(v0, vn)); + + // Positive input, positive sign => unchanged + HWY_ASSERT_VEC_EQ(d, vp, CopySign(vp, vp)); + HWY_ASSERT_VEC_EQ(d, vp, CopySignToAbs(vp, vp)); + + // Positive input, negative sign => negated + HWY_ASSERT_VEC_EQ(d, Neg(vp), CopySign(vp, vn)); + HWY_ASSERT_VEC_EQ(d, Neg(vp), CopySignToAbs(vp, vn)); + + // Negative input, negative sign => unchanged + HWY_ASSERT_VEC_EQ(d, vn, CopySign(vn, vn)); + + // Negative input, positive sign => negated + HWY_ASSERT_VEC_EQ(d, Neg(vn), CopySign(vn, vp)); + } +}; + +HWY_NOINLINE void TestAllCopySign() { + ForFloatTypes(ForPartialVectors<TestCopySign>()); +} + +struct TestBroadcastSignBit { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto s0 = Zero(d); + const auto s1 = Set(d, -1); // all bit set + const auto vpos = And(Iota(d, 0), Set(d, LimitsMax<T>())); + const auto vneg = Sub(s1, vpos); + + HWY_ASSERT_VEC_EQ(d, s0, BroadcastSignBit(vpos)); + HWY_ASSERT_VEC_EQ(d, s0, BroadcastSignBit(Set(d, LimitsMax<T>()))); + + HWY_ASSERT_VEC_EQ(d, s1, BroadcastSignBit(vneg)); + HWY_ASSERT_VEC_EQ(d, s1, BroadcastSignBit(Set(d, LimitsMin<T>()))); + HWY_ASSERT_VEC_EQ(d, s1, BroadcastSignBit(Set(d, LimitsMin<T>() / 2))); + } +}; + +HWY_NOINLINE void TestAllBroadcastSignBit() { + ForSignedTypes(ForPartialVectors<TestBroadcastSignBit>()); +} + +struct TestTestBit { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t kNumBits = sizeof(T) * 8; + for (size_t i = 0; i < kNumBits; ++i) { + const auto bit1 = Set(d, T(1ull << i)); + const auto bit2 = Set(d, T(1ull << ((i + 1) % kNumBits))); + const auto bit3 = Set(d, T(1ull << ((i + 2) % kNumBits))); + const auto bits12 = Or(bit1, bit2); + const auto bits23 = Or(bit2, bit3); + HWY_ASSERT(AllTrue(d, TestBit(bit1, bit1))); + HWY_ASSERT(AllTrue(d, TestBit(bits12, bit1))); + HWY_ASSERT(AllTrue(d, TestBit(bits12, bit2))); + + HWY_ASSERT(AllFalse(d, TestBit(bits12, bit3))); + HWY_ASSERT(AllFalse(d, TestBit(bits23, bit1))); + HWY_ASSERT(AllFalse(d, TestBit(bit1, bit2))); + HWY_ASSERT(AllFalse(d, TestBit(bit2, bit1))); + HWY_ASSERT(AllFalse(d, TestBit(bit1, bit3))); + HWY_ASSERT(AllFalse(d, TestBit(bit3, bit1))); + HWY_ASSERT(AllFalse(d, TestBit(bit2, bit3))); + HWY_ASSERT(AllFalse(d, TestBit(bit3, bit2))); + } + } +}; + +HWY_NOINLINE void TestAllTestBit() { + ForIntegerTypes(ForPartialVectors<TestTestBit>()); +} + +struct TestPopulationCount { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + RandomState rng; + size_t N = Lanes(d); + auto data = AllocateAligned<T>(N); + auto popcnt = AllocateAligned<T>(N); + for (size_t i = 0; i < AdjustedReps(1 << 18) / N; i++) { + for (size_t i = 0; i < N; i++) { + data[i] = static_cast<T>(rng()); + popcnt[i] = static_cast<T>(PopCount(data[i])); + } + HWY_ASSERT_VEC_EQ(d, popcnt.get(), PopulationCount(Load(d, data.get()))); + } + } +}; + +HWY_NOINLINE void TestAllPopulationCount() { + ForUnsignedTypes(ForPartialVectors<TestPopulationCount>()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyLogicalTest); +HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllNot); +HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogical); +HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCopySign); +HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllBroadcastSignBit); +HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllTestBit); +HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllPopulationCount); +} // namespace hwy + +#endif diff --git a/third_party/highway/hwy/tests/mask_mem_test.cc b/third_party/highway/hwy/tests/mask_mem_test.cc new file mode 100644 index 0000000000..c44119dcd7 --- /dev/null +++ b/third_party/highway/hwy/tests/mask_mem_test.cc @@ -0,0 +1,197 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS // before inttypes.h +#endif +#include <inttypes.h> +#include <stddef.h> +#include <stdint.h> +#include <string.h> // memcmp + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/mask_mem_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +struct TestMaskedLoad { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + RandomState rng; + + using TI = MakeSigned<T>; // For mask > 0 comparison + const Rebind<TI, D> di; + const size_t N = Lanes(d); + auto bool_lanes = AllocateAligned<TI>(N); + + auto lanes = AllocateAligned<T>(N); + Store(Iota(d, T{1}), d, lanes.get()); + + // Each lane should have a chance of having mask=true. + for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { + for (size_t i = 0; i < N; ++i) { + bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0); + } + + const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); + const auto expected = IfThenElseZero(mask, Load(d, lanes.get())); + const auto actual = MaskedLoad(mask, d, lanes.get()); + HWY_ASSERT_VEC_EQ(d, expected, actual); + } + } +}; + +HWY_NOINLINE void TestAllMaskedLoad() { + ForAllTypes(ForPartialVectors<TestMaskedLoad>()); +} + +struct TestBlendedStore { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + RandomState rng; + + using TI = MakeSigned<T>; // For mask > 0 comparison + const Rebind<TI, D> di; + const size_t N = Lanes(d); + auto bool_lanes = AllocateAligned<TI>(N); + + const Vec<D> v = Iota(d, T{1}); + auto actual = AllocateAligned<T>(N); + auto expected = AllocateAligned<T>(N); + + // Each lane should have a chance of having mask=true. + for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { + for (size_t i = 0; i < N; ++i) { + bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0); + // Re-initialize to something distinct from v[i]. + actual[i] = static_cast<T>(127 - (i & 127)); + expected[i] = bool_lanes[i] ? static_cast<T>(i + 1) : actual[i]; + } + + const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); + BlendedStore(v, mask, d, actual.get()); + HWY_ASSERT_VEC_EQ(d, expected.get(), Load(d, actual.get())); + } + } +}; + +HWY_NOINLINE void TestAllBlendedStore() { + ForAllTypes(ForPartialVectors<TestBlendedStore>()); +} + +class TestStoreMaskBits { + public: + template <class T, class D> + HWY_NOINLINE void operator()(T /*t*/, D /*d*/) { + RandomState rng; + using TI = MakeSigned<T>; // For mask > 0 comparison + const Rebind<TI, D> di; + const size_t N = Lanes(di); + auto bool_lanes = AllocateAligned<TI>(N); + + const ScalableTag<uint8_t, -3> d_bits; + const size_t expected_num_bytes = (N + 7) / 8; + auto expected = AllocateAligned<uint8_t>(expected_num_bytes); + auto actual = AllocateAligned<uint8_t>(HWY_MAX(8, expected_num_bytes)); + + for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { + // Generate random mask pattern. + for (size_t i = 0; i < N; ++i) { + bool_lanes[i] = static_cast<TI>((rng() & 1024) ? 1 : 0); + } + const auto bools = Load(di, bool_lanes.get()); + const auto mask = Gt(bools, Zero(di)); + + // Requires at least 8 bytes, ensured above. + const size_t bytes_written = StoreMaskBits(di, mask, actual.get()); + if (bytes_written != expected_num_bytes) { + fprintf(stderr, "%s expected %" PRIu64 " bytes, actual %" PRIu64 "\n", + TypeName(T(), N).c_str(), + static_cast<uint64_t>(expected_num_bytes), + static_cast<uint64_t>(bytes_written)); + + HWY_ASSERT(false); + } + + // Requires at least 8 bytes, ensured above. + const auto mask2 = LoadMaskBits(di, actual.get()); + HWY_ASSERT_MASK_EQ(di, mask, mask2); + + memset(expected.get(), 0, expected_num_bytes); + for (size_t i = 0; i < N; ++i) { + expected[i / 8] = + static_cast<uint8_t>(expected[i / 8] | (bool_lanes[i] << (i % 8))); + } + + size_t i = 0; + // Stored bits must match original mask + for (; i < N; ++i) { + const TI is_set = (actual[i / 8] & (1 << (i % 8))) ? 1 : 0; + if (is_set != bool_lanes[i]) { + fprintf(stderr, "%s lane %" PRIu64 ": expected %d, actual %d\n", + TypeName(T(), N).c_str(), static_cast<uint64_t>(i), + static_cast<int>(bool_lanes[i]), static_cast<int>(is_set)); + Print(di, "bools", bools, 0, N); + Print(d_bits, "expected bytes", Load(d_bits, expected.get()), 0, + expected_num_bytes); + Print(d_bits, "actual bytes", Load(d_bits, actual.get()), 0, + expected_num_bytes); + + HWY_ASSERT(false); + } + } + // Any partial bits in the last byte must be zero + for (; i < 8 * bytes_written; ++i) { + const int bit = (actual[i / 8] & (1 << (i % 8))); + if (bit != 0) { + fprintf(stderr, "%s: bit #%" PRIu64 " should be zero\n", + TypeName(T(), N).c_str(), static_cast<uint64_t>(i)); + Print(di, "bools", bools, 0, N); + Print(d_bits, "expected bytes", Load(d_bits, expected.get()), 0, + expected_num_bytes); + Print(d_bits, "actual bytes", Load(d_bits, actual.get()), 0, + expected_num_bytes); + + HWY_ASSERT(false); + } + } + } + } +}; + +HWY_NOINLINE void TestAllStoreMaskBits() { + ForAllTypes(ForPartialVectors<TestStoreMaskBits>()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyMaskTest); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllMaskedLoad); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllBlendedStore); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllStoreMaskBits); +} // namespace hwy + +#endif diff --git a/third_party/highway/hwy/tests/mask_test.cc b/third_party/highway/hwy/tests/mask_test.cc new file mode 100644 index 0000000000..cf0d2d4ee8 --- /dev/null +++ b/third_party/highway/hwy/tests/mask_test.cc @@ -0,0 +1,295 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <stddef.h> +#include <stdint.h> +#include <string.h> // memcmp + +#include <algorithm> // std::fill + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/mask_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// All types. +struct TestFromVec { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + auto lanes = AllocateAligned<T>(N); + + memset(lanes.get(), 0, N * sizeof(T)); + const auto actual_false = MaskFromVec(Load(d, lanes.get())); + HWY_ASSERT_MASK_EQ(d, MaskFalse(d), actual_false); + + memset(lanes.get(), 0xFF, N * sizeof(T)); + const auto actual_true = MaskFromVec(Load(d, lanes.get())); + HWY_ASSERT_MASK_EQ(d, MaskTrue(d), actual_true); + } +}; + +HWY_NOINLINE void TestAllFromVec() { + ForAllTypes(ForPartialVectors<TestFromVec>()); +} + +struct TestFirstN { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + auto bool_lanes = AllocateAligned<T>(N); + + using TN = SignedFromSize<HWY_MIN(sizeof(size_t), sizeof(T))>; + const size_t max_len = static_cast<size_t>(LimitsMax<TN>()); + + const size_t max_lanes = HWY_MIN(2 * N, AdjustedReps(512)); + for (size_t len = 0; len <= HWY_MIN(max_lanes, max_len); ++len) { + // Loop instead of Iota+Lt to avoid wraparound for 8-bit T. + for (size_t i = 0; i < N; ++i) { + bool_lanes[i] = (i < len) ? T{1} : 0; + } + const auto expected = Eq(Load(d, bool_lanes.get()), Set(d, T{1})); + HWY_ASSERT_MASK_EQ(d, expected, FirstN(d, len)); + } + + // Also ensure huge values yield all-true (unless the vector is actually + // larger than max_len). + for (size_t i = 0; i < N; ++i) { + bool_lanes[i] = (i < max_len) ? T{1} : 0; + } + const auto expected = Eq(Load(d, bool_lanes.get()), Set(d, T{1})); + HWY_ASSERT_MASK_EQ(d, expected, FirstN(d, max_len)); + } +}; + +HWY_NOINLINE void TestAllFirstN() { + ForAllTypes(ForPartialVectors<TestFirstN>()); +} + +struct TestMaskVec { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + RandomState rng; + + using TI = MakeSigned<T>; // For mask > 0 comparison + const Rebind<TI, D> di; + const size_t N = Lanes(d); + auto bool_lanes = AllocateAligned<TI>(N); + + // Each lane should have a chance of having mask=true. + for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { + for (size_t i = 0; i < N; ++i) { + bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0); + } + + const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); + HWY_ASSERT_MASK_EQ(d, mask, MaskFromVec(VecFromMask(d, mask))); + } + } +}; + +HWY_NOINLINE void TestAllMaskVec() { + const ForPartialVectors<TestMaskVec> test; + + test(uint16_t()); + test(int16_t()); + // TODO(janwas): float16_t - cannot compare yet + + ForUIF3264(test); +} + +struct TestAllTrueFalse { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto zero = Zero(d); + auto v = zero; + + const size_t N = Lanes(d); + auto lanes = AllocateAligned<T>(N); + std::fill(lanes.get(), lanes.get() + N, T(0)); + + HWY_ASSERT(AllTrue(d, Eq(v, zero))); + HWY_ASSERT(!AllFalse(d, Eq(v, zero))); + + // Single lane implies AllFalse = !AllTrue. Otherwise, there are multiple + // lanes and one is nonzero. + const bool expected_all_false = (N != 1); + + // Set each lane to nonzero and back to zero + for (size_t i = 0; i < N; ++i) { + lanes[i] = T(1); + v = Load(d, lanes.get()); + + HWY_ASSERT(!AllTrue(d, Eq(v, zero))); + + HWY_ASSERT(expected_all_false ^ AllFalse(d, Eq(v, zero))); + + lanes[i] = T(-1); + v = Load(d, lanes.get()); + HWY_ASSERT(!AllTrue(d, Eq(v, zero))); + HWY_ASSERT(expected_all_false ^ AllFalse(d, Eq(v, zero))); + + // Reset to all zero + lanes[i] = T(0); + v = Load(d, lanes.get()); + HWY_ASSERT(AllTrue(d, Eq(v, zero))); + HWY_ASSERT(!AllFalse(d, Eq(v, zero))); + } + } +}; + +HWY_NOINLINE void TestAllAllTrueFalse() { + ForAllTypes(ForPartialVectors<TestAllTrueFalse>()); +} + +struct TestCountTrue { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using TI = MakeSigned<T>; // For mask > 0 comparison + const Rebind<TI, D> di; + const size_t N = Lanes(di); + auto bool_lanes = AllocateAligned<TI>(N); + memset(bool_lanes.get(), 0, N * sizeof(TI)); + + // For all combinations of zero/nonzero state of subset of lanes: + const size_t max_lanes = HWY_MIN(N, size_t(10)); + + for (size_t code = 0; code < (1ull << max_lanes); ++code) { + // Number of zeros written = number of mask lanes that are true. + size_t expected = 0; + for (size_t i = 0; i < max_lanes; ++i) { + const bool is_true = (code & (1ull << i)) != 0; + bool_lanes[i] = is_true ? TI(1) : TI(0); + expected += is_true; + } + + const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); + const size_t actual = CountTrue(d, mask); + HWY_ASSERT_EQ(expected, actual); + } + } +}; + +HWY_NOINLINE void TestAllCountTrue() { + ForAllTypes(ForPartialVectors<TestCountTrue>()); +} + +struct TestFindFirstTrue { // Also FindKnownFirstTrue + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using TI = MakeSigned<T>; // For mask > 0 comparison + const Rebind<TI, D> di; + const size_t N = Lanes(di); + auto bool_lanes = AllocateAligned<TI>(N); + memset(bool_lanes.get(), 0, N * sizeof(TI)); + + // For all combinations of zero/nonzero state of subset of lanes: + const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(9))); + + HWY_ASSERT_EQ(intptr_t(-1), FindFirstTrue(d, MaskFalse(d))); + HWY_ASSERT_EQ(intptr_t(0), FindFirstTrue(d, MaskTrue(d))); + HWY_ASSERT_EQ(size_t(0), FindKnownFirstTrue(d, MaskTrue(d))); + + for (size_t code = 1; code < (1ull << max_lanes); ++code) { + for (size_t i = 0; i < max_lanes; ++i) { + bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0); + } + + const size_t expected = + Num0BitsBelowLS1Bit_Nonzero32(static_cast<uint32_t>(code)); + const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); + HWY_ASSERT_EQ(static_cast<intptr_t>(expected), FindFirstTrue(d, mask)); + HWY_ASSERT_EQ(expected, FindKnownFirstTrue(d, mask)); + } + } +}; + +HWY_NOINLINE void TestAllFindFirstTrue() { + ForAllTypes(ForPartialVectors<TestFindFirstTrue>()); +} + +struct TestLogicalMask { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto m0 = MaskFalse(d); + const auto m_all = MaskTrue(d); + + using TI = MakeSigned<T>; // For mask > 0 comparison + const Rebind<TI, D> di; + const size_t N = Lanes(di); + auto bool_lanes = AllocateAligned<TI>(N); + memset(bool_lanes.get(), 0, N * sizeof(TI)); + + HWY_ASSERT_MASK_EQ(d, m0, Not(m_all)); + HWY_ASSERT_MASK_EQ(d, m_all, Not(m0)); + + Print(d, ".", VecFromMask(d, ExclusiveNeither(m0, m0))); + HWY_ASSERT_MASK_EQ(d, m_all, ExclusiveNeither(m0, m0)); + HWY_ASSERT_MASK_EQ(d, m0, ExclusiveNeither(m_all, m0)); + HWY_ASSERT_MASK_EQ(d, m0, ExclusiveNeither(m0, m_all)); + + // For all combinations of zero/nonzero state of subset of lanes: + const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6))); + for (size_t code = 0; code < (1ull << max_lanes); ++code) { + for (size_t i = 0; i < max_lanes; ++i) { + bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0); + } + + const auto m = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); + + HWY_ASSERT_MASK_EQ(d, m0, Xor(m, m)); + HWY_ASSERT_MASK_EQ(d, m0, AndNot(m, m)); + HWY_ASSERT_MASK_EQ(d, m0, AndNot(m_all, m)); + + HWY_ASSERT_MASK_EQ(d, m, Or(m, m)); + HWY_ASSERT_MASK_EQ(d, m, Or(m0, m)); + HWY_ASSERT_MASK_EQ(d, m, Or(m, m0)); + HWY_ASSERT_MASK_EQ(d, m, Xor(m0, m)); + HWY_ASSERT_MASK_EQ(d, m, Xor(m, m0)); + HWY_ASSERT_MASK_EQ(d, m, And(m, m)); + HWY_ASSERT_MASK_EQ(d, m, And(m_all, m)); + HWY_ASSERT_MASK_EQ(d, m, And(m, m_all)); + HWY_ASSERT_MASK_EQ(d, m, AndNot(m0, m)); + } + } +}; + +HWY_NOINLINE void TestAllLogicalMask() { + ForAllTypes(ForPartialVectors<TestLogicalMask>()); +} +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyMaskTest); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFromVec); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFirstN); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllMaskVec); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllAllTrueFalse); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllCountTrue); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFindFirstTrue); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllLogicalMask); +} // namespace hwy + +#endif diff --git a/third_party/highway/hwy/tests/memory_test.cc b/third_party/highway/hwy/tests/memory_test.cc new file mode 100644 index 0000000000..d17addf544 --- /dev/null +++ b/third_party/highway/hwy/tests/memory_test.cc @@ -0,0 +1,343 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Ensure incompabilities with Windows macros (e.g. #define StoreFence) are +// detected. Must come before Highway headers. +#include "hwy/base.h" +#if defined(_WIN32) || defined(_WIN64) +#include <windows.h> +#endif + +#include <stddef.h> +#include <stdint.h> + +#include <algorithm> // std::fill + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/memory_test.cc" +#include "hwy/cache_control.h" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +struct TestLoadStore { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + const auto hi = Iota(d, static_cast<T>(1 + N)); + const auto lo = Iota(d, 1); + auto lanes = AllocateAligned<T>(2 * N); + Store(hi, d, &lanes[N]); + Store(lo, d, &lanes[0]); + + // Aligned load + const auto lo2 = Load(d, &lanes[0]); + HWY_ASSERT_VEC_EQ(d, lo2, lo); + + // Aligned store + auto lanes2 = AllocateAligned<T>(2 * N); + Store(lo2, d, &lanes2[0]); + Store(hi, d, &lanes2[N]); + for (size_t i = 0; i < 2 * N; ++i) { + HWY_ASSERT_EQ(lanes[i], lanes2[i]); + } + + // Unaligned load + const auto vu = LoadU(d, &lanes[1]); + auto lanes3 = AllocateAligned<T>(N); + Store(vu, d, lanes3.get()); + for (size_t i = 0; i < N; ++i) { + HWY_ASSERT_EQ(T(i + 2), lanes3[i]); + } + + // Unaligned store + StoreU(lo2, d, &lanes2[N / 2]); + size_t i = 0; + for (; i < N / 2; ++i) { + HWY_ASSERT_EQ(lanes[i], lanes2[i]); + } + for (; i < 3 * N / 2; ++i) { + HWY_ASSERT_EQ(T(i - N / 2 + 1), lanes2[i]); + } + // Subsequent values remain unchanged. + for (; i < 2 * N; ++i) { + HWY_ASSERT_EQ(T(i + 1), lanes2[i]); + } + } +}; + +HWY_NOINLINE void TestAllLoadStore() { + ForAllTypes(ForPartialVectors<TestLoadStore>()); +} + +struct TestSafeCopyN { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + const auto v = Iota(d, 1); + auto from = AllocateAligned<T>(N + 2); + auto to = AllocateAligned<T>(N + 2); + Store(v, d, from.get()); + + // 0: nothing changes + to[0] = T(); + SafeCopyN(0, d, from.get(), to.get()); + HWY_ASSERT_EQ(T(), to[0]); + + // 1: only first changes + to[1] = T(); + SafeCopyN(1, d, from.get(), to.get()); + HWY_ASSERT_EQ(static_cast<T>(1), to[0]); + HWY_ASSERT_EQ(T(), to[1]); + + // N-1: last does not change + to[N - 1] = T(); + SafeCopyN(N - 1, d, from.get(), to.get()); + HWY_ASSERT_EQ(T(), to[N - 1]); + // Also check preceding lanes + to[N - 1] = static_cast<T>(N); + HWY_ASSERT_VEC_EQ(d, to.get(), v); + + // N: all change + to[N] = T(); + SafeCopyN(N, d, from.get(), to.get()); + HWY_ASSERT_VEC_EQ(d, to.get(), v); + HWY_ASSERT_EQ(T(), to[N]); + + // N+1: subsequent lane does not change if using masked store + to[N + 1] = T(); + SafeCopyN(N + 1, d, from.get(), to.get()); + HWY_ASSERT_VEC_EQ(d, to.get(), v); +#if !HWY_MEM_OPS_MIGHT_FAULT + HWY_ASSERT_EQ(T(), to[N + 1]); +#endif + } +}; + +HWY_NOINLINE void TestAllSafeCopyN() { + ForAllTypes(ForPartialVectors<TestSafeCopyN>()); +} + +struct TestLoadDup128 { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + // Scalar does not define LoadDup128. +#if HWY_TARGET != HWY_SCALAR || HWY_IDE + constexpr size_t N128 = 16 / sizeof(T); + alignas(16) T lanes[N128]; + for (size_t i = 0; i < N128; ++i) { + lanes[i] = static_cast<T>(1 + i); + } + + const size_t N = Lanes(d); + auto expected = AllocateAligned<T>(N); + for (size_t i = 0; i < N; ++i) { + expected[i] = static_cast<T>(i % N128 + 1); + } + + HWY_ASSERT_VEC_EQ(d, expected.get(), LoadDup128(d, lanes)); +#else + (void)d; +#endif + } +}; + +HWY_NOINLINE void TestAllLoadDup128() { + ForAllTypes(ForGEVectors<128, TestLoadDup128>()); +} + +struct TestStream { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v = Iota(d, T(1)); + const size_t affected_bytes = + (Lanes(d) * sizeof(T) + HWY_STREAM_MULTIPLE - 1) & + ~size_t(HWY_STREAM_MULTIPLE - 1); + const size_t affected_lanes = affected_bytes / sizeof(T); + auto out = AllocateAligned<T>(2 * affected_lanes); + std::fill(out.get(), out.get() + 2 * affected_lanes, T(0)); + + Stream(v, d, out.get()); + FlushStream(); + const auto actual = Load(d, out.get()); + HWY_ASSERT_VEC_EQ(d, v, actual); + // Ensure Stream didn't modify more memory than expected + for (size_t i = affected_lanes; i < 2 * affected_lanes; ++i) { + HWY_ASSERT_EQ(T(0), out[i]); + } + } +}; + +HWY_NOINLINE void TestAllStream() { + const ForPartialVectors<TestStream> test; + // No u8,u16. + test(uint32_t()); + test(uint64_t()); + // No i8,i16. + test(int32_t()); + test(int64_t()); + ForFloatTypes(test); +} + +// Assumes little-endian byte order! +struct TestScatter { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using Offset = MakeSigned<T>; + + const size_t N = Lanes(d); + const size_t range = 4 * N; // number of items to scatter + const size_t max_bytes = range * sizeof(T); // upper bound on offset + + RandomState rng; + + // Data to be scattered + auto bytes = AllocateAligned<uint8_t>(max_bytes); + for (size_t i = 0; i < max_bytes; ++i) { + bytes[i] = static_cast<uint8_t>(Random32(&rng) & 0xFF); + } + const auto data = Load(d, reinterpret_cast<const T*>(bytes.get())); + + // Scatter into these regions, ensure vector results match scalar + auto expected = AllocateAligned<T>(range); + auto actual = AllocateAligned<T>(range); + + const Rebind<Offset, D> d_offsets; + auto offsets = AllocateAligned<Offset>(N); // or indices + + for (size_t rep = 0; rep < 100; ++rep) { + // Byte offsets + std::fill(expected.get(), expected.get() + range, T(0)); + std::fill(actual.get(), actual.get() + range, T(0)); + for (size_t i = 0; i < N; ++i) { + // Must be aligned + offsets[i] = static_cast<Offset>((Random32(&rng) % range) * sizeof(T)); + CopyBytes<sizeof(T)>( + bytes.get() + i * sizeof(T), + reinterpret_cast<uint8_t*>(expected.get()) + offsets[i]); + } + const auto voffsets = Load(d_offsets, offsets.get()); + ScatterOffset(data, d, actual.get(), voffsets); + if (!BytesEqual(expected.get(), actual.get(), max_bytes)) { + Print(d, "Data", data); + Print(d_offsets, "Offsets", voffsets); + HWY_ASSERT(false); + } + + // Indices + std::fill(expected.get(), expected.get() + range, T(0)); + std::fill(actual.get(), actual.get() + range, T(0)); + for (size_t i = 0; i < N; ++i) { + offsets[i] = static_cast<Offset>(Random32(&rng) % range); + CopyBytes<sizeof(T)>(bytes.get() + i * sizeof(T), + &expected[size_t(offsets[i])]); + } + const auto vindices = Load(d_offsets, offsets.get()); + ScatterIndex(data, d, actual.get(), vindices); + if (!BytesEqual(expected.get(), actual.get(), max_bytes)) { + Print(d, "Data", data); + Print(d_offsets, "Indices", vindices); + HWY_ASSERT(false); + } + } + } +}; + +HWY_NOINLINE void TestAllScatter() { + ForUIF3264(ForPartialVectors<TestScatter>()); +} + +struct TestGather { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using Offset = MakeSigned<T>; + + const size_t N = Lanes(d); + const size_t range = 4 * N; // number of items to gather + const size_t max_bytes = range * sizeof(T); // upper bound on offset + + RandomState rng; + + // Data to be gathered from + auto bytes = AllocateAligned<uint8_t>(max_bytes); + for (size_t i = 0; i < max_bytes; ++i) { + bytes[i] = static_cast<uint8_t>(Random32(&rng) & 0xFF); + } + + auto expected = AllocateAligned<T>(N); + auto offsets = AllocateAligned<Offset>(N); + auto indices = AllocateAligned<Offset>(N); + + for (size_t rep = 0; rep < 100; ++rep) { + // Offsets + for (size_t i = 0; i < N; ++i) { + // Must be aligned + offsets[i] = static_cast<Offset>((Random32(&rng) % range) * sizeof(T)); + CopyBytes<sizeof(T)>(bytes.get() + offsets[i], &expected[i]); + } + + const Rebind<Offset, D> d_offset; + const T* base = reinterpret_cast<const T*>(bytes.get()); + auto actual = GatherOffset(d, base, Load(d_offset, offsets.get())); + HWY_ASSERT_VEC_EQ(d, expected.get(), actual); + + // Indices + for (size_t i = 0; i < N; ++i) { + indices[i] = + static_cast<Offset>(Random32(&rng) % (max_bytes / sizeof(T))); + CopyBytes<sizeof(T)>(base + indices[i], &expected[i]); + } + actual = GatherIndex(d, base, Load(d_offset, indices.get())); + HWY_ASSERT_VEC_EQ(d, expected.get(), actual); + } + } +}; + +HWY_NOINLINE void TestAllGather() { + ForUIF3264(ForPartialVectors<TestGather>()); +} + +HWY_NOINLINE void TestAllCache() { + LoadFence(); + FlushStream(); + int test = 0; + Prefetch(&test); + FlushCacheline(&test); + Pause(); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyMemoryTest); +HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadStore); +HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllSafeCopyN); +HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadDup128); +HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStream); +HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllScatter); +HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllGather); +HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllCache); +} // namespace hwy + +#endif diff --git a/third_party/highway/hwy/tests/mul_test.cc b/third_party/highway/hwy/tests/mul_test.cc new file mode 100644 index 0000000000..5622983cee --- /dev/null +++ b/third_party/highway/hwy/tests/mul_test.cc @@ -0,0 +1,526 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <stddef.h> +#include <stdint.h> + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/mul_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +template <size_t kBits> +constexpr uint64_t FirstBits() { + return (1ull << kBits) - 1; +} +template <> +constexpr uint64_t FirstBits<64>() { + return ~uint64_t{0}; +} + +struct TestUnsignedMul { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v0 = Zero(d); + const auto v1 = Set(d, T(1)); + const auto vi = Iota(d, 1); + const auto vj = Iota(d, 3); + const size_t N = Lanes(d); + auto expected = AllocateAligned<T>(N); + + HWY_ASSERT_VEC_EQ(d, v0, Mul(v0, v0)); + HWY_ASSERT_VEC_EQ(d, v1, Mul(v1, v1)); + HWY_ASSERT_VEC_EQ(d, vi, Mul(v1, vi)); + HWY_ASSERT_VEC_EQ(d, vi, Mul(vi, v1)); + + for (size_t i = 0; i < N; ++i) { + expected[i] = static_cast<T>((1 + i) * (1 + i)); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vi, vi)); + + for (size_t i = 0; i < N; ++i) { + expected[i] = static_cast<T>((1 + i) * (3 + i)); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vi, vj)); + + const T max = LimitsMax<T>(); + const auto vmax = Set(d, max); + HWY_ASSERT_VEC_EQ(d, vmax, Mul(vmax, v1)); + HWY_ASSERT_VEC_EQ(d, vmax, Mul(v1, vmax)); + + constexpr uint64_t kMask = FirstBits<sizeof(T) * 8>(); + const T max2 = (static_cast<uint64_t>(max) * max) & kMask; + HWY_ASSERT_VEC_EQ(d, Set(d, max2), Mul(vmax, vmax)); + } +}; + +struct TestSignedMul { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + auto expected = AllocateAligned<T>(N); + + const auto v0 = Zero(d); + const auto v1 = Set(d, T(1)); + const auto vi = Iota(d, 1); + const auto vn = Iota(d, -T(N)); // no i8 supported, so no wraparound + HWY_ASSERT_VEC_EQ(d, v0, Mul(v0, v0)); + HWY_ASSERT_VEC_EQ(d, v1, Mul(v1, v1)); + HWY_ASSERT_VEC_EQ(d, vi, Mul(v1, vi)); + HWY_ASSERT_VEC_EQ(d, vi, Mul(vi, v1)); + + for (size_t i = 0; i < N; ++i) { + expected[i] = static_cast<T>((1 + i) * (1 + i)); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vi, vi)); + + for (size_t i = 0; i < N; ++i) { + expected[i] = static_cast<T>((-T(N) + T(i)) * T(1u + i)); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vn, vi)); + HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vi, vn)); + } +}; + +struct TestMulOverflow { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto vMax = Set(d, LimitsMax<T>()); + HWY_ASSERT_VEC_EQ(d, Mul(vMax, vMax), Mul(vMax, vMax)); + } +}; + +struct TestDivOverflow { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto vZero = Set(d, T(0)); + const auto v1 = Set(d, T(1)); + HWY_ASSERT_VEC_EQ(d, Div(v1, vZero), Div(v1, vZero)); + } +}; + +HWY_NOINLINE void TestAllMul() { + const ForPartialVectors<TestUnsignedMul> test_unsigned; + // No u8. + test_unsigned(uint16_t()); + test_unsigned(uint32_t()); + test_unsigned(uint64_t()); + + const ForPartialVectors<TestSignedMul> test_signed; + // No i8. + test_signed(int16_t()); + test_signed(int32_t()); + test_signed(int64_t()); + + const ForPartialVectors<TestMulOverflow> test_mul_overflow; + test_mul_overflow(int16_t()); + test_mul_overflow(int32_t()); +#if HWY_HAVE_INTEGER64 + test_mul_overflow(int64_t()); +#endif + + const ForPartialVectors<TestDivOverflow> test_div_overflow; + test_div_overflow(float()); +#if HWY_HAVE_FLOAT64 + test_div_overflow(double()); +#endif +} + +struct TestMulHigh { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using Wide = MakeWide<T>; + const size_t N = Lanes(d); + auto in_lanes = AllocateAligned<T>(N); + auto expected_lanes = AllocateAligned<T>(N); + + const auto vi = Iota(d, 1); + // no i8 supported, so no wraparound + const auto vni = Iota(d, T(static_cast<T>(~N + 1))); + + const auto v0 = Zero(d); + HWY_ASSERT_VEC_EQ(d, v0, MulHigh(v0, v0)); + HWY_ASSERT_VEC_EQ(d, v0, MulHigh(v0, vi)); + HWY_ASSERT_VEC_EQ(d, v0, MulHigh(vi, v0)); + + // Large positive squared + for (size_t i = 0; i < N; ++i) { + in_lanes[i] = T(LimitsMax<T>() >> i); + expected_lanes[i] = T((Wide(in_lanes[i]) * in_lanes[i]) >> 16); + } + auto v = Load(d, in_lanes.get()); + HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(v, v)); + + // Large positive * small positive + for (size_t i = 0; i < N; ++i) { + expected_lanes[i] = T((Wide(in_lanes[i]) * T(1u + i)) >> 16); + } + HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(v, vi)); + HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(vi, v)); + + // Large positive * small negative + for (size_t i = 0; i < N; ++i) { + expected_lanes[i] = T((Wide(in_lanes[i]) * T(i - N)) >> 16); + } + HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(v, vni)); + HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(vni, v)); + } +}; + +HWY_NOINLINE void TestAllMulHigh() { + ForPartialVectors<TestMulHigh> test; + test(int16_t()); + test(uint16_t()); +} + +struct TestMulFixedPoint15 { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v0 = Zero(d); + HWY_ASSERT_VEC_EQ(d, v0, MulFixedPoint15(v0, v0)); + HWY_ASSERT_VEC_EQ(d, v0, MulFixedPoint15(v0, v0)); + + const size_t N = Lanes(d); + auto in1 = AllocateAligned<T>(N); + auto in2 = AllocateAligned<T>(N); + auto expected = AllocateAligned<T>(N); + + // Random inputs in each lane + RandomState rng; + for (size_t rep = 0; rep < AdjustedReps(10000); ++rep) { + for (size_t i = 0; i < N; ++i) { + in1[i] = static_cast<T>(Random64(&rng) & 0xFFFF); + in2[i] = static_cast<T>(Random64(&rng) & 0xFFFF); + } + + for (size_t i = 0; i < N; ++i) { + // There are three ways to compute the results. x86 and ARM are defined + // using 32-bit multiplication results: + const int arm = (2 * in1[i] * in2[i] + 0x8000) >> 16; + const int x86 = (((in1[i] * in2[i]) >> 14) + 1) >> 1; + // On other platforms, split the result into upper and lower 16 bits. + const auto v1 = Set(d, in1[i]); + const auto v2 = Set(d, in2[i]); + const int hi = GetLane(MulHigh(v1, v2)); + const int lo = GetLane(Mul(v1, v2)) & 0xFFFF; + const int split = 2 * hi + ((lo + 0x4000) >> 15); + expected[i] = static_cast<T>(arm); + if (in1[i] != -32768 || in2[i] != -32768) { + HWY_ASSERT_EQ(arm, x86); + HWY_ASSERT_EQ(arm, split); + } + } + + const auto a = Load(d, in1.get()); + const auto b = Load(d, in2.get()); + HWY_ASSERT_VEC_EQ(d, expected.get(), MulFixedPoint15(a, b)); + } + } +}; + +HWY_NOINLINE void TestAllMulFixedPoint15() { + ForPartialVectors<TestMulFixedPoint15>()(int16_t()); +} + +struct TestMulEven { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using Wide = MakeWide<T>; + const Repartition<Wide, D> d2; + const auto v0 = Zero(d); + HWY_ASSERT_VEC_EQ(d2, Zero(d2), MulEven(v0, v0)); + + const size_t N = Lanes(d); + auto in_lanes = AllocateAligned<T>(N); + auto expected = AllocateAligned<Wide>(Lanes(d2)); + for (size_t i = 0; i < N; i += 2) { + in_lanes[i + 0] = LimitsMax<T>() >> i; + if (N != 1) { + in_lanes[i + 1] = 1; // unused + } + expected[i / 2] = Wide(in_lanes[i + 0]) * in_lanes[i + 0]; + } + + const auto v = Load(d, in_lanes.get()); + HWY_ASSERT_VEC_EQ(d2, expected.get(), MulEven(v, v)); + } +}; + +struct TestMulEvenOdd64 { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { +#if HWY_TARGET != HWY_SCALAR + const auto v0 = Zero(d); + HWY_ASSERT_VEC_EQ(d, Zero(d), MulEven(v0, v0)); + HWY_ASSERT_VEC_EQ(d, Zero(d), MulOdd(v0, v0)); + + const size_t N = Lanes(d); + if (N == 1) return; + + auto in1 = AllocateAligned<T>(N); + auto in2 = AllocateAligned<T>(N); + auto expected_even = AllocateAligned<T>(N); + auto expected_odd = AllocateAligned<T>(N); + + // Random inputs in each lane + RandomState rng; + for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) { + for (size_t i = 0; i < N; ++i) { + in1[i] = Random64(&rng); + in2[i] = Random64(&rng); + } + + for (size_t i = 0; i < N; i += 2) { + expected_even[i] = Mul128(in1[i], in2[i], &expected_even[i + 1]); + expected_odd[i] = Mul128(in1[i + 1], in2[i + 1], &expected_odd[i + 1]); + } + + const auto a = Load(d, in1.get()); + const auto b = Load(d, in2.get()); + HWY_ASSERT_VEC_EQ(d, expected_even.get(), MulEven(a, b)); + HWY_ASSERT_VEC_EQ(d, expected_odd.get(), MulOdd(a, b)); + } +#else + (void)d; +#endif // HWY_TARGET != HWY_SCALAR + } +}; + +HWY_NOINLINE void TestAllMulEven() { + ForGEVectors<64, TestMulEven> test; + test(int32_t()); + test(uint32_t()); + + ForGEVectors<128, TestMulEvenOdd64>()(uint64_t()); +} + +#ifndef HWY_NATIVE_FMA +#error "Bug in set_macros-inl.h, did not set HWY_NATIVE_FMA" +#endif + +struct TestMulAdd { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto k0 = Zero(d); + const auto kNeg0 = Set(d, T(-0.0)); + const auto v1 = Iota(d, 1); + const auto v2 = Iota(d, 2); + const size_t N = Lanes(d); + auto expected = AllocateAligned<T>(N); + HWY_ASSERT_VEC_EQ(d, k0, MulAdd(k0, k0, k0)); + HWY_ASSERT_VEC_EQ(d, v2, MulAdd(k0, v1, v2)); + HWY_ASSERT_VEC_EQ(d, v2, MulAdd(v1, k0, v2)); + HWY_ASSERT_VEC_EQ(d, k0, NegMulAdd(k0, k0, k0)); + HWY_ASSERT_VEC_EQ(d, v2, NegMulAdd(k0, v1, v2)); + HWY_ASSERT_VEC_EQ(d, v2, NegMulAdd(v1, k0, v2)); + + for (size_t i = 0; i < N; ++i) { + expected[i] = static_cast<T>((i + 1) * (i + 2)); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), MulAdd(v2, v1, k0)); + HWY_ASSERT_VEC_EQ(d, expected.get(), MulAdd(v1, v2, k0)); + HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulAdd(Neg(v2), v1, k0)); + HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulAdd(v1, Neg(v2), k0)); + + for (size_t i = 0; i < N; ++i) { + expected[i] = static_cast<T>((i + 2) * (i + 2) + (i + 1)); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), MulAdd(v2, v2, v1)); + HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulAdd(Neg(v2), v2, v1)); + + for (size_t i = 0; i < N; ++i) { + expected[i] = + T(-T(i + 2u) * static_cast<T>(i + 2) + static_cast<T>(1 + i)); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulAdd(v2, v2, v1)); + + HWY_ASSERT_VEC_EQ(d, k0, MulSub(k0, k0, k0)); + HWY_ASSERT_VEC_EQ(d, kNeg0, NegMulSub(k0, k0, k0)); + + for (size_t i = 0; i < N; ++i) { + expected[i] = -T(i + 2); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), MulSub(k0, v1, v2)); + HWY_ASSERT_VEC_EQ(d, expected.get(), MulSub(v1, k0, v2)); + HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulSub(Neg(k0), v1, v2)); + HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulSub(v1, Neg(k0), v2)); + + for (size_t i = 0; i < N; ++i) { + expected[i] = static_cast<T>((i + 1) * (i + 2)); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), MulSub(v1, v2, k0)); + HWY_ASSERT_VEC_EQ(d, expected.get(), MulSub(v2, v1, k0)); + HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulSub(Neg(v1), v2, k0)); + HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulSub(v2, Neg(v1), k0)); + + for (size_t i = 0; i < N; ++i) { + expected[i] = static_cast<T>((i + 2) * (i + 2) - (1 + i)); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), MulSub(v2, v2, v1)); + HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulSub(Neg(v2), v2, v1)); + } +}; + +HWY_NOINLINE void TestAllMulAdd() { + ForFloatTypes(ForPartialVectors<TestMulAdd>()); +} + +struct TestReorderWidenMulAccumulate { + template <typename TN, class DN> + HWY_NOINLINE void operator()(TN /*unused*/, DN dn) { + using TW = MakeWide<TN>; + const RepartitionToWide<DN> dw; + const Half<DN> dnh; + using VW = Vec<decltype(dw)>; + using VN = Vec<decltype(dn)>; + const size_t NN = Lanes(dn); + + const VW f0 = Zero(dw); + const VW f1 = Set(dw, TW{1}); + const VN bf0 = Zero(dn); + // Cannot Set() bfloat16_t directly. + const VN bf1 = ReorderDemote2To(dn, f1, f1); + + // Any input zero => both outputs zero + VW sum1 = f0; + HWY_ASSERT_VEC_EQ(dw, f0, + ReorderWidenMulAccumulate(dw, bf0, bf0, f0, sum1)); + HWY_ASSERT_VEC_EQ(dw, f0, sum1); + HWY_ASSERT_VEC_EQ(dw, f0, + ReorderWidenMulAccumulate(dw, bf0, bf1, f0, sum1)); + HWY_ASSERT_VEC_EQ(dw, f0, sum1); + HWY_ASSERT_VEC_EQ(dw, f0, + ReorderWidenMulAccumulate(dw, bf1, bf0, f0, sum1)); + HWY_ASSERT_VEC_EQ(dw, f0, sum1); + + // delta[p] := 1, all others zero. For each p: Dot(delta, all-ones) == 1. + auto delta_w = AllocateAligned<TW>(NN); + for (size_t p = 0; p < NN; ++p) { + // Workaround for incorrect Clang wasm codegen: re-initialize the entire + // array rather than zero-initialize once and then toggle lane p. + for (size_t i = 0; i < NN; ++i) { + delta_w[i] = static_cast<TW>(i == p); + } + const VW delta0 = Load(dw, delta_w.get()); + const VW delta1 = Load(dw, delta_w.get() + NN / 2); + const VN delta = ReorderDemote2To(dn, delta0, delta1); + + { + sum1 = f0; + const VW sum0 = ReorderWidenMulAccumulate(dw, delta, bf1, f0, sum1); + HWY_ASSERT_EQ(TW{1}, GetLane(SumOfLanes(dw, Add(sum0, sum1)))); + } + // Swapped arg order + { + sum1 = f0; + const VW sum0 = ReorderWidenMulAccumulate(dw, bf1, delta, f0, sum1); + HWY_ASSERT_EQ(TW{1}, GetLane(SumOfLanes(dw, Add(sum0, sum1)))); + } + // Start with nonzero sum0 or sum1 + { + VW sum0 = PromoteTo(dw, LowerHalf(dnh, delta)); + sum1 = PromoteTo(dw, UpperHalf(dnh, delta)); + sum0 = ReorderWidenMulAccumulate(dw, delta, bf1, sum0, sum1); + HWY_ASSERT_EQ(TW{2}, GetLane(SumOfLanes(dw, Add(sum0, sum1)))); + } + // Start with nonzero sum0 or sum1, and swap arg order + { + VW sum0 = PromoteTo(dw, LowerHalf(dnh, delta)); + sum1 = PromoteTo(dw, UpperHalf(dnh, delta)); + sum0 = ReorderWidenMulAccumulate(dw, bf1, delta, sum0, sum1); + HWY_ASSERT_EQ(TW{2}, GetLane(SumOfLanes(dw, Add(sum0, sum1)))); + } + } + } +}; + +HWY_NOINLINE void TestAllReorderWidenMulAccumulate() { + ForShrinkableVectors<TestReorderWidenMulAccumulate>()(bfloat16_t()); + ForShrinkableVectors<TestReorderWidenMulAccumulate>()(int16_t()); +} + +struct TestRearrangeToOddPlusEven { + template <typename TN, class DN> + HWY_NOINLINE void operator()(TN /*unused*/, DN dn) { + using TW = MakeWide<TN>; + const RebindToUnsigned<DN> du; + const RepartitionToWide<DN> dw; + const Half<DN> dnh; + const RebindToUnsigned<decltype(dnh)> duh; + using VW = Vec<decltype(dw)>; + using VN = Vec<decltype(dn)>; + const size_t NW = Lanes(dw); + + const VW up0 = Iota(dw, TW{1}); + const VW up1 = Iota(dw, static_cast<TW>(1 + NW)); + // We will compute i * (N-i) to avoid per-lane overflow. + const VW down0 = Reverse(dw, up1); + const VW down1 = Reverse(dw, up0); + + // Combine is not available for bf16, so cast to u16. + const auto a0 = BitCast(duh, DemoteTo(dnh, up0)); + const auto a1 = BitCast(duh, DemoteTo(dnh, up1)); + const VN a = BitCast(dn, Combine(du, a1, a0)); + const auto b0 = BitCast(duh, DemoteTo(dnh, down0)); + const auto b1 = BitCast(duh, DemoteTo(dnh, down1)); + const VN b = BitCast(dn, Combine(du, b1, b0)); + + const auto expected = AllocateAligned<TW>(NW); + for (size_t iw = 0; iw < NW; ++iw) { + const size_t in = iw * 2; // even, odd is +1 + const size_t a0 = 1 + in; + const size_t b0 = 1 + 2 * NW - a0; + const size_t a1 = a0 + 1; + const size_t b1 = b0 - 1; + expected[iw] = static_cast<TW>(a0 * b0 + a1 * b1); + } + + VW sum1 = Zero(dw); + const VW sum0 = ReorderWidenMulAccumulate(dw, a, b, Zero(dw), sum1); + const VW sum_odd_even = RearrangeToOddPlusEven(sum0, sum1); + HWY_ASSERT_VEC_EQ(dw, expected.get(), sum_odd_even); + } +}; + +HWY_NOINLINE void TestAllRearrangeToOddPlusEven() { + ForShrinkableVectors<TestRearrangeToOddPlusEven>()(bfloat16_t()); + ForShrinkableVectors<TestRearrangeToOddPlusEven>()(int16_t()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyMulTest); +HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllMul); +HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllMulHigh); +HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllMulFixedPoint15); +HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllMulEven); +HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllMulAdd); +HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllReorderWidenMulAccumulate); +HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllRearrangeToOddPlusEven); + +} // namespace hwy + +#endif diff --git a/third_party/highway/hwy/tests/reduction_test.cc b/third_party/highway/hwy/tests/reduction_test.cc new file mode 100644 index 0000000000..5cc051ef1c --- /dev/null +++ b/third_party/highway/hwy/tests/reduction_test.cc @@ -0,0 +1,261 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <stddef.h> +#include <stdint.h> + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/reduction_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +struct TestSumOfLanes { + template <typename T, size_t N, int P, + hwy::EnableIf<!IsSigned<T>() || ((N & 1) != 0)>* = nullptr> + HWY_NOINLINE void SignedEvenLengthVectorTests(Simd<T, N, P>) { + // do nothing + } + template <typename T, size_t N, int P, + hwy::EnableIf<IsSigned<T>() && ((N & 1) == 0)>* = nullptr> + HWY_NOINLINE void SignedEvenLengthVectorTests(Simd<T, N, P> d) { + const T pairs = static_cast<T>(Lanes(d) / 2); + + // Lanes are the repeated sequence -2, 1, [...]; each pair sums to -1, + // so the eventual total is just -(N/2). + Vec<decltype(d)> v = + InterleaveLower(Set(d, static_cast<T>(-2)), Set(d, T{1})); + HWY_ASSERT_VEC_EQ(d, Set(d, static_cast<T>(-pairs)), SumOfLanes(d, v)); + + // Similar test with a positive result. + v = InterleaveLower(Set(d, static_cast<T>(-2)), Set(d, T{4})); + HWY_ASSERT_VEC_EQ(d, Set(d, static_cast<T>(pairs * 2)), SumOfLanes(d, v)); + } + + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + auto in_lanes = AllocateAligned<T>(N); + + // Lane i = bit i, higher lanes 0 + double sum = 0.0; + // Avoid setting sign bit and cap at double precision + constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51); + for (size_t i = 0; i < N; ++i) { + in_lanes[i] = i < kBits ? static_cast<T>(1ull << i) : 0; + sum += static_cast<double>(in_lanes[i]); + } + HWY_ASSERT_VEC_EQ(d, Set(d, T(sum)), + SumOfLanes(d, Load(d, in_lanes.get()))); + + // Lane i = i (iota) to include upper lanes + sum = 0.0; + for (size_t i = 0; i < N; ++i) { + sum += static_cast<double>(i); + } + HWY_ASSERT_VEC_EQ(d, Set(d, T(sum)), SumOfLanes(d, Iota(d, 0))); + + // Run more tests only for signed types with even vector lengths. Some of + // this code may not otherwise compile, so put it in a templated function. + SignedEvenLengthVectorTests(d); + } +}; + +HWY_NOINLINE void TestAllSumOfLanes() { + ForUIF3264(ForPartialVectors<TestSumOfLanes>()); + ForUI16(ForPartialVectors<TestSumOfLanes>()); + +#if HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_SSE4 || HWY_TARGET == HWY_SSSE3 + ForUI8(ForGEVectors<64, TestSumOfLanes>()); +#endif +} + +struct TestMinOfLanes { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + auto in_lanes = AllocateAligned<T>(N); + + // Lane i = bit i, higher lanes = 2 (not the minimum) + T min = HighestValue<T>(); + // Avoid setting sign bit and cap at double precision + constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51); + for (size_t i = 0; i < N; ++i) { + in_lanes[i] = i < kBits ? static_cast<T>(1ull << i) : 2; + min = HWY_MIN(min, in_lanes[i]); + } + HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get()))); + + // Lane i = N - i to include upper lanes + min = HighestValue<T>(); + for (size_t i = 0; i < N; ++i) { + in_lanes[i] = static_cast<T>(N - i); // no 8-bit T so no wraparound + min = HWY_MIN(min, in_lanes[i]); + } + HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get()))); + + // Bug #910: also check negative values + min = HighestValue<T>(); + const T input_copy[] = {static_cast<T>(-1), + static_cast<T>(-2), + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14}; + size_t i = 0; + for (; i < HWY_MIN(N, sizeof(input_copy) / sizeof(T)); ++i) { + in_lanes[i] = input_copy[i]; + min = HWY_MIN(min, input_copy[i]); + } + // Pad with neutral element to full vector (so we can load) + for (; i < N; ++i) { + in_lanes[i] = min; + } + HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get()))); + } +}; + +struct TestMaxOfLanes { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + auto in_lanes = AllocateAligned<T>(N); + + T max = LowestValue<T>(); + // Avoid setting sign bit and cap at double precision + constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51); + for (size_t i = 0; i < N; ++i) { + in_lanes[i] = i < kBits ? static_cast<T>(1ull << i) : 0; + max = HWY_MAX(max, in_lanes[i]); + } + HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get()))); + + // Lane i = i to include upper lanes + max = LowestValue<T>(); + for (size_t i = 0; i < N; ++i) { + in_lanes[i] = static_cast<T>(i); // no 8-bit T so no wraparound + max = HWY_MAX(max, in_lanes[i]); + } + HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get()))); + + // Bug #910: also check negative values + max = LowestValue<T>(); + const T input_copy[] = {static_cast<T>(-1), + static_cast<T>(-2), + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14}; + size_t i = 0; + for (; i < HWY_MIN(N, sizeof(input_copy) / sizeof(T)); ++i) { + in_lanes[i] = input_copy[i]; + max = HWY_MAX(max, in_lanes[i]); + } + // Pad with neutral element to full vector (so we can load) + for (; i < N; ++i) { + in_lanes[i] = max; + } + HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get()))); + } +}; + +HWY_NOINLINE void TestAllMinMaxOfLanes() { + const ForPartialVectors<TestMinOfLanes> test_min; + const ForPartialVectors<TestMaxOfLanes> test_max; + ForUIF3264(test_min); + ForUIF3264(test_max); + ForUI16(test_min); + ForUI16(test_max); + +#if HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_SSE4 || HWY_TARGET == HWY_SSSE3 + ForUI8(ForGEVectors<64, TestMinOfLanes>()); + ForUI8(ForGEVectors<64, TestMaxOfLanes>()); +#endif +} + +struct TestSumsOf8 { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + RandomState rng; + + const size_t N = Lanes(d); + if (N < 8) return; + const Repartition<uint64_t, D> du64; + + auto in_lanes = AllocateAligned<T>(N); + auto sum_lanes = AllocateAligned<uint64_t>(N / 8); + + for (size_t rep = 0; rep < 100; ++rep) { + for (size_t i = 0; i < N; ++i) { + in_lanes[i] = Random64(&rng) & 0xFF; + } + + for (size_t idx_sum = 0; idx_sum < N / 8; ++idx_sum) { + uint64_t sum = 0; + for (size_t i = 0; i < 8; ++i) { + sum += in_lanes[idx_sum * 8 + i]; + } + sum_lanes[idx_sum] = sum; + } + + const Vec<D> in = Load(d, in_lanes.get()); + HWY_ASSERT_VEC_EQ(du64, sum_lanes.get(), SumsOf8(in)); + } + } +}; + +HWY_NOINLINE void TestAllSumsOf8() { + ForGEVectors<64, TestSumsOf8>()(uint8_t()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyReductionTest); +HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllSumOfLanes); +HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllMinMaxOfLanes); +HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllSumsOf8); +} // namespace hwy + +#endif diff --git a/third_party/highway/hwy/tests/reverse_test.cc b/third_party/highway/hwy/tests/reverse_test.cc new file mode 100644 index 0000000000..b1572c03fe --- /dev/null +++ b/third_party/highway/hwy/tests/reverse_test.cc @@ -0,0 +1,186 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <stddef.h> + +#include "hwy/base.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/reverse_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +struct TestReverse { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + const RebindToUnsigned<D> du; // Iota does not support float16_t. + const auto v = BitCast(d, Iota(du, 1)); + auto expected = AllocateAligned<T>(N); + + // Can't set float16_t value directly, need to permute in memory. + auto copy = AllocateAligned<T>(N); + Store(v, d, copy.get()); + for (size_t i = 0; i < N; ++i) { + expected[i] = copy[N - 1 - i]; + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse(d, v)); + } +}; + +struct TestReverse2 { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + const RebindToUnsigned<D> du; // Iota does not support float16_t. + const auto v = BitCast(d, Iota(du, 1)); + auto expected = AllocateAligned<T>(N); + if (N == 1) { + Store(v, d, expected.get()); + HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse2(d, v)); + return; + } + + // Can't set float16_t value directly, need to permute in memory. + auto copy = AllocateAligned<T>(N); + Store(v, d, copy.get()); + for (size_t i = 0; i < N; ++i) { + expected[i] = copy[i ^ 1]; + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse2(d, v)); + } +}; + +struct TestReverse4 { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + const RebindToUnsigned<D> du; // Iota does not support float16_t. + const auto v = BitCast(d, Iota(du, 1)); + auto expected = AllocateAligned<T>(N); + + // Can't set float16_t value directly, need to permute in memory. + auto copy = AllocateAligned<T>(N); + Store(v, d, copy.get()); + for (size_t i = 0; i < N; ++i) { + expected[i] = copy[i ^ 3]; + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse4(d, v)); + } +}; + +struct TestReverse8 { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + const RebindToUnsigned<D> du; // Iota does not support float16_t. + const auto v = BitCast(d, Iota(du, 1)); + auto expected = AllocateAligned<T>(N); + + // Can't set float16_t value directly, need to permute in memory. + auto copy = AllocateAligned<T>(N); + Store(v, d, copy.get()); + for (size_t i = 0; i < N; ++i) { + expected[i] = copy[i ^ 7]; + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse8(d, v)); + } +}; + +HWY_NOINLINE void TestAllReverse() { + // 8-bit is not supported because Risc-V uses rgather of Lanes - Iota, + // which requires 16 bits. + ForUIF163264(ForPartialVectors<TestReverse>()); +} + +HWY_NOINLINE void TestAllReverse2() { + // 8-bit is not supported because Risc-V uses rgather of Lanes - Iota, + // which requires 16 bits. + ForUIF64(ForGEVectors<128, TestReverse2>()); + ForUIF32(ForGEVectors<64, TestReverse2>()); + ForUIF16(ForGEVectors<32, TestReverse2>()); + +#if HWY_TARGET == HWY_SSSE3 + // Implemented mainly for internal use. + ForUI8(ForPartialVectors<TestReverse2>()); +#endif +} + +HWY_NOINLINE void TestAllReverse4() { + // 8-bit is not supported because Risc-V uses rgather of Lanes - Iota, + // which requires 16 bits. + ForUIF64(ForGEVectors<256, TestReverse4>()); + ForUIF32(ForGEVectors<128, TestReverse4>()); + ForUIF16(ForGEVectors<64, TestReverse4>()); +} + +HWY_NOINLINE void TestAllReverse8() { + // 8-bit is not supported because Risc-V uses rgather of Lanes - Iota, + // which requires 16 bits. + ForUIF64(ForGEVectors<512, TestReverse8>()); + ForUIF32(ForGEVectors<256, TestReverse8>()); + ForUIF16(ForGEVectors<128, TestReverse8>()); +} + +struct TestReverseBlocks { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + const RebindToUnsigned<D> du; // Iota does not support float16_t. + const auto v = BitCast(d, Iota(du, 1)); + auto expected = AllocateAligned<T>(N); + + constexpr size_t kLanesPerBlock = 16 / sizeof(T); + const size_t num_blocks = N / kLanesPerBlock; + HWY_ASSERT(num_blocks != 0); + + // Can't set float16_t value directly, need to permute in memory. + auto copy = AllocateAligned<T>(N); + Store(v, d, copy.get()); + for (size_t i = 0; i < N; ++i) { + const size_t idx_block = i / kLanesPerBlock; + const size_t base = (num_blocks - 1 - idx_block) * kLanesPerBlock; + expected[i] = copy[base + (i % kLanesPerBlock)]; + } + HWY_ASSERT_VEC_EQ(d, expected.get(), ReverseBlocks(d, v)); + } +}; + +HWY_NOINLINE void TestAllReverseBlocks() { + ForAllTypes(ForGEVectors<128, TestReverseBlocks>()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyReverseTest); +HWY_EXPORT_AND_TEST_P(HwyReverseTest, TestAllReverse); +HWY_EXPORT_AND_TEST_P(HwyReverseTest, TestAllReverse2); +HWY_EXPORT_AND_TEST_P(HwyReverseTest, TestAllReverse4); +HWY_EXPORT_AND_TEST_P(HwyReverseTest, TestAllReverse8); +HWY_EXPORT_AND_TEST_P(HwyReverseTest, TestAllReverseBlocks); +} // namespace hwy + +#endif diff --git a/third_party/highway/hwy/tests/shift_test.cc b/third_party/highway/hwy/tests/shift_test.cc new file mode 100644 index 0000000000..585eba761c --- /dev/null +++ b/third_party/highway/hwy/tests/shift_test.cc @@ -0,0 +1,428 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <stddef.h> +#include <stdint.h> + +#include <algorithm> +#include <limits> + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/shift_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +template <bool kSigned> +struct TestLeftShifts { + template <typename T, class D> + HWY_NOINLINE void operator()(T t, D d) { + if (kSigned) { + // Also test positive values + TestLeftShifts</*kSigned=*/false>()(t, d); + } + + using TI = MakeSigned<T>; + using TU = MakeUnsigned<T>; + const size_t N = Lanes(d); + auto expected = AllocateAligned<T>(N); + + // Values to shift + const auto values = Iota(d, static_cast<T>(kSigned ? -TI(N) : TI(0))); + constexpr size_t kMaxShift = (sizeof(T) * 8) - 1; + + // 0 + HWY_ASSERT_VEC_EQ(d, values, ShiftLeft<0>(values)); + HWY_ASSERT_VEC_EQ(d, values, ShiftLeftSame(values, 0)); + + // 1 + for (size_t i = 0; i < N; ++i) { + const T value = kSigned ? T(T(i) - T(N)) : T(i); + expected[i] = T(TU(value) << 1); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<1>(values)); + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, 1)); + + // max + for (size_t i = 0; i < N; ++i) { + const T value = kSigned ? T(T(i) - T(N)) : T(i); + expected[i] = T(TU(value) << kMaxShift); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<kMaxShift>(values)); + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, kMaxShift)); + } +}; + +template <bool kSigned> +struct TestVariableLeftShifts { + template <typename T, class D> + HWY_NOINLINE void operator()(T t, D d) { + if (kSigned) { + // Also test positive values + TestVariableLeftShifts</*kSigned=*/false>()(t, d); + } + + using TI = MakeSigned<T>; + using TU = MakeUnsigned<T>; + const size_t N = Lanes(d); + auto expected = AllocateAligned<T>(N); + + const auto v0 = Zero(d); + const auto v1 = Set(d, 1); + const auto values = Iota(d, kSigned ? -TI(N) : TI(0)); // value to shift + + constexpr size_t kMaxShift = (sizeof(T) * 8) - 1; + const auto max_shift = Set(d, kMaxShift); + const auto small_shifts = And(Iota(d, 0), max_shift); + const auto large_shifts = max_shift - small_shifts; + + // Same: 0 + HWY_ASSERT_VEC_EQ(d, values, Shl(values, v0)); + + // Same: 1 + for (size_t i = 0; i < N; ++i) { + const T value = kSigned ? T(i) - T(N) : T(i); + expected[i] = T(TU(value) << 1); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, v1)); + + // Same: max + for (size_t i = 0; i < N; ++i) { + const T value = kSigned ? T(i) - T(N) : T(i); + expected[i] = T(TU(value) << kMaxShift); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, max_shift)); + + // Variable: small + for (size_t i = 0; i < N; ++i) { + const T value = kSigned ? T(i) - T(N) : T(i); + expected[i] = T(TU(value) << (i & kMaxShift)); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, small_shifts)); + + // Variable: large + for (size_t i = 0; i < N; ++i) { + expected[i] = T(TU(1) << (kMaxShift - (i & kMaxShift))); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(v1, large_shifts)); + } +}; + +struct TestUnsignedRightShifts { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + auto expected = AllocateAligned<T>(N); + + const auto values = Iota(d, 0); + + const T kMax = LimitsMax<T>(); + constexpr size_t kMaxShift = (sizeof(T) * 8) - 1; + + // Shift by 0 + HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values)); + HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0)); + + // Shift by 1 + for (size_t i = 0; i < N; ++i) { + expected[i] = T(T(i & kMax) >> 1); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values)); + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1)); + + // max + for (size_t i = 0; i < N; ++i) { + expected[i] = T(T(i & kMax) >> kMaxShift); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<kMaxShift>(values)); + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, kMaxShift)); + } +}; + +struct TestRotateRight { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + auto expected = AllocateAligned<T>(N); + + constexpr size_t kBits = sizeof(T) * 8; + const auto mask_shift = Set(d, T{kBits}); + // Cover as many bit positions as possible to test shifting out + const auto values = Shl(Set(d, T{1}), And(Iota(d, 0), mask_shift)); + + // Rotate by 0 + HWY_ASSERT_VEC_EQ(d, values, RotateRight<0>(values)); + + // Rotate by 1 + Store(values, d, expected.get()); + for (size_t i = 0; i < N; ++i) { + expected[i] = (expected[i] >> 1) | (expected[i] << (kBits - 1)); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<1>(values)); + + // Rotate by half + Store(values, d, expected.get()); + for (size_t i = 0; i < N; ++i) { + expected[i] = (expected[i] >> (kBits / 2)) | (expected[i] << (kBits / 2)); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<kBits / 2>(values)); + + // Rotate by max + Store(values, d, expected.get()); + for (size_t i = 0; i < N; ++i) { + expected[i] = (expected[i] >> (kBits - 1)) | (expected[i] << 1); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<kBits - 1>(values)); + } +}; + +struct TestVariableUnsignedRightShifts { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + auto expected = AllocateAligned<T>(N); + + const auto v0 = Zero(d); + const auto v1 = Set(d, 1); + const auto values = Iota(d, 0); + + const T kMax = LimitsMax<T>(); + const auto max = Set(d, kMax); + + constexpr size_t kMaxShift = (sizeof(T) * 8) - 1; + const auto max_shift = Set(d, kMaxShift); + const auto small_shifts = And(Iota(d, 0), max_shift); + const auto large_shifts = max_shift - small_shifts; + + // Same: 0 + HWY_ASSERT_VEC_EQ(d, values, Shr(values, v0)); + + // Same: 1 + for (size_t i = 0; i < N; ++i) { + expected[i] = T(T(i & kMax) >> 1); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, v1)); + + // Same: max + HWY_ASSERT_VEC_EQ(d, v0, Shr(values, max_shift)); + + // Variable: small + for (size_t i = 0; i < N; ++i) { + expected[i] = T(i) >> (i & kMaxShift); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, small_shifts)); + + // Variable: Large + for (size_t i = 0; i < N; ++i) { + expected[i] = kMax >> (kMaxShift - (i & kMaxShift)); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(max, large_shifts)); + } +}; + +template <int kAmount, typename T> +T RightShiftNegative(T val) { + // C++ shifts are implementation-defined for negative numbers, and we have + // seen divisions replaced with shifts, so resort to bit operations. + using TU = hwy::MakeUnsigned<T>; + TU bits; + CopySameSize(&val, &bits); + + const TU shifted = TU(bits >> kAmount); + + const TU all = TU(~TU(0)); + const size_t num_zero = sizeof(TU) * 8 - 1 - kAmount; + const TU sign_extended = static_cast<TU>((all << num_zero) & LimitsMax<TU>()); + + bits = shifted | sign_extended; + CopySameSize(&bits, &val); + return val; +} + +class TestSignedRightShifts { + public: + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + auto expected = AllocateAligned<T>(N); + constexpr T kMin = LimitsMin<T>(); + constexpr T kMax = LimitsMax<T>(); + constexpr size_t kMaxShift = (sizeof(T) * 8) - 1; + + // First test positive values, negative are checked below. + const auto v0 = Zero(d); + const auto values = And(Iota(d, 0), Set(d, kMax)); + + // Shift by 0 + HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values)); + HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0)); + + // Shift by 1 + for (size_t i = 0; i < N; ++i) { + expected[i] = T(T(i & kMax) >> 1); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values)); + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1)); + + // max + HWY_ASSERT_VEC_EQ(d, v0, ShiftRight<kMaxShift>(values)); + HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(values, kMaxShift)); + + // Even negative value + Test<0>(kMin, d, __LINE__); + Test<1>(kMin, d, __LINE__); + Test<2>(kMin, d, __LINE__); + Test<kMaxShift>(kMin, d, __LINE__); + + const T odd = static_cast<T>(kMin + 1); + Test<0>(odd, d, __LINE__); + Test<1>(odd, d, __LINE__); + Test<2>(odd, d, __LINE__); + Test<kMaxShift>(odd, d, __LINE__); + } + + private: + template <int kAmount, typename T, class D> + void Test(T val, D d, int line) { + const auto expected = Set(d, RightShiftNegative<kAmount>(val)); + const auto in = Set(d, val); + const char* file = __FILE__; + AssertVecEqual(d, expected, ShiftRight<kAmount>(in), file, line); + AssertVecEqual(d, expected, ShiftRightSame(in, kAmount), file, line); + } +}; + +struct TestVariableSignedRightShifts { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using TU = MakeUnsigned<T>; + const size_t N = Lanes(d); + auto expected = AllocateAligned<T>(N); + + constexpr T kMin = LimitsMin<T>(); + constexpr T kMax = LimitsMax<T>(); + + constexpr size_t kMaxShift = (sizeof(T) * 8) - 1; + + // First test positive values, negative are checked below. + const auto v0 = Zero(d); + const auto positive = Iota(d, 0) & Set(d, kMax); + + // Shift by 0 + HWY_ASSERT_VEC_EQ(d, positive, ShiftRight<0>(positive)); + HWY_ASSERT_VEC_EQ(d, positive, ShiftRightSame(positive, 0)); + + // Shift by 1 + for (size_t i = 0; i < N; ++i) { + expected[i] = T(T(i & kMax) >> 1); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(positive)); + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(positive, 1)); + + // max + HWY_ASSERT_VEC_EQ(d, v0, ShiftRight<kMaxShift>(positive)); + HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(positive, kMaxShift)); + + const auto max_shift = Set(d, kMaxShift); + const auto small_shifts = And(Iota(d, 0), max_shift); + const auto large_shifts = max_shift - small_shifts; + + const auto negative = Iota(d, kMin); + + // Test varying negative to shift + for (size_t i = 0; i < N; ++i) { + expected[i] = RightShiftNegative<1>(static_cast<T>(kMin + i)); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(negative, Set(d, 1))); + + // Shift MSB right by small amounts + for (size_t i = 0; i < N; ++i) { + const size_t amount = i & kMaxShift; + const TU shifted = ~((1ull << (kMaxShift - amount)) - 1); + CopySameSize(&shifted, &expected[i]); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(Set(d, kMin), small_shifts)); + + // Shift MSB right by large amounts + for (size_t i = 0; i < N; ++i) { + const size_t amount = kMaxShift - (i & kMaxShift); + const TU shifted = ~((1ull << (kMaxShift - amount)) - 1); + CopySameSize(&shifted, &expected[i]); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(Set(d, kMin), large_shifts)); + } +}; + +HWY_NOINLINE void TestAllShifts() { + ForUnsignedTypes(ForPartialVectors<TestLeftShifts</*kSigned=*/false>>()); + ForSignedTypes(ForPartialVectors<TestLeftShifts</*kSigned=*/true>>()); + ForUnsignedTypes(ForPartialVectors<TestUnsignedRightShifts>()); + ForSignedTypes(ForPartialVectors<TestSignedRightShifts>()); +} + +HWY_NOINLINE void TestAllVariableShifts() { + const ForPartialVectors<TestLeftShifts</*kSigned=*/false>> shl_u; + const ForPartialVectors<TestLeftShifts</*kSigned=*/true>> shl_s; + const ForPartialVectors<TestUnsignedRightShifts> shr_u; + const ForPartialVectors<TestSignedRightShifts> shr_s; + + shl_u(uint16_t()); + shr_u(uint16_t()); + + shl_u(uint32_t()); + shr_u(uint32_t()); + + shl_s(int16_t()); + shr_s(int16_t()); + + shl_s(int32_t()); + shr_s(int32_t()); + +#if HWY_HAVE_INTEGER64 + shl_u(uint64_t()); + shr_u(uint64_t()); + + shl_s(int64_t()); + shr_s(int64_t()); +#endif +} + +HWY_NOINLINE void TestAllRotateRight() { + const ForPartialVectors<TestRotateRight> test; + test(uint32_t()); +#if HWY_HAVE_INTEGER64 + test(uint64_t()); +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyShiftTest); +HWY_EXPORT_AND_TEST_P(HwyShiftTest, TestAllShifts); +HWY_EXPORT_AND_TEST_P(HwyShiftTest, TestAllVariableShifts); +HWY_EXPORT_AND_TEST_P(HwyShiftTest, TestAllRotateRight); +} // namespace hwy + +#endif diff --git a/third_party/highway/hwy/tests/swizzle_test.cc b/third_party/highway/hwy/tests/swizzle_test.cc new file mode 100644 index 0000000000..f447f7a800 --- /dev/null +++ b/third_party/highway/hwy/tests/swizzle_test.cc @@ -0,0 +1,272 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <stddef.h> +#include <string.h> // memset + +#include "hwy/base.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/swizzle_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +struct TestGetLane { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v = Iota(d, T(1)); + HWY_ASSERT_EQ(T(1), GetLane(v)); + } +}; + +HWY_NOINLINE void TestAllGetLane() { + ForAllTypes(ForPartialVectors<TestGetLane>()); +} + +struct TestExtractLane { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v = Iota(d, T(1)); + for (size_t i = 0; i < Lanes(d); ++i) { + const T actual = ExtractLane(v, i); + HWY_ASSERT_EQ(static_cast<T>(i + 1), actual); + } + } +}; + +HWY_NOINLINE void TestAllExtractLane() { + ForAllTypes(ForPartialVectors<TestExtractLane>()); +} + +struct TestInsertLane { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using V = Vec<D>; + const V v = Iota(d, T(1)); + const size_t N = Lanes(d); + auto lanes = AllocateAligned<T>(N); + Store(v, d, lanes.get()); + + for (size_t i = 0; i < Lanes(d); ++i) { + lanes[i] = T{0}; + const V actual = InsertLane(v, i, static_cast<T>(i + 1)); + HWY_ASSERT_VEC_EQ(d, v, actual); + Store(v, d, lanes.get()); // restore lane i + } + } +}; + +HWY_NOINLINE void TestAllInsertLane() { + ForAllTypes(ForPartialVectors<TestInsertLane>()); +} + +struct TestDupEven { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + auto expected = AllocateAligned<T>(N); + for (size_t i = 0; i < N; ++i) { + expected[i] = static_cast<T>((static_cast<int>(i) & ~1) + 1); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), DupEven(Iota(d, 1))); + } +}; + +HWY_NOINLINE void TestAllDupEven() { + ForUIF3264(ForShrinkableVectors<TestDupEven>()); +} + +struct TestDupOdd { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { +#if HWY_TARGET != HWY_SCALAR + const size_t N = Lanes(d); + auto expected = AllocateAligned<T>(N); + for (size_t i = 0; i < N; ++i) { + expected[i] = static_cast<T>((static_cast<int>(i) & ~1) + 2); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), DupOdd(Iota(d, 1))); +#else + (void)d; +#endif + } +}; + +HWY_NOINLINE void TestAllDupOdd() { + ForUIF3264(ForShrinkableVectors<TestDupOdd>()); +} + +struct TestOddEven { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + const auto even = Iota(d, 1); + const auto odd = Iota(d, static_cast<T>(1 + N)); + auto expected = AllocateAligned<T>(N); + for (size_t i = 0; i < N; ++i) { + expected[i] = static_cast<T>(1 + i + ((i & 1) ? N : 0)); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), OddEven(odd, even)); + } +}; + +HWY_NOINLINE void TestAllOddEven() { + ForAllTypes(ForShrinkableVectors<TestOddEven>()); +} + +struct TestOddEvenBlocks { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + const auto even = Iota(d, 1); + const auto odd = Iota(d, static_cast<T>(1 + N)); + auto expected = AllocateAligned<T>(N); + for (size_t i = 0; i < N; ++i) { + const size_t idx_block = i / (16 / sizeof(T)); + expected[i] = static_cast<T>(1 + i + ((idx_block & 1) ? N : 0)); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), OddEvenBlocks(odd, even)); + } +}; + +HWY_NOINLINE void TestAllOddEvenBlocks() { + ForAllTypes(ForGEVectors<128, TestOddEvenBlocks>()); +} + +struct TestSwapAdjacentBlocks { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + constexpr size_t kLanesPerBlock = 16 / sizeof(T); + if (N < 2 * kLanesPerBlock) return; + const auto vi = Iota(d, 1); + auto expected = AllocateAligned<T>(N); + for (size_t i = 0; i < N; ++i) { + const size_t idx_block = i / kLanesPerBlock; + const size_t base = (idx_block ^ 1) * kLanesPerBlock; + const size_t mod = i % kLanesPerBlock; + expected[i] = static_cast<T>(1 + base + mod); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), SwapAdjacentBlocks(vi)); + } +}; + +HWY_NOINLINE void TestAllSwapAdjacentBlocks() { + ForAllTypes(ForGEVectors<128, TestSwapAdjacentBlocks>()); +} + +struct TestTableLookupLanes { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const RebindToSigned<D> di; + using TI = TFromD<decltype(di)>; +#if HWY_TARGET != HWY_SCALAR + const size_t N = Lanes(d); + auto idx = AllocateAligned<TI>(N); + memset(idx.get(), 0, N * sizeof(TI)); + auto expected = AllocateAligned<T>(N); + const auto v = Iota(d, 1); + + if (N <= 8) { // Test all permutations + for (size_t i0 = 0; i0 < N; ++i0) { + idx[0] = static_cast<TI>(i0); + + for (size_t i1 = 0; i1 < N; ++i1) { + if (N >= 2) idx[1] = static_cast<TI>(i1); + for (size_t i2 = 0; i2 < N; ++i2) { + if (N >= 4) idx[2] = static_cast<TI>(i2); + for (size_t i3 = 0; i3 < N; ++i3) { + if (N >= 4) idx[3] = static_cast<TI>(i3); + + for (size_t i = 0; i < N; ++i) { + expected[i] = static_cast<T>(idx[i] + 1); // == v[idx[i]] + } + + const auto opaque1 = IndicesFromVec(d, Load(di, idx.get())); + const auto actual1 = TableLookupLanes(v, opaque1); + HWY_ASSERT_VEC_EQ(d, expected.get(), actual1); + + const auto opaque2 = SetTableIndices(d, idx.get()); + const auto actual2 = TableLookupLanes(v, opaque2); + HWY_ASSERT_VEC_EQ(d, expected.get(), actual2); + } + } + } + } + } else { + // Too many permutations to test exhaustively; choose one with repeated + // and cross-block indices and ensure indices do not exceed #lanes. + // For larger vectors, upper lanes will be zero. + HWY_ALIGN TI idx_source[16] = {1, 3, 2, 2, 8, 1, 7, 6, + 15, 14, 14, 15, 4, 9, 8, 5}; + for (size_t i = 0; i < N; ++i) { + idx[i] = (i < 16) ? idx_source[i] : 0; + // Avoid undefined results / asan error for scalar by capping indices. + if (idx[i] >= static_cast<TI>(N)) { + idx[i] = static_cast<TI>(N - 1); + } + expected[i] = static_cast<T>(idx[i] + 1); // == v[idx[i]] + } + + const auto opaque1 = IndicesFromVec(d, Load(di, idx.get())); + const auto actual1 = TableLookupLanes(v, opaque1); + HWY_ASSERT_VEC_EQ(d, expected.get(), actual1); + + const auto opaque2 = SetTableIndices(d, idx.get()); + const auto actual2 = TableLookupLanes(v, opaque2); + HWY_ASSERT_VEC_EQ(d, expected.get(), actual2); + } +#else + const TI index = 0; + const auto v = Set(d, 1); + const auto opaque1 = SetTableIndices(d, &index); + HWY_ASSERT_VEC_EQ(d, v, TableLookupLanes(v, opaque1)); + const auto opaque2 = IndicesFromVec(d, Zero(di)); + HWY_ASSERT_VEC_EQ(d, v, TableLookupLanes(v, opaque2)); +#endif + } +}; + +HWY_NOINLINE void TestAllTableLookupLanes() { + ForUIF3264(ForPartialVectors<TestTableLookupLanes>()); +} + + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwySwizzleTest); +HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllGetLane); +HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllExtractLane); +HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllInsertLane); +HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllDupEven); +HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllDupOdd); +HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllOddEven); +HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllOddEvenBlocks); +HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllSwapAdjacentBlocks); +HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllTableLookupLanes); +} // namespace hwy + +#endif diff --git a/third_party/highway/hwy/tests/test_util-inl.h b/third_party/highway/hwy/tests/test_util-inl.h new file mode 100644 index 0000000000..972b3361e0 --- /dev/null +++ b/third_party/highway/hwy/tests/test_util-inl.h @@ -0,0 +1,665 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Target-specific helper functions for use by *_test.cc. + +#include <stdint.h> + +#include "hwy/base.h" +#include "hwy/tests/hwy_gtest.h" +#include "hwy/tests/test_util.h" + +// After test_util (also includes highway.h) +#include "hwy/print-inl.h" + +// Per-target include guard +#if defined(HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_ +#undef HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_ +#else +#define HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_ +#endif + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// Compare expected vector to vector. +// HWY_INLINE works around a Clang SVE compiler bug where all but the first +// 128 bits (the NEON register) of actual are zero. +template <class D, typename T = TFromD<D>, class V = Vec<D>> +HWY_INLINE void AssertVecEqual(D d, const T* expected, VecArg<V> actual, + const char* filename, const int line) { + const size_t N = Lanes(d); + auto actual_lanes = AllocateAligned<T>(N); + Store(actual, d, actual_lanes.get()); + + const auto info = hwy::detail::MakeTypeInfo<T>(); + const char* target_name = hwy::TargetName(HWY_TARGET); + hwy::detail::AssertArrayEqual(info, expected, actual_lanes.get(), N, + target_name, filename, line); +} + +// Compare expected lanes to vector. +// HWY_INLINE works around a Clang SVE compiler bug where all but the first +// 128 bits (the NEON register) of actual are zero. +template <class D, typename T = TFromD<D>, class V = Vec<D>> +HWY_INLINE void AssertVecEqual(D d, VecArg<V> expected, VecArg<V> actual, + const char* filename, int line) { + auto expected_lanes = AllocateAligned<T>(Lanes(d)); + Store(expected, d, expected_lanes.get()); + AssertVecEqual(d, expected_lanes.get(), actual, filename, line); +} + +// Only checks the valid mask elements (those whose index < Lanes(d)). +template <class D> +HWY_NOINLINE void AssertMaskEqual(D d, VecArg<Mask<D>> a, VecArg<Mask<D>> b, + const char* filename, int line) { + // lvalues prevented MSAN failure in farm_sve. + const Vec<D> va = VecFromMask(d, a); + const Vec<D> vb = VecFromMask(d, b); + AssertVecEqual(d, va, vb, filename, line); + + const char* target_name = hwy::TargetName(HWY_TARGET); + AssertEqual(CountTrue(d, a), CountTrue(d, b), target_name, filename, line); + AssertEqual(AllTrue(d, a), AllTrue(d, b), target_name, filename, line); + AssertEqual(AllFalse(d, a), AllFalse(d, b), target_name, filename, line); + + const size_t N = Lanes(d); +#if HWY_TARGET == HWY_SCALAR + const Rebind<uint8_t, D> d8; +#else + const Repartition<uint8_t, D> d8; +#endif + const size_t N8 = Lanes(d8); + auto bits_a = AllocateAligned<uint8_t>(HWY_MAX(size_t{8}, N8)); + auto bits_b = AllocateAligned<uint8_t>(size_t{HWY_MAX(8, N8)}); + memset(bits_a.get(), 0, N8); + memset(bits_b.get(), 0, N8); + const size_t num_bytes_a = StoreMaskBits(d, a, bits_a.get()); + const size_t num_bytes_b = StoreMaskBits(d, b, bits_b.get()); + AssertEqual(num_bytes_a, num_bytes_b, target_name, filename, line); + size_t i = 0; + // First check whole bytes (if that many elements are still valid) + for (; i < N / 8; ++i) { + if (bits_a[i] != bits_b[i]) { + fprintf(stderr, "Mismatch in byte %d: %d != %d\n", static_cast<int>(i), + bits_a[i], bits_b[i]); + Print(d8, "expect", Load(d8, bits_a.get()), 0, N8); + Print(d8, "actual", Load(d8, bits_b.get()), 0, N8); + hwy::Abort(filename, line, "Masks not equal"); + } + } + // Then the valid bit(s) in the last byte. + const size_t remainder = N % 8; + if (remainder != 0) { + const int mask = (1 << remainder) - 1; + const int valid_a = bits_a[i] & mask; + const int valid_b = bits_b[i] & mask; + if (valid_a != valid_b) { + fprintf(stderr, "Mismatch in last byte %d: %d != %d\n", + static_cast<int>(i), valid_a, valid_b); + Print(d8, "expect", Load(d8, bits_a.get()), 0, N8); + Print(d8, "actual", Load(d8, bits_b.get()), 0, N8); + hwy::Abort(filename, line, "Masks not equal"); + } + } +} + +// Only sets valid elements (those whose index < Lanes(d)). This helps catch +// tests that are not masking off the (undefined) upper mask elements. +// +// TODO(janwas): with HWY_NOINLINE GCC zeros the upper half of AVX2 masks. +template <class D> +HWY_INLINE Mask<D> MaskTrue(const D d) { + return FirstN(d, Lanes(d)); +} + +template <class D> +HWY_INLINE Mask<D> MaskFalse(const D d) { + const auto zero = Zero(RebindToSigned<D>()); + return RebindMask(d, Lt(zero, zero)); +} + +#ifndef HWY_ASSERT_EQ + +#define HWY_ASSERT_EQ(expected, actual) \ + hwy::AssertEqual(expected, actual, hwy::TargetName(HWY_TARGET), __FILE__, \ + __LINE__) + +#define HWY_ASSERT_ARRAY_EQ(expected, actual, count) \ + hwy::AssertArrayEqual(expected, actual, count, hwy::TargetName(HWY_TARGET), \ + __FILE__, __LINE__) + +#define HWY_ASSERT_STRING_EQ(expected, actual) \ + hwy::AssertStringEqual(expected, actual, hwy::TargetName(HWY_TARGET), \ + __FILE__, __LINE__) + +#define HWY_ASSERT_VEC_EQ(d, expected, actual) \ + AssertVecEqual(d, expected, actual, __FILE__, __LINE__) + +#define HWY_ASSERT_MASK_EQ(d, expected, actual) \ + AssertMaskEqual(d, expected, actual, __FILE__, __LINE__) + +#endif // HWY_ASSERT_EQ + +namespace detail { + +// Helpers for instantiating tests with combinations of lane types / counts. + +// Calls Test for each CappedTag<T, N> where N is in [kMinLanes, kMul * kMinArg] +// and the resulting Lanes() is in [min_lanes, max_lanes]. The upper bound +// is required to ensure capped vectors remain extendable. Implemented by +// recursively halving kMul until it is zero. +template <typename T, size_t kMul, size_t kMinArg, class Test> +struct ForeachCappedR { + static void Do(size_t min_lanes, size_t max_lanes) { + const CappedTag<T, kMul * kMinArg> d; + + // If we already don't have enough lanes, stop. + const size_t lanes = Lanes(d); + if (lanes < min_lanes) return; + + if (lanes <= max_lanes) { + Test()(T(), d); + } + ForeachCappedR<T, kMul / 2, kMinArg, Test>::Do(min_lanes, max_lanes); + } +}; + +// Base case to stop the recursion. +template <typename T, size_t kMinArg, class Test> +struct ForeachCappedR<T, 0, kMinArg, Test> { + static void Do(size_t, size_t) {} +}; + +#if HWY_HAVE_SCALABLE + +template <typename T> +constexpr int MinPow2() { + // Highway follows RVV LMUL in that the smallest fraction is 1/8th (encoded + // as kPow2 == -3). The fraction also must not result in zero lanes for the + // smallest possible vector size, which is 128 bits even on RISC-V (with the + // application processor profile). + return HWY_MAX(-3, -static_cast<int>(CeilLog2(16 / sizeof(T)))); +} + +// Iterates kPow2 upward through +3. +template <typename T, int kPow2, int kAddPow2, class Test> +struct ForeachShiftR { + static void Do(size_t min_lanes) { + const ScalableTag<T, kPow2 + kAddPow2> d; + + // Precondition: [kPow2, 3] + kAddPow2 is a valid fraction of the minimum + // vector size, so we always have enough lanes, except ForGEVectors. + if (Lanes(d) >= min_lanes) { + Test()(T(), d); + } else { + fprintf(stderr, "%d lanes < %d: T=%d pow=%d\n", + static_cast<int>(Lanes(d)), static_cast<int>(min_lanes), + static_cast<int>(sizeof(T)), kPow2 + kAddPow2); + HWY_ASSERT(min_lanes != 1); + } + + ForeachShiftR<T, kPow2 + 1, kAddPow2, Test>::Do(min_lanes); + } +}; + +// Base case to stop the recursion. +template <typename T, int kAddPow2, class Test> +struct ForeachShiftR<T, 4, kAddPow2, Test> { + static void Do(size_t) {} +}; +#else +// ForeachCappedR already handled all possible sizes. +#endif // HWY_HAVE_SCALABLE + +} // namespace detail + +// These 'adapters' call a test for all possible N or kPow2 subject to +// constraints such as "vectors must be extendable" or "vectors >= 128 bits". +// They may be called directly, or via For*Types. Note that for an adapter C, +// `C<Test>(T())` does not call the test - the correct invocation is +// `C<Test>()(T())`, or preferably `ForAllTypes(C<Test>())`. We check at runtime +// that operator() is called to prevent such bugs. Note that this is not +// thread-safe, but that is fine because C are typically local variables. + +// Calls Test for all power of two N in [1, Lanes(d) >> kPow2]. This is for +// ops that widen their input, e.g. Combine (not supported by HWY_SCALAR). +template <class Test, int kPow2 = 1> +class ForExtendableVectors { + mutable bool called_ = false; + + public: + ~ForExtendableVectors() { + if (!called_) { + HWY_ABORT("Test is incorrect, ensure operator() is called"); + } + } + + template <typename T> + void operator()(T /*unused*/) const { + called_ = true; + constexpr size_t kMaxCapped = HWY_LANES(T); + // Skip CappedTag that are already full vectors. + const size_t max_lanes = Lanes(ScalableTag<T>()) >> kPow2; + (void)kMaxCapped; + (void)max_lanes; +#if HWY_TARGET == HWY_SCALAR + // not supported +#else + detail::ForeachCappedR<T, (kMaxCapped >> kPow2), 1, Test>::Do(1, max_lanes); +#if HWY_TARGET == HWY_RVV + // For each [MinPow2, 3 - kPow2]; counter is [MinPow2 + kPow2, 3]. + detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2, -kPow2, Test>::Do(1); +#elif HWY_HAVE_SCALABLE + // For each [MinPow2, 0 - kPow2]; counter is [MinPow2 + kPow2 + 3, 3]. + detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2 + 3, -kPow2 - 3, + Test>::Do(1); +#endif +#endif // HWY_SCALAR + } +}; + +// Calls Test for all power of two N in [1 << kPow2, Lanes(d)]. This is for ops +// that narrow their input, e.g. UpperHalf. +template <class Test, int kPow2 = 1> +class ForShrinkableVectors { + mutable bool called_ = false; + + public: + ~ForShrinkableVectors() { + if (!called_) { + HWY_ABORT("Test is incorrect, ensure operator() is called"); + } + } + + template <typename T> + void operator()(T /*unused*/) const { + called_ = true; + constexpr size_t kMinLanes = size_t{1} << kPow2; + constexpr size_t kMaxCapped = HWY_LANES(T); + // For shrinking, an upper limit is unnecessary. + constexpr size_t max_lanes = kMaxCapped; + + (void)kMinLanes; + (void)max_lanes; + (void)max_lanes; +#if HWY_TARGET == HWY_SCALAR + // not supported +#else + detail::ForeachCappedR<T, (kMaxCapped >> kPow2), kMinLanes, Test>::Do( + kMinLanes, max_lanes); +#if HWY_TARGET == HWY_RVV + // For each [MinPow2 + kPow2, 3]; counter is [MinPow2 + kPow2, 3]. + detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2, 0, Test>::Do( + kMinLanes); +#elif HWY_HAVE_SCALABLE + // For each [MinPow2 + kPow2, 0]; counter is [MinPow2 + kPow2 + 3, 3]. + detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2 + 3, -3, Test>::Do( + kMinLanes); +#endif +#endif // HWY_TARGET == HWY_SCALAR + } +}; + +// Calls Test for all supported power of two vectors of at least kMinBits. +// Examples: AES or 64x64 require 128 bits, casts may require 64 bits. +template <size_t kMinBits, class Test> +class ForGEVectors { + mutable bool called_ = false; + + public: + ~ForGEVectors() { + if (!called_) { + HWY_ABORT("Test is incorrect, ensure operator() is called"); + } + } + + template <typename T> + void operator()(T /*unused*/) const { + called_ = true; + constexpr size_t kMaxCapped = HWY_LANES(T); + constexpr size_t kMinLanes = kMinBits / 8 / sizeof(T); + // An upper limit is unnecessary. + constexpr size_t max_lanes = kMaxCapped; + (void)max_lanes; +#if HWY_TARGET == HWY_SCALAR + (void)kMinLanes; // not supported +#else + detail::ForeachCappedR<T, HWY_LANES(T) / kMinLanes, kMinLanes, Test>::Do( + kMinLanes, max_lanes); +#if HWY_TARGET == HWY_RVV + // Can be 0 (handled below) if kMinBits > 64. + constexpr size_t kRatio = 128 / kMinBits; + constexpr int kMinPow2 = + kRatio == 0 ? 0 : -static_cast<int>(CeilLog2(kRatio)); + // For each [kMinPow2, 3]; counter is [kMinPow2, 3]. + detail::ForeachShiftR<T, kMinPow2, 0, Test>::Do(kMinLanes); +#elif HWY_HAVE_SCALABLE + // Can be 0 (handled below) if kMinBits > 128. + constexpr size_t kRatio = 128 / kMinBits; + constexpr int kMinPow2 = + kRatio == 0 ? 0 : -static_cast<int>(CeilLog2(kRatio)); + // For each [kMinPow2, 0]; counter is [kMinPow2 + 3, 3]. + detail::ForeachShiftR<T, kMinPow2 + 3, -3, Test>::Do(kMinLanes); +#endif +#endif // HWY_TARGET == HWY_SCALAR + } +}; + +template <class Test> +using ForGE128Vectors = ForGEVectors<128, Test>; + +// Calls Test for all N that can be promoted (not the same as Extendable because +// HWY_SCALAR has one lane). Also used for ZipLower, but not ZipUpper. +template <class Test, int kPow2 = 1> +class ForPromoteVectors { + mutable bool called_ = false; + + public: + ~ForPromoteVectors() { + if (!called_) { + HWY_ABORT("Test is incorrect, ensure operator() is called"); + } + } + + template <typename T> + void operator()(T /*unused*/) const { + called_ = true; + constexpr size_t kFactor = size_t{1} << kPow2; + static_assert(kFactor >= 2 && kFactor * sizeof(T) <= sizeof(uint64_t), ""); + constexpr size_t kMaxCapped = HWY_LANES(T); + constexpr size_t kMinLanes = kFactor; + // Skip CappedTag that are already full vectors. + const size_t max_lanes = Lanes(ScalableTag<T>()) >> kPow2; + (void)kMaxCapped; + (void)kMinLanes; + (void)max_lanes; +#if HWY_TARGET == HWY_SCALAR + detail::ForeachCappedR<T, 1, 1, Test>::Do(1, 1); +#else + // TODO(janwas): call Extendable if kMinLanes check not required? + detail::ForeachCappedR<T, (kMaxCapped >> kPow2), 1, Test>::Do(kMinLanes, + max_lanes); +#if HWY_TARGET == HWY_RVV + // For each [MinPow2, 3 - kPow2]; counter is [MinPow2 + kPow2, 3]. + detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2, -kPow2, Test>::Do( + kMinLanes); +#elif HWY_HAVE_SCALABLE + // For each [MinPow2, 0 - kPow2]; counter is [MinPow2 + kPow2 + 3, 3]. + detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2 + 3, -kPow2 - 3, + Test>::Do(kMinLanes); +#endif +#endif // HWY_SCALAR + } +}; + +// Calls Test for all N than can be demoted (not the same as Shrinkable because +// HWY_SCALAR has one lane). +template <class Test, int kPow2 = 1> +class ForDemoteVectors { + mutable bool called_ = false; + + public: + ~ForDemoteVectors() { + if (!called_) { + HWY_ABORT("Test is incorrect, ensure operator() is called"); + } + } + + template <typename T> + void operator()(T /*unused*/) const { + called_ = true; + constexpr size_t kMinLanes = size_t{1} << kPow2; + constexpr size_t kMaxCapped = HWY_LANES(T); + // For shrinking, an upper limit is unnecessary. + constexpr size_t max_lanes = kMaxCapped; + + (void)kMinLanes; + (void)max_lanes; + (void)max_lanes; +#if HWY_TARGET == HWY_SCALAR + detail::ForeachCappedR<T, 1, 1, Test>::Do(1, 1); +#else + detail::ForeachCappedR<T, (kMaxCapped >> kPow2), kMinLanes, Test>::Do( + kMinLanes, max_lanes); + +// TODO(janwas): call Extendable if kMinLanes check not required? +#if HWY_TARGET == HWY_RVV + // For each [MinPow2 + kPow2, 3]; counter is [MinPow2 + kPow2, 3]. + detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2, 0, Test>::Do( + kMinLanes); +#elif HWY_HAVE_SCALABLE + // For each [MinPow2 + kPow2, 0]; counter is [MinPow2 + kPow2 + 3, 3]. + detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2 + 3, -3, Test>::Do( + kMinLanes); +#endif +#endif // HWY_TARGET == HWY_SCALAR + } +}; + +// For LowerHalf/Quarter. +template <class Test, int kPow2 = 1> +class ForHalfVectors { + mutable bool called_ = false; + + public: + ~ForHalfVectors() { + if (!called_) { + HWY_ABORT("Test is incorrect, ensure operator() is called"); + } + } + + template <typename T> + void operator()(T /*unused*/) const { + called_ = true; +#if HWY_TARGET == HWY_SCALAR + detail::ForeachCappedR<T, 1, 1, Test>::Do(1, 1); +#else + constexpr size_t kMinLanes = size_t{1} << kPow2; + // For shrinking, an upper limit is unnecessary. + constexpr size_t kMaxCapped = HWY_LANES(T); + detail::ForeachCappedR<T, (kMaxCapped >> kPow2), kMinLanes, Test>::Do( + kMinLanes, kMaxCapped); + +// TODO(janwas): call Extendable if kMinLanes check not required? +#if HWY_TARGET == HWY_RVV + // For each [MinPow2 + kPow2, 3]; counter is [MinPow2 + kPow2, 3]. + detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2, 0, Test>::Do( + kMinLanes); +#elif HWY_HAVE_SCALABLE + // For each [MinPow2 + kPow2, 0]; counter is [MinPow2 + kPow2 + 3, 3]. + detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2 + 3, -3, Test>::Do( + kMinLanes); +#endif +#endif // HWY_TARGET == HWY_SCALAR + } +}; + +// Calls Test for all power of two N in [1, Lanes(d)]. This is the default +// for ops that do not narrow nor widen their input, nor require 128 bits. +template <class Test> +class ForPartialVectors { + mutable bool called_ = false; + + public: + ~ForPartialVectors() { + if (!called_) { + HWY_ABORT("Test is incorrect, ensure operator() is called"); + } + } + + template <typename T> + void operator()(T t) const { + called_ = true; +#if HWY_TARGET == HWY_SCALAR + (void)t; + detail::ForeachCappedR<T, 1, 1, Test>::Do(1, 1); +#else + ForExtendableVectors<Test, 0>()(t); +#endif + } +}; + +// Type lists to shorten call sites: + +template <class Func> +void ForSignedTypes(const Func& func) { + func(int8_t()); + func(int16_t()); + func(int32_t()); +#if HWY_HAVE_INTEGER64 + func(int64_t()); +#endif +} + +template <class Func> +void ForUnsignedTypes(const Func& func) { + func(uint8_t()); + func(uint16_t()); + func(uint32_t()); +#if HWY_HAVE_INTEGER64 + func(uint64_t()); +#endif +} + +template <class Func> +void ForIntegerTypes(const Func& func) { + ForSignedTypes(func); + ForUnsignedTypes(func); +} + +template <class Func> +void ForFloatTypes(const Func& func) { + func(float()); +#if HWY_HAVE_FLOAT64 + func(double()); +#endif +} + +template <class Func> +void ForAllTypes(const Func& func) { + ForIntegerTypes(func); + ForFloatTypes(func); +} + +template <class Func> +void ForUI8(const Func& func) { + func(uint8_t()); + func(int8_t()); +} + +template <class Func> +void ForUI16(const Func& func) { + func(uint16_t()); + func(int16_t()); +} + +template <class Func> +void ForUIF16(const Func& func) { + ForUI16(func); +#if HWY_HAVE_FLOAT16 + func(float16_t()); +#endif +} + +template <class Func> +void ForUI32(const Func& func) { + func(uint32_t()); + func(int32_t()); +} + +template <class Func> +void ForUIF32(const Func& func) { + ForUI32(func); + func(float()); +} + +template <class Func> +void ForUI64(const Func& func) { +#if HWY_HAVE_INTEGER64 + func(uint64_t()); + func(int64_t()); +#endif +} + +template <class Func> +void ForUIF64(const Func& func) { + ForUI64(func); +#if HWY_HAVE_FLOAT64 + func(double()); +#endif +} + +template <class Func> +void ForUI3264(const Func& func) { + ForUI32(func); + ForUI64(func); +} + +template <class Func> +void ForUIF3264(const Func& func) { + ForUIF32(func); + ForUIF64(func); +} + +template <class Func> +void ForUI163264(const Func& func) { + ForUI16(func); + ForUI3264(func); +} + +template <class Func> +void ForUIF163264(const Func& func) { + ForUIF16(func); + ForUIF3264(func); +} + +// For tests that involve loops, adjust the trip count so that emulated tests +// finish quickly (but always at least 2 iterations to ensure some diversity). +constexpr size_t AdjustedReps(size_t max_reps) { +#if HWY_ARCH_RVV + return HWY_MAX(max_reps / 32, 2); +#elif HWY_IS_DEBUG_BUILD + return HWY_MAX(max_reps / 8, 2); +#elif HWY_ARCH_ARM + return HWY_MAX(max_reps / 4, 2); +#else + return HWY_MAX(max_reps, 2); +#endif +} + +// Same as above, but the loop trip count will be 1 << max_pow2. +constexpr size_t AdjustedLog2Reps(size_t max_pow2) { + // If "negative" (unsigned wraparound), use original. +#if HWY_ARCH_RVV + return HWY_MIN(max_pow2 - 4, max_pow2); +#elif HWY_IS_DEBUG_BUILD + return HWY_MIN(max_pow2 - 1, max_pow2); +#elif HWY_ARCH_ARM + return HWY_MIN(max_pow2 - 1, max_pow2); +#else + return max_pow2; +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // per-target include guard diff --git a/third_party/highway/hwy/tests/test_util.cc b/third_party/highway/hwy/tests/test_util.cc new file mode 100644 index 0000000000..a0796b15f9 --- /dev/null +++ b/third_party/highway/hwy/tests/test_util.cc @@ -0,0 +1,117 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/tests/test_util.h" + +#include <stddef.h> +#include <stdio.h> + +#include <cmath> + +#include "hwy/base.h" +#include "hwy/print.h" + +namespace hwy { + +HWY_TEST_DLLEXPORT bool BytesEqual(const void* p1, const void* p2, + const size_t size, size_t* pos) { + const uint8_t* bytes1 = reinterpret_cast<const uint8_t*>(p1); + const uint8_t* bytes2 = reinterpret_cast<const uint8_t*>(p2); + for (size_t i = 0; i < size; ++i) { + if (bytes1[i] != bytes2[i]) { + if (pos != nullptr) { + *pos = i; + } + return false; + } + } + return true; +} + +void AssertStringEqual(const char* expected, const char* actual, + const char* target_name, const char* filename, + int line) { + while (*expected == *actual++) { + if (*expected++ == '\0') return; + } + + Abort(filename, line, "%s string mismatch: expected '%s', got '%s'.\n", + target_name, expected, actual); +} + +namespace detail { + +HWY_TEST_DLLEXPORT bool IsEqual(const TypeInfo& info, const void* expected_ptr, + const void* actual_ptr) { + if (!info.is_float) { + return BytesEqual(expected_ptr, actual_ptr, info.sizeof_t); + } + + if (info.sizeof_t == 4) { + float expected, actual; + CopyBytes<4>(expected_ptr, &expected); + CopyBytes<4>(actual_ptr, &actual); + return ComputeUlpDelta(expected, actual) <= 1; + } else if (info.sizeof_t == 8) { + double expected, actual; + CopyBytes<8>(expected_ptr, &expected); + CopyBytes<8>(actual_ptr, &actual); + return ComputeUlpDelta(expected, actual) <= 1; + } else { + HWY_ABORT("Unexpected float size %d\n", static_cast<int>(info.sizeof_t)); + return false; + } +} + +HWY_TEST_DLLEXPORT HWY_NORETURN void PrintMismatchAndAbort( + const TypeInfo& info, const void* expected_ptr, const void* actual_ptr, + const char* target_name, const char* filename, int line, size_t lane, + size_t num_lanes) { + char type_name[100]; + TypeName(info, 1, type_name); + char expected_str[100]; + ToString(info, expected_ptr, expected_str); + char actual_str[100]; + ToString(info, actual_ptr, actual_str); + Abort(filename, line, + "%s, %sx%d lane %d mismatch: expected '%s', got '%s'.\n", target_name, + type_name, static_cast<int>(num_lanes), static_cast<int>(lane), + expected_str, actual_str); +} + +HWY_TEST_DLLEXPORT void AssertArrayEqual(const TypeInfo& info, + const void* expected_void, + const void* actual_void, size_t N, + const char* target_name, + const char* filename, int line) { + const uint8_t* expected_array = + reinterpret_cast<const uint8_t*>(expected_void); + const uint8_t* actual_array = reinterpret_cast<const uint8_t*>(actual_void); + for (size_t i = 0; i < N; ++i) { + const void* expected_ptr = expected_array + i * info.sizeof_t; + const void* actual_ptr = actual_array + i * info.sizeof_t; + if (!IsEqual(info, expected_ptr, actual_ptr)) { + fprintf(stderr, "\n\n"); + PrintArray(info, "expect", expected_array, N, i); + PrintArray(info, "actual", actual_array, N, i); + + PrintMismatchAndAbort(info, expected_ptr, actual_ptr, target_name, + filename, line, i, N); + } + } +} + +} // namespace detail +} // namespace hwy diff --git a/third_party/highway/hwy/tests/test_util.h b/third_party/highway/hwy/tests/test_util.h new file mode 100644 index 0000000000..558d1bcfba --- /dev/null +++ b/third_party/highway/hwy/tests/test_util.h @@ -0,0 +1,173 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HWY_TESTS_TEST_UTIL_H_ +#define HWY_TESTS_TEST_UTIL_H_ + +// Target-independent helper functions for use by *_test.cc. + +#include <stddef.h> +#include <stdint.h> +#include <string.h> + +#include <cmath> // std::isnan +#include <string> + +#include "hwy/aligned_allocator.h" +#include "hwy/base.h" +#include "hwy/highway.h" +#include "hwy/highway_export.h" +#include "hwy/print.h" + +namespace hwy { + +// The maximum vector size used in tests when defining test data. DEPRECATED. +constexpr size_t kTestMaxVectorSize = 64; + +// 64-bit random generator (Xorshift128+). Much smaller state than std::mt19937, +// which triggers a compiler bug. +class RandomState { + public: + explicit RandomState(const uint64_t seed = 0x123456789ull) { + s0_ = SplitMix64(seed + 0x9E3779B97F4A7C15ull); + s1_ = SplitMix64(s0_); + } + + HWY_INLINE uint64_t operator()() { + uint64_t s1 = s0_; + const uint64_t s0 = s1_; + const uint64_t bits = s1 + s0; + s0_ = s0; + s1 ^= s1 << 23; + s1 ^= s0 ^ (s1 >> 18) ^ (s0 >> 5); + s1_ = s1; + return bits; + } + + private: + static uint64_t SplitMix64(uint64_t z) { + z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull; + z = (z ^ (z >> 27)) * 0x94D049BB133111EBull; + return z ^ (z >> 31); + } + + uint64_t s0_; + uint64_t s1_; +}; + +static HWY_INLINE uint32_t Random32(RandomState* rng) { + return static_cast<uint32_t>((*rng)()); +} + +static HWY_INLINE uint64_t Random64(RandomState* rng) { return (*rng)(); } + +// Prevents the compiler from eliding the computations that led to "output". +// Works by indicating to the compiler that "output" is being read and modified. +// The +r constraint avoids unnecessary writes to memory, but only works for +// built-in types. +template <class T> +inline void PreventElision(T&& output) { +#if HWY_COMPILER_MSVC + (void)output; +#else // HWY_COMPILER_MSVC + asm volatile("" : "+r"(output) : : "memory"); +#endif // HWY_COMPILER_MSVC +} + +HWY_TEST_DLLEXPORT bool BytesEqual(const void* p1, const void* p2, + const size_t size, size_t* pos = nullptr); + +void AssertStringEqual(const char* expected, const char* actual, + const char* target_name, const char* filename, int line); + +namespace detail { + +template <typename T, typename TU = MakeUnsigned<T>> +TU ComputeUlpDelta(const T expected, const T actual) { + // Handle -0 == 0 and infinities. + if (expected == actual) return 0; + + // Consider "equal" if both are NaN, so we can verify an expected NaN. + // Needs a special case because there are many possible NaN representations. + if (std::isnan(expected) && std::isnan(actual)) return 0; + + // Compute the difference in units of last place. We do not need to check for + // differing signs; they will result in large differences, which is fine. + TU ux, uy; + CopySameSize(&expected, &ux); + CopySameSize(&actual, &uy); + + // Avoid unsigned->signed cast: 2's complement is only guaranteed by C++20. + const TU ulp = HWY_MAX(ux, uy) - HWY_MIN(ux, uy); + return ulp; +} + +HWY_TEST_DLLEXPORT bool IsEqual(const TypeInfo& info, const void* expected_ptr, + const void* actual_ptr); + +HWY_TEST_DLLEXPORT HWY_NORETURN void PrintMismatchAndAbort( + const TypeInfo& info, const void* expected_ptr, const void* actual_ptr, + const char* target_name, const char* filename, int line, size_t lane = 0, + size_t num_lanes = 1); + +HWY_TEST_DLLEXPORT void AssertArrayEqual(const TypeInfo& info, + const void* expected_void, + const void* actual_void, size_t N, + const char* target_name, + const char* filename, int line); + +} // namespace detail + +// Returns a name for the vector/part/scalar. The type prefix is u/i/f for +// unsigned/signed/floating point, followed by the number of bits per lane; +// then 'x' followed by the number of lanes. Example: u8x16. This is useful for +// understanding which instantiation of a generic test failed. +template <typename T> +std::string TypeName(T /*unused*/, size_t N) { + char string100[100]; + detail::TypeName(detail::MakeTypeInfo<T>(), N, string100); + return string100; +} + +// Compare non-vector, non-string T. +template <typename T> +HWY_INLINE bool IsEqual(const T expected, const T actual) { + const auto info = detail::MakeTypeInfo<T>(); + return detail::IsEqual(info, &expected, &actual); +} + +template <typename T> +HWY_INLINE void AssertEqual(const T expected, const T actual, + const char* target_name, const char* filename, + int line, size_t lane = 0) { + const auto info = detail::MakeTypeInfo<T>(); + if (!detail::IsEqual(info, &expected, &actual)) { + detail::PrintMismatchAndAbort(info, &expected, &actual, target_name, + filename, line, lane); + } +} + +template <typename T> +HWY_INLINE void AssertArrayEqual(const T* expected, const T* actual, + size_t count, const char* target_name, + const char* filename, int line) { + const auto info = hwy::detail::MakeTypeInfo<T>(); + detail::AssertArrayEqual(info, expected, actual, count, target_name, filename, + line); +} + +} // namespace hwy + +#endif // HWY_TESTS_TEST_UTIL_H_ diff --git a/third_party/highway/hwy/tests/test_util_test.cc b/third_party/highway/hwy/tests/test_util_test.cc new file mode 100644 index 0000000000..1911467c34 --- /dev/null +++ b/third_party/highway/hwy/tests/test_util_test.cc @@ -0,0 +1,107 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <stddef.h> +#include <stdint.h> + +#include <string> + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/test_util_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +struct TestName { + template <class T, class D> + HWY_NOINLINE void operator()(T t, D d) { + char num[10]; + std::string expected = IsFloat<T>() ? "f" : (IsSigned<T>() ? "i" : "u"); + snprintf(num, sizeof(num), "%u" , static_cast<unsigned>(sizeof(T) * 8)); + expected += num; + + const size_t N = Lanes(d); + if (N != 1) { + expected += 'x'; + snprintf(num, sizeof(num), "%u", static_cast<unsigned>(N)); + expected += num; + } + const std::string actual = TypeName(t, N); + if (expected != actual) { + HWY_ABORT("%s mismatch: expected '%s', got '%s'.\n", + hwy::TargetName(HWY_TARGET), expected.c_str(), actual.c_str()); + } + } +}; + +HWY_NOINLINE void TestAllName() { ForAllTypes(ForPartialVectors<TestName>()); } + +struct TestEqualInteger { + template <class T> + HWY_NOINLINE void operator()(T /*t*/) const { + HWY_ASSERT_EQ(T(0), T(0)); + HWY_ASSERT_EQ(T(1), T(1)); + HWY_ASSERT_EQ(T(-1), T(-1)); + HWY_ASSERT_EQ(LimitsMin<T>(), LimitsMin<T>()); + + HWY_ASSERT(!IsEqual(T(0), T(1))); + HWY_ASSERT(!IsEqual(T(1), T(0))); + HWY_ASSERT(!IsEqual(T(1), T(-1))); + HWY_ASSERT(!IsEqual(T(-1), T(1))); + HWY_ASSERT(!IsEqual(LimitsMin<T>(), LimitsMax<T>())); + HWY_ASSERT(!IsEqual(LimitsMax<T>(), LimitsMin<T>())); + } +}; + +struct TestEqualFloat { + template <class T> + HWY_NOINLINE void operator()(T /*t*/) const { + HWY_ASSERT(IsEqual(T(0), T(0))); + HWY_ASSERT(IsEqual(T(1), T(1))); + HWY_ASSERT(IsEqual(T(-1), T(-1))); + HWY_ASSERT(IsEqual(MantissaEnd<T>(), MantissaEnd<T>())); + + HWY_ASSERT(!IsEqual(T(0), T(1))); + HWY_ASSERT(!IsEqual(T(1), T(0))); + HWY_ASSERT(!IsEqual(T(1), T(-1))); + HWY_ASSERT(!IsEqual(T(-1), T(1))); + HWY_ASSERT(!IsEqual(LowestValue<T>(), HighestValue<T>())); + HWY_ASSERT(!IsEqual(HighestValue<T>(), LowestValue<T>())); + } +}; + +HWY_NOINLINE void TestAllEqual() { + ForIntegerTypes(TestEqualInteger()); + ForFloatTypes(TestEqualFloat()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(TestUtilTest); +HWY_EXPORT_AND_TEST_P(TestUtilTest, TestAllName); +HWY_EXPORT_AND_TEST_P(TestUtilTest, TestAllEqual); +} // namespace hwy + +#endif |