diff options
Diffstat (limited to 'third_party/highway/hwy/tests/blockwise_test.cc')
-rw-r--r-- | third_party/highway/hwy/tests/blockwise_test.cc | 463 |
1 files changed, 463 insertions, 0 deletions
diff --git a/third_party/highway/hwy/tests/blockwise_test.cc b/third_party/highway/hwy/tests/blockwise_test.cc new file mode 100644 index 0000000000..5932984482 --- /dev/null +++ b/third_party/highway/hwy/tests/blockwise_test.cc @@ -0,0 +1,463 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <string.h> + +#include <algorithm> // std::fill + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/blockwise_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +template <typename D, int kLane> +struct TestBroadcastR { + HWY_NOINLINE void operator()() const { + using T = typename D::T; + const D d; + const size_t N = Lanes(d); + if (kLane >= N) return; + auto in_lanes = AllocateAligned<T>(N); + auto expected = AllocateAligned<T>(N); + HWY_ASSERT(in_lanes && expected); + std::fill(in_lanes.get(), in_lanes.get() + N, T(0)); + const size_t blockN = HWY_MIN(N * sizeof(T), 16) / sizeof(T); + // Need to set within each 128-bit block + for (size_t block = 0; block < N; block += blockN) { + in_lanes[block + kLane] = static_cast<T>(block + 1); + } + const auto in = Load(d, in_lanes.get()); + for (size_t block = 0; block < N; block += blockN) { + for (size_t i = 0; i < blockN; ++i) { + expected[block + i] = T(block + 1); + } + } + HWY_ASSERT_VEC_EQ(d, expected.get(), Broadcast<kLane>(in)); + + TestBroadcastR<D, kLane - 1>()(); + } +}; + +template <class D> +struct TestBroadcastR<D, -1> { + void operator()() const {} +}; + +struct TestBroadcast { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + TestBroadcastR<D, HWY_MIN(MaxLanes(d), 16 / sizeof(T)) - 1>()(); + } +}; + +HWY_NOINLINE void TestAllBroadcast() { + const ForPartialVectors<TestBroadcast> test; + // No u/i8. + test(uint16_t()); + test(int16_t()); + ForUIF3264(test); +} + +template <bool kFull> +struct ChooseTableSize { + template <typename T, typename DIdx> + using type = DIdx; +}; +template <> +struct ChooseTableSize<true> { + template <typename T, typename DIdx> + using type = ScalableTag<T>; +}; + +template <bool kFull> +struct TestTableLookupBytes { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { +#if HWY_TARGET != HWY_SCALAR + RandomState rng; + + const typename ChooseTableSize<kFull>::template type<T, D> d_tbl; + const Repartition<uint8_t, decltype(d_tbl)> d_tbl8; + const Repartition<uint8_t, D> d8; + const size_t N = Lanes(d); + const size_t NT8 = Lanes(d_tbl8); + const size_t N8 = Lanes(d8); + + auto in_bytes = AllocateAligned<uint8_t>(NT8); + auto index_bytes = AllocateAligned<uint8_t>(N8); + auto expected = AllocateAligned<T>(N); + HWY_ASSERT(in_bytes && index_bytes && expected); + + // Random input bytes + for (size_t i = 0; i < NT8; ++i) { + in_bytes[i] = Random32(&rng) & 0xFF; + } + const auto in = BitCast(d_tbl, Load(d_tbl8, in_bytes.get())); + + // Enough test data; for larger vectors, upper lanes will be zero. + const uint8_t index_bytes_source[64] = { + // Same index as source, multiple outputs from same input, + // unused input (9), ascending/descending and nonconsecutive neighbors. + 0, 2, 1, 2, 15, 12, 13, 14, 6, 7, 8, 5, 4, 3, 10, 11, + 11, 10, 3, 4, 5, 8, 7, 6, 14, 13, 12, 15, 2, 1, 2, 0, + 4, 3, 2, 2, 5, 6, 7, 7, 15, 15, 15, 15, 15, 15, 0, 1}; + const size_t max_index = HWY_MIN(NT8, 16) - 1; + for (size_t i = 0; i < N8; ++i) { + index_bytes[i] = (i < 64) ? index_bytes_source[i] : 0; + // Avoid asan error for partial vectors. + index_bytes[i] = static_cast<uint8_t>(HWY_MIN(index_bytes[i], max_index)); + } + const auto indices = Load(d, reinterpret_cast<const T*>(index_bytes.get())); + + uint8_t* expected_bytes = reinterpret_cast<uint8_t*>(expected.get()); + + for (size_t block = 0; block < N8; block += 16) { + for (size_t i = 0; i < 16 && (block + i) < N8; ++i) { + const uint8_t index = index_bytes[block + i]; + HWY_ASSERT(index <= max_index); + // Note that block + index may exceed NT8 on RVV, which is fine because + // the operation uses the larger of the table and index vector size. + HWY_ASSERT(block + index < HWY_MAX(N8, NT8)); + // For large vectors, the lane index may wrap around due to block, + // also wrap around after 8-bit overflow. + expected_bytes[block + i] = + in_bytes[(block + index) % HWY_MIN(NT8, 256)]; + } + } + HWY_ASSERT_VEC_EQ(d, expected.get(), TableLookupBytes(in, indices)); + + // Individually test zeroing each byte position. + for (size_t i = 0; i < N8; ++i) { + const uint8_t prev_expected = expected_bytes[i]; + const uint8_t prev_index = index_bytes[i]; + expected_bytes[i] = 0; + + const int idx = 0x80 + (static_cast<int>(Random32(&rng) & 7) << 4); + HWY_ASSERT(0x80 <= idx && idx < 256); + index_bytes[i] = static_cast<uint8_t>(idx); + + const auto indices = + Load(d, reinterpret_cast<const T*>(index_bytes.get())); + HWY_ASSERT_VEC_EQ(d, expected.get(), TableLookupBytesOr0(in, indices)); + expected_bytes[i] = prev_expected; + index_bytes[i] = prev_index; + } +#else + (void)d; +#endif + } +}; + +HWY_NOINLINE void TestAllTableLookupBytesSame() { + // Partial index, same-sized table. + ForIntegerTypes(ForPartialVectors<TestTableLookupBytes<false>>()); +} + +HWY_NOINLINE void TestAllTableLookupBytesMixed() { + // Partial index, full-size table. + ForIntegerTypes(ForPartialVectors<TestTableLookupBytes<true>>()); +} + +struct TestInterleaveLower { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using TU = MakeUnsigned<T>; + const size_t N = Lanes(d); + auto even_lanes = AllocateAligned<T>(N); + auto odd_lanes = AllocateAligned<T>(N); + auto expected = AllocateAligned<T>(N); + HWY_ASSERT(even_lanes && odd_lanes && expected); + for (size_t i = 0; i < N; ++i) { + even_lanes[i] = static_cast<T>(2 * i + 0); + odd_lanes[i] = static_cast<T>(2 * i + 1); + } + const auto even = Load(d, even_lanes.get()); + const auto odd = Load(d, odd_lanes.get()); + + const size_t blockN = HWY_MIN(16 / sizeof(T), N); + for (size_t i = 0; i < Lanes(d); ++i) { + const size_t block = i / blockN; + const size_t index = (i % blockN) + block * 2 * blockN; + expected[i] = static_cast<T>(index & LimitsMax<TU>()); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveLower(even, odd)); + HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveLower(d, even, odd)); + } +}; + +struct TestInterleaveUpper { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + if (N == 1) return; + auto even_lanes = AllocateAligned<T>(N); + auto odd_lanes = AllocateAligned<T>(N); + auto expected = AllocateAligned<T>(N); + HWY_ASSERT(even_lanes && odd_lanes && expected); + for (size_t i = 0; i < N; ++i) { + even_lanes[i] = static_cast<T>(2 * i + 0); + odd_lanes[i] = static_cast<T>(2 * i + 1); + } + const auto even = Load(d, even_lanes.get()); + const auto odd = Load(d, odd_lanes.get()); + + const size_t blockN = HWY_MIN(16 / sizeof(T), N); + for (size_t i = 0; i < Lanes(d); ++i) { + const size_t block = i / blockN; + expected[i] = T((i % blockN) + block * 2 * blockN + blockN); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveUpper(d, even, odd)); + } +}; + +HWY_NOINLINE void TestAllInterleave() { + // Not DemoteVectors because this cannot be supported by HWY_SCALAR. + ForAllTypes(ForShrinkableVectors<TestInterleaveLower>()); + ForAllTypes(ForShrinkableVectors<TestInterleaveUpper>()); +} + +struct TestZipLower { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using WideT = MakeWide<T>; + static_assert(sizeof(T) * 2 == sizeof(WideT), "Must be double-width"); + static_assert(IsSigned<T>() == IsSigned<WideT>(), "Must have same sign"); + const size_t N = Lanes(d); + auto even_lanes = AllocateAligned<T>(N); + auto odd_lanes = AllocateAligned<T>(N); + // At least 2 lanes for HWY_SCALAR + auto zip_lanes = AllocateAligned<T>(HWY_MAX(N, 2)); + HWY_ASSERT(even_lanes && odd_lanes && zip_lanes); + const T kMaxT = LimitsMax<T>(); + for (size_t i = 0; i < N; ++i) { + even_lanes[i] = static_cast<T>((2 * i + 0) & kMaxT); + odd_lanes[i] = static_cast<T>((2 * i + 1) & kMaxT); + } + const auto even = Load(d, even_lanes.get()); + const auto odd = Load(d, odd_lanes.get()); + + const Repartition<WideT, D> dw; +#if HWY_TARGET == HWY_SCALAR + // Safely handle big-endian + const auto expected = Set(dw, static_cast<WideT>(1ULL << (sizeof(T) * 8))); +#else + const size_t blockN = HWY_MIN(size_t(16) / sizeof(T), N); + for (size_t i = 0; i < N; i += 2) { + const size_t base = (i / blockN) * blockN; + const size_t mod = i % blockN; + zip_lanes[i + 0] = even_lanes[mod / 2 + base]; + zip_lanes[i + 1] = odd_lanes[mod / 2 + base]; + // Without this, `expected` is incorrect with Clang and 512-bit SVE: the + // first byte of the second block is 0x10 instead of 0x20 as it should be. + PreventElision(zip_lanes[i + 0]); + } + const Vec<decltype(dw)> expected = BitCast(dw, Load(d, zip_lanes.get())); +#endif // HWY_TARGET == HWY_SCALAR + HWY_ASSERT_VEC_EQ(dw, expected, ZipLower(even, odd)); + HWY_ASSERT_VEC_EQ(dw, expected, ZipLower(dw, even, odd)); + } +}; + +HWY_NOINLINE void TestAllZipLower() { + const ForDemoteVectors<TestZipLower> lower_unsigned; + lower_unsigned(uint8_t()); + lower_unsigned(uint16_t()); +#if HWY_HAVE_INTEGER64 + lower_unsigned(uint32_t()); // generates u64 +#endif + + const ForDemoteVectors<TestZipLower> lower_signed; + lower_signed(int8_t()); + lower_signed(int16_t()); +#if HWY_HAVE_INTEGER64 + lower_signed(int32_t()); // generates i64 +#endif + + // No float - concatenating f32 does not result in a f64 +} + +// Remove this test (so it does not show as having run) if the only target is +// HWY_SCALAR, which does not support this op. +#if HWY_TARGETS != HWY_SCALAR + +struct TestZipUpper { + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { +#if HWY_TARGET == HWY_SCALAR + (void)d; +#else + using WideT = MakeWide<T>; + static_assert(sizeof(T) * 2 == sizeof(WideT), "Must be double-width"); + static_assert(IsSigned<T>() == IsSigned<WideT>(), "Must have same sign"); + const size_t N = Lanes(d); + if (N < 16 / sizeof(T)) return; + auto even_lanes = AllocateAligned<T>(N); + auto odd_lanes = AllocateAligned<T>(N); + auto zip_lanes = AllocateAligned<T>(N); + HWY_ASSERT(even_lanes && odd_lanes && zip_lanes); + const T kMaxT = LimitsMax<T>(); + for (size_t i = 0; i < N; ++i) { + even_lanes[i] = static_cast<T>((2 * i + 0) & kMaxT); + odd_lanes[i] = static_cast<T>((2 * i + 1) & kMaxT); + } + const auto even = Load(d, even_lanes.get()); + const auto odd = Load(d, odd_lanes.get()); + + const size_t blockN = HWY_MIN(size_t(16) / sizeof(T), N); + + for (size_t i = 0; i < N; i += 2) { + const size_t base = (i / blockN) * blockN + blockN / 2; + const size_t mod = i % blockN; + zip_lanes[i + 0] = even_lanes[mod / 2 + base]; + zip_lanes[i + 1] = odd_lanes[mod / 2 + base]; + // See comment at previous call to PreventElision. + PreventElision(zip_lanes[i + 0]); + } + const Repartition<WideT, D> dw; + const Vec<decltype(dw)> expected = BitCast(dw, Load(d, zip_lanes.get())); + HWY_ASSERT_VEC_EQ(dw, expected, ZipUpper(dw, even, odd)); +#endif // HWY_TARGET == HWY_SCALAR + } +}; + +HWY_NOINLINE void TestAllZipUpper() { + const ForShrinkableVectors<TestZipUpper> upper_unsigned; + upper_unsigned(uint8_t()); + upper_unsigned(uint16_t()); +#if HWY_HAVE_INTEGER64 + upper_unsigned(uint32_t()); // generates u64 +#endif + + const ForShrinkableVectors<TestZipUpper> upper_signed; + upper_signed(int8_t()); + upper_signed(int16_t()); +#if HWY_HAVE_INTEGER64 + upper_signed(int32_t()); // generates i64 +#endif + + // No float - concatenating f32 does not result in a f64 +} + +#endif // HWY_TARGETS != HWY_SCALAR + +class TestSpecialShuffle32 { + public: + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v = Iota(d, 0); + VerifyLanes32(d, Shuffle2301(v), 2, 3, 0, 1, __FILE__, __LINE__); + VerifyLanes32(d, Shuffle1032(v), 1, 0, 3, 2, __FILE__, __LINE__); + VerifyLanes32(d, Shuffle0321(v), 0, 3, 2, 1, __FILE__, __LINE__); + VerifyLanes32(d, Shuffle2103(v), 2, 1, 0, 3, __FILE__, __LINE__); + VerifyLanes32(d, Shuffle0123(v), 0, 1, 2, 3, __FILE__, __LINE__); + } + + private: + // HWY_INLINE works around a Clang SVE compiler bug where all but the first + // 128 bits (the NEON register) of actual are zero. + template <class D, class V> + HWY_INLINE void VerifyLanes32(D d, VecArg<V> actual, const size_t i3, + const size_t i2, const size_t i1, + const size_t i0, const char* filename, + const int line) { + using T = TFromD<D>; + constexpr size_t kBlockN = 16 / sizeof(T); + const size_t N = Lanes(d); + if (N < 4) return; + auto expected = AllocateAligned<T>(N); + HWY_ASSERT(expected); + for (size_t block = 0; block < N; block += kBlockN) { + expected[block + 3] = static_cast<T>(block + i3); + expected[block + 2] = static_cast<T>(block + i2); + expected[block + 1] = static_cast<T>(block + i1); + expected[block + 0] = static_cast<T>(block + i0); + } + AssertVecEqual(d, expected.get(), actual, filename, line); + } +}; + +class TestSpecialShuffle64 { + public: + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v = Iota(d, 0); + VerifyLanes64(d, Shuffle01(v), 0, 1, __FILE__, __LINE__); + } + + private: + // HWY_INLINE works around a Clang SVE compiler bug where all but the first + // 128 bits (the NEON register) of actual are zero. + template <class D, class V> + HWY_INLINE void VerifyLanes64(D d, VecArg<V> actual, const size_t i1, + const size_t i0, const char* filename, + const int line) { + using T = TFromD<D>; + constexpr size_t kBlockN = 16 / sizeof(T); + const size_t N = Lanes(d); + if (N < 2) return; + auto expected = AllocateAligned<T>(N); + HWY_ASSERT(expected); + for (size_t block = 0; block < N; block += kBlockN) { + expected[block + 1] = static_cast<T>(block + i1); + expected[block + 0] = static_cast<T>(block + i0); + } + AssertVecEqual(d, expected.get(), actual, filename, line); + } +}; + +HWY_NOINLINE void TestAllSpecialShuffles() { + const ForGEVectors<128, TestSpecialShuffle32> test32; + test32(uint32_t()); + test32(int32_t()); + test32(float()); + +#if HWY_HAVE_INTEGER64 + const ForGEVectors<128, TestSpecialShuffle64> test64; + test64(uint64_t()); + test64(int64_t()); +#endif + +#if HWY_HAVE_FLOAT64 + const ForGEVectors<128, TestSpecialShuffle64> test_d; + test_d(double()); +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyBlockwiseTest); +HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllBroadcast); +HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllTableLookupBytesSame); +HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllTableLookupBytesMixed); +HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllInterleave); +HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllZipLower); +#if HWY_TARGETS != HWY_SCALAR +HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllZipUpper); +#endif +HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllSpecialShuffles); +} // namespace hwy + +#endif |