// Copyright 2019 Google LLC // SPDX-License-Identifier: Apache-2.0 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #undef HWY_TARGET_INCLUDE #define HWY_TARGET_INCLUDE "tests/reduction_test.cc" #include "hwy/foreach_target.h" // IWYU pragma: keep #include "hwy/highway.h" #include "hwy/tests/test_util-inl.h" HWY_BEFORE_NAMESPACE(); namespace hwy { namespace HWY_NAMESPACE { struct TestSumOfLanes { template () || ((N & 1) != 0)>* = nullptr> HWY_NOINLINE void SignedEvenLengthVectorTests(Simd) { // do nothing } template () && ((N & 1) == 0)>* = nullptr> HWY_NOINLINE void SignedEvenLengthVectorTests(Simd d) { const T pairs = static_cast(Lanes(d) / 2); // Lanes are the repeated sequence -2, 1, [...]; each pair sums to -1, // so the eventual total is just -(N/2). Vec v = InterleaveLower(Set(d, static_cast(-2)), Set(d, T{1})); HWY_ASSERT_VEC_EQ(d, Set(d, static_cast(-pairs)), SumOfLanes(d, v)); HWY_ASSERT_EQ(static_cast(-pairs), ReduceSum(d, v)); // Similar test with a positive result. v = InterleaveLower(Set(d, static_cast(-2)), Set(d, T{4})); HWY_ASSERT_VEC_EQ(d, Set(d, static_cast(pairs * 2)), SumOfLanes(d, v)); HWY_ASSERT_EQ(static_cast(pairs * 2), ReduceSum(d, v)); } template HWY_NOINLINE void operator()(T /*unused*/, D d) { const size_t N = Lanes(d); auto in_lanes = AllocateAligned(N); HWY_ASSERT(in_lanes); // Lane i = bit i, higher lanes 0 double sum = 0.0; // Avoid setting sign bit and cap at double precision constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51); for (size_t i = 0; i < N; ++i) { in_lanes[i] = i < kBits ? static_cast(1ull << i) : static_cast(0); sum += static_cast(in_lanes[i]); } HWY_ASSERT_VEC_EQ(d, Set(d, T(sum)), SumOfLanes(d, Load(d, in_lanes.get()))); HWY_ASSERT_EQ(T(sum), ReduceSum(d, Load(d, in_lanes.get()))); // Lane i = i (iota) to include upper lanes sum = 0.0; for (size_t i = 0; i < N; ++i) { sum += static_cast(i); } HWY_ASSERT_VEC_EQ(d, Set(d, T(sum)), SumOfLanes(d, Iota(d, 0))); HWY_ASSERT_EQ(T(sum), ReduceSum(d, Iota(d, 0))); // Run more tests only for signed types with even vector lengths. Some of // this code may not otherwise compile, so put it in a templated function. SignedEvenLengthVectorTests(d); } }; HWY_NOINLINE void TestAllSumOfLanes() { ForUIF3264(ForPartialVectors()); ForUI16(ForPartialVectors()); // UI8 is only implemented for some targets. #if HWY_MAX_BYTES == 16 && (HWY_ARCH_ARM || HWY_ARCH_X86) ForUI8(ForGEVectors<64, TestSumOfLanes>()); #endif } struct TestMinOfLanes { template HWY_NOINLINE void operator()(T /*unused*/, D d) { const size_t N = Lanes(d); auto in_lanes = AllocateAligned(N); // Lane i = bit i, higher lanes = 2 (not the minimum) T min = HighestValue(); // Avoid setting sign bit and cap at double precision constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51); for (size_t i = 0; i < N; ++i) { in_lanes[i] = i < kBits ? static_cast(1ull << i) : static_cast(2); min = HWY_MIN(min, in_lanes[i]); } HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get()))); // Lane i = N - i to include upper lanes min = HighestValue(); for (size_t i = 0; i < N; ++i) { in_lanes[i] = static_cast(N - i); // no 8-bit T so no wraparound min = HWY_MIN(min, in_lanes[i]); } HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get()))); // Bug #910: also check negative values min = HighestValue(); const T input_copy[] = {static_cast(-1), static_cast(-2), 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}; size_t i = 0; for (; i < HWY_MIN(N, sizeof(input_copy) / sizeof(T)); ++i) { in_lanes[i] = input_copy[i]; min = HWY_MIN(min, input_copy[i]); } // Pad with neutral element to full vector (so we can load) for (; i < N; ++i) { in_lanes[i] = min; } HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get()))); } }; struct TestMaxOfLanes { template HWY_NOINLINE void operator()(T /*unused*/, D d) { const size_t N = Lanes(d); auto in_lanes = AllocateAligned(N); T max = LowestValue(); // Avoid setting sign bit and cap at double precision constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51); for (size_t i = 0; i < N; ++i) { in_lanes[i] = i < kBits ? static_cast(1ull << i) : static_cast(0); max = HWY_MAX(max, in_lanes[i]); } HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get()))); // Lane i = i to include upper lanes max = LowestValue(); for (size_t i = 0; i < N; ++i) { in_lanes[i] = static_cast(i); // no 8-bit T so no wraparound max = HWY_MAX(max, in_lanes[i]); } HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get()))); // Bug #910: also check negative values max = LowestValue(); const T input_copy[] = {static_cast(-1), static_cast(-2), 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}; size_t i = 0; for (; i < HWY_MIN(N, sizeof(input_copy) / sizeof(T)); ++i) { in_lanes[i] = input_copy[i]; max = HWY_MAX(max, in_lanes[i]); } // Pad with neutral element to full vector (so we can load) for (; i < N; ++i) { in_lanes[i] = max; } HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get()))); } }; HWY_NOINLINE void TestAllMinMaxOfLanes() { const ForPartialVectors test_min; const ForPartialVectors test_max; ForUIF3264(test_min); ForUIF3264(test_max); ForUI16(test_min); ForUI16(test_max); // UI8 is only implemented for some targets. #if HWY_MAX_BYTES == 16 && (HWY_ARCH_ARM || HWY_ARCH_X86) ForUI8(ForGEVectors<64, TestMinOfLanes>()); ForUI8(ForGEVectors<64, TestMaxOfLanes>()); #endif } struct TestSumsOf8 { template HWY_NOINLINE void operator()(T /*unused*/, D d) { RandomState rng; const size_t N = Lanes(d); if (N < 8) return; const Repartition du64; auto in_lanes = AllocateAligned(N); auto sum_lanes = AllocateAligned(N / 8); for (size_t rep = 0; rep < 100; ++rep) { for (size_t i = 0; i < N; ++i) { in_lanes[i] = Random64(&rng) & 0xFF; } for (size_t idx_sum = 0; idx_sum < N / 8; ++idx_sum) { uint64_t sum = 0; for (size_t i = 0; i < 8; ++i) { sum += in_lanes[idx_sum * 8 + i]; } sum_lanes[idx_sum] = sum; } const Vec in = Load(d, in_lanes.get()); HWY_ASSERT_VEC_EQ(du64, sum_lanes.get(), SumsOf8(in)); } } }; HWY_NOINLINE void TestAllSumsOf8() { ForGEVectors<64, TestSumsOf8>()(uint8_t()); } struct TestSumsOf8AbsDiff { template HWY_NOINLINE void operator()(T /*unused*/, D d) { RandomState rng; const size_t N = Lanes(d); if (N < 8) return; const Repartition du64; auto in_lanes_a = AllocateAligned(N); auto in_lanes_b = AllocateAligned(N); auto sum_lanes = AllocateAligned(N / 8); for (size_t rep = 0; rep < 100; ++rep) { for (size_t i = 0; i < N; ++i) { uint64_t rand64_val = Random64(&rng); in_lanes_a[i] = rand64_val & 0xFF; in_lanes_b[i] = (rand64_val >> 8) & 0xFF; } for (size_t idx_sum = 0; idx_sum < N / 8; ++idx_sum) { uint64_t sum = 0; for (size_t i = 0; i < 8; ++i) { const auto lane_diff = static_cast(in_lanes_a[idx_sum * 8 + i]) - static_cast(in_lanes_b[idx_sum * 8 + i]); sum += static_cast((lane_diff >= 0) ? lane_diff : -lane_diff); } sum_lanes[idx_sum] = sum; } const Vec a = Load(d, in_lanes_a.get()); const Vec b = Load(d, in_lanes_b.get()); HWY_ASSERT_VEC_EQ(du64, sum_lanes.get(), SumsOf8AbsDiff(a, b)); } } }; HWY_NOINLINE void TestAllSumsOf8AbsDiff() { ForGEVectors<64, TestSumsOf8AbsDiff>()(uint8_t()); } // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace hwy HWY_AFTER_NAMESPACE(); #if HWY_ONCE namespace hwy { HWY_BEFORE_TEST(HwyReductionTest); HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllSumOfLanes); HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllMinMaxOfLanes); HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllSumsOf8); HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllSumsOf8AbsDiff); } // namespace hwy #endif