summaryrefslogtreecommitdiffstats
path: root/third_party/highway/hwy/contrib/algo
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
commit26a029d407be480d791972afb5975cf62c9360a6 (patch)
treef435a8308119effd964b339f76abb83a57c29483 /third_party/highway/hwy/contrib/algo
parentInitial commit. (diff)
downloadfirefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz
firefox-26a029d407be480d791972afb5975cf62c9360a6.zip
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/highway/hwy/contrib/algo')
-rw-r--r--third_party/highway/hwy/contrib/algo/copy-inl.h136
-rw-r--r--third_party/highway/hwy/contrib/algo/copy_test.cc204
-rw-r--r--third_party/highway/hwy/contrib/algo/find-inl.h109
-rw-r--r--third_party/highway/hwy/contrib/algo/find_test.cc223
-rw-r--r--third_party/highway/hwy/contrib/algo/transform-inl.h262
-rw-r--r--third_party/highway/hwy/contrib/algo/transform_test.cc381
6 files changed, 1315 insertions, 0 deletions
diff --git a/third_party/highway/hwy/contrib/algo/copy-inl.h b/third_party/highway/hwy/contrib/algo/copy-inl.h
new file mode 100644
index 0000000000..033cf8a626
--- /dev/null
+++ b/third_party/highway/hwy/contrib/algo/copy-inl.h
@@ -0,0 +1,136 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Per-target include guard
+#if defined(HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_) == \
+ defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
+#undef HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
+#else
+#define HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
+#endif
+
+#include "hwy/highway.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// These functions avoid having to write a loop plus remainder handling in the
+// (unfortunately still common) case where arrays are not aligned/padded. If the
+// inputs are known to be aligned/padded, it is more efficient to write a single
+// loop using Load(). We do not provide a CopyAlignedPadded because it
+// would be more verbose than such a loop.
+
+// Fills `to`[0, `count`) with `value`.
+template <class D, typename T = TFromD<D>>
+void Fill(D d, T value, size_t count, T* HWY_RESTRICT to) {
+ const size_t N = Lanes(d);
+ const Vec<D> v = Set(d, value);
+
+ size_t idx = 0;
+ for (; idx + N <= count; idx += N) {
+ StoreU(v, d, to + idx);
+ }
+
+ // `count` was a multiple of the vector length `N`: already done.
+ if (HWY_UNLIKELY(idx == count)) return;
+
+ const size_t remaining = count - idx;
+ HWY_DASSERT(0 != remaining && remaining < N);
+ SafeFillN(remaining, value, d, to + idx);
+}
+
+// Copies `from`[0, `count`) to `to`, which must not overlap `from`.
+template <class D, typename T = TFromD<D>>
+void Copy(D d, const T* HWY_RESTRICT from, size_t count, T* HWY_RESTRICT to) {
+ const size_t N = Lanes(d);
+
+ size_t idx = 0;
+ for (; idx + N <= count; idx += N) {
+ const Vec<D> v = LoadU(d, from + idx);
+ StoreU(v, d, to + idx);
+ }
+
+ // `count` was a multiple of the vector length `N`: already done.
+ if (HWY_UNLIKELY(idx == count)) return;
+
+ const size_t remaining = count - idx;
+ HWY_DASSERT(0 != remaining && remaining < N);
+ SafeCopyN(remaining, d, from + idx, to + idx);
+}
+
+// For idx in [0, count) in ascending order, appends `from[idx]` to `to` if the
+// corresponding mask element of `func(d, v)` is true. Returns the STL-style end
+// of the newly written elements in `to`.
+//
+// `func` is either a functor with a templated operator()(d, v) returning a
+// mask, or a generic lambda if using C++14. Due to apparent limitations of
+// Clang on Windows, it is currently necessary to add HWY_ATTR before the
+// opening { of the lambda to avoid errors about "function .. requires target".
+//
+// NOTE: this is only supported for 16-, 32- or 64-bit types.
+// NOTE: Func may be called a second time for elements it has already seen, but
+// these elements will not be written to `to` again.
+template <class D, class Func, typename T = TFromD<D>>
+T* CopyIf(D d, const T* HWY_RESTRICT from, size_t count, T* HWY_RESTRICT to,
+ const Func& func) {
+ const size_t N = Lanes(d);
+
+ size_t idx = 0;
+ for (; idx + N <= count; idx += N) {
+ const Vec<D> v = LoadU(d, from + idx);
+ to += CompressBlendedStore(v, func(d, v), d, to);
+ }
+
+ // `count` was a multiple of the vector length `N`: already done.
+ if (HWY_UNLIKELY(idx == count)) return to;
+
+#if HWY_MEM_OPS_MIGHT_FAULT
+ // Proceed one by one.
+ const CappedTag<T, 1> d1;
+ for (; idx < count; ++idx) {
+ using V1 = Vec<decltype(d1)>;
+ // Workaround for -Waggressive-loop-optimizations on GCC 8
+ // (iteration 2305843009213693951 invokes undefined behavior for T=i64)
+ const uintptr_t addr = reinterpret_cast<uintptr_t>(from);
+ const T* HWY_RESTRICT from_idx =
+ reinterpret_cast<const T * HWY_RESTRICT>(addr + (idx * sizeof(T)));
+ const V1 v = LoadU(d1, from_idx);
+ // Avoid storing to `to` unless we know it should be kept - otherwise, we
+ // might overrun the end if it was allocated for the exact count.
+ if (CountTrue(d1, func(d1, v)) == 0) continue;
+ StoreU(v, d1, to);
+ to += 1;
+ }
+#else
+ // Start index of the last unaligned whole vector, ending at the array end.
+ const size_t last = count - N;
+ // Number of elements before `from` or already written.
+ const size_t invalid = idx - last;
+ HWY_DASSERT(0 != invalid && invalid < N);
+ const Mask<D> mask = Not(FirstN(d, invalid));
+ const Vec<D> v = MaskedLoad(mask, d, from + last);
+ to += CompressBlendedStore(v, And(mask, func(d, v)), d, to);
+#endif
+ return to;
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#endif // HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
diff --git a/third_party/highway/hwy/contrib/algo/copy_test.cc b/third_party/highway/hwy/contrib/algo/copy_test.cc
new file mode 100644
index 0000000000..c74f6e9bd7
--- /dev/null
+++ b/third_party/highway/hwy/contrib/algo/copy_test.cc
@@ -0,0 +1,204 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/aligned_allocator.h"
+
+// clang-format off
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/algo/copy_test.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/contrib/algo/copy-inl.h"
+#include "hwy/tests/test_util-inl.h"
+// clang-format on
+
+// If your project requires C++14 or later, you can ignore this and pass lambdas
+// directly to Transform, without requiring an lvalue as we do here for C++11.
+#if __cplusplus < 201402L
+#define HWY_GENERIC_LAMBDA 0
+#else
+#define HWY_GENERIC_LAMBDA 1
+#endif
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// Returns random integer in [0, 128), which fits in any lane type.
+template <typename T>
+T Random7Bit(RandomState& rng) {
+ return static_cast<T>(Random32(&rng) & 127);
+}
+
+// In C++14, we can instead define these as generic lambdas next to where they
+// are invoked.
+#if !HWY_GENERIC_LAMBDA
+
+struct IsOdd {
+ template <class D, class V>
+ Mask<D> operator()(D d, V v) const {
+ return TestBit(v, Set(d, TFromD<D>{1}));
+ }
+};
+
+#endif // !HWY_GENERIC_LAMBDA
+
+// Invokes Test (e.g. TestCopyIf) with all arg combinations. T comes from
+// ForFloatTypes.
+template <class Test>
+struct ForeachCountAndMisalign {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) const {
+ RandomState rng;
+ const size_t N = Lanes(d);
+ const size_t misalignments[3] = {0, N / 4, 3 * N / 5};
+
+ for (size_t count = 0; count < 2 * N; ++count) {
+ for (size_t ma : misalignments) {
+ for (size_t mb : misalignments) {
+ Test()(d, count, ma, mb, rng);
+ }
+ }
+ }
+ }
+};
+
+struct TestFill {
+ template <class D>
+ void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
+ RandomState& rng) {
+ using T = TFromD<D>;
+ // HWY_MAX prevents error when misalign == count == 0.
+ AlignedFreeUniquePtr<T[]> pa =
+ AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
+ AlignedFreeUniquePtr<T[]> pb = AllocateAligned<T>(misalign_b + count + 1);
+ HWY_ASSERT(pa && pb);
+ T* expected = pa.get() + misalign_a;
+ const T value = Random7Bit<T>(rng);
+ for (size_t i = 0; i < count; ++i) {
+ expected[i] = value;
+ }
+ T* actual = pb.get() + misalign_b;
+
+ actual[count] = T{0}; // sentinel
+ Fill(d, value, count, actual);
+ HWY_ASSERT_EQ(T{0}, actual[count]); // did not write past end
+
+ const auto info = hwy::detail::MakeTypeInfo<T>();
+ const char* target_name = hwy::TargetName(HWY_TARGET);
+ hwy::detail::AssertArrayEqual(info, expected, actual, count, target_name,
+ __FILE__, __LINE__);
+ }
+};
+
+void TestAllFill() {
+ ForAllTypes(ForPartialVectors<ForeachCountAndMisalign<TestFill>>());
+}
+
+struct TestCopy {
+ template <class D>
+ void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
+ RandomState& rng) {
+ using T = TFromD<D>;
+ // Prevents error if size to allocate is zero.
+ AlignedFreeUniquePtr<T[]> pa =
+ AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
+ AlignedFreeUniquePtr<T[]> pb =
+ AllocateAligned<T>(HWY_MAX(1, misalign_b + count));
+ HWY_ASSERT(pa && pb);
+ T* a = pa.get() + misalign_a;
+ for (size_t i = 0; i < count; ++i) {
+ a[i] = Random7Bit<T>(rng);
+ }
+ T* b = pb.get() + misalign_b;
+
+ Copy(d, a, count, b);
+
+ const auto info = hwy::detail::MakeTypeInfo<T>();
+ const char* target_name = hwy::TargetName(HWY_TARGET);
+ hwy::detail::AssertArrayEqual(info, a, b, count, target_name, __FILE__,
+ __LINE__);
+ }
+};
+
+void TestAllCopy() {
+ ForAllTypes(ForPartialVectors<ForeachCountAndMisalign<TestCopy>>());
+}
+
+struct TestCopyIf {
+ template <class D>
+ void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
+ RandomState& rng) {
+ using T = TFromD<D>;
+ const size_t padding = Lanes(ScalableTag<T>());
+
+ // Prevents error if size to allocate is zero.
+ AlignedFreeUniquePtr<T[]> pa =
+ AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
+ AlignedFreeUniquePtr<T[]> pb =
+ AllocateAligned<T>(HWY_MAX(1, misalign_b + count + padding));
+ AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count));
+ HWY_ASSERT(pa && pb && expected);
+
+ T* a = pa.get() + misalign_a;
+ for (size_t i = 0; i < count; ++i) {
+ a[i] = Random7Bit<T>(rng);
+ }
+ T* b = pb.get() + misalign_b;
+
+ size_t num_odd = 0;
+ for (size_t i = 0; i < count; ++i) {
+ if (a[i] & 1) {
+ expected[num_odd++] = a[i];
+ }
+ }
+
+#if HWY_GENERIC_LAMBDA
+ const auto is_odd = [](const auto d, const auto v) HWY_ATTR {
+ return TestBit(v, Set(d, TFromD<decltype(d)>{1}));
+ };
+#else
+ const IsOdd is_odd;
+#endif
+ T* end = CopyIf(d, a, count, b, is_odd);
+ const size_t num_written = static_cast<size_t>(end - b);
+ HWY_ASSERT_EQ(num_odd, num_written);
+
+ const auto info = hwy::detail::MakeTypeInfo<T>();
+ const char* target_name = hwy::TargetName(HWY_TARGET);
+ hwy::detail::AssertArrayEqual(info, expected.get(), b, num_odd, target_name,
+ __FILE__, __LINE__);
+ }
+};
+
+void TestAllCopyIf() {
+ ForUI163264(ForPartialVectors<ForeachCountAndMisalign<TestCopyIf>>());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(CopyTest);
+HWY_EXPORT_AND_TEST_P(CopyTest, TestAllFill);
+HWY_EXPORT_AND_TEST_P(CopyTest, TestAllCopy);
+HWY_EXPORT_AND_TEST_P(CopyTest, TestAllCopyIf);
+} // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/contrib/algo/find-inl.h b/third_party/highway/hwy/contrib/algo/find-inl.h
new file mode 100644
index 0000000000..388842e988
--- /dev/null
+++ b/third_party/highway/hwy/contrib/algo/find-inl.h
@@ -0,0 +1,109 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Per-target include guard
+#if defined(HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_) == \
+ defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_
+#undef HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_
+#else
+#define HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_
+#endif
+
+#include "hwy/highway.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// Returns index of the first element equal to `value` in `in[0, count)`, or
+// `count` if not found.
+template <class D, typename T = TFromD<D>>
+size_t Find(D d, T value, const T* HWY_RESTRICT in, size_t count) {
+ const size_t N = Lanes(d);
+ const Vec<D> broadcasted = Set(d, value);
+
+ size_t i = 0;
+ for (; i + N <= count; i += N) {
+ const intptr_t pos = FindFirstTrue(d, Eq(broadcasted, LoadU(d, in + i)));
+ if (pos >= 0) return i + static_cast<size_t>(pos);
+ }
+
+ if (i != count) {
+#if HWY_MEM_OPS_MIGHT_FAULT
+ // Scan single elements.
+ const CappedTag<T, 1> d1;
+ using V1 = Vec<decltype(d1)>;
+ const V1 broadcasted1 = Set(d1, GetLane(broadcasted));
+ for (; i < count; ++i) {
+ if (AllTrue(d1, Eq(broadcasted1, LoadU(d1, in + i)))) {
+ return i;
+ }
+ }
+#else
+ const size_t remaining = count - i;
+ HWY_DASSERT(0 != remaining && remaining < N);
+ const Mask<D> mask = FirstN(d, remaining);
+ const Vec<D> v = MaskedLoad(mask, d, in + i);
+ // Apply mask so that we don't 'find' the zero-padding from MaskedLoad.
+ const intptr_t pos = FindFirstTrue(d, And(Eq(broadcasted, v), mask));
+ if (pos >= 0) return i + static_cast<size_t>(pos);
+#endif // HWY_MEM_OPS_MIGHT_FAULT
+ }
+
+ return count; // not found
+}
+
+// Returns index of the first element in `in[0, count)` for which `func(d, vec)`
+// returns true, otherwise `count`.
+template <class D, class Func, typename T = TFromD<D>>
+size_t FindIf(D d, const T* HWY_RESTRICT in, size_t count, const Func& func) {
+ const size_t N = Lanes(d);
+
+ size_t i = 0;
+ for (; i + N <= count; i += N) {
+ const intptr_t pos = FindFirstTrue(d, func(d, LoadU(d, in + i)));
+ if (pos >= 0) return i + static_cast<size_t>(pos);
+ }
+
+ if (i != count) {
+#if HWY_MEM_OPS_MIGHT_FAULT
+ // Scan single elements.
+ const CappedTag<T, 1> d1;
+ for (; i < count; ++i) {
+ if (AllTrue(d1, func(d1, LoadU(d1, in + i)))) {
+ return i;
+ }
+ }
+#else
+ const size_t remaining = count - i;
+ HWY_DASSERT(0 != remaining && remaining < N);
+ const Mask<D> mask = FirstN(d, remaining);
+ const Vec<D> v = MaskedLoad(mask, d, in + i);
+ // Apply mask so that we don't 'find' the zero-padding from MaskedLoad.
+ const intptr_t pos = FindFirstTrue(d, And(func(d, v), mask));
+ if (pos >= 0) return i + static_cast<size_t>(pos);
+#endif // HWY_MEM_OPS_MIGHT_FAULT
+ }
+
+ return count; // not found
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#endif // HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_
diff --git a/third_party/highway/hwy/contrib/algo/find_test.cc b/third_party/highway/hwy/contrib/algo/find_test.cc
new file mode 100644
index 0000000000..96f1fd640c
--- /dev/null
+++ b/third_party/highway/hwy/contrib/algo/find_test.cc
@@ -0,0 +1,223 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+
+#include <algorithm> // std::find_if
+#include <vector>
+
+#include "hwy/aligned_allocator.h"
+#include "hwy/base.h"
+#include "hwy/print.h"
+
+// clang-format off
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/algo/find_test.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/contrib/algo/find-inl.h"
+#include "hwy/tests/test_util-inl.h"
+// clang-format on
+
+// If your project requires C++14 or later, you can ignore this and pass lambdas
+// directly to FindIf, without requiring an lvalue as we do here for C++11.
+#if __cplusplus < 201402L
+#define HWY_GENERIC_LAMBDA 0
+#else
+#define HWY_GENERIC_LAMBDA 1
+#endif
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// Returns random number in [-8, 8) - we use knowledge of the range to Find()
+// values we know are not present.
+template <typename T>
+T Random(RandomState& rng) {
+ const int32_t bits = static_cast<int32_t>(Random32(&rng)) & 1023;
+ const double val = (bits - 512) / 64.0;
+ // Clamp negative to zero for unsigned types.
+ return static_cast<T>(HWY_MAX(hwy::LowestValue<T>(), val));
+}
+
+// In C++14, we can instead define these as generic lambdas next to where they
+// are invoked.
+#if !HWY_GENERIC_LAMBDA
+
+class GreaterThan {
+ public:
+ GreaterThan(int val) : val_(val) {}
+ template <class D, class V>
+ Mask<D> operator()(D d, V v) const {
+ return Gt(v, Set(d, static_cast<TFromD<D>>(val_)));
+ }
+
+ private:
+ int val_;
+};
+
+#endif // !HWY_GENERIC_LAMBDA
+
+// Invokes Test (e.g. TestFind) with all arg combinations.
+template <class Test>
+struct ForeachCountAndMisalign {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) const {
+ RandomState rng;
+ const size_t N = Lanes(d);
+ const size_t misalignments[3] = {0, N / 4, 3 * N / 5};
+
+ // Find() checks 8 vectors at a time, so we want to cover a fairly large
+ // range without oversampling (checking every possible count).
+ std::vector<size_t> counts(AdjustedReps(512));
+ for (size_t& count : counts) {
+ count = static_cast<size_t>(rng()) % (16 * N + 1);
+ }
+ counts[0] = 0; // ensure we test count=0.
+
+ for (size_t count : counts) {
+ for (size_t m : misalignments) {
+ Test()(d, count, m, rng);
+ }
+ }
+ }
+};
+
+struct TestFind {
+ template <class D>
+ void operator()(D d, size_t count, size_t misalign, RandomState& rng) {
+ using T = TFromD<D>;
+ // Must allocate at least one even if count is zero.
+ AlignedFreeUniquePtr<T[]> storage =
+ AllocateAligned<T>(HWY_MAX(1, misalign + count));
+ HWY_ASSERT(storage);
+ T* in = storage.get() + misalign;
+ for (size_t i = 0; i < count; ++i) {
+ in[i] = Random<T>(rng);
+ }
+
+ // For each position, search for that element (which we know is there)
+ for (size_t pos = 0; pos < count; ++pos) {
+ const size_t actual = Find(d, in[pos], in, count);
+
+ // We may have found an earlier occurrence of the same value; ensure the
+ // value is the same, and that it is the first.
+ if (!IsEqual(in[pos], in[actual])) {
+ fprintf(stderr, "%s count %d, found %.15f at %d but wanted %.15f\n",
+ hwy::TypeName(T(), Lanes(d)).c_str(), static_cast<int>(count),
+ static_cast<double>(in[actual]), static_cast<int>(actual),
+ static_cast<double>(in[pos]));
+ HWY_ASSERT(false);
+ }
+ for (size_t i = 0; i < actual; ++i) {
+ if (IsEqual(in[i], in[pos])) {
+ fprintf(stderr, "%s count %d, found %f at %d but Find returned %d\n",
+ hwy::TypeName(T(), Lanes(d)).c_str(), static_cast<int>(count),
+ static_cast<double>(in[i]), static_cast<int>(i),
+ static_cast<int>(actual));
+ HWY_ASSERT(false);
+ }
+ }
+ }
+
+ // Also search for values we know not to be present (out of range)
+ HWY_ASSERT_EQ(count, Find(d, T{9}, in, count));
+ HWY_ASSERT_EQ(count, Find(d, static_cast<T>(-9), in, count));
+ }
+};
+
+void TestAllFind() {
+ ForAllTypes(ForPartialVectors<ForeachCountAndMisalign<TestFind>>());
+}
+
+struct TestFindIf {
+ template <class D>
+ void operator()(D d, size_t count, size_t misalign, RandomState& rng) {
+ using T = TFromD<D>;
+ using TI = MakeSigned<T>;
+ // Must allocate at least one even if count is zero.
+ AlignedFreeUniquePtr<T[]> storage =
+ AllocateAligned<T>(HWY_MAX(1, misalign + count));
+ HWY_ASSERT(storage);
+ T* in = storage.get() + misalign;
+ for (size_t i = 0; i < count; ++i) {
+ in[i] = Random<T>(rng);
+ HWY_ASSERT(in[i] < 8);
+ HWY_ASSERT(!hwy::IsSigned<T>() || static_cast<TI>(in[i]) >= -8);
+ }
+
+ bool found_any = false;
+ bool not_found_any = false;
+
+ // unsigned T would be promoted to signed and compare greater than any
+ // negative val, whereas Set() would just cast to an unsigned value and the
+ // comparison remains unsigned, so avoid negative numbers there.
+ const int min_val = IsSigned<T>() ? -9 : 0;
+ // Includes out-of-range value 9 to test the not-found path.
+ for (int val = min_val; val <= 9; ++val) {
+#if HWY_GENERIC_LAMBDA
+ const auto greater = [val](const auto d, const auto v) HWY_ATTR {
+ return Gt(v, Set(d, static_cast<T>(val)));
+ };
+#else
+ const GreaterThan greater(val);
+#endif
+ const size_t actual = FindIf(d, in, count, greater);
+ found_any |= actual < count;
+ not_found_any |= actual == count;
+
+ const auto pos = std::find_if(
+ in, in + count, [val](T x) { return x > static_cast<T>(val); });
+ // Convert returned iterator to index.
+ const size_t expected = static_cast<size_t>(pos - in);
+ if (expected != actual) {
+ fprintf(stderr, "%s count %d val %d, expected %d actual %d\n",
+ hwy::TypeName(T(), Lanes(d)).c_str(), static_cast<int>(count),
+ val, static_cast<int>(expected), static_cast<int>(actual));
+ hwy::detail::PrintArray(hwy::detail::MakeTypeInfo<T>(), "in", in, count,
+ 0, count);
+ HWY_ASSERT(false);
+ }
+ }
+
+ // We will always not-find something due to val=9.
+ HWY_ASSERT(not_found_any);
+ // We'll find something unless the input is empty or {0} - because 0 > i
+ // is false for all i=[0,9].
+ if (count != 0 && in[0] != 0) {
+ HWY_ASSERT(found_any);
+ }
+ }
+};
+
+void TestAllFindIf() {
+ ForAllTypes(ForPartialVectors<ForeachCountAndMisalign<TestFindIf>>());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(FindTest);
+HWY_EXPORT_AND_TEST_P(FindTest, TestAllFind);
+HWY_EXPORT_AND_TEST_P(FindTest, TestAllFindIf);
+} // namespace hwy
+
+#endif
diff --git a/third_party/highway/hwy/contrib/algo/transform-inl.h b/third_party/highway/hwy/contrib/algo/transform-inl.h
new file mode 100644
index 0000000000..3e830acb47
--- /dev/null
+++ b/third_party/highway/hwy/contrib/algo/transform-inl.h
@@ -0,0 +1,262 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Per-target include guard
+#if defined(HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_) == \
+ defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
+#undef HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
+#else
+#define HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
+#endif
+
+#include "hwy/highway.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// These functions avoid having to write a loop plus remainder handling in the
+// (unfortunately still common) case where arrays are not aligned/padded. If the
+// inputs are known to be aligned/padded, it is more efficient to write a single
+// loop using Load(). We do not provide a TransformAlignedPadded because it
+// would be more verbose than such a loop.
+//
+// Func is either a functor with a templated operator()(d, v[, v1[, v2]]), or a
+// generic lambda if using C++14. Due to apparent limitations of Clang on
+// Windows, it is currently necessary to add HWY_ATTR before the opening { of
+// the lambda to avoid errors about "always_inline function .. requires target".
+//
+// If HWY_MEM_OPS_MIGHT_FAULT, we use scalar code instead of masking. Otherwise,
+// we used `MaskedLoad` and `BlendedStore` to read/write the final partial
+// vector.
+
+// Fills `out[0, count)` with the vectors returned by `func(d, index_vec)`,
+// where `index_vec` is `Vec<RebindToUnsigned<D>>`. On the first call to `func`,
+// the value of its lane i is i, and increases by `Lanes(d)` after every call.
+// Note that some of these indices may be `>= count`, but the elements that
+// `func` returns in those lanes will not be written to `out`.
+template <class D, class Func, typename T = TFromD<D>>
+void Generate(D d, T* HWY_RESTRICT out, size_t count, const Func& func) {
+ const RebindToUnsigned<D> du;
+ using TU = TFromD<decltype(du)>;
+ const size_t N = Lanes(d);
+
+ size_t idx = 0;
+ Vec<decltype(du)> vidx = Iota(du, 0);
+ for (; idx + N <= count; idx += N) {
+ StoreU(func(d, vidx), d, out + idx);
+ vidx = Add(vidx, Set(du, static_cast<TU>(N)));
+ }
+
+ // `count` was a multiple of the vector length `N`: already done.
+ if (HWY_UNLIKELY(idx == count)) return;
+
+#if HWY_MEM_OPS_MIGHT_FAULT
+ // Proceed one by one.
+ const CappedTag<T, 1> d1;
+ const RebindToUnsigned<decltype(d1)> du1;
+ for (; idx < count; ++idx) {
+ StoreU(func(d1, Set(du1, static_cast<TU>(idx))), d1, out + idx);
+ }
+#else
+ const size_t remaining = count - idx;
+ HWY_DASSERT(0 != remaining && remaining < N);
+ const Mask<D> mask = FirstN(d, remaining);
+ BlendedStore(func(d, vidx), mask, d, out + idx);
+#endif
+}
+
+// Replaces `inout[idx]` with `func(d, inout[idx])`. Example usage: multiplying
+// array elements by a constant.
+template <class D, class Func, typename T = TFromD<D>>
+void Transform(D d, T* HWY_RESTRICT inout, size_t count, const Func& func) {
+ const size_t N = Lanes(d);
+
+ size_t idx = 0;
+ for (; idx + N <= count; idx += N) {
+ const Vec<D> v = LoadU(d, inout + idx);
+ StoreU(func(d, v), d, inout + idx);
+ }
+
+ // `count` was a multiple of the vector length `N`: already done.
+ if (HWY_UNLIKELY(idx == count)) return;
+
+#if HWY_MEM_OPS_MIGHT_FAULT
+ // Proceed one by one.
+ const CappedTag<T, 1> d1;
+ for (; idx < count; ++idx) {
+ using V1 = Vec<decltype(d1)>;
+ const V1 v = LoadU(d1, inout + idx);
+ StoreU(func(d1, v), d1, inout + idx);
+ }
+#else
+ const size_t remaining = count - idx;
+ HWY_DASSERT(0 != remaining && remaining < N);
+ const Mask<D> mask = FirstN(d, remaining);
+ const Vec<D> v = MaskedLoad(mask, d, inout + idx);
+ BlendedStore(func(d, v), mask, d, inout + idx);
+#endif
+}
+
+// Replaces `inout[idx]` with `func(d, inout[idx], in1[idx])`. Example usage:
+// multiplying array elements by those of another array.
+template <class D, class Func, typename T = TFromD<D>>
+void Transform1(D d, T* HWY_RESTRICT inout, size_t count,
+ const T* HWY_RESTRICT in1, const Func& func) {
+ const size_t N = Lanes(d);
+
+ size_t idx = 0;
+ for (; idx + N <= count; idx += N) {
+ const Vec<D> v = LoadU(d, inout + idx);
+ const Vec<D> v1 = LoadU(d, in1 + idx);
+ StoreU(func(d, v, v1), d, inout + idx);
+ }
+
+ // `count` was a multiple of the vector length `N`: already done.
+ if (HWY_UNLIKELY(idx == count)) return;
+
+#if HWY_MEM_OPS_MIGHT_FAULT
+ // Proceed one by one.
+ const CappedTag<T, 1> d1;
+ for (; idx < count; ++idx) {
+ using V1 = Vec<decltype(d1)>;
+ const V1 v = LoadU(d1, inout + idx);
+ const V1 v1 = LoadU(d1, in1 + idx);
+ StoreU(func(d1, v, v1), d1, inout + idx);
+ }
+#else
+ const size_t remaining = count - idx;
+ HWY_DASSERT(0 != remaining && remaining < N);
+ const Mask<D> mask = FirstN(d, remaining);
+ const Vec<D> v = MaskedLoad(mask, d, inout + idx);
+ const Vec<D> v1 = MaskedLoad(mask, d, in1 + idx);
+ BlendedStore(func(d, v, v1), mask, d, inout + idx);
+#endif
+}
+
+// Replaces `inout[idx]` with `func(d, inout[idx], in1[idx], in2[idx])`. Example
+// usage: FMA of elements from three arrays, stored into the first array.
+template <class D, class Func, typename T = TFromD<D>>
+void Transform2(D d, T* HWY_RESTRICT inout, size_t count,
+ const T* HWY_RESTRICT in1, const T* HWY_RESTRICT in2,
+ const Func& func) {
+ const size_t N = Lanes(d);
+
+ size_t idx = 0;
+ for (; idx + N <= count; idx += N) {
+ const Vec<D> v = LoadU(d, inout + idx);
+ const Vec<D> v1 = LoadU(d, in1 + idx);
+ const Vec<D> v2 = LoadU(d, in2 + idx);
+ StoreU(func(d, v, v1, v2), d, inout + idx);
+ }
+
+ // `count` was a multiple of the vector length `N`: already done.
+ if (HWY_UNLIKELY(idx == count)) return;
+
+#if HWY_MEM_OPS_MIGHT_FAULT
+ // Proceed one by one.
+ const CappedTag<T, 1> d1;
+ for (; idx < count; ++idx) {
+ using V1 = Vec<decltype(d1)>;
+ const V1 v = LoadU(d1, inout + idx);
+ const V1 v1 = LoadU(d1, in1 + idx);
+ const V1 v2 = LoadU(d1, in2 + idx);
+ StoreU(func(d1, v, v1, v2), d1, inout + idx);
+ }
+#else
+ const size_t remaining = count - idx;
+ HWY_DASSERT(0 != remaining && remaining < N);
+ const Mask<D> mask = FirstN(d, remaining);
+ const Vec<D> v = MaskedLoad(mask, d, inout + idx);
+ const Vec<D> v1 = MaskedLoad(mask, d, in1 + idx);
+ const Vec<D> v2 = MaskedLoad(mask, d, in2 + idx);
+ BlendedStore(func(d, v, v1, v2), mask, d, inout + idx);
+#endif
+}
+
+template <class D, typename T = TFromD<D>>
+void Replace(D d, T* HWY_RESTRICT inout, size_t count, T new_t, T old_t) {
+ const size_t N = Lanes(d);
+ const Vec<D> old_v = Set(d, old_t);
+ const Vec<D> new_v = Set(d, new_t);
+
+ size_t idx = 0;
+ for (; idx + N <= count; idx += N) {
+ Vec<D> v = LoadU(d, inout + idx);
+ StoreU(IfThenElse(Eq(v, old_v), new_v, v), d, inout + idx);
+ }
+
+ // `count` was a multiple of the vector length `N`: already done.
+ if (HWY_UNLIKELY(idx == count)) return;
+
+#if HWY_MEM_OPS_MIGHT_FAULT
+ // Proceed one by one.
+ const CappedTag<T, 1> d1;
+ const Vec<decltype(d1)> old_v1 = Set(d1, old_t);
+ const Vec<decltype(d1)> new_v1 = Set(d1, new_t);
+ for (; idx < count; ++idx) {
+ using V1 = Vec<decltype(d1)>;
+ const V1 v1 = LoadU(d1, inout + idx);
+ StoreU(IfThenElse(Eq(v1, old_v1), new_v1, v1), d1, inout + idx);
+ }
+#else
+ const size_t remaining = count - idx;
+ HWY_DASSERT(0 != remaining && remaining < N);
+ const Mask<D> mask = FirstN(d, remaining);
+ const Vec<D> v = MaskedLoad(mask, d, inout + idx);
+ BlendedStore(IfThenElse(Eq(v, old_v), new_v, v), mask, d, inout + idx);
+#endif
+}
+
+template <class D, class Func, typename T = TFromD<D>>
+void ReplaceIf(D d, T* HWY_RESTRICT inout, size_t count, T new_t,
+ const Func& func) {
+ const size_t N = Lanes(d);
+ const Vec<D> new_v = Set(d, new_t);
+
+ size_t idx = 0;
+ for (; idx + N <= count; idx += N) {
+ Vec<D> v = LoadU(d, inout + idx);
+ StoreU(IfThenElse(func(d, v), new_v, v), d, inout + idx);
+ }
+
+ // `count` was a multiple of the vector length `N`: already done.
+ if (HWY_UNLIKELY(idx == count)) return;
+
+#if HWY_MEM_OPS_MIGHT_FAULT
+ // Proceed one by one.
+ const CappedTag<T, 1> d1;
+ const Vec<decltype(d1)> new_v1 = Set(d1, new_t);
+ for (; idx < count; ++idx) {
+ using V1 = Vec<decltype(d1)>;
+ const V1 v = LoadU(d1, inout + idx);
+ StoreU(IfThenElse(func(d1, v), new_v1, v), d1, inout + idx);
+ }
+#else
+ const size_t remaining = count - idx;
+ HWY_DASSERT(0 != remaining && remaining < N);
+ const Mask<D> mask = FirstN(d, remaining);
+ const Vec<D> v = MaskedLoad(mask, d, inout + idx);
+ BlendedStore(IfThenElse(func(d, v), new_v, v), mask, d, inout + idx);
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#endif // HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
diff --git a/third_party/highway/hwy/contrib/algo/transform_test.cc b/third_party/highway/hwy/contrib/algo/transform_test.cc
new file mode 100644
index 0000000000..4726e82419
--- /dev/null
+++ b/third_party/highway/hwy/contrib/algo/transform_test.cc
@@ -0,0 +1,381 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h> // memcpy
+
+#include <vector>
+
+#include "hwy/aligned_allocator.h"
+
+// clang-format off
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/algo/transform_test.cc" //NOLINT
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/contrib/algo/transform-inl.h"
+#include "hwy/tests/test_util-inl.h"
+// clang-format on
+
+// If your project requires C++14 or later, you can ignore this and pass lambdas
+// directly to Transform, without requiring an lvalue as we do here for C++11.
+#if __cplusplus < 201402L
+#define HWY_GENERIC_LAMBDA 0
+#else
+#define HWY_GENERIC_LAMBDA 1
+#endif
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+template <typename T>
+T Alpha() {
+ return static_cast<T>(1.5); // arbitrary scalar
+}
+
+// Returns random floating-point number in [-8, 8) to ensure computations do
+// not exceed float32 precision.
+template <typename T>
+T Random(RandomState& rng) {
+ const int32_t bits = static_cast<int32_t>(Random32(&rng)) & 1023;
+ const double val = (bits - 512) / 64.0;
+ // Clamp negative to zero for unsigned types.
+ return static_cast<T>(HWY_MAX(hwy::LowestValue<T>(), val));
+}
+
+// SCAL, AXPY names are from BLAS.
+template <typename T>
+HWY_NOINLINE void SimpleSCAL(const T* x, T* out, size_t count) {
+ for (size_t i = 0; i < count; ++i) {
+ out[i] = Alpha<T>() * x[i];
+ }
+}
+
+template <typename T>
+HWY_NOINLINE void SimpleAXPY(const T* x, const T* y, T* out, size_t count) {
+ for (size_t i = 0; i < count; ++i) {
+ out[i] = Alpha<T>() * x[i] + y[i];
+ }
+}
+
+template <typename T>
+HWY_NOINLINE void SimpleFMA4(const T* x, const T* y, const T* z, T* out,
+ size_t count) {
+ for (size_t i = 0; i < count; ++i) {
+ out[i] = x[i] * y[i] + z[i];
+ }
+}
+
+// In C++14, we can instead define these as generic lambdas next to where they
+// are invoked.
+#if !HWY_GENERIC_LAMBDA
+
+// Generator that returns even numbers by doubling the output indices.
+struct Gen2 {
+ template <class D, class VU>
+ Vec<D> operator()(D d, VU vidx) const {
+ return BitCast(d, Add(vidx, vidx));
+ }
+};
+
+struct SCAL {
+ template <class D, class V>
+ Vec<D> operator()(D d, V v) const {
+ using T = TFromD<D>;
+ return Mul(Set(d, Alpha<T>()), v);
+ }
+};
+
+struct AXPY {
+ template <class D, class V>
+ Vec<D> operator()(D d, V v, V v1) const {
+ using T = TFromD<D>;
+ return MulAdd(Set(d, Alpha<T>()), v, v1);
+ }
+};
+
+struct FMA4 {
+ template <class D, class V>
+ Vec<D> operator()(D /*d*/, V v, V v1, V v2) const {
+ return MulAdd(v, v1, v2);
+ }
+};
+
+#endif // !HWY_GENERIC_LAMBDA
+
+// Invokes Test (e.g. TestTransform1) with all arg combinations. T comes from
+// ForFloatTypes.
+template <class Test>
+struct ForeachCountAndMisalign {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) const {
+ RandomState rng;
+ const size_t N = Lanes(d);
+ const size_t misalignments[3] = {0, N / 4, 3 * N / 5};
+
+ for (size_t count = 0; count < 2 * N; ++count) {
+ for (size_t ma : misalignments) {
+ for (size_t mb : misalignments) {
+ Test()(d, count, ma, mb, rng);
+ }
+ }
+ }
+ }
+};
+
+// Output-only, no loads
+struct TestGenerate {
+ template <class D>
+ void operator()(D d, size_t count, size_t misalign_a, size_t /*misalign_b*/,
+ RandomState& /*rng*/) {
+ using T = TFromD<D>;
+ AlignedFreeUniquePtr<T[]> pa = AllocateAligned<T>(misalign_a + count + 1);
+ AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count));
+ HWY_ASSERT(pa && expected);
+
+ T* actual = pa.get() + misalign_a;
+
+ for (size_t i = 0; i < count; ++i) {
+ expected[i] = static_cast<T>(2 * i);
+ }
+
+ // TODO(janwas): can we update the apply_to in HWY_PUSH_ATTRIBUTES so that
+ // the attribute also applies to lambdas? If so, remove HWY_ATTR.
+#if HWY_GENERIC_LAMBDA
+ const auto gen2 = [](const auto d, const auto vidx)
+ HWY_ATTR { return BitCast(d, Add(vidx, vidx)); };
+#else
+ const Gen2 gen2;
+#endif
+ actual[count] = T{0}; // sentinel
+ Generate(d, actual, count, gen2);
+ HWY_ASSERT_EQ(T{0}, actual[count]); // did not write past end
+
+ const auto info = hwy::detail::MakeTypeInfo<T>();
+ const char* target_name = hwy::TargetName(HWY_TARGET);
+ hwy::detail::AssertArrayEqual(info, expected.get(), actual, count,
+ target_name, __FILE__, __LINE__);
+ }
+};
+
+// Zero extra input arrays
+struct TestTransform {
+ template <class D>
+ void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
+ RandomState& rng) {
+ if (misalign_b != 0) return;
+ using T = TFromD<D>;
+ // Prevents error if size to allocate is zero.
+ AlignedFreeUniquePtr<T[]> pa =
+ AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
+ AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count));
+ HWY_ASSERT(pa && expected);
+
+ T* a = pa.get() + misalign_a;
+ for (size_t i = 0; i < count; ++i) {
+ a[i] = Random<T>(rng);
+ }
+
+ SimpleSCAL(a, expected.get(), count);
+
+ // TODO(janwas): can we update the apply_to in HWY_PUSH_ATTRIBUTES so that
+ // the attribute also applies to lambdas? If so, remove HWY_ATTR.
+#if HWY_GENERIC_LAMBDA
+ const auto scal = [](const auto d, const auto v)
+ HWY_ATTR { return Mul(Set(d, Alpha<T>()), v); };
+#else
+ const SCAL scal;
+#endif
+ Transform(d, a, count, scal);
+
+ const auto info = hwy::detail::MakeTypeInfo<T>();
+ const char* target_name = hwy::TargetName(HWY_TARGET);
+ hwy::detail::AssertArrayEqual(info, expected.get(), a, count, target_name,
+ __FILE__, __LINE__);
+ }
+};
+
+// One extra input array
+struct TestTransform1 {
+ template <class D>
+ void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
+ RandomState& rng) {
+ using T = TFromD<D>;
+ // Prevents error if size to allocate is zero.
+ AlignedFreeUniquePtr<T[]> pa =
+ AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
+ AlignedFreeUniquePtr<T[]> pb =
+ AllocateAligned<T>(HWY_MAX(1, misalign_b + count));
+ AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count));
+ HWY_ASSERT(pa && pb && expected);
+ T* a = pa.get() + misalign_a;
+ T* b = pb.get() + misalign_b;
+ for (size_t i = 0; i < count; ++i) {
+ a[i] = Random<T>(rng);
+ b[i] = Random<T>(rng);
+ }
+
+ SimpleAXPY(a, b, expected.get(), count);
+
+#if HWY_GENERIC_LAMBDA
+ const auto axpy = [](const auto d, const auto v, const auto v1) HWY_ATTR {
+ return MulAdd(Set(d, Alpha<T>()), v, v1);
+ };
+#else
+ const AXPY axpy;
+#endif
+ Transform1(d, a, count, b, axpy);
+
+ const auto info = hwy::detail::MakeTypeInfo<T>();
+ const char* target_name = hwy::TargetName(HWY_TARGET);
+ hwy::detail::AssertArrayEqual(info, expected.get(), a, count, target_name,
+ __FILE__, __LINE__);
+ }
+};
+
+// Two extra input arrays
+struct TestTransform2 {
+ template <class D>
+ void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
+ RandomState& rng) {
+ using T = TFromD<D>;
+ // Prevents error if size to allocate is zero.
+ AlignedFreeUniquePtr<T[]> pa =
+ AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
+ AlignedFreeUniquePtr<T[]> pb =
+ AllocateAligned<T>(HWY_MAX(1, misalign_b + count));
+ AlignedFreeUniquePtr<T[]> pc =
+ AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
+ AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count));
+ HWY_ASSERT(pa && pb && pc && expected);
+ T* a = pa.get() + misalign_a;
+ T* b = pb.get() + misalign_b;
+ T* c = pc.get() + misalign_a;
+ for (size_t i = 0; i < count; ++i) {
+ a[i] = Random<T>(rng);
+ b[i] = Random<T>(rng);
+ c[i] = Random<T>(rng);
+ }
+
+ SimpleFMA4(a, b, c, expected.get(), count);
+
+#if HWY_GENERIC_LAMBDA
+ const auto fma4 = [](auto /*d*/, auto v, auto v1, auto v2)
+ HWY_ATTR { return MulAdd(v, v1, v2); };
+#else
+ const FMA4 fma4;
+#endif
+ Transform2(d, a, count, b, c, fma4);
+
+ const auto info = hwy::detail::MakeTypeInfo<T>();
+ const char* target_name = hwy::TargetName(HWY_TARGET);
+ hwy::detail::AssertArrayEqual(info, expected.get(), a, count, target_name,
+ __FILE__, __LINE__);
+ }
+};
+
+template <typename T>
+class IfEq {
+ public:
+ IfEq(T val) : val_(val) {}
+
+ template <class D, class V>
+ Mask<D> operator()(D d, V v) const {
+ return Eq(v, Set(d, val_));
+ }
+
+ private:
+ T val_;
+};
+
+struct TestReplace {
+ template <class D>
+ void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
+ RandomState& rng) {
+ if (misalign_b != 0) return;
+ if (count == 0) return;
+ using T = TFromD<D>;
+ AlignedFreeUniquePtr<T[]> pa = AllocateAligned<T>(misalign_a + count);
+ AlignedFreeUniquePtr<T[]> pb = AllocateAligned<T>(count);
+ AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(count);
+ HWY_ASSERT(pa && pb && expected);
+
+ T* a = pa.get() + misalign_a;
+ for (size_t i = 0; i < count; ++i) {
+ a[i] = Random<T>(rng);
+ }
+
+ std::vector<size_t> positions(AdjustedReps(count));
+ for (size_t& pos : positions) {
+ pos = static_cast<size_t>(rng()) % count;
+ }
+
+ for (size_t pos = 0; pos < count; ++pos) {
+ const T old_t = a[pos];
+ const T new_t = Random<T>(rng);
+ for (size_t i = 0; i < count; ++i) {
+ expected[i] = IsEqual(a[i], old_t) ? new_t : a[i];
+ }
+
+ // Copy so ReplaceIf gets the same input (and thus also outputs expected)
+ memcpy(pb.get(), a, count * sizeof(T));
+
+ Replace(d, a, count, new_t, old_t);
+ HWY_ASSERT_ARRAY_EQ(expected.get(), a, count);
+
+ ReplaceIf(d, pb.get(), count, new_t, IfEq<T>(old_t));
+ HWY_ASSERT_ARRAY_EQ(expected.get(), pb.get(), count);
+ }
+ }
+};
+
+void TestAllGenerate() {
+ // The test BitCast-s the indices, which does not work for floats.
+ ForIntegerTypes(ForPartialVectors<ForeachCountAndMisalign<TestGenerate>>());
+}
+
+void TestAllTransform() {
+ ForFloatTypes(ForPartialVectors<ForeachCountAndMisalign<TestTransform>>());
+}
+
+void TestAllTransform1() {
+ ForFloatTypes(ForPartialVectors<ForeachCountAndMisalign<TestTransform1>>());
+}
+
+void TestAllTransform2() {
+ ForFloatTypes(ForPartialVectors<ForeachCountAndMisalign<TestTransform2>>());
+}
+
+void TestAllReplace() {
+ ForFloatTypes(ForPartialVectors<ForeachCountAndMisalign<TestReplace>>());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(TransformTest);
+HWY_EXPORT_AND_TEST_P(TransformTest, TestAllGenerate);
+HWY_EXPORT_AND_TEST_P(TransformTest, TestAllTransform);
+HWY_EXPORT_AND_TEST_P(TransformTest, TestAllTransform1);
+HWY_EXPORT_AND_TEST_P(TransformTest, TestAllTransform2);
+HWY_EXPORT_AND_TEST_P(TransformTest, TestAllReplace);
+} // namespace hwy
+
+#endif