summaryrefslogtreecommitdiffstats
path: root/third_party/highway/hwy/examples
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 19:33:14 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 19:33:14 +0000
commit36d22d82aa202bb199967e9512281e9a53db42c9 (patch)
tree105e8c98ddea1c1e4784a60a5a6410fa416be2de /third_party/highway/hwy/examples
parentInitial commit. (diff)
downloadfirefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz
firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip
Adding upstream version 115.7.0esr.upstream/115.7.0esr
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/highway/hwy/examples')
-rw-r--r--third_party/highway/hwy/examples/benchmark.cc255
-rw-r--r--third_party/highway/hwy/examples/skeleton-inl.h66
-rw-r--r--third_party/highway/hwy/examples/skeleton.cc122
-rw-r--r--third_party/highway/hwy/examples/skeleton.h36
-rw-r--r--third_party/highway/hwy/examples/skeleton_test.cc110
5 files changed, 589 insertions, 0 deletions
diff --git a/third_party/highway/hwy/examples/benchmark.cc b/third_party/highway/hwy/examples/benchmark.cc
new file mode 100644
index 0000000000..55afd3bcca
--- /dev/null
+++ b/third_party/highway/hwy/examples/benchmark.cc
@@ -0,0 +1,255 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS // before inttypes.h
+#endif
+#include <inttypes.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <cmath> // std::abs
+#include <memory>
+#include <numeric> // std::iota, std::inner_product
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/examples/benchmark.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+
+// Must come after foreach_target.h to avoid redefinition errors.
+#include "hwy/aligned_allocator.h"
+#include "hwy/highway.h"
+#include "hwy/nanobenchmark.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+#if HWY_TARGET != HWY_SCALAR
+using hwy::HWY_NAMESPACE::CombineShiftRightLanes;
+#endif
+
+class TwoArray {
+ public:
+ // Must be a multiple of the vector lane count * 8.
+ static size_t NumItems() { return 3456; }
+
+ TwoArray()
+ : a_(AllocateAligned<float>(NumItems() * 2)), b_(a_.get() + NumItems()) {
+ // = 1, but compiler doesn't know
+ const float init = static_cast<float>(Unpredictable1());
+ std::iota(a_.get(), a_.get() + NumItems(), init);
+ std::iota(b_, b_ + NumItems(), init);
+ }
+
+ protected:
+ AlignedFreeUniquePtr<float[]> a_;
+ float* b_;
+};
+
+// Measures durations, verifies results, prints timings.
+template <class Benchmark>
+void RunBenchmark(const char* caption) {
+ printf("%10s: ", caption);
+ const size_t kNumInputs = 1;
+ const size_t num_items = Benchmark::NumItems() * size_t(Unpredictable1());
+ const FuncInput inputs[kNumInputs] = {num_items};
+ Result results[kNumInputs];
+
+ Benchmark benchmark;
+
+ Params p;
+ p.verbose = false;
+ p.max_evals = 7;
+ p.target_rel_mad = 0.002;
+ const size_t num_results = MeasureClosure(
+ [&benchmark](const FuncInput input) { return benchmark(input); }, inputs,
+ kNumInputs, results, p);
+ if (num_results != kNumInputs) {
+ fprintf(stderr, "MeasureClosure failed.\n");
+ }
+
+ benchmark.Verify(num_items);
+
+ for (size_t i = 0; i < num_results; ++i) {
+ const double cycles_per_item =
+ results[i].ticks / static_cast<double>(results[i].input);
+ const double mad = results[i].variability * cycles_per_item;
+ printf("%6" PRIu64 ": %6.3f (+/- %5.3f)\n",
+ static_cast<uint64_t>(results[i].input), cycles_per_item, mad);
+ }
+}
+
+void Intro() {
+ const float in[16] = {1, 2, 3, 4, 5, 6};
+ float out[16];
+ const ScalableTag<float> d; // largest possible vector
+ for (size_t i = 0; i < 16; i += Lanes(d)) {
+ const auto vec = LoadU(d, in + i); // no alignment requirement
+ auto result = Mul(vec, vec);
+ result = Add(result, result); // can update if not const
+ StoreU(result, d, out + i);
+ }
+ printf("\nF(x)->2*x^2, F(%.0f) = %.1f\n", in[2], out[2]);
+}
+
+// BEGINNER: dot product
+// 0.4 cyc/float = bronze, 0.25 = silver, 0.15 = gold!
+class BenchmarkDot : public TwoArray {
+ public:
+ BenchmarkDot() : dot_{-1.0f} {}
+
+ FuncOutput operator()(const size_t num_items) {
+ const ScalableTag<float> d;
+ const size_t N = Lanes(d);
+ using V = decltype(Zero(d));
+ // Compiler doesn't make independent sum* accumulators, so unroll manually.
+ // We cannot use an array because V might be a sizeless type. For reasonable
+ // code, we unroll 4x, but 8x might help (2 FMA ports * 4 cycle latency).
+ V sum0 = Zero(d);
+ V sum1 = Zero(d);
+ V sum2 = Zero(d);
+ V sum3 = Zero(d);
+ const float* const HWY_RESTRICT pa = &a_[0];
+ const float* const HWY_RESTRICT pb = b_;
+ for (size_t i = 0; i < num_items; i += 4 * N) {
+ const auto a0 = Load(d, pa + i + 0 * N);
+ const auto b0 = Load(d, pb + i + 0 * N);
+ sum0 = MulAdd(a0, b0, sum0);
+ const auto a1 = Load(d, pa + i + 1 * N);
+ const auto b1 = Load(d, pb + i + 1 * N);
+ sum1 = MulAdd(a1, b1, sum1);
+ const auto a2 = Load(d, pa + i + 2 * N);
+ const auto b2 = Load(d, pb + i + 2 * N);
+ sum2 = MulAdd(a2, b2, sum2);
+ const auto a3 = Load(d, pa + i + 3 * N);
+ const auto b3 = Load(d, pb + i + 3 * N);
+ sum3 = MulAdd(a3, b3, sum3);
+ }
+ // Reduction tree: sum of all accumulators by pairs into sum0.
+ sum0 = Add(sum0, sum1);
+ sum2 = Add(sum2, sum3);
+ sum0 = Add(sum0, sum2);
+ dot_ = GetLane(SumOfLanes(d, sum0));
+ return static_cast<FuncOutput>(dot_);
+ }
+ void Verify(size_t num_items) {
+ if (dot_ == -1.0f) {
+ fprintf(stderr, "Dot: must call Verify after benchmark");
+ abort();
+ }
+
+ const float expected =
+ std::inner_product(a_.get(), a_.get() + num_items, b_, 0.0f);
+ const float rel_err = std::abs(expected - dot_) / expected;
+ if (rel_err > 1.1E-6f) {
+ fprintf(stderr, "Dot: expected %e actual %e (%e)\n", expected, dot_,
+ rel_err);
+ abort();
+ }
+ }
+
+ private:
+ float dot_; // for Verify
+};
+
+// INTERMEDIATE: delta coding
+// 1.0 cycles/float = bronze, 0.7 = silver, 0.4 = gold!
+struct BenchmarkDelta : public TwoArray {
+ FuncOutput operator()(const size_t num_items) const {
+#if HWY_TARGET == HWY_SCALAR
+ b_[0] = a_[0];
+ for (size_t i = 1; i < num_items; ++i) {
+ b_[i] = a_[i] - a_[i - 1];
+ }
+#elif HWY_CAP_GE256
+ // Larger vectors are split into 128-bit blocks, easiest to use the
+ // unaligned load support to shift between them.
+ const ScalableTag<float> df;
+ const size_t N = Lanes(df);
+ size_t i;
+ b_[0] = a_[0];
+ for (i = 1; i < N; ++i) {
+ b_[i] = a_[i] - a_[i - 1];
+ }
+ for (; i < num_items; i += N) {
+ const auto a = Load(df, &a_[i]);
+ const auto shifted = LoadU(df, &a_[i - 1]);
+ Store(a - shifted, df, &b_[i]);
+ }
+#else // 128-bit
+ // Slightly better than unaligned loads
+ const HWY_CAPPED(float, 4) df;
+ const size_t N = Lanes(df);
+ size_t i;
+ b_[0] = a_[0];
+ for (i = 1; i < N; ++i) {
+ b_[i] = a_[i] - a_[i - 1];
+ }
+ auto prev = Load(df, &a_[0]);
+ for (; i < num_items; i += Lanes(df)) {
+ const auto a = Load(df, &a_[i]);
+ const auto shifted = CombineShiftRightLanes<3>(df, a, prev);
+ prev = a;
+ Store(Sub(a, shifted), df, &b_[i]);
+ }
+#endif
+ return static_cast<FuncOutput>(b_[num_items - 1]);
+ }
+
+ void Verify(size_t num_items) {
+ for (size_t i = 0; i < num_items; ++i) {
+ const float expected = (i == 0) ? a_[0] : a_[i] - a_[i - 1];
+ const float err = std::abs(expected - b_[i]);
+ if (err > 1E-6f) {
+ fprintf(stderr, "Delta: expected %e, actual %e\n", expected, b_[i]);
+ }
+ }
+ }
+};
+
+void RunBenchmarks() {
+ Intro();
+ printf("------------------------ %s\n", TargetName(HWY_TARGET));
+ RunBenchmark<BenchmarkDot>("dot");
+ RunBenchmark<BenchmarkDelta>("delta");
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+HWY_EXPORT(RunBenchmarks);
+
+void Run() {
+ for (int64_t target : SupportedAndGeneratedTargets()) {
+ SetSupportedTargetsForTest(target);
+ HWY_DYNAMIC_DISPATCH(RunBenchmarks)();
+ }
+ SetSupportedTargetsForTest(0); // Reset the mask afterwards.
+}
+
+} // namespace hwy
+
+int main(int /*argc*/, char** /*argv*/) {
+ hwy::Run();
+ return 0;
+}
+#endif // HWY_ONCE
diff --git a/third_party/highway/hwy/examples/skeleton-inl.h b/third_party/highway/hwy/examples/skeleton-inl.h
new file mode 100644
index 0000000000..8aec33e666
--- /dev/null
+++ b/third_party/highway/hwy/examples/skeleton-inl.h
@@ -0,0 +1,66 @@
+// Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Demo of functions that might be called from multiple SIMD modules (either
+// other -inl.h files, or a .cc file between begin/end_target-inl). This is
+// optional - all SIMD code can reside in .cc files. However, this allows
+// splitting code into different files while still inlining instead of requiring
+// calling through function pointers.
+
+// Per-target include guard. This is only required when using dynamic dispatch,
+// i.e. including foreach_target.h. For static dispatch, a normal include
+// guard would be fine because the header is only compiled once.
+#if defined(HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
+#undef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
+#else
+#define HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
+#endif
+
+// It is fine to #include normal or *-inl headers.
+#include <stddef.h>
+
+#include "hwy/highway.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace skeleton {
+namespace HWY_NAMESPACE {
+
+// Highway ops reside here; ADL does not find templates nor builtins.
+namespace hn = hwy::HWY_NAMESPACE;
+
+// Example of a type-agnostic (caller-specified lane type) and width-agnostic
+// (uses best available instruction set) function in a header.
+//
+// Computes x[i] = mul_array[i] * x_array[i] + add_array[i] for i < size.
+template <class D, typename T>
+HWY_MAYBE_UNUSED void MulAddLoop(const D d, const T* HWY_RESTRICT mul_array,
+ const T* HWY_RESTRICT add_array,
+ const size_t size, T* HWY_RESTRICT x_array) {
+ for (size_t i = 0; i < size; i += hn::Lanes(d)) {
+ const auto mul = hn::Load(d, mul_array + i);
+ const auto add = hn::Load(d, add_array + i);
+ auto x = hn::Load(d, x_array + i);
+ x = hn::MulAdd(mul, x, add);
+ hn::Store(x, d, x_array + i);
+ }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace skeleton
+HWY_AFTER_NAMESPACE();
+
+#endif // include guard
diff --git a/third_party/highway/hwy/examples/skeleton.cc b/third_party/highway/hwy/examples/skeleton.cc
new file mode 100644
index 0000000000..778ba4ac0a
--- /dev/null
+++ b/third_party/highway/hwy/examples/skeleton.cc
@@ -0,0 +1,122 @@
+// Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/examples/skeleton.h"
+
+#include <stdio.h>
+
+// >>>> for dynamic dispatch only, skip if you want static dispatch
+
+// First undef to prevent error when re-included.
+#undef HWY_TARGET_INCLUDE
+// For dynamic dispatch, specify the name of the current file (unfortunately
+// __FILE__ is not reliable) so that foreach_target.h can re-include it.
+#define HWY_TARGET_INCLUDE "hwy/examples/skeleton.cc"
+// Generates code for each enabled target by re-including this source file.
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+
+// <<<< end of dynamic dispatch
+
+// Must come after foreach_target.h to avoid redefinition errors.
+#include "hwy/highway.h"
+
+// Optional, can instead add HWY_ATTR to all functions.
+HWY_BEFORE_NAMESPACE();
+
+namespace skeleton {
+// This namespace name is unique per target, which allows code for multiple
+// targets to co-exist in the same translation unit. Required when using dynamic
+// dispatch, otherwise optional.
+namespace HWY_NAMESPACE {
+
+// Highway ops reside here; ADL does not find templates nor builtins.
+namespace hn = hwy::HWY_NAMESPACE;
+
+// Computes log2 by converting to a vector of floats. Compiled once per target.
+template <class DF>
+HWY_ATTR_NO_MSAN void OneFloorLog2(const DF df,
+ const uint8_t* HWY_RESTRICT values,
+ uint8_t* HWY_RESTRICT log2) {
+ // Type tags for converting to other element types (Rebind = same count).
+ const hn::RebindToSigned<DF> d32;
+ const hn::Rebind<uint8_t, DF> d8;
+ using VI32 = hn::Vec<decltype(d32)>;
+
+ const VI32 vi32 = hn::PromoteTo(d32, hn::Load(d8, values));
+ const VI32 bits = hn::BitCast(d32, hn::ConvertTo(df, vi32));
+ const VI32 exponent = hn::Sub(hn::ShiftRight<23>(bits), hn::Set(d32, 127));
+ hn::Store(hn::DemoteTo(d8, exponent), d8, log2);
+}
+
+void CodepathDemo() {
+ // Highway defaults to portability, but per-target codepaths may be selected
+ // via #if HWY_TARGET == HWY_SSE4 or by testing capability macros:
+#if HWY_HAVE_INTEGER64
+ const char* gather = "Has int64";
+#else
+ const char* gather = "No int64";
+#endif
+ printf("Target %s: %s\n", hwy::TargetName(HWY_TARGET), gather);
+}
+
+void FloorLog2(const uint8_t* HWY_RESTRICT values, size_t count,
+ uint8_t* HWY_RESTRICT log2) {
+ CodepathDemo();
+
+ const hn::ScalableTag<float> df;
+ const size_t N = hn::Lanes(df);
+ size_t i = 0;
+ for (; i + N <= count; i += N) {
+ OneFloorLog2(df, values + i, log2 + i);
+ }
+ for (; i < count; ++i) {
+ hn::CappedTag<float, 1> d1;
+ OneFloorLog2(d1, values + i, log2 + i);
+ }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace skeleton
+HWY_AFTER_NAMESPACE();
+
+// The table of pointers to the various implementations in HWY_NAMESPACE must
+// be compiled only once (foreach_target #includes this file multiple times).
+// HWY_ONCE is true for only one of these 'compilation passes'.
+#if HWY_ONCE
+
+namespace skeleton {
+
+// This macro declares a static array used for dynamic dispatch; it resides in
+// the same outer namespace that contains FloorLog2.
+HWY_EXPORT(FloorLog2);
+
+// This function is optional and only needed in the case of exposing it in the
+// header file. Otherwise using HWY_DYNAMIC_DISPATCH(FloorLog2) in this module
+// is equivalent to inlining this function.
+HWY_DLLEXPORT void CallFloorLog2(const uint8_t* HWY_RESTRICT in,
+ const size_t count,
+ uint8_t* HWY_RESTRICT out) {
+ // This must reside outside of HWY_NAMESPACE because it references (calls the
+ // appropriate one from) the per-target implementations there.
+ // For static dispatch, use HWY_STATIC_DISPATCH.
+ return HWY_DYNAMIC_DISPATCH(FloorLog2)(in, count, out);
+}
+
+// Optional: anything to compile only once, e.g. non-SIMD implementations of
+// public functions provided by this module, can go inside #if HWY_ONCE.
+
+} // namespace skeleton
+#endif // HWY_ONCE
diff --git a/third_party/highway/hwy/examples/skeleton.h b/third_party/highway/hwy/examples/skeleton.h
new file mode 100644
index 0000000000..381ac69af6
--- /dev/null
+++ b/third_party/highway/hwy/examples/skeleton.h
@@ -0,0 +1,36 @@
+// Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Demo interface to target-specific code in skeleton.cc
+
+// Normal header with include guard and namespace.
+#ifndef HIGHWAY_HWY_EXAMPLES_SKELETON_H_
+#define HIGHWAY_HWY_EXAMPLES_SKELETON_H_
+
+#include <stddef.h>
+
+// Platform-specific definitions used for declaring an interface, independent of
+// the SIMD instruction set.
+#include "hwy/base.h" // HWY_RESTRICT
+
+namespace skeleton {
+
+// Computes base-2 logarithm by converting to float. Supports dynamic dispatch.
+HWY_DLLEXPORT void CallFloorLog2(const uint8_t* HWY_RESTRICT in,
+ const size_t count, uint8_t* HWY_RESTRICT out);
+
+} // namespace skeleton
+
+#endif // HIGHWAY_HWY_EXAMPLES_SKELETON_H_
diff --git a/third_party/highway/hwy/examples/skeleton_test.cc b/third_party/highway/hwy/examples/skeleton_test.cc
new file mode 100644
index 0000000000..c7c26bf5b4
--- /dev/null
+++ b/third_party/highway/hwy/examples/skeleton_test.cc
@@ -0,0 +1,110 @@
+// Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Example of unit test for the "skeleton" library.
+
+#include "hwy/examples/skeleton.h"
+
+#include <stdio.h>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "examples/skeleton_test.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+
+// Must come after foreach_target.h to avoid redefinition errors.
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+// Optional: factor out parts of the implementation into *-inl.h
+// (must also come after foreach_target.h to avoid redefinition errors)
+#include "hwy/examples/skeleton-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace skeleton {
+namespace HWY_NAMESPACE {
+
+namespace hn = hwy::HWY_NAMESPACE;
+
+// Calls function defined in skeleton.cc.
+struct TestFloorLog2 {
+ template <class T, class DF>
+ HWY_NOINLINE void operator()(T /*unused*/, DF df) {
+ const size_t count = 5 * hn::Lanes(df);
+ auto in = hwy::AllocateAligned<uint8_t>(count);
+ auto expected = hwy::AllocateAligned<uint8_t>(count);
+
+ hwy::RandomState rng;
+ for (size_t i = 0; i < count; ++i) {
+ expected[i] = Random32(&rng) & 7;
+ in[i] = static_cast<uint8_t>(1u << expected[i]);
+ }
+ auto out = hwy::AllocateAligned<uint8_t>(count);
+ CallFloorLog2(in.get(), count, out.get());
+ int sum = 0;
+ for (size_t i = 0; i < count; ++i) {
+ HWY_ASSERT_EQ(expected[i], out[i]);
+ sum += out[i];
+ }
+ hwy::PreventElision(sum);
+ }
+};
+
+HWY_NOINLINE void TestAllFloorLog2() {
+ hn::ForPartialVectors<TestFloorLog2>()(float());
+}
+
+// Calls function defined in skeleton-inl.h.
+struct TestSumMulAdd {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ hwy::RandomState rng;
+ const size_t count = 4096;
+ EXPECT_EQ(0, count % hn::Lanes(d));
+ auto mul = hwy::AllocateAligned<T>(count);
+ auto x = hwy::AllocateAligned<T>(count);
+ auto add = hwy::AllocateAligned<T>(count);
+ for (size_t i = 0; i < count; ++i) {
+ mul[i] = static_cast<T>(Random32(&rng) & 0xF);
+ x[i] = static_cast<T>(Random32(&rng) & 0xFF);
+ add[i] = static_cast<T>(Random32(&rng) & 0xFF);
+ }
+ double expected_sum = 0.0;
+ for (size_t i = 0; i < count; ++i) {
+ expected_sum += mul[i] * x[i] + add[i];
+ }
+
+ MulAddLoop(d, mul.get(), add.get(), count, x.get());
+ HWY_ASSERT_EQ(4344240.0, expected_sum);
+ }
+};
+
+HWY_NOINLINE void TestAllSumMulAdd() {
+ hn::ForFloatTypes(hn::ForPartialVectors<TestSumMulAdd>());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace skeleton
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace skeleton {
+HWY_BEFORE_TEST(SkeletonTest);
+HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllFloorLog2);
+HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllSumMulAdd);
+} // namespace skeleton
+
+#endif