From 6bf0a5cb5034a7e684dcc3500e841785237ce2dd Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 19:32:43 +0200 Subject: Adding upstream version 1:115.7.0. Signed-off-by: Daniel Baumann --- third_party/highway/hwy/examples/benchmark.cc | 255 ++++++++++++++++++++++ third_party/highway/hwy/examples/skeleton-inl.h | 66 ++++++ third_party/highway/hwy/examples/skeleton.cc | 122 +++++++++++ third_party/highway/hwy/examples/skeleton.h | 36 +++ third_party/highway/hwy/examples/skeleton_test.cc | 110 ++++++++++ 5 files changed, 589 insertions(+) create mode 100644 third_party/highway/hwy/examples/benchmark.cc create mode 100644 third_party/highway/hwy/examples/skeleton-inl.h create mode 100644 third_party/highway/hwy/examples/skeleton.cc create mode 100644 third_party/highway/hwy/examples/skeleton.h create mode 100644 third_party/highway/hwy/examples/skeleton_test.cc (limited to 'third_party/highway/hwy/examples') diff --git a/third_party/highway/hwy/examples/benchmark.cc b/third_party/highway/hwy/examples/benchmark.cc new file mode 100644 index 0000000000..55afd3bcca --- /dev/null +++ b/third_party/highway/hwy/examples/benchmark.cc @@ -0,0 +1,255 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS // before inttypes.h +#endif +#include +#include +#include +#include + +#include // std::abs +#include +#include // std::iota, std::inner_product + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/examples/benchmark.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// Must come after foreach_target.h to avoid redefinition errors. +#include "hwy/aligned_allocator.h" +#include "hwy/highway.h" +#include "hwy/nanobenchmark.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +#if HWY_TARGET != HWY_SCALAR +using hwy::HWY_NAMESPACE::CombineShiftRightLanes; +#endif + +class TwoArray { + public: + // Must be a multiple of the vector lane count * 8. + static size_t NumItems() { return 3456; } + + TwoArray() + : a_(AllocateAligned(NumItems() * 2)), b_(a_.get() + NumItems()) { + // = 1, but compiler doesn't know + const float init = static_cast(Unpredictable1()); + std::iota(a_.get(), a_.get() + NumItems(), init); + std::iota(b_, b_ + NumItems(), init); + } + + protected: + AlignedFreeUniquePtr a_; + float* b_; +}; + +// Measures durations, verifies results, prints timings. +template +void RunBenchmark(const char* caption) { + printf("%10s: ", caption); + const size_t kNumInputs = 1; + const size_t num_items = Benchmark::NumItems() * size_t(Unpredictable1()); + const FuncInput inputs[kNumInputs] = {num_items}; + Result results[kNumInputs]; + + Benchmark benchmark; + + Params p; + p.verbose = false; + p.max_evals = 7; + p.target_rel_mad = 0.002; + const size_t num_results = MeasureClosure( + [&benchmark](const FuncInput input) { return benchmark(input); }, inputs, + kNumInputs, results, p); + if (num_results != kNumInputs) { + fprintf(stderr, "MeasureClosure failed.\n"); + } + + benchmark.Verify(num_items); + + for (size_t i = 0; i < num_results; ++i) { + const double cycles_per_item = + results[i].ticks / static_cast(results[i].input); + const double mad = results[i].variability * cycles_per_item; + printf("%6" PRIu64 ": %6.3f (+/- %5.3f)\n", + static_cast(results[i].input), cycles_per_item, mad); + } +} + +void Intro() { + const float in[16] = {1, 2, 3, 4, 5, 6}; + float out[16]; + const ScalableTag d; // largest possible vector + for (size_t i = 0; i < 16; i += Lanes(d)) { + const auto vec = LoadU(d, in + i); // no alignment requirement + auto result = Mul(vec, vec); + result = Add(result, result); // can update if not const + StoreU(result, d, out + i); + } + printf("\nF(x)->2*x^2, F(%.0f) = %.1f\n", in[2], out[2]); +} + +// BEGINNER: dot product +// 0.4 cyc/float = bronze, 0.25 = silver, 0.15 = gold! +class BenchmarkDot : public TwoArray { + public: + BenchmarkDot() : dot_{-1.0f} {} + + FuncOutput operator()(const size_t num_items) { + const ScalableTag d; + const size_t N = Lanes(d); + using V = decltype(Zero(d)); + // Compiler doesn't make independent sum* accumulators, so unroll manually. + // We cannot use an array because V might be a sizeless type. For reasonable + // code, we unroll 4x, but 8x might help (2 FMA ports * 4 cycle latency). + V sum0 = Zero(d); + V sum1 = Zero(d); + V sum2 = Zero(d); + V sum3 = Zero(d); + const float* const HWY_RESTRICT pa = &a_[0]; + const float* const HWY_RESTRICT pb = b_; + for (size_t i = 0; i < num_items; i += 4 * N) { + const auto a0 = Load(d, pa + i + 0 * N); + const auto b0 = Load(d, pb + i + 0 * N); + sum0 = MulAdd(a0, b0, sum0); + const auto a1 = Load(d, pa + i + 1 * N); + const auto b1 = Load(d, pb + i + 1 * N); + sum1 = MulAdd(a1, b1, sum1); + const auto a2 = Load(d, pa + i + 2 * N); + const auto b2 = Load(d, pb + i + 2 * N); + sum2 = MulAdd(a2, b2, sum2); + const auto a3 = Load(d, pa + i + 3 * N); + const auto b3 = Load(d, pb + i + 3 * N); + sum3 = MulAdd(a3, b3, sum3); + } + // Reduction tree: sum of all accumulators by pairs into sum0. + sum0 = Add(sum0, sum1); + sum2 = Add(sum2, sum3); + sum0 = Add(sum0, sum2); + dot_ = GetLane(SumOfLanes(d, sum0)); + return static_cast(dot_); + } + void Verify(size_t num_items) { + if (dot_ == -1.0f) { + fprintf(stderr, "Dot: must call Verify after benchmark"); + abort(); + } + + const float expected = + std::inner_product(a_.get(), a_.get() + num_items, b_, 0.0f); + const float rel_err = std::abs(expected - dot_) / expected; + if (rel_err > 1.1E-6f) { + fprintf(stderr, "Dot: expected %e actual %e (%e)\n", expected, dot_, + rel_err); + abort(); + } + } + + private: + float dot_; // for Verify +}; + +// INTERMEDIATE: delta coding +// 1.0 cycles/float = bronze, 0.7 = silver, 0.4 = gold! +struct BenchmarkDelta : public TwoArray { + FuncOutput operator()(const size_t num_items) const { +#if HWY_TARGET == HWY_SCALAR + b_[0] = a_[0]; + for (size_t i = 1; i < num_items; ++i) { + b_[i] = a_[i] - a_[i - 1]; + } +#elif HWY_CAP_GE256 + // Larger vectors are split into 128-bit blocks, easiest to use the + // unaligned load support to shift between them. + const ScalableTag df; + const size_t N = Lanes(df); + size_t i; + b_[0] = a_[0]; + for (i = 1; i < N; ++i) { + b_[i] = a_[i] - a_[i - 1]; + } + for (; i < num_items; i += N) { + const auto a = Load(df, &a_[i]); + const auto shifted = LoadU(df, &a_[i - 1]); + Store(a - shifted, df, &b_[i]); + } +#else // 128-bit + // Slightly better than unaligned loads + const HWY_CAPPED(float, 4) df; + const size_t N = Lanes(df); + size_t i; + b_[0] = a_[0]; + for (i = 1; i < N; ++i) { + b_[i] = a_[i] - a_[i - 1]; + } + auto prev = Load(df, &a_[0]); + for (; i < num_items; i += Lanes(df)) { + const auto a = Load(df, &a_[i]); + const auto shifted = CombineShiftRightLanes<3>(df, a, prev); + prev = a; + Store(Sub(a, shifted), df, &b_[i]); + } +#endif + return static_cast(b_[num_items - 1]); + } + + void Verify(size_t num_items) { + for (size_t i = 0; i < num_items; ++i) { + const float expected = (i == 0) ? a_[0] : a_[i] - a_[i - 1]; + const float err = std::abs(expected - b_[i]); + if (err > 1E-6f) { + fprintf(stderr, "Delta: expected %e, actual %e\n", expected, b_[i]); + } + } + } +}; + +void RunBenchmarks() { + Intro(); + printf("------------------------ %s\n", TargetName(HWY_TARGET)); + RunBenchmark("dot"); + RunBenchmark("delta"); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +HWY_EXPORT(RunBenchmarks); + +void Run() { + for (int64_t target : SupportedAndGeneratedTargets()) { + SetSupportedTargetsForTest(target); + HWY_DYNAMIC_DISPATCH(RunBenchmarks)(); + } + SetSupportedTargetsForTest(0); // Reset the mask afterwards. +} + +} // namespace hwy + +int main(int /*argc*/, char** /*argv*/) { + hwy::Run(); + return 0; +} +#endif // HWY_ONCE diff --git a/third_party/highway/hwy/examples/skeleton-inl.h b/third_party/highway/hwy/examples/skeleton-inl.h new file mode 100644 index 0000000000..8aec33e666 --- /dev/null +++ b/third_party/highway/hwy/examples/skeleton-inl.h @@ -0,0 +1,66 @@ +// Copyright 2020 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Demo of functions that might be called from multiple SIMD modules (either +// other -inl.h files, or a .cc file between begin/end_target-inl). This is +// optional - all SIMD code can reside in .cc files. However, this allows +// splitting code into different files while still inlining instead of requiring +// calling through function pointers. + +// Per-target include guard. This is only required when using dynamic dispatch, +// i.e. including foreach_target.h. For static dispatch, a normal include +// guard would be fine because the header is only compiled once. +#if defined(HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_ +#undef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_ +#else +#define HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_ +#endif + +// It is fine to #include normal or *-inl headers. +#include + +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace skeleton { +namespace HWY_NAMESPACE { + +// Highway ops reside here; ADL does not find templates nor builtins. +namespace hn = hwy::HWY_NAMESPACE; + +// Example of a type-agnostic (caller-specified lane type) and width-agnostic +// (uses best available instruction set) function in a header. +// +// Computes x[i] = mul_array[i] * x_array[i] + add_array[i] for i < size. +template +HWY_MAYBE_UNUSED void MulAddLoop(const D d, const T* HWY_RESTRICT mul_array, + const T* HWY_RESTRICT add_array, + const size_t size, T* HWY_RESTRICT x_array) { + for (size_t i = 0; i < size; i += hn::Lanes(d)) { + const auto mul = hn::Load(d, mul_array + i); + const auto add = hn::Load(d, add_array + i); + auto x = hn::Load(d, x_array + i); + x = hn::MulAdd(mul, x, add); + hn::Store(x, d, x_array + i); + } +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace skeleton +HWY_AFTER_NAMESPACE(); + +#endif // include guard diff --git a/third_party/highway/hwy/examples/skeleton.cc b/third_party/highway/hwy/examples/skeleton.cc new file mode 100644 index 0000000000..778ba4ac0a --- /dev/null +++ b/third_party/highway/hwy/examples/skeleton.cc @@ -0,0 +1,122 @@ +// Copyright 2020 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/examples/skeleton.h" + +#include + +// >>>> for dynamic dispatch only, skip if you want static dispatch + +// First undef to prevent error when re-included. +#undef HWY_TARGET_INCLUDE +// For dynamic dispatch, specify the name of the current file (unfortunately +// __FILE__ is not reliable) so that foreach_target.h can re-include it. +#define HWY_TARGET_INCLUDE "hwy/examples/skeleton.cc" +// Generates code for each enabled target by re-including this source file. +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// <<<< end of dynamic dispatch + +// Must come after foreach_target.h to avoid redefinition errors. +#include "hwy/highway.h" + +// Optional, can instead add HWY_ATTR to all functions. +HWY_BEFORE_NAMESPACE(); + +namespace skeleton { +// This namespace name is unique per target, which allows code for multiple +// targets to co-exist in the same translation unit. Required when using dynamic +// dispatch, otherwise optional. +namespace HWY_NAMESPACE { + +// Highway ops reside here; ADL does not find templates nor builtins. +namespace hn = hwy::HWY_NAMESPACE; + +// Computes log2 by converting to a vector of floats. Compiled once per target. +template +HWY_ATTR_NO_MSAN void OneFloorLog2(const DF df, + const uint8_t* HWY_RESTRICT values, + uint8_t* HWY_RESTRICT log2) { + // Type tags for converting to other element types (Rebind = same count). + const hn::RebindToSigned d32; + const hn::Rebind d8; + using VI32 = hn::Vec; + + const VI32 vi32 = hn::PromoteTo(d32, hn::Load(d8, values)); + const VI32 bits = hn::BitCast(d32, hn::ConvertTo(df, vi32)); + const VI32 exponent = hn::Sub(hn::ShiftRight<23>(bits), hn::Set(d32, 127)); + hn::Store(hn::DemoteTo(d8, exponent), d8, log2); +} + +void CodepathDemo() { + // Highway defaults to portability, but per-target codepaths may be selected + // via #if HWY_TARGET == HWY_SSE4 or by testing capability macros: +#if HWY_HAVE_INTEGER64 + const char* gather = "Has int64"; +#else + const char* gather = "No int64"; +#endif + printf("Target %s: %s\n", hwy::TargetName(HWY_TARGET), gather); +} + +void FloorLog2(const uint8_t* HWY_RESTRICT values, size_t count, + uint8_t* HWY_RESTRICT log2) { + CodepathDemo(); + + const hn::ScalableTag df; + const size_t N = hn::Lanes(df); + size_t i = 0; + for (; i + N <= count; i += N) { + OneFloorLog2(df, values + i, log2 + i); + } + for (; i < count; ++i) { + hn::CappedTag d1; + OneFloorLog2(d1, values + i, log2 + i); + } +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace skeleton +HWY_AFTER_NAMESPACE(); + +// The table of pointers to the various implementations in HWY_NAMESPACE must +// be compiled only once (foreach_target #includes this file multiple times). +// HWY_ONCE is true for only one of these 'compilation passes'. +#if HWY_ONCE + +namespace skeleton { + +// This macro declares a static array used for dynamic dispatch; it resides in +// the same outer namespace that contains FloorLog2. +HWY_EXPORT(FloorLog2); + +// This function is optional and only needed in the case of exposing it in the +// header file. Otherwise using HWY_DYNAMIC_DISPATCH(FloorLog2) in this module +// is equivalent to inlining this function. +HWY_DLLEXPORT void CallFloorLog2(const uint8_t* HWY_RESTRICT in, + const size_t count, + uint8_t* HWY_RESTRICT out) { + // This must reside outside of HWY_NAMESPACE because it references (calls the + // appropriate one from) the per-target implementations there. + // For static dispatch, use HWY_STATIC_DISPATCH. + return HWY_DYNAMIC_DISPATCH(FloorLog2)(in, count, out); +} + +// Optional: anything to compile only once, e.g. non-SIMD implementations of +// public functions provided by this module, can go inside #if HWY_ONCE. + +} // namespace skeleton +#endif // HWY_ONCE diff --git a/third_party/highway/hwy/examples/skeleton.h b/third_party/highway/hwy/examples/skeleton.h new file mode 100644 index 0000000000..381ac69af6 --- /dev/null +++ b/third_party/highway/hwy/examples/skeleton.h @@ -0,0 +1,36 @@ +// Copyright 2020 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Demo interface to target-specific code in skeleton.cc + +// Normal header with include guard and namespace. +#ifndef HIGHWAY_HWY_EXAMPLES_SKELETON_H_ +#define HIGHWAY_HWY_EXAMPLES_SKELETON_H_ + +#include + +// Platform-specific definitions used for declaring an interface, independent of +// the SIMD instruction set. +#include "hwy/base.h" // HWY_RESTRICT + +namespace skeleton { + +// Computes base-2 logarithm by converting to float. Supports dynamic dispatch. +HWY_DLLEXPORT void CallFloorLog2(const uint8_t* HWY_RESTRICT in, + const size_t count, uint8_t* HWY_RESTRICT out); + +} // namespace skeleton + +#endif // HIGHWAY_HWY_EXAMPLES_SKELETON_H_ diff --git a/third_party/highway/hwy/examples/skeleton_test.cc b/third_party/highway/hwy/examples/skeleton_test.cc new file mode 100644 index 0000000000..c7c26bf5b4 --- /dev/null +++ b/third_party/highway/hwy/examples/skeleton_test.cc @@ -0,0 +1,110 @@ +// Copyright 2020 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Example of unit test for the "skeleton" library. + +#include "hwy/examples/skeleton.h" + +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "examples/skeleton_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// Must come after foreach_target.h to avoid redefinition errors. +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +// Optional: factor out parts of the implementation into *-inl.h +// (must also come after foreach_target.h to avoid redefinition errors) +#include "hwy/examples/skeleton-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace skeleton { +namespace HWY_NAMESPACE { + +namespace hn = hwy::HWY_NAMESPACE; + +// Calls function defined in skeleton.cc. +struct TestFloorLog2 { + template + HWY_NOINLINE void operator()(T /*unused*/, DF df) { + const size_t count = 5 * hn::Lanes(df); + auto in = hwy::AllocateAligned(count); + auto expected = hwy::AllocateAligned(count); + + hwy::RandomState rng; + for (size_t i = 0; i < count; ++i) { + expected[i] = Random32(&rng) & 7; + in[i] = static_cast(1u << expected[i]); + } + auto out = hwy::AllocateAligned(count); + CallFloorLog2(in.get(), count, out.get()); + int sum = 0; + for (size_t i = 0; i < count; ++i) { + HWY_ASSERT_EQ(expected[i], out[i]); + sum += out[i]; + } + hwy::PreventElision(sum); + } +}; + +HWY_NOINLINE void TestAllFloorLog2() { + hn::ForPartialVectors()(float()); +} + +// Calls function defined in skeleton-inl.h. +struct TestSumMulAdd { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + hwy::RandomState rng; + const size_t count = 4096; + EXPECT_EQ(0, count % hn::Lanes(d)); + auto mul = hwy::AllocateAligned(count); + auto x = hwy::AllocateAligned(count); + auto add = hwy::AllocateAligned(count); + for (size_t i = 0; i < count; ++i) { + mul[i] = static_cast(Random32(&rng) & 0xF); + x[i] = static_cast(Random32(&rng) & 0xFF); + add[i] = static_cast(Random32(&rng) & 0xFF); + } + double expected_sum = 0.0; + for (size_t i = 0; i < count; ++i) { + expected_sum += mul[i] * x[i] + add[i]; + } + + MulAddLoop(d, mul.get(), add.get(), count, x.get()); + HWY_ASSERT_EQ(4344240.0, expected_sum); + } +}; + +HWY_NOINLINE void TestAllSumMulAdd() { + hn::ForFloatTypes(hn::ForPartialVectors()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace skeleton +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace skeleton { +HWY_BEFORE_TEST(SkeletonTest); +HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllFloorLog2); +HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllSumMulAdd); +} // namespace skeleton + +#endif -- cgit v1.2.3