diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 17:32:43 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 17:32:43 +0000 |
commit | 6bf0a5cb5034a7e684dcc3500e841785237ce2dd (patch) | |
tree | a68f146d7fa01f0134297619fbe7e33db084e0aa /third_party/highway/hwy/contrib/sort/bench_sort.cc | |
parent | Initial commit. (diff) | |
download | thunderbird-6bf0a5cb5034a7e684dcc3500e841785237ce2dd.tar.xz thunderbird-6bf0a5cb5034a7e684dcc3500e841785237ce2dd.zip |
Adding upstream version 1:115.7.0.upstream/1%115.7.0upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/highway/hwy/contrib/sort/bench_sort.cc')
-rw-r--r-- | third_party/highway/hwy/contrib/sort/bench_sort.cc | 310 |
1 files changed, 310 insertions, 0 deletions
diff --git a/third_party/highway/hwy/contrib/sort/bench_sort.cc b/third_party/highway/hwy/contrib/sort/bench_sort.cc new file mode 100644 index 0000000000..a668fde907 --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/bench_sort.cc @@ -0,0 +1,310 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <stdint.h> +#include <stdio.h> + +#include <vector> + +// clang-format off +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_sort.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/algo-inl.h" +#include "hwy/contrib/sort/result-inl.h" +#include "hwy/contrib/sort/sorting_networks-inl.h" // SharedTraits +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/traits128-inl.h" +#include "hwy/tests/test_util-inl.h" +// clang-format on + +// Mode for larger sorts because M1 is able to access more than the per-core +// share of L2, so 1M elements might still be in cache. +#define SORT_100M 0 + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +// Defined within HWY_ONCE, used by BenchAllSort. +extern int64_t first_sort_target; + +namespace HWY_NAMESPACE { +namespace { +using detail::TraitsLane; +using detail::OrderAscending; +using detail::OrderDescending; +using detail::SharedTraits; + +#if VQSORT_ENABLED || HWY_IDE +using detail::OrderAscending128; +using detail::OrderAscendingKV128; +using detail::Traits128; + +template <class Traits> +HWY_NOINLINE void BenchPartition() { + using LaneType = typename Traits::LaneType; + using KeyType = typename Traits::KeyType; + const SortTag<LaneType> d; + detail::SharedTraits<Traits> st; + const Dist dist = Dist::kUniform8; + double sum = 0.0; + + detail::Generator rng(&sum, 123); // for ChoosePivot + + const size_t max_log2 = AdjustedLog2Reps(20); + for (size_t log2 = max_log2; log2 < max_log2 + 1; ++log2) { + const size_t num_lanes = 1ull << log2; + const size_t num_keys = num_lanes / st.LanesPerKey(); + auto aligned = hwy::AllocateAligned<LaneType>(num_lanes); + auto buf = hwy::AllocateAligned<LaneType>( + HWY_MAX(hwy::SortConstants::PartitionBufNum(Lanes(d)), + hwy::SortConstants::PivotBufNum(sizeof(LaneType), Lanes(d)))); + + std::vector<double> seconds; + const size_t num_reps = (1ull << (14 - log2 / 2)) * 30; + for (size_t rep = 0; rep < num_reps; ++rep) { + (void)GenerateInput(dist, aligned.get(), num_lanes); + + // The pivot value can influence performance. Do exactly what vqsort will + // do so that the performance (influenced by prefetching and branch + // prediction) is likely to predict the actual performance inside vqsort. + detail::DrawSamples(d, st, aligned.get(), num_lanes, buf.get(), rng); + detail::SortSamples(d, st, buf.get()); + auto pivot = detail::ChoosePivotByRank(d, st, buf.get()); + + const Timestamp t0; + detail::Partition(d, st, aligned.get(), num_lanes - 1, pivot, buf.get()); + seconds.push_back(SecondsSince(t0)); + // 'Use' the result to prevent optimizing out the partition. + sum += static_cast<double>(aligned.get()[num_lanes / 2]); + } + + Result(Algo::kVQSort, dist, num_keys, 1, SummarizeMeasurements(seconds), + sizeof(KeyType), st.KeyString()) + .Print(); + } + HWY_ASSERT(sum != 999999); // Prevent optimizing out +} + +HWY_NOINLINE void BenchAllPartition() { + // Not interested in benchmark results for these targets + if (HWY_TARGET == HWY_SSSE3) { + return; + } + + BenchPartition<TraitsLane<OrderDescending<float>>>(); + BenchPartition<TraitsLane<OrderDescending<int32_t>>>(); + BenchPartition<TraitsLane<OrderDescending<int64_t>>>(); + BenchPartition<Traits128<OrderAscending128>>(); + // BenchPartition<Traits128<OrderDescending128>>(); + BenchPartition<Traits128<OrderAscendingKV128>>(); +} + +template <class Traits> +HWY_NOINLINE void BenchBase(std::vector<Result>& results) { + // Not interested in benchmark results for these targets + if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) { + return; + } + + using LaneType = typename Traits::LaneType; + using KeyType = typename Traits::KeyType; + const SortTag<LaneType> d; + detail::SharedTraits<Traits> st; + const Dist dist = Dist::kUniform32; + + const size_t N = Lanes(d); + const size_t num_lanes = SortConstants::BaseCaseNum(N); + const size_t num_keys = num_lanes / st.LanesPerKey(); + auto keys = hwy::AllocateAligned<LaneType>(num_lanes); + auto buf = hwy::AllocateAligned<LaneType>(num_lanes + N); + + std::vector<double> seconds; + double sum = 0; // prevents elision + constexpr size_t kMul = AdjustedReps(600); // ensures long enough to measure + + for (size_t rep = 0; rep < 30; ++rep) { + InputStats<LaneType> input_stats = + GenerateInput(dist, keys.get(), num_lanes); + + const Timestamp t0; + for (size_t i = 0; i < kMul; ++i) { + detail::BaseCase(d, st, keys.get(), keys.get() + num_lanes, num_lanes, + buf.get()); + sum += static_cast<double>(keys[0]); + } + seconds.push_back(SecondsSince(t0)); + // printf("%f\n", seconds.back()); + + HWY_ASSERT(VerifySort(st, input_stats, keys.get(), num_lanes, "BenchBase")); + } + HWY_ASSERT(sum < 1E99); + results.emplace_back(Algo::kVQSort, dist, num_keys * kMul, 1, + SummarizeMeasurements(seconds), sizeof(KeyType), + st.KeyString()); +} + +HWY_NOINLINE void BenchAllBase() { + // Not interested in benchmark results for these targets + if (HWY_TARGET == HWY_SSSE3) { + return; + } + + std::vector<Result> results; + BenchBase<TraitsLane<OrderAscending<float>>>(results); + BenchBase<TraitsLane<OrderDescending<int64_t>>>(results); + BenchBase<Traits128<OrderAscending128>>(results); + for (const Result& r : results) { + r.Print(); + } +} + +#else +void BenchAllPartition() {} +void BenchAllBase() {} +#endif // VQSORT_ENABLED + +std::vector<Algo> AlgoForBench() { + return { +#if HAVE_AVX2SORT + Algo::kSEA, +#endif +#if HAVE_PARALLEL_IPS4O + Algo::kParallelIPS4O, +#elif HAVE_IPS4O + Algo::kIPS4O, +#endif +#if HAVE_PDQSORT + Algo::kPDQ, +#endif +#if HAVE_SORT512 + Algo::kSort512, +#endif +// Only include if we're compiling for the target it supports. +#if HAVE_VXSORT && ((VXSORT_AVX3 && HWY_TARGET == HWY_AVX3) || \ + (!VXSORT_AVX3 && HWY_TARGET == HWY_AVX2)) + Algo::kVXSort, +#endif + +#if !HAVE_PARALLEL_IPS4O +#if !SORT_100M + // These are 10-20x slower, but that's OK for the default size when we + // are not testing the parallel nor 100M modes. + Algo::kStd, Algo::kHeap, +#endif + + Algo::kVQSort, // only ~4x slower, but not required for Table 1a +#endif + }; +} + +template <class Traits> +HWY_NOINLINE void BenchSort(size_t num_keys) { + if (first_sort_target == 0) first_sort_target = HWY_TARGET; + + SharedState shared; + detail::SharedTraits<Traits> st; + using Order = typename Traits::Order; + using LaneType = typename Traits::LaneType; + using KeyType = typename Traits::KeyType; + const size_t num_lanes = num_keys * st.LanesPerKey(); + auto aligned = hwy::AllocateAligned<LaneType>(num_lanes); + + const size_t reps = num_keys > 1000 * 1000 ? 10 : 30; + + for (Algo algo : AlgoForBench()) { + // Other algorithms don't depend on the vector instructions, so only run + // them for the first target. +#if !HAVE_VXSORT + if (algo != Algo::kVQSort && HWY_TARGET != first_sort_target) { + continue; + } +#endif + + for (Dist dist : AllDist()) { + std::vector<double> seconds; + for (size_t rep = 0; rep < reps; ++rep) { + InputStats<LaneType> input_stats = + GenerateInput(dist, aligned.get(), num_lanes); + + const Timestamp t0; + Run<Order>(algo, reinterpret_cast<KeyType*>(aligned.get()), num_keys, + shared, /*thread=*/0); + seconds.push_back(SecondsSince(t0)); + // printf("%f\n", seconds.back()); + + HWY_ASSERT( + VerifySort(st, input_stats, aligned.get(), num_lanes, "BenchSort")); + } + Result(algo, dist, num_keys, 1, SummarizeMeasurements(seconds), + sizeof(KeyType), st.KeyString()) + .Print(); + } // dist + } // algo +} + +HWY_NOINLINE void BenchAllSort() { + // Not interested in benchmark results for these targets + if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) { + return; + } + + constexpr size_t K = 1000; + constexpr size_t M = K * K; + (void)K; + (void)M; + for (size_t num_keys : { +#if HAVE_PARALLEL_IPS4O || SORT_100M + 100 * M, +#else + 1 * M, +#endif + }) { + BenchSort<TraitsLane<OrderAscending<float>>>(num_keys); + // BenchSort<TraitsLane<OrderDescending<double>>>(num_keys); + // BenchSort<TraitsLane<OrderAscending<int16_t>>>(num_keys); + BenchSort<TraitsLane<OrderDescending<int32_t>>>(num_keys); + BenchSort<TraitsLane<OrderAscending<int64_t>>>(num_keys); + // BenchSort<TraitsLane<OrderDescending<uint16_t>>>(num_keys); + // BenchSort<TraitsLane<OrderDescending<uint32_t>>>(num_keys); + // BenchSort<TraitsLane<OrderAscending<uint64_t>>>(num_keys); + +#if !HAVE_VXSORT && VQSORT_ENABLED + BenchSort<Traits128<OrderAscending128>>(num_keys); + BenchSort<Traits128<OrderAscendingKV128>>(num_keys); +#endif + } +} + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +int64_t first_sort_target = 0; // none run yet +namespace { +HWY_BEFORE_TEST(BenchSort); +HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllPartition); +HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllBase); +HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllSort); +} // namespace +} // namespace hwy + +#endif // HWY_ONCE |