diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
commit | 26a029d407be480d791972afb5975cf62c9360a6 (patch) | |
tree | f435a8308119effd964b339f76abb83a57c29483 /third_party/highway/hwy/contrib/sort | |
parent | Initial commit. (diff) | |
download | firefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz firefox-26a029d407be480d791972afb5975cf62c9360a6.zip |
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/highway/hwy/contrib/sort')
37 files changed, 7781 insertions, 0 deletions
diff --git a/third_party/highway/hwy/contrib/sort/BUILD b/third_party/highway/hwy/contrib/sort/BUILD new file mode 100644 index 0000000000..dc15341908 --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/BUILD @@ -0,0 +1,210 @@ +package( + default_applicable_licenses = ["//:license"], + default_visibility = ["//visibility:public"], +) + +licenses(["notice"]) + +# Unused on Bazel builds, where this is not defined/known; Copybara replaces +# usages with an empty list. +COMPAT = [ + "//buildenv/target:non_prod", # includes mobile/vendor. +] + +cc_library( + name = "intel", + # hdrs = select({ + # "//third_party/bazel_platforms/cpu:x86_64": [ + # "avx512-16bit-common.h", + # "avx512-16bit-qsort.hpp", + # "avx512-32bit-qsort.hpp", + # "avx512-64bit-common.h", + # "avx512-64bit-qsort.hpp", + # "avx512-common-qsort.h", + # ], + # "//conditions:default": [], + # }), + compatible_with = [], +) + +cc_library( + name = "vxsort", + srcs = [ + # "vxsort/isa_detection.cpp", + # "vxsort/isa_detection_msvc.cpp", + # "vxsort/isa_detection_sane.cpp", + # "vxsort/machine_traits.avx2.cpp", + # "vxsort/smallsort/avx2_load_mask_tables.cpp", + # "vxsort/smallsort/bitonic_sort.AVX2.double.generated.cpp", + # "vxsort/smallsort/bitonic_sort.AVX2.float.generated.cpp", + # "vxsort/smallsort/bitonic_sort.AVX2.int32_t.generated.cpp", + # "vxsort/smallsort/bitonic_sort.AVX2.int64_t.generated.cpp", + # "vxsort/smallsort/bitonic_sort.AVX2.uint32_t.generated.cpp", + # "vxsort/smallsort/bitonic_sort.AVX2.uint64_t.generated.cpp", + # "vxsort/smallsort/bitonic_sort.AVX512.double.generated.cpp", + # "vxsort/smallsort/bitonic_sort.AVX512.float.generated.cpp", + # "vxsort/smallsort/bitonic_sort.AVX512.int32_t.generated.cpp", + # "vxsort/smallsort/bitonic_sort.AVX512.int64_t.generated.cpp", + # "vxsort/smallsort/bitonic_sort.AVX512.uint32_t.generated.cpp", + # "vxsort/smallsort/bitonic_sort.AVX512.uint64_t.generated.cpp", + # "vxsort/vxsort_stats.cpp", + ], + hdrs = [ + # "vxsort/alignment.h", + # "vxsort/defs.h", + # "vxsort/isa_detection.h", + # "vxsort/machine_traits.avx2.h", + # "vxsort/machine_traits.avx512.h", + # "vxsort/machine_traits.h", + # "vxsort/packer.h", + # "vxsort/smallsort/bitonic_sort.AVX2.double.generated.h", + # "vxsort/smallsort/bitonic_sort.AVX2.float.generated.h", + # "vxsort/smallsort/bitonic_sort.AVX2.int32_t.generated.h", + # "vxsort/smallsort/bitonic_sort.AVX2.int64_t.generated.h", + # "vxsort/smallsort/bitonic_sort.AVX2.uint32_t.generated.h", + # "vxsort/smallsort/bitonic_sort.AVX2.uint64_t.generated.h", + # "vxsort/smallsort/bitonic_sort.AVX512.double.generated.h", + # "vxsort/smallsort/bitonic_sort.AVX512.float.generated.h", + # "vxsort/smallsort/bitonic_sort.AVX512.int32_t.generated.h", + # "vxsort/smallsort/bitonic_sort.AVX512.int64_t.generated.h", + # "vxsort/smallsort/bitonic_sort.AVX512.uint32_t.generated.h", + # "vxsort/smallsort/bitonic_sort.AVX512.uint64_t.generated.h", + # "vxsort/smallsort/bitonic_sort.h", + # "vxsort/vxsort.h", + # "vxsort/vxsort_stats.h", + ], + compatible_with = [], + textual_hdrs = [ + # "vxsort/vxsort_targets_disable.h", + # "vxsort/vxsort_targets_enable_avx2.h", + # "vxsort/vxsort_targets_enable_avx512.h", + ], +) + +cc_library( + name = "vqsort", + srcs = [ + # Split into separate files to reduce MSVC build time. + "vqsort.cc", + "vqsort_128a.cc", + "vqsort_128d.cc", + "vqsort_f32a.cc", + "vqsort_f32d.cc", + "vqsort_f64a.cc", + "vqsort_f64d.cc", + "vqsort_i16a.cc", + "vqsort_i16d.cc", + "vqsort_i32a.cc", + "vqsort_i32d.cc", + "vqsort_i64a.cc", + "vqsort_i64d.cc", + "vqsort_kv64a.cc", + "vqsort_kv64d.cc", + "vqsort_kv128a.cc", + "vqsort_kv128d.cc", + "vqsort_u16a.cc", + "vqsort_u16d.cc", + "vqsort_u32a.cc", + "vqsort_u32d.cc", + "vqsort_u64a.cc", + "vqsort_u64d.cc", + ], + hdrs = [ + "vqsort.h", # public interface + ], + compatible_with = [], + local_defines = ["hwy_contrib_EXPORTS"], + textual_hdrs = [ + "shared-inl.h", + "sorting_networks-inl.h", + "traits-inl.h", + "traits128-inl.h", + "vqsort-inl.h", + # Placeholder for internal instrumentation. Do not remove. + ], + deps = [ + ":intel", # required if HAVE_INTEL + ":vxsort", # required if HAVE_VXSORT + "//:hwy", + ], +) + +# ----------------------------------------------------------------------------- +# Internal-only targets + +cc_library( + name = "helpers", + testonly = 1, + textual_hdrs = [ + "algo-inl.h", + "result-inl.h", + ], + deps = [ + ":vqsort", + "//:nanobenchmark", + # Required for HAVE_PDQSORT, but that is unused and this is + # unavailable to Bazel builds, hence commented out. + # "//third_party/boost/allowed", + # Avoid ips4o and thus TBB to work around hwloc build failure. + ], +) + +cc_binary( + name = "print_network", + testonly = 1, + srcs = ["print_network.cc"], + deps = [ + ":helpers", + ":vqsort", + "//:hwy", + ], +) + +cc_test( + name = "sort_test", + size = "medium", + srcs = ["sort_test.cc"], + # Do not enable fully_static_link (pthread crash on bazel) + local_defines = ["HWY_IS_TEST"], + # for test_suite. + tags = ["hwy_ops_test"], + deps = [ + ":helpers", + ":vqsort", + "@com_google_googletest//:gtest_main", + "//:hwy", + "//:hwy_test_util", + ], +) + +cc_test( + name = "bench_sort", + size = "medium", + srcs = ["bench_sort.cc"], + # Do not enable fully_static_link (pthread crash on bazel) + local_defines = ["HWY_IS_TEST"], + # for test_suite. + tags = ["hwy_ops_test"], + deps = [ + ":helpers", + ":vqsort", + "@com_google_googletest//:gtest_main", + "//:hwy", + "//:hwy_test_util", + ], +) + +cc_binary( + name = "bench_parallel", + testonly = 1, + srcs = ["bench_parallel.cc"], + # Do not enable fully_static_link (pthread crash on bazel) + local_defines = ["HWY_IS_TEST"], + deps = [ + ":helpers", + ":vqsort", + "@com_google_googletest//:gtest_main", + "//:hwy", + "//:hwy_test_util", + ], +) diff --git a/third_party/highway/hwy/contrib/sort/README.md b/third_party/highway/hwy/contrib/sort/README.md new file mode 100644 index 0000000000..46047e6359 --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/README.md @@ -0,0 +1,109 @@ +# Vectorized and performance-portable Quicksort + +## Introduction + +As of 2022-06-07 this sorts large arrays of built-in types about ten times as +fast as `std::sort`. See also our +[blog post](https://opensource.googleblog.com/2022/06/Vectorized%20and%20performance%20portable%20Quicksort.html) +and [paper](https://arxiv.org/abs/2205.05982). + +## Instructions + +Here are instructions for reproducing our results with cross-platform CMake, +Linux, or AWS (SVE, NEON). + +### CMake, any platform + +Please first ensure that Clang (tested with 13.0.1 and 15.0.6) is installed, and +if it is not the default compiler, point the CC and CXX environment variables to +it, e.g. + +``` +export CC=clang-15 +export CXX=clang++-15 +``` + +Then run the usual CMake workflow, also documented in the Highway README, e.g.: + +``` +mkdir -p build && cd build && cmake .. && make -j +taskset -c 2 tests/bench_sort +``` + +The optional `taskset -c 2` part reduces the variability of measurements by +preventing the OS from migrating the benchmark between cores. + +### Linux + +Please first ensure golang, and Clang (tested with 13.0.1) are installed via +your system's package manager. + +``` +go install github.com/bazelbuild/bazelisk@latest +git clone https://github.com/google/highway +cd highway +CC=clang CXX=clang++ ~/go/bin/bazelisk build -c opt hwy/contrib/sort:all +bazel-bin/hwy/contrib/sort/sort_test +bazel-bin/hwy/contrib/sort/bench_sort +``` + +### AWS Graviton3 + +Instance config: amazon linux 5.10 arm64, c7g.8xlarge (largest allowed config is +32 vCPU). Initial launch will fail. Wait a few minutes for an email saying the +config is verified, then re-launch. See IPv4 hostname in list of instances. + +`ssh -i /path/key.pem ec2-user@hostname` + +Note that the AWS CMake package is too old for llvm, so we build it first: +``` +wget https://cmake.org/files/v3.23/cmake-3.23.2.tar.gz +tar -xvzf cmake-3.23.2.tar.gz && cd cmake-3.23.2/ +./bootstrap -- -DCMAKE_USE_OPENSSL=OFF +make -j8 && sudo make install +cd .. +``` + +AWS clang is at version 11.1, which generates unnecessary `AND` instructions +which slow down the sort by 1.15x. We tested with clang trunk as of June 13 +(which reports Git hash 8f6512fea000c3a0d394864bb94e524bee375069). To build: + +``` +git clone --depth 1 https://github.com/llvm/llvm-project.git +cd llvm-project +mkdir -p build && cd build +/usr/local/bin/cmake ../llvm -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi" -DCMAKE_BUILD_TYPE=Release +make -j32 && sudo make install +``` + +``` +sudo yum install go +go install github.com/bazelbuild/bazelisk@latest +git clone https://github.com/google/highway +cd highway +CC=/usr/local/bin/clang CXX=/usr/local/bin/clang++ ~/go/bin/bazelisk build -c opt --copt=-march=armv8.2-a+sve hwy/contrib/sort:all +bazel-bin/hwy/contrib/sort/sort_test +bazel-bin/hwy/contrib/sort/bench_sort +``` + +The above command line enables SVE, which is currently only available on +Graviton 3. You can also test NEON on the same processor, or other Arm CPUs, by +changing the `-march=` option to `--copt=-march=armv8.2-a+crypto`. Note that +such flags will be unnecessary once Clang supports `#pragma target` for NEON and +SVE intrinsics, as it does for x86. + +## Results + +`bench_sort` outputs the instruction set (AVX3 refers to AVX-512), the sort +algorithm (std for `std::sort`, vq for our vqsort), the type of keys being +sorted (f32 is float), the distribution of keys (uniform32 for uniform random +with range 0-2^32), the number of keys, then the throughput of sorted keys (i.e. +number of key bytes output per second). + +Example excerpt from Xeon 6154 (Skylake-X) CPU clocked at 3 GHz: + +``` +[ RUN ] BenchSortGroup/BenchSort.BenchAllSort/AVX3 + AVX3: std: f32: uniform32: 1.00E+06 54 MB/s ( 1 threads) + AVX3: vq: f32: uniform32: 1.00E+06 1143 MB/s ( 1 threads) +``` diff --git a/third_party/highway/hwy/contrib/sort/algo-inl.h b/third_party/highway/hwy/contrib/sort/algo-inl.h new file mode 100644 index 0000000000..546843e101 --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/algo-inl.h @@ -0,0 +1,553 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Normal include guard for target-independent parts +#ifndef HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_ +#define HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_ + +#include <stdint.h> +#include <string.h> // memcpy + +#include <algorithm> // std::sort, std::min, std::max +#include <functional> // std::less, std::greater +#include <vector> + +#include "hwy/base.h" +#include "hwy/contrib/sort/vqsort.h" +#include "hwy/print.h" + +// Third-party algorithms +#define HAVE_AVX2SORT 0 +#define HAVE_IPS4O 0 +// When enabling, consider changing max_threads (required for Table 1a) +#define HAVE_PARALLEL_IPS4O (HAVE_IPS4O && 1) +#define HAVE_PDQSORT 0 +#define HAVE_SORT512 0 +#define HAVE_VXSORT 0 +#if HWY_ARCH_X86 +#define HAVE_INTEL 0 +#else +#define HAVE_INTEL 0 +#endif + +#if HAVE_PARALLEL_IPS4O +#include <thread> // NOLINT +#endif + +#if HAVE_AVX2SORT +HWY_PUSH_ATTRIBUTES("avx2,avx") +#include "avx2sort.h" //NOLINT +HWY_POP_ATTRIBUTES +#endif +#if HAVE_IPS4O || HAVE_PARALLEL_IPS4O +#include "third_party/ips4o/include/ips4o.hpp" +#include "third_party/ips4o/include/ips4o/thread_pool.hpp" +#endif +#if HAVE_PDQSORT +#include "third_party/boost/allowed/sort/sort.hpp" +#endif +#if HAVE_SORT512 +#include "sort512.h" //NOLINT +#endif + +// vxsort is difficult to compile for multiple targets because it also uses +// .cpp files, and we'd also have to #undef its include guards. Instead, compile +// only for AVX2 or AVX3 depending on this macro. +#define VXSORT_AVX3 1 +#if HAVE_VXSORT +// inlined from vxsort_targets_enable_avx512 (must close before end of header) +#ifdef __GNUC__ +#ifdef __clang__ +#if VXSORT_AVX3 +#pragma clang attribute push(__attribute__((target("avx512f,avx512dq"))), \ + apply_to = any(function)) +#else +#pragma clang attribute push(__attribute__((target("avx2"))), \ + apply_to = any(function)) +#endif // VXSORT_AVX3 + +#else +#pragma GCC push_options +#if VXSORT_AVX3 +#pragma GCC target("avx512f,avx512dq") +#else +#pragma GCC target("avx2") +#endif // VXSORT_AVX3 +#endif +#endif + +#if VXSORT_AVX3 +#include "vxsort/machine_traits.avx512.h" +#else +#include "vxsort/machine_traits.avx2.h" +#endif // VXSORT_AVX3 +#include "vxsort/vxsort.h" +#ifdef __GNUC__ +#ifdef __clang__ +#pragma clang attribute pop +#else +#pragma GCC pop_options +#endif +#endif +#endif // HAVE_VXSORT + +namespace hwy { + +enum class Dist { kUniform8, kUniform16, kUniform32 }; + +static inline std::vector<Dist> AllDist() { + return {/*Dist::kUniform8, Dist::kUniform16,*/ Dist::kUniform32}; +} + +static inline const char* DistName(Dist dist) { + switch (dist) { + case Dist::kUniform8: + return "uniform8"; + case Dist::kUniform16: + return "uniform16"; + case Dist::kUniform32: + return "uniform32"; + } + return "unreachable"; +} + +template <typename T> +class InputStats { + public: + void Notify(T value) { + min_ = std::min(min_, value); + max_ = std::max(max_, value); + // Converting to integer would truncate floats, multiplying to save digits + // risks overflow especially when casting, so instead take the sum of the + // bit representations as the checksum. + uint64_t bits = 0; + static_assert(sizeof(T) <= 8, "Expected a built-in type"); + CopyBytes<sizeof(T)>(&value, &bits); // not same size + sum_ += bits; + count_ += 1; + } + + bool operator==(const InputStats& other) const { + char type_name[100]; + detail::TypeName(hwy::detail::MakeTypeInfo<T>(), 1, type_name); + + if (count_ != other.count_) { + HWY_ABORT("Sort %s: count %d vs %d\n", type_name, + static_cast<int>(count_), static_cast<int>(other.count_)); + } + + if (min_ != other.min_ || max_ != other.max_) { + HWY_ABORT("Sort %s: minmax %f/%f vs %f/%f\n", type_name, + static_cast<double>(min_), static_cast<double>(max_), + static_cast<double>(other.min_), + static_cast<double>(other.max_)); + } + + // Sum helps detect duplicated/lost values + if (sum_ != other.sum_) { + HWY_ABORT("Sort %s: Sum mismatch %g %g; min %g max %g\n", type_name, + static_cast<double>(sum_), static_cast<double>(other.sum_), + static_cast<double>(min_), static_cast<double>(max_)); + } + + return true; + } + + private: + T min_ = hwy::HighestValue<T>(); + T max_ = hwy::LowestValue<T>(); + uint64_t sum_ = 0; + size_t count_ = 0; +}; + +enum class Algo { +#if HAVE_INTEL + kIntel, +#endif +#if HAVE_AVX2SORT + kSEA, +#endif +#if HAVE_IPS4O + kIPS4O, +#endif +#if HAVE_PARALLEL_IPS4O + kParallelIPS4O, +#endif +#if HAVE_PDQSORT + kPDQ, +#endif +#if HAVE_SORT512 + kSort512, +#endif +#if HAVE_VXSORT + kVXSort, +#endif + kStd, + kVQSort, + kHeap, +}; + +static inline const char* AlgoName(Algo algo) { + switch (algo) { +#if HAVE_INTEL + case Algo::kIntel: + return "intel"; +#endif +#if HAVE_AVX2SORT + case Algo::kSEA: + return "sea"; +#endif +#if HAVE_IPS4O + case Algo::kIPS4O: + return "ips4o"; +#endif +#if HAVE_PARALLEL_IPS4O + case Algo::kParallelIPS4O: + return "par_ips4o"; +#endif +#if HAVE_PDQSORT + case Algo::kPDQ: + return "pdq"; +#endif +#if HAVE_SORT512 + case Algo::kSort512: + return "sort512"; +#endif +#if HAVE_VXSORT + case Algo::kVXSort: + return "vxsort"; +#endif + case Algo::kStd: + return "std"; + case Algo::kVQSort: + return "vq"; + case Algo::kHeap: + return "heap"; + } + return "unreachable"; +} + +} // namespace hwy +#endif // HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_ + +// Per-target +#if defined(HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE +#undef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE +#else +#define HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE +#endif + +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/traits128-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" // HeapSort +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); + +// Requires target pragma set by HWY_BEFORE_NAMESPACE +#if HAVE_INTEL && HWY_TARGET <= HWY_AVX3 +// #include "avx512-16bit-qsort.hpp" // requires vbmi2 +#include "avx512-32bit-qsort.hpp" +#include "avx512-64bit-qsort.hpp" +#endif + +namespace hwy { +namespace HWY_NAMESPACE { + +#if HAVE_INTEL // only supports ascending order +template <typename T> +using OtherOrder = detail::OrderAscending<T>; +#else +template <typename T> +using OtherOrder = detail::OrderDescending<T>; +#endif + +class Xorshift128Plus { + static HWY_INLINE uint64_t SplitMix64(uint64_t z) { + z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull; + z = (z ^ (z >> 27)) * 0x94D049BB133111EBull; + return z ^ (z >> 31); + } + + public: + // Generates two vectors of 64-bit seeds via SplitMix64 and stores into + // `seeds`. Generating these afresh in each ChoosePivot is too expensive. + template <class DU64> + static void GenerateSeeds(DU64 du64, TFromD<DU64>* HWY_RESTRICT seeds) { + seeds[0] = SplitMix64(0x9E3779B97F4A7C15ull); + for (size_t i = 1; i < 2 * Lanes(du64); ++i) { + seeds[i] = SplitMix64(seeds[i - 1]); + } + } + + // Need to pass in the state because vector cannot be class members. + template <class VU64> + static VU64 RandomBits(VU64& state0, VU64& state1) { + VU64 s1 = state0; + VU64 s0 = state1; + const VU64 bits = Add(s1, s0); + state0 = s0; + s1 = Xor(s1, ShiftLeft<23>(s1)); + state1 = Xor(s1, Xor(s0, Xor(ShiftRight<18>(s1), ShiftRight<5>(s0)))); + return bits; + } +}; + +template <class D, class VU64, HWY_IF_NOT_FLOAT_D(D)> +Vec<D> RandomValues(D d, VU64& s0, VU64& s1, const VU64 mask) { + const VU64 bits = Xorshift128Plus::RandomBits(s0, s1); + return BitCast(d, And(bits, mask)); +} + +// It is important to avoid denormals, which are flushed to zero by SIMD but not +// scalar sorts, and NaN, which may be ordered differently in scalar vs. SIMD. +template <class DF, class VU64, HWY_IF_FLOAT_D(DF)> +Vec<DF> RandomValues(DF df, VU64& s0, VU64& s1, const VU64 mask) { + using TF = TFromD<DF>; + const RebindToUnsigned<decltype(df)> du; + using VU = Vec<decltype(du)>; + + const VU64 bits64 = And(Xorshift128Plus::RandomBits(s0, s1), mask); + +#if HWY_TARGET == HWY_SCALAR // Cannot repartition u64 to smaller types + using TU = MakeUnsigned<TF>; + const VU bits = Set(du, static_cast<TU>(GetLane(bits64) & LimitsMax<TU>())); +#else + const VU bits = BitCast(du, bits64); +#endif + // Avoid NaN/denormal by only generating values in [1, 2), i.e. random + // mantissas with the exponent taken from the representation of 1.0. + const VU k1 = BitCast(du, Set(df, TF{1.0})); + const VU mantissa_mask = Set(du, MantissaMask<TF>()); + const VU representation = OrAnd(k1, bits, mantissa_mask); + return BitCast(df, representation); +} + +template <class DU64> +Vec<DU64> MaskForDist(DU64 du64, const Dist dist, size_t sizeof_t) { + switch (sizeof_t) { + case 2: + return Set(du64, (dist == Dist::kUniform8) ? 0x00FF00FF00FF00FFull + : 0xFFFFFFFFFFFFFFFFull); + case 4: + return Set(du64, (dist == Dist::kUniform8) ? 0x000000FF000000FFull + : (dist == Dist::kUniform16) ? 0x0000FFFF0000FFFFull + : 0xFFFFFFFFFFFFFFFFull); + case 8: + return Set(du64, (dist == Dist::kUniform8) ? 0x00000000000000FFull + : (dist == Dist::kUniform16) ? 0x000000000000FFFFull + : 0x00000000FFFFFFFFull); + default: + HWY_ABORT("Logic error"); + return Zero(du64); + } +} + +template <typename T> +InputStats<T> GenerateInput(const Dist dist, T* v, size_t num) { + SortTag<uint64_t> du64; + using VU64 = Vec<decltype(du64)>; + const size_t N64 = Lanes(du64); + auto seeds = hwy::AllocateAligned<uint64_t>(2 * N64); + Xorshift128Plus::GenerateSeeds(du64, seeds.get()); + VU64 s0 = Load(du64, seeds.get()); + VU64 s1 = Load(du64, seeds.get() + N64); + +#if HWY_TARGET == HWY_SCALAR + const Sisd<T> d; +#else + const Repartition<T, decltype(du64)> d; +#endif + using V = Vec<decltype(d)>; + const size_t N = Lanes(d); + const VU64 mask = MaskForDist(du64, dist, sizeof(T)); + auto buf = hwy::AllocateAligned<T>(N); + + size_t i = 0; + for (; i + N <= num; i += N) { + const V values = RandomValues(d, s0, s1, mask); + StoreU(values, d, v + i); + } + if (i < num) { + const V values = RandomValues(d, s0, s1, mask); + StoreU(values, d, buf.get()); + memcpy(v + i, buf.get(), (num - i) * sizeof(T)); + } + + InputStats<T> input_stats; + for (size_t i = 0; i < num; ++i) { + input_stats.Notify(v[i]); + } + return input_stats; +} + +struct SharedState { +#if HAVE_PARALLEL_IPS4O + const unsigned max_threads = hwy::LimitsMax<unsigned>(); // 16 for Table 1a + ips4o::StdThreadPool pool{static_cast<int>( + HWY_MIN(max_threads, std::thread::hardware_concurrency() / 2))}; +#endif +}; + +// Bridge from keys (passed to Run) to lanes as expected by HeapSort. For +// non-128-bit keys they are the same: +template <class Order, typename KeyType, HWY_IF_NOT_T_SIZE(KeyType, 16)> +void CallHeapSort(KeyType* HWY_RESTRICT keys, const size_t num_keys) { + using detail::TraitsLane; + using detail::SharedTraits; + if (Order().IsAscending()) { + const SharedTraits<TraitsLane<detail::OrderAscending<KeyType>>> st; + return detail::HeapSort(st, keys, num_keys); + } else { + const SharedTraits<TraitsLane<detail::OrderDescending<KeyType>>> st; + return detail::HeapSort(st, keys, num_keys); + } +} + +#if VQSORT_ENABLED +template <class Order> +void CallHeapSort(hwy::uint128_t* HWY_RESTRICT keys, const size_t num_keys) { + using detail::SharedTraits; + using detail::Traits128; + uint64_t* lanes = reinterpret_cast<uint64_t*>(keys); + const size_t num_lanes = num_keys * 2; + if (Order().IsAscending()) { + const SharedTraits<Traits128<detail::OrderAscending128>> st; + return detail::HeapSort(st, lanes, num_lanes); + } else { + const SharedTraits<Traits128<detail::OrderDescending128>> st; + return detail::HeapSort(st, lanes, num_lanes); + } +} + +template <class Order> +void CallHeapSort(K64V64* HWY_RESTRICT keys, const size_t num_keys) { + using detail::SharedTraits; + using detail::Traits128; + uint64_t* lanes = reinterpret_cast<uint64_t*>(keys); + const size_t num_lanes = num_keys * 2; + if (Order().IsAscending()) { + const SharedTraits<Traits128<detail::OrderAscendingKV128>> st; + return detail::HeapSort(st, lanes, num_lanes); + } else { + const SharedTraits<Traits128<detail::OrderDescendingKV128>> st; + return detail::HeapSort(st, lanes, num_lanes); + } +} +#endif // VQSORT_ENABLED + +template <class Order, typename KeyType> +void Run(Algo algo, KeyType* HWY_RESTRICT inout, size_t num, + SharedState& shared, size_t /*thread*/) { + const std::less<KeyType> less; + const std::greater<KeyType> greater; + +#if !HAVE_PARALLEL_IPS4O + (void)shared; +#endif + + switch (algo) { +#if HAVE_INTEL && HWY_TARGET <= HWY_AVX3 + case Algo::kIntel: + return avx512_qsort<KeyType>(inout, static_cast<int64_t>(num)); +#endif + +#if HAVE_AVX2SORT + case Algo::kSEA: + return avx2::quicksort(inout, static_cast<int>(num)); +#endif + +#if HAVE_IPS4O + case Algo::kIPS4O: + if (Order().IsAscending()) { + return ips4o::sort(inout, inout + num, less); + } else { + return ips4o::sort(inout, inout + num, greater); + } +#endif + +#if HAVE_PARALLEL_IPS4O + case Algo::kParallelIPS4O: + if (Order().IsAscending()) { + return ips4o::parallel::sort(inout, inout + num, less, shared.pool); + } else { + return ips4o::parallel::sort(inout, inout + num, greater, shared.pool); + } +#endif + +#if HAVE_SORT512 + case Algo::kSort512: + HWY_ABORT("not supported"); + // return Sort512::Sort(inout, num); +#endif + +#if HAVE_PDQSORT + case Algo::kPDQ: + if (Order().IsAscending()) { + return boost::sort::pdqsort_branchless(inout, inout + num, less); + } else { + return boost::sort::pdqsort_branchless(inout, inout + num, greater); + } +#endif + +#if HAVE_VXSORT + case Algo::kVXSort: { +#if (VXSORT_AVX3 && HWY_TARGET != HWY_AVX3) || \ + (!VXSORT_AVX3 && HWY_TARGET != HWY_AVX2) + fprintf(stderr, "Do not call for target %s\n", + hwy::TargetName(HWY_TARGET)); + return; +#else +#if VXSORT_AVX3 + vxsort::vxsort<KeyType, vxsort::AVX512> vx; +#else + vxsort::vxsort<KeyType, vxsort::AVX2> vx; +#endif + if (Order().IsAscending()) { + return vx.sort(inout, inout + num - 1); + } else { + fprintf(stderr, "Skipping VX - does not support descending order\n"); + return; + } +#endif // enabled for this target + } +#endif // HAVE_VXSORT + + case Algo::kStd: + if (Order().IsAscending()) { + return std::sort(inout, inout + num, less); + } else { + return std::sort(inout, inout + num, greater); + } + + case Algo::kVQSort: + return VQSort(inout, num, Order()); + + case Algo::kHeap: + return CallHeapSort<Order>(inout, num); + + default: + HWY_ABORT("Not implemented"); + } +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE diff --git a/third_party/highway/hwy/contrib/sort/bench_parallel.cc b/third_party/highway/hwy/contrib/sort/bench_parallel.cc new file mode 100644 index 0000000000..113061bab3 --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/bench_parallel.cc @@ -0,0 +1,238 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Concurrent, independent sorts for generating more memory traffic and testing +// scalability. + +#include <stdint.h> +#include <stdio.h> + +#include <condition_variable> //NOLINT +#include <functional> +#include <memory> +#include <mutex> //NOLINT +#include <thread> //NOLINT +#include <utility> +#include <vector> + +// clang-format off +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_parallel.cc" //NOLINT +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/algo-inl.h" +#include "hwy/contrib/sort/result-inl.h" +#include "hwy/aligned_allocator.h" +// Last +#include "hwy/tests/test_util-inl.h" +// clang-format on + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { +namespace { + +class ThreadPool { + public: + // Starts the given number of worker threads and blocks until they are ready. + explicit ThreadPool( + const size_t num_threads = std::thread::hardware_concurrency()) + : num_threads_(num_threads) { + HWY_ASSERT(num_threads_ > 0); + threads_.reserve(num_threads_); + for (size_t i = 0; i < num_threads_; ++i) { + threads_.emplace_back(ThreadFunc, this, i); + } + + WorkersReadyBarrier(); + } + + ThreadPool(const ThreadPool&) = delete; + ThreadPool& operator&(const ThreadPool&) = delete; + + // Waits for all threads to exit. + ~ThreadPool() { + StartWorkers(kWorkerExit); + + for (std::thread& thread : threads_) { + thread.join(); + } + } + + size_t NumThreads() const { return threads_.size(); } + + template <class Func> + void RunOnThreads(size_t max_threads, const Func& func) { + task_ = &CallClosure<Func>; + data_ = &func; + StartWorkers(max_threads); + WorkersReadyBarrier(); + } + + private: + // After construction and between calls to Run, workers are "ready", i.e. + // waiting on worker_start_cv_. They are "started" by sending a "command" + // and notifying all worker_start_cv_ waiters. (That is why all workers + // must be ready/waiting - otherwise, the notification will not reach all of + // them and the main thread waits in vain for them to report readiness.) + using WorkerCommand = uint64_t; + + static constexpr WorkerCommand kWorkerWait = ~1ULL; + static constexpr WorkerCommand kWorkerExit = ~2ULL; + + // Calls a closure (lambda with captures). + template <class Closure> + static void CallClosure(const void* f, size_t thread) { + (*reinterpret_cast<const Closure*>(f))(thread); + } + + void WorkersReadyBarrier() { + std::unique_lock<std::mutex> lock(mutex_); + // Typically only a single iteration. + while (workers_ready_ != threads_.size()) { + workers_ready_cv_.wait(lock); + } + workers_ready_ = 0; + + // Safely handle spurious worker wakeups. + worker_start_command_ = kWorkerWait; + } + + // Precondition: all workers are ready. + void StartWorkers(const WorkerCommand worker_command) { + std::unique_lock<std::mutex> lock(mutex_); + worker_start_command_ = worker_command; + // Workers will need this lock, so release it before they wake up. + lock.unlock(); + worker_start_cv_.notify_all(); + } + + static void ThreadFunc(ThreadPool* self, size_t thread) { + // Until kWorkerExit command received: + for (;;) { + std::unique_lock<std::mutex> lock(self->mutex_); + // Notify main thread that this thread is ready. + if (++self->workers_ready_ == self->num_threads_) { + self->workers_ready_cv_.notify_one(); + } + RESUME_WAIT: + // Wait for a command. + self->worker_start_cv_.wait(lock); + const WorkerCommand command = self->worker_start_command_; + switch (command) { + case kWorkerWait: // spurious wakeup: + goto RESUME_WAIT; // lock still held, avoid incrementing ready. + case kWorkerExit: + return; // exits thread + default: + break; + } + + lock.unlock(); + // Command is the maximum number of threads that should run the task. + HWY_ASSERT(command < self->NumThreads()); + if (thread < command) { + self->task_(self->data_, thread); + } + } + } + + const size_t num_threads_; + + // Unmodified after ctor, but cannot be const because we call thread::join(). + std::vector<std::thread> threads_; + + std::mutex mutex_; // guards both cv and their variables. + std::condition_variable workers_ready_cv_; + size_t workers_ready_ = 0; + std::condition_variable worker_start_cv_; + WorkerCommand worker_start_command_; + + // Written by main thread, read by workers (after mutex lock/unlock). + std::function<void(const void*, size_t)> task_; // points to CallClosure + const void* data_; // points to caller's Func +}; + +template <class Traits> +void RunWithoutVerify(Traits st, const Dist dist, const size_t num_keys, + const Algo algo, SharedState& shared, size_t thread) { + using LaneType = typename Traits::LaneType; + using KeyType = typename Traits::KeyType; + using Order = typename Traits::Order; + const size_t num_lanes = num_keys * st.LanesPerKey(); + auto aligned = hwy::AllocateAligned<LaneType>(num_lanes); + + (void)GenerateInput(dist, aligned.get(), num_lanes); + + const Timestamp t0; + Run<Order>(algo, reinterpret_cast<KeyType*>(aligned.get()), num_keys, shared, + thread); + HWY_ASSERT(aligned[0] < aligned[num_lanes - 1]); +} + +void BenchParallel() { + // Not interested in benchmark results for other targets on x86 + if (HWY_ARCH_X86 && (HWY_TARGET != HWY_AVX2 && HWY_TARGET != HWY_AVX3 && + HWY_TARGET != HWY_AVX3_ZEN4)) { + return; + } + + ThreadPool pool; + const size_t NT = pool.NumThreads(); + + detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int64_t>>> st; + using KeyType = typename decltype(st)::KeyType; + const size_t num_keys = size_t{100} * 1000 * 1000; + +#if HAVE_IPS4O + const Algo algo = Algo::kIPS4O; +#else + const Algo algo = Algo::kVQSort; +#endif + const Dist dist = Dist::kUniform32; + + SharedState shared; + + std::vector<Result> results; + for (size_t nt = 1; nt < NT; nt += HWY_MAX(1, NT / 16)) { + Timestamp t0; + // Default capture because MSVC wants algo/dist but clang does not. + pool.RunOnThreads(nt, [=, &shared](size_t thread) { + RunWithoutVerify(st, dist, num_keys, algo, shared, thread); + }); + const double sec = SecondsSince(t0); + results.emplace_back(algo, dist, num_keys, nt, sec, sizeof(KeyType), + st.KeyString()); + results.back().Print(); + } +} + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +namespace { +HWY_BEFORE_TEST(BenchParallel); +HWY_EXPORT_AND_TEST_P(BenchParallel, BenchParallel); +} // namespace +} // namespace hwy + +#endif // HWY_ONCE diff --git a/third_party/highway/hwy/contrib/sort/bench_sort.cc b/third_party/highway/hwy/contrib/sort/bench_sort.cc new file mode 100644 index 0000000000..13025aa26b --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/bench_sort.cc @@ -0,0 +1,367 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <stdint.h> +#include <stdio.h> + +#include <vector> + +// clang-format off +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_sort.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/algo-inl.h" +#include "hwy/contrib/sort/vqsort.h" +#include "hwy/contrib/sort/result-inl.h" +#include "hwy/contrib/sort/sorting_networks-inl.h" // SharedTraits +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/traits128-inl.h" +#include "hwy/tests/test_util-inl.h" +// clang-format on + +// Mode for larger sorts because M1 is able to access more than the per-core +// share of L2, so 1M elements might still be in cache. +#define SORT_100M 0 + +#define SORT_BENCH_BASE_AND_PARTITION 0 + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +// Defined within HWY_ONCE, used by BenchAllSort. +extern int64_t first_sort_target; + +namespace HWY_NAMESPACE { +namespace { +using detail::TraitsLane; +using detail::OrderAscending; +using detail::OrderDescending; +using detail::SharedTraits; + +#if VQSORT_ENABLED +using detail::OrderAscending128; +using detail::OrderAscendingKV128; +using detail::Traits128; +#endif + +#if (VQSORT_ENABLED && SORT_BENCH_BASE_AND_PARTITION) || HWY_IDE + +template <class Traits> +HWY_NOINLINE void BenchPartition() { + using LaneType = typename Traits::LaneType; + using KeyType = typename Traits::KeyType; + const SortTag<LaneType> d; + detail::SharedTraits<Traits> st; + const Dist dist = Dist::kUniform8; + double sum = 0.0; + + constexpr size_t kLPK = st.LanesPerKey(); + HWY_ALIGN LaneType + buf[SortConstants::BufBytes<LaneType>(HWY_MAX_BYTES, kLPK) / + sizeof(LaneType)]; + uint64_t* HWY_RESTRICT state = GetGeneratorState(); + + const size_t max_log2 = AdjustedLog2Reps(20); + for (size_t log2 = max_log2; log2 < max_log2 + 1; ++log2) { + const size_t num_lanes = 1ull << log2; + const size_t num_keys = num_lanes / kLPK; + auto aligned = hwy::AllocateAligned<LaneType>(num_lanes); + + std::vector<double> seconds; + const size_t num_reps = (1ull << (14 - log2 / 2)) * 30; + for (size_t rep = 0; rep < num_reps; ++rep) { + (void)GenerateInput(dist, aligned.get(), num_lanes); + + // The pivot value can influence performance. Do exactly what vqsort will + // do so that the performance (influenced by prefetching and branch + // prediction) is likely to predict the actual performance inside vqsort. + detail::DrawSamples(d, st, aligned.get(), num_lanes, buf, state); + detail::SortSamples(d, st, buf); + auto pivot = detail::ChoosePivotByRank(d, st, buf); + + const Timestamp t0; + detail::Partition(d, st, aligned.get(), num_lanes - 1, pivot, buf); + seconds.push_back(SecondsSince(t0)); + // 'Use' the result to prevent optimizing out the partition. + sum += static_cast<double>(aligned.get()[num_lanes / 2]); + } + + Result(Algo::kVQSort, dist, num_keys, 1, SummarizeMeasurements(seconds), + sizeof(KeyType), st.KeyString()) + .Print(); + } + HWY_ASSERT(sum != 999999); // Prevent optimizing out +} + +HWY_NOINLINE void BenchAllPartition() { + // Not interested in benchmark results for these targets + if (HWY_TARGET == HWY_SSSE3) { + return; + } + + BenchPartition<TraitsLane<OrderDescending<float>>>(); + BenchPartition<TraitsLane<OrderDescending<int32_t>>>(); + BenchPartition<TraitsLane<OrderDescending<int64_t>>>(); + BenchPartition<Traits128<OrderAscending128>>(); + // BenchPartition<Traits128<OrderDescending128>>(); + BenchPartition<Traits128<OrderAscendingKV128>>(); +} + +template <class Traits> +HWY_NOINLINE void BenchBase(std::vector<Result>& results) { + // Not interested in benchmark results for these targets + if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) { + return; + } + + using LaneType = typename Traits::LaneType; + using KeyType = typename Traits::KeyType; + const SortTag<LaneType> d; + detail::SharedTraits<Traits> st; + const Dist dist = Dist::kUniform32; + + const size_t N = Lanes(d); + constexpr size_t kLPK = st.LanesPerKey(); + const size_t num_lanes = SortConstants::BaseCaseNumLanes<kLPK>(N); + const size_t num_keys = num_lanes / kLPK; + auto keys = hwy::AllocateAligned<LaneType>(num_lanes); + auto buf = hwy::AllocateAligned<LaneType>(num_lanes + N); + + std::vector<double> seconds; + double sum = 0; // prevents elision + constexpr size_t kMul = AdjustedReps(600); // ensures long enough to measure + + for (size_t rep = 0; rep < 30; ++rep) { + InputStats<LaneType> input_stats = + GenerateInput(dist, keys.get(), num_lanes); + + const Timestamp t0; + for (size_t i = 0; i < kMul; ++i) { + detail::BaseCase(d, st, keys.get(), keys.get() + num_lanes, num_lanes, + buf.get()); + sum += static_cast<double>(keys[0]); + } + seconds.push_back(SecondsSince(t0)); + // printf("%f\n", seconds.back()); + + HWY_ASSERT(VerifySort(st, input_stats, keys.get(), num_lanes, "BenchBase")); + } + HWY_ASSERT(sum < 1E99); + results.emplace_back(Algo::kVQSort, dist, num_keys * kMul, 1, + SummarizeMeasurements(seconds), sizeof(KeyType), + st.KeyString()); +} + +HWY_NOINLINE void BenchAllBase() { + // Not interested in benchmark results for these targets + if (HWY_TARGET == HWY_SSSE3) { + return; + } + + std::vector<Result> results; + BenchBase<TraitsLane<OrderAscending<float>>>(results); + BenchBase<TraitsLane<OrderDescending<int64_t>>>(results); + BenchBase<Traits128<OrderAscending128>>(results); + for (const Result& r : results) { + r.Print(); + } +} + +#endif // VQSORT_ENABLED && SORT_BENCH_BASE_AND_PARTITION + +std::vector<Algo> AlgoForBench() { + return { +#if HAVE_AVX2SORT + Algo::kSEA, +#endif +#if HAVE_PARALLEL_IPS4O + Algo::kParallelIPS4O, +#elif HAVE_IPS4O + Algo::kIPS4O, +#endif +#if HAVE_PDQSORT + Algo::kPDQ, +#endif +#if HAVE_SORT512 + Algo::kSort512, +#endif +// Only include if we're compiling for the target it supports. +#if HAVE_VXSORT && ((VXSORT_AVX3 && HWY_TARGET == HWY_AVX3) || \ + (!VXSORT_AVX3 && HWY_TARGET == HWY_AVX2)) + Algo::kVXSort, +#endif +// Only include if we're compiling for the target it supports. +#if HAVE_INTEL && HWY_TARGET <= HWY_AVX3 + Algo::kIntel, +#endif + +#if !HAVE_PARALLEL_IPS4O +#if !SORT_100M + // 10-20x slower, but that's OK for the default size when we are not + // testing the parallel nor 100M modes. + Algo::kStd, +#endif + + Algo::kVQSort, // only ~4x slower, but not required for Table 1a +#endif // !HAVE_PARALLEL_IPS4O + }; +} + +template <class Traits> +HWY_NOINLINE void BenchSort(size_t num_keys) { + if (first_sort_target == 0) first_sort_target = HWY_TARGET; + + SharedState shared; + detail::SharedTraits<Traits> st; + using Order = typename Traits::Order; + using LaneType = typename Traits::LaneType; + using KeyType = typename Traits::KeyType; + const size_t num_lanes = num_keys * st.LanesPerKey(); + auto aligned = hwy::AllocateAligned<LaneType>(num_lanes); + + const size_t reps = num_keys > 1000 * 1000 ? 10 : 30; + + for (Algo algo : AlgoForBench()) { + // Other algorithms don't depend on the vector instructions, so only run + // them for the first target. +#if !HAVE_VXSORT + if (algo != Algo::kVQSort && HWY_TARGET != first_sort_target) { + continue; + } +#endif + + for (Dist dist : AllDist()) { + std::vector<double> seconds; + for (size_t rep = 0; rep < reps; ++rep) { + InputStats<LaneType> input_stats = + GenerateInput(dist, aligned.get(), num_lanes); + + const Timestamp t0; + Run<Order>(algo, reinterpret_cast<KeyType*>(aligned.get()), num_keys, + shared, /*thread=*/0); + seconds.push_back(SecondsSince(t0)); + // printf("%f\n", seconds.back()); + + HWY_ASSERT( + VerifySort(st, input_stats, aligned.get(), num_lanes, "BenchSort")); + } + Result(algo, dist, num_keys, 1, SummarizeMeasurements(seconds), + sizeof(KeyType), st.KeyString()) + .Print(); + } // dist + } // algo +} + +enum class BenchmarkModes { + kDefault, + k1M, + kAllSmall, + kSmallPow2, + kPow4, + kPow10 +}; + +std::vector<size_t> SizesToBenchmark(BenchmarkModes mode) { + std::vector<size_t> sizes; + switch (mode) { + default: + case BenchmarkModes::kDefault: +#if HAVE_PARALLEL_IPS4O || SORT_100M + sizes.push_back(100 * 1000 * size_t{1000}); +#else + sizes.push_back(100); + sizes.push_back(100 * 1000); +#endif + break; + case BenchmarkModes::k1M: + sizes.push_back(1000 * 1000); + break; + + case BenchmarkModes::kAllSmall: + sizes.reserve(128); + for (size_t i = 1; i <= 128; ++i) { + sizes.push_back(i); + } + break; + case BenchmarkModes::kSmallPow2: + for (size_t size = 2; size <= 128; size *= 2) { + sizes.push_back(size); + } + break; + case BenchmarkModes::kPow4: + for (size_t size = 4; size <= 256 * 1024; size *= 4) { + sizes.push_back(size); + } + break; + case BenchmarkModes::kPow10: + for (size_t size = 10; size <= 100 * 1000; size *= 10) { + sizes.push_back(size); + } + break; + } + return sizes; +} + +HWY_NOINLINE void BenchAllSort() { + // Not interested in benchmark results for these targets. Note that SSE4 is + // numerically less than SSE2, hence it is the lower bound. + if (HWY_SSE4 <= HWY_TARGET && HWY_TARGET <= HWY_SSE2) { + return; + } +#if HAVE_INTEL + if (HWY_TARGET > HWY_AVX3) return; +#endif + + for (size_t num_keys : SizesToBenchmark(BenchmarkModes::kSmallPow2)) { +#if !HAVE_INTEL + BenchSort<TraitsLane<OrderAscending<float>>>(num_keys); +#endif + // BenchSort<TraitsLane<OtherOrder<double>>>(num_keys); + // BenchSort<TraitsLane<OrderAscending<int16_t>>>(num_keys); + BenchSort<TraitsLane<OtherOrder<int32_t>>>(num_keys); + BenchSort<TraitsLane<OrderAscending<int64_t>>>(num_keys); + // BenchSort<TraitsLane<OtherOrder<uint16_t>>>(num_keys); + // BenchSort<TraitsLane<OtherOrder<uint32_t>>>(num_keys); + // BenchSort<TraitsLane<OrderAscending<uint64_t>>>(num_keys); + +#if !HAVE_VXSORT && !HAVE_INTEL && VQSORT_ENABLED + BenchSort<Traits128<OrderAscending128>>(num_keys); + BenchSort<Traits128<OrderAscendingKV128>>(num_keys); +#endif + } +} + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +int64_t first_sort_target = 0; // none run yet +namespace { +HWY_BEFORE_TEST(BenchSort); +#if SORT_BENCH_BASE_AND_PARTITION +HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllPartition); +HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllBase); +#endif +HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllSort); +} // namespace +} // namespace hwy + +#endif // HWY_ONCE diff --git a/third_party/highway/hwy/contrib/sort/print_network.cc b/third_party/highway/hwy/contrib/sort/print_network.cc new file mode 100644 index 0000000000..0760696e79 --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/print_network.cc @@ -0,0 +1,90 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <stdio.h> + +#include <vector> + +#include "hwy/base.h" + +// Based on A.7 in "Entwurf und Implementierung vektorisierter +// Sortieralgorithmen" and code by Mark Blacher. +void PrintMergeNetwork(int rows, int cols) { + printf("\n%d x %d:\n", rows, cols); + // Powers of two + HWY_ASSERT(rows != 0 && (rows & (rows - 1)) == 0); + HWY_ASSERT(cols != 0 && (cols & (cols - 1)) == 0); + HWY_ASSERT(rows >= 4); + HWY_ASSERT(cols >= 2); // otherwise no cross-column merging required + HWY_ASSERT(cols <= 16); // SortTraits lacks Reverse32 + + // Log(rows) times: sort half of the vectors with reversed groups of the + // other half. Group size halves until we are sorting adjacent vectors. + int group_size = rows; + int num_groups = 1; + for (; group_size >= 2; group_size /= 2, num_groups *= 2) { + // All vectors except those being reversed. Allows us to group the + // ReverseKeys and Sort2 operations, which is easier to read and may help + // in-order machines with high-latency ReverseKeys. + std::vector<int> all_vi; + for (int group = 0; group < num_groups; ++group) { + for (int i = 0; i < group_size / 2; ++i) { + all_vi.push_back(group * group_size + i); + } + } + for (int vi : all_vi) { + const int vr = vi ^ (group_size - 1); + printf("v%x = st.ReverseKeys%d(d, v%x);\n", vr, cols, vr); + } + for (int vi : all_vi) { + const int vr = vi ^ (group_size - 1); + printf("st.Sort2(d, v%x, v%x);\n", vi, vr); + } + printf("\n"); + } + + // Now merge across columns in all vectors. + if (cols > 2) { + for (int i = 0; i < rows; ++i) { + printf("v%x = st.SortPairsReverse%d(d, v%x);\n", i, cols, i); + } + printf("\n"); + } + if (cols >= 16) { + for (int i = 0; i < rows; ++i) { + printf("v%x = st.SortPairsDistance4(d, v%x);\n", i, i); + } + printf("\n"); + } + if (cols >= 8) { + for (int i = 0; i < rows; ++i) { + printf("v%x = st.SortPairsDistance2(d, v%x);\n", i, i); + } + printf("\n"); + } + for (int i = 0; i < rows; ++i) { + printf("v%x = st.SortPairsDistance1(d, v%x);\n", i, i); + } + printf("\n"); +} + +int main(int argc, char** argv) { + PrintMergeNetwork(8, 2); + PrintMergeNetwork(8, 4); + PrintMergeNetwork(16, 4); + PrintMergeNetwork(16, 8); + PrintMergeNetwork(16, 16); + return 0; +} diff --git a/third_party/highway/hwy/contrib/sort/result-inl.h b/third_party/highway/hwy/contrib/sort/result-inl.h new file mode 100644 index 0000000000..34365a1669 --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/result-inl.h @@ -0,0 +1,140 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/algo-inl.h" + +// Normal include guard for non-SIMD parts +#ifndef HIGHWAY_HWY_CONTRIB_SORT_RESULT_INL_H_ +#define HIGHWAY_HWY_CONTRIB_SORT_RESULT_INL_H_ + +#include <time.h> + +#include <algorithm> // std::sort +#include <string> + +#include "hwy/base.h" +#include "hwy/nanobenchmark.h" + +namespace hwy { + +struct Timestamp { + Timestamp() { t = platform::Now(); } + double t; +}; + +static inline double SecondsSince(const Timestamp& t0) { + const Timestamp t1; + return t1.t - t0.t; +} + +// Returns trimmed mean (we don't want to run an out-of-L3-cache sort often +// enough for the mode to be reliable). +static inline double SummarizeMeasurements(std::vector<double>& seconds) { + std::sort(seconds.begin(), seconds.end()); + double sum = 0; + int count = 0; + const size_t num = seconds.size(); + for (size_t i = num / 4; i < num / 2; ++i) { + sum += seconds[i]; + count += 1; + } + return sum / count; +} + +} // namespace hwy +#endif // HIGHWAY_HWY_CONTRIB_SORT_RESULT_INL_H_ + +// Per-target +#if defined(HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE +#undef HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE +#else +#define HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE +#endif + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +struct Result { + Result() {} + Result(const Algo algo, Dist dist, size_t num_keys, size_t num_threads, + double sec, size_t sizeof_key, const char* key_name) + : target(HWY_TARGET), + algo(algo), + dist(dist), + num_keys(num_keys), + num_threads(num_threads), + sec(sec), + sizeof_key(sizeof_key), + key_name(key_name) {} + + void Print() const { + const double bytes = static_cast<double>(num_keys) * + static_cast<double>(num_threads) * + static_cast<double>(sizeof_key); + printf("%10s: %12s: %7s: %9s: %05g %4.0f MB/s (%2zu threads)\n", + hwy::TargetName(target), AlgoName(algo), key_name.c_str(), + DistName(dist), static_cast<double>(num_keys), bytes * 1E-6 / sec, + num_threads); + } + + int64_t target; + Algo algo; + Dist dist; + size_t num_keys = 0; + size_t num_threads = 0; + double sec = 0.0; + size_t sizeof_key = 0; + std::string key_name; +}; + +template <class Traits, typename LaneType> +bool VerifySort(Traits st, const InputStats<LaneType>& input_stats, + const LaneType* out, size_t num_lanes, const char* caller) { + constexpr size_t N1 = st.LanesPerKey(); + HWY_ASSERT(num_lanes >= N1); + + InputStats<LaneType> output_stats; + // Ensure it matches the sort order + for (size_t i = 0; i < num_lanes - N1; i += N1) { + output_stats.Notify(out[i]); + if (N1 == 2) output_stats.Notify(out[i + 1]); + // Reverse order instead of checking !Compare1 so we accept equal keys. + if (st.Compare1(out + i + N1, out + i)) { + fprintf(stderr, "%s: i=%d of %d lanes: N1=%d", caller, + static_cast<int>(i), static_cast<int>(num_lanes), + static_cast<int>(N1)); + fprintf(stderr, "%5.0f %5.0f vs. %5.0f %5.0f\n\n", + static_cast<double>(out[i + 1]), static_cast<double>(out[i + 0]), + static_cast<double>(out[i + N1 + 1]), + static_cast<double>(out[i + N1])); + HWY_ABORT("%d-bit sort is incorrect\n", + static_cast<int>(sizeof(LaneType) * 8 * N1)); + } + } + output_stats.Notify(out[num_lanes - N1]); + if (N1 == 2) output_stats.Notify(out[num_lanes - N1 + 1]); + + return input_stats == output_stats; +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE diff --git a/third_party/highway/hwy/contrib/sort/shared-inl.h b/third_party/highway/hwy/contrib/sort/shared-inl.h new file mode 100644 index 0000000000..18cb58d78b --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/shared-inl.h @@ -0,0 +1,154 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Definitions shared between vqsort-inl and sorting_networks-inl. + +// Normal include guard for target-independent parts +#ifndef HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_ +#define HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_ + +#include "hwy/base.h" + +namespace hwy { + +// Internal constants - these are to avoid magic numbers/literals and cannot be +// changed without also changing the associated code. +struct SortConstants { + // SortingNetwork reshapes its input into a matrix. This is the maximum number + // of *lanes* per vector. Must be at least 8 because SortSamples assumes the + // sorting network can handle 128 bytes with 8 rows, so 16 bytes per vector, + // which means 8 lanes for 16-bit types. +#if HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD + static constexpr size_t kMaxCols = 8; // avoid build timeout/stack overflow +#else + static constexpr size_t kMaxCols = 16; // enough for u32 in 512-bit vector +#endif + + // 16 rows is a compromise between using the 32 AVX-512/SVE/RVV registers, + // fitting within 16 AVX2 registers with only a few spills, keeping BaseCase + // code size reasonable, and minimizing the extra logN factor for larger + // networks (for which only loose upper bounds on size are known). + static constexpr size_t kMaxRows = 16; + + // Template argument ensures there is no actual division instruction. + template <size_t kLPK> + static constexpr HWY_INLINE size_t BaseCaseNumLanes(size_t N) { + // We use 8, 8x2, 8x4, and 16x{4..} networks, in units of keys. For N/kLPK + // < 4, we cannot use the 16-row networks. + return (((N / kLPK) >= 4) ? kMaxRows : 8) * HWY_MIN(N, kMaxCols); + } + + // Unrolling is important (pipelining and amortizing branch mispredictions); + // 2x is sufficient to reach full memory bandwidth on SKX in Partition, but + // somewhat slower for sorting than 4x. + // + // To change, must also update left + 3 * N etc. in the loop. + static constexpr size_t kPartitionUnroll = 4; + + // Chunk := group of keys loaded for sampling a pivot. Matches the typical + // cache line size of 64 bytes to get maximum benefit per L2 miss. Sort() + // ensures vectors are no larger than that, so this can be independent of the + // vector size and thus constexpr. + static constexpr HWY_INLINE size_t LanesPerChunk(size_t sizeof_t) { + return 64 / sizeof_t; + } + + template <typename T> + static constexpr HWY_INLINE size_t SampleLanes() { + return 2 * LanesPerChunk(sizeof(T)); // Stored samples + } + + static constexpr HWY_INLINE size_t PartitionBufNum(size_t N) { + // The main loop reads kPartitionUnroll vectors, and first loads from + // both left and right beforehand, so it requires min = 2 * + // kPartitionUnroll vectors. To handle smaller amounts (only guaranteed + // >= BaseCaseNumLanes), we partition the right side into a buffer. We need + // another vector at the end so CompressStore does not overwrite anything. + return (2 * kPartitionUnroll + 1) * N; + } + + // Max across the three buffer usages. + template <typename T, size_t kLPK> + static constexpr HWY_INLINE size_t BufNum(size_t N) { + // BaseCase may write one padding vector, and SortSamples uses the space + // after samples as the buffer. + return HWY_MAX(SampleLanes<T>() + BaseCaseNumLanes<kLPK>(N) + N, + PartitionBufNum(N)); + } + + // Translates vector_size to lanes and returns size in bytes. + template <typename T, size_t kLPK> + static constexpr HWY_INLINE size_t BufBytes(size_t vector_size) { + return BufNum<T, kLPK>(vector_size / sizeof(T)) * sizeof(T); + } + + // Returns max for any type. + template <size_t kLPK> + static constexpr HWY_INLINE size_t MaxBufBytes(size_t vector_size) { + // If 2 lanes per key, it's a 128-bit key with u64 lanes. + return kLPK == 2 ? BufBytes<uint64_t, 2>(vector_size) + : HWY_MAX((BufBytes<uint16_t, 1>(vector_size)), + HWY_MAX((BufBytes<uint32_t, 1>(vector_size)), + (BufBytes<uint64_t, 1>(vector_size)))); + } +}; + +static_assert(SortConstants::MaxBufBytes<1>(64) <= 1280, "Unexpectedly high"); +static_assert(SortConstants::MaxBufBytes<2>(64) <= 1280, "Unexpectedly high"); + +} // namespace hwy + +#endif // HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_ + +// Per-target +#if defined(HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE +#undef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE +#else +#define HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE +#endif + +#include "hwy/highway.h" + +// vqsort isn't available on HWY_SCALAR, and builds time out on MSVC opt and +// Armv7 debug. +#undef VQSORT_ENABLED +#if (HWY_TARGET == HWY_SCALAR) || \ + (HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD) || \ + (HWY_ARCH_ARM_V7 && HWY_IS_DEBUG_BUILD) +#define VQSORT_ENABLED 0 +#else +#define VQSORT_ENABLED 1 +#endif + +namespace hwy { +namespace HWY_NAMESPACE { + +// Default tag / vector width selector. +#if HWY_TARGET == HWY_RVV +// Use LMUL = 1/2; for SEW=64 this ends up emulated via vsetvl. +template <typename T> +using SortTag = ScalableTag<T, -1>; +#else +template <typename T> +using SortTag = ScalableTag<T>; +#endif + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy + +#endif // HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE diff --git a/third_party/highway/hwy/contrib/sort/sort_test.cc b/third_party/highway/hwy/contrib/sort/sort_test.cc new file mode 100644 index 0000000000..b38a42d214 --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/sort_test.cc @@ -0,0 +1,650 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS // before inttypes.h +#endif +#include <inttypes.h> // IWYU pragma: keep +#include <stdio.h> +#include <string.h> // memcpy + +#include <unordered_map> +#include <vector> + +// clang-format off +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/sort_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/contrib/sort/vqsort.h" +// After foreach_target +#include "hwy/contrib/sort/algo-inl.h" +#include "hwy/contrib/sort/traits128-inl.h" +#include "hwy/contrib/sort/result-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" // BaseCase +#include "hwy/tests/test_util-inl.h" +// clang-format on + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { +namespace { + +using detail::OrderAscending; +using detail::SharedTraits; +using detail::TraitsLane; +#if VQSORT_ENABLED || HWY_IDE +#if !HAVE_INTEL +using detail::OrderAscending128; +using detail::OrderAscendingKV128; +using detail::OrderAscendingKV64; +using detail::OrderDescending128; +using detail::OrderDescendingKV128; +using detail::OrderDescendingKV64; +using detail::Traits128; +#endif + +template <class Traits> +static HWY_NOINLINE void TestMedian3() { + using LaneType = typename Traits::LaneType; + using D = CappedTag<LaneType, 1>; + SharedTraits<Traits> st; + const D d; + using V = Vec<D>; + for (uint32_t bits = 0; bits < 8; ++bits) { + const V v0 = Set(d, LaneType{(bits & (1u << 0)) ? 1u : 0u}); + const V v1 = Set(d, LaneType{(bits & (1u << 1)) ? 1u : 0u}); + const V v2 = Set(d, LaneType{(bits & (1u << 2)) ? 1u : 0u}); + const LaneType m = GetLane(detail::MedianOf3(st, v0, v1, v2)); + // If at least half(rounded up) of bits are 1, so is the median. + const size_t count = PopCount(bits); + HWY_ASSERT_EQ((count >= 2) ? static_cast<LaneType>(1) : 0, m); + } +} + +HWY_NOINLINE void TestAllMedian() { + TestMedian3<TraitsLane<OrderAscending<uint64_t> > >(); +} + +template <class Traits> +static HWY_NOINLINE void TestBaseCaseAscDesc() { + using LaneType = typename Traits::LaneType; + SharedTraits<Traits> st; + const SortTag<LaneType> d; + const size_t N = Lanes(d); + constexpr size_t N1 = st.LanesPerKey(); + const size_t base_case_num = SortConstants::BaseCaseNumLanes<N1>(N); + + constexpr int kDebug = 0; + auto aligned_lanes = hwy::AllocateAligned<LaneType>(N + base_case_num + N); + auto buf = hwy::AllocateAligned<LaneType>(base_case_num + 2 * N); + HWY_ASSERT(aligned_lanes && buf); + + std::vector<size_t> lengths; + lengths.push_back(HWY_MAX(1, N1)); + lengths.push_back(3 * N1); + lengths.push_back(base_case_num / 2); + lengths.push_back(base_case_num / 2 + N1); + lengths.push_back(base_case_num - N1); + lengths.push_back(base_case_num); + + std::vector<size_t> misalignments; + misalignments.push_back(0); + misalignments.push_back(1); + if (N >= 6) misalignments.push_back(N / 2 - 1); + misalignments.push_back(N / 2); + misalignments.push_back(N / 2 + 1); + misalignments.push_back(HWY_MIN(2 * N / 3 + 3, size_t{N - 1})); + + for (bool asc : {false, true}) { + for (size_t len : lengths) { + for (size_t misalign : misalignments) { + LaneType* HWY_RESTRICT lanes = aligned_lanes.get() + misalign; + if (kDebug) { + printf("============%s asc %d N1 %d len %d misalign %d\n", + st.KeyString(), asc, static_cast<int>(N1), + static_cast<int>(len), static_cast<int>(misalign)); + } + + for (size_t i = 0; i < misalign; ++i) { + aligned_lanes[i] = hwy::LowestValue<LaneType>(); + } + InputStats<LaneType> input_stats; + for (size_t i = 0; i < len; ++i) { + lanes[i] = asc ? static_cast<LaneType>(LaneType(i) + 1) + : static_cast<LaneType>(LaneType(len) - LaneType(i)); + input_stats.Notify(lanes[i]); + if (kDebug >= 2) { + printf("%3zu: %f\n", i, static_cast<double>(lanes[i])); + } + } + for (size_t i = len; i < base_case_num + N; ++i) { + lanes[i] = hwy::LowestValue<LaneType>(); + } + + detail::BaseCase(d, st, lanes, len, buf.get()); + + if (kDebug >= 2) { + printf("out>>>>>>\n"); + for (size_t i = 0; i < len; ++i) { + printf("%3zu: %f\n", i, static_cast<double>(lanes[i])); + } + } + + HWY_ASSERT(VerifySort(st, input_stats, lanes, len, "BaseAscDesc")); + for (size_t i = 0; i < misalign; ++i) { + if (aligned_lanes[i] != hwy::LowestValue<LaneType>()) + HWY_ABORT("Overrun misalign at %d\n", static_cast<int>(i)); + } + for (size_t i = len; i < base_case_num + N; ++i) { + if (lanes[i] != hwy::LowestValue<LaneType>()) + HWY_ABORT("Overrun right at %d\n", static_cast<int>(i)); + } + } // misalign + } // len + } // asc +} + +template <class Traits> +static HWY_NOINLINE void TestBaseCase01() { + using LaneType = typename Traits::LaneType; + SharedTraits<Traits> st; + const SortTag<LaneType> d; + const size_t N = Lanes(d); + constexpr size_t N1 = st.LanesPerKey(); + const size_t base_case_num = SortConstants::BaseCaseNumLanes<N1>(N); + + constexpr int kDebug = 0; + auto lanes = hwy::AllocateAligned<LaneType>(base_case_num + N); + auto buf = hwy::AllocateAligned<LaneType>(base_case_num + 2 * N); + HWY_ASSERT(lanes && buf); + + std::vector<size_t> lengths; + lengths.push_back(HWY_MAX(1, N1)); + lengths.push_back(3 * N1); + lengths.push_back(base_case_num / 2); + lengths.push_back(base_case_num / 2 + N1); + lengths.push_back(base_case_num - N1); + lengths.push_back(base_case_num); + + for (size_t len : lengths) { + if (kDebug) { + printf("============%s 01 N1 %d len %d\n", st.KeyString(), + static_cast<int>(N1), static_cast<int>(len)); + } + const uint64_t kMaxBits = AdjustedLog2Reps(HWY_MIN(len, size_t{14})); + for (uint64_t bits = 0; bits < ((1ull << kMaxBits) - 1); ++bits) { + InputStats<LaneType> input_stats; + for (size_t i = 0; i < len; ++i) { + lanes[i] = (i < 64 && (bits & (1ull << i))) ? 1 : 0; + input_stats.Notify(lanes[i]); + if (kDebug >= 2) { + printf("%3zu: %f\n", i, static_cast<double>(lanes[i])); + } + } + for (size_t i = len; i < base_case_num + N; ++i) { + lanes[i] = hwy::LowestValue<LaneType>(); + } + + detail::BaseCase(d, st, lanes.get(), len, buf.get()); + + if (kDebug >= 2) { + printf("out>>>>>>\n"); + for (size_t i = 0; i < len; ++i) { + printf("%3zu: %f\n", i, static_cast<double>(lanes[i])); + } + } + + HWY_ASSERT(VerifySort(st, input_stats, lanes.get(), len, "Base01")); + for (size_t i = len; i < base_case_num + N; ++i) { + if (lanes[i] != hwy::LowestValue<LaneType>()) + HWY_ABORT("Overrun right at %d\n", static_cast<int>(i)); + } + } // bits + } // len +} + +template <class Traits> +static HWY_NOINLINE void TestBaseCase() { + TestBaseCaseAscDesc<Traits>(); + TestBaseCase01<Traits>(); +} + +HWY_NOINLINE void TestAllBaseCase() { + // Workaround for stack overflow on MSVC debug. +#if defined(_MSC_VER) + return; +#endif + TestBaseCase<TraitsLane<OrderAscending<int32_t> > >(); + TestBaseCase<TraitsLane<OtherOrder<int64_t> > >(); +#if !HAVE_INTEL + TestBaseCase<Traits128<OrderAscending128> >(); + TestBaseCase<Traits128<OrderDescending128> >(); +#endif +} + +template <class Traits> +static HWY_NOINLINE void VerifyPartition( + Traits st, typename Traits::LaneType* HWY_RESTRICT lanes, size_t left, + size_t border, size_t right, const size_t N1, + const typename Traits::LaneType* pivot) { + /* for (size_t i = left; i < right; ++i) { + if (i == border) printf("--\n"); + printf("%4zu: %3d\n", i, lanes[i]); + }*/ + + HWY_ASSERT(left % N1 == 0); + HWY_ASSERT(border % N1 == 0); + HWY_ASSERT(right % N1 == 0); + const bool asc = typename Traits::Order().IsAscending(); + for (size_t i = left; i < border; i += N1) { + if (st.Compare1(pivot, lanes + i)) { + HWY_ABORT( + "%s: asc %d left[%d] piv %.0f %.0f compares before %.0f %.0f " + "border %d", + st.KeyString(), asc, static_cast<int>(i), + static_cast<double>(pivot[1]), static_cast<double>(pivot[0]), + static_cast<double>(lanes[i + 1]), static_cast<double>(lanes[i + 0]), + static_cast<int>(border)); + } + } + for (size_t i = border; i < right; i += N1) { + if (!st.Compare1(pivot, lanes + i)) { + HWY_ABORT( + "%s: asc %d right[%d] piv %.0f %.0f compares after %.0f %.0f " + "border %d", + st.KeyString(), asc, static_cast<int>(i), + static_cast<double>(pivot[1]), static_cast<double>(pivot[0]), + static_cast<double>(lanes[i + 1]), static_cast<double>(lanes[i]), + static_cast<int>(border)); + } + } +} + +template <class Traits> +static HWY_NOINLINE void TestPartition() { + using LaneType = typename Traits::LaneType; + const SortTag<LaneType> d; + SharedTraits<Traits> st; + const bool asc = typename Traits::Order().IsAscending(); + const size_t N = Lanes(d); + constexpr int kDebug = 0; + constexpr size_t N1 = st.LanesPerKey(); + const size_t base_case_num = SortConstants::BaseCaseNumLanes<N1>(N); + // left + len + align + const size_t total = 32 + (base_case_num + 4 * HWY_MAX(N, 4)) + 2 * N; + auto aligned_lanes = hwy::AllocateAligned<LaneType>(total); + HWY_ALIGN LaneType buf[SortConstants::BufBytes<LaneType, N1>(HWY_MAX_BYTES) / + sizeof(LaneType)]; + + for (bool in_asc : {false, true}) { + for (int left_i : {0, 1, 7, 8, 30, 31}) { + const size_t left = static_cast<size_t>(left_i) & ~(N1 - 1); + for (size_t ofs : + {N, N + 3, 2 * N, 2 * N + 2, 2 * N + 3, 3 * N - 1, 4 * N - 2}) { + const size_t len = (base_case_num + ofs) & ~(N1 - 1); + for (LaneType pivot1 : {LaneType(0), LaneType(len / 3), + LaneType(2 * len / 3), LaneType(len)}) { + const LaneType pivot2[2] = {pivot1, 0}; + const auto pivot = st.SetKey(d, pivot2); + for (size_t misalign = 0; misalign < N; + misalign += st.LanesPerKey()) { + LaneType* HWY_RESTRICT lanes = aligned_lanes.get() + misalign; + const size_t right = left + len; + if (kDebug) { + printf( + "=========%s asc %d left %d len %d right %d piv %.0f %.0f\n", + st.KeyString(), asc, static_cast<int>(left), + static_cast<int>(len), static_cast<int>(right), + static_cast<double>(pivot2[1]), + static_cast<double>(pivot2[0])); + } + + for (size_t i = 0; i < misalign; ++i) { + aligned_lanes[i] = hwy::LowestValue<LaneType>(); + } + for (size_t i = 0; i < left; ++i) { + lanes[i] = hwy::LowestValue<LaneType>(); + } + std::unordered_map<LaneType, int> counts; + for (size_t i = left; i < right; ++i) { + lanes[i] = static_cast<LaneType>( + in_asc ? LaneType(i + 1) - static_cast<LaneType>(left) + : static_cast<LaneType>(right) - LaneType(i)); + ++counts[lanes[i]]; + if (kDebug >= 2) { + printf("%3zu: %f\n", i, static_cast<double>(lanes[i])); + } + } + for (size_t i = right; i < total - misalign; ++i) { + lanes[i] = hwy::LowestValue<LaneType>(); + } + + size_t border = left + detail::Partition(d, st, lanes + left, + right - left, pivot, buf); + + if (kDebug >= 2) { + printf("out>>>>>>\n"); + for (size_t i = left; i < right; ++i) { + printf("%3zu: %f\n", i, static_cast<double>(lanes[i])); + } + for (size_t i = right; i < total - misalign; ++i) { + printf("%3zu: sentinel %f\n", i, static_cast<double>(lanes[i])); + } + } + for (size_t i = left; i < right; ++i) { + --counts[lanes[i]]; + } + for (auto kv : counts) { + if (kv.second != 0) { + PrintValue(kv.first); + HWY_ABORT("Incorrect count %d\n", kv.second); + } + } + VerifyPartition(st, lanes, left, border, right, N1, pivot2); + for (size_t i = 0; i < misalign; ++i) { + if (aligned_lanes[i] != hwy::LowestValue<LaneType>()) + HWY_ABORT("Overrun misalign at %d\n", static_cast<int>(i)); + } + for (size_t i = 0; i < left; ++i) { + if (lanes[i] != hwy::LowestValue<LaneType>()) + HWY_ABORT("Overrun left at %d\n", static_cast<int>(i)); + } + for (size_t i = right; i < total - misalign; ++i) { + if (lanes[i] != hwy::LowestValue<LaneType>()) + HWY_ABORT("Overrun right at %d\n", static_cast<int>(i)); + } + } // misalign + } // pivot + } // len + } // left + } // asc +} + +HWY_NOINLINE void TestAllPartition() { + TestPartition<TraitsLane<OtherOrder<int32_t> > >(); +#if !HAVE_INTEL + TestPartition<Traits128<OrderAscending128> >(); +#endif + +#if !HWY_IS_DEBUG_BUILD + TestPartition<TraitsLane<OrderAscending<int16_t> > >(); + TestPartition<TraitsLane<OrderAscending<int64_t> > >(); + TestPartition<TraitsLane<OtherOrder<float> > >(); +#if HWY_HAVE_FLOAT64 + TestPartition<TraitsLane<OtherOrder<double> > >(); +#endif +#if !HAVE_INTEL + TestPartition<Traits128<OrderDescending128> >(); +#endif +#endif +} + +// (used for sample selection for choosing a pivot) +template <typename TU> +static HWY_NOINLINE void TestRandomGenerator() { + static_assert(!hwy::IsSigned<TU>(), ""); + SortTag<TU> du; + const size_t N = Lanes(du); + + uint64_t* state = GetGeneratorState(); + + // Ensure lower and upper 32 bits are uniformly distributed. + uint64_t sum_lo = 0, sum_hi = 0; + for (size_t i = 0; i < 1000; ++i) { + const uint64_t bits = detail::RandomBits(state); + sum_lo += bits & 0xFFFFFFFF; + sum_hi += bits >> 32; + } + const double expected = 1000 * (1ULL << 31); + HWY_ASSERT(0.9 * expected <= static_cast<double>(sum_lo) && + static_cast<double>(sum_lo) <= 1.1 * expected); + HWY_ASSERT(0.9 * expected <= static_cast<double>(sum_hi) && + static_cast<double>(sum_hi) <= 1.1 * expected); + + const size_t lanes_per_block = HWY_MAX(64 / sizeof(TU), N); // power of two + + for (uint32_t num_blocks = 2; num_blocks < 100000; + num_blocks = 3 * num_blocks / 2) { + // Generate some numbers and ensure all are in range + uint64_t sum = 0; + constexpr size_t kReps = 10000; + for (size_t rep = 0; rep < kReps; ++rep) { + const uint32_t bits = detail::RandomBits(state) & 0xFFFFFFFF; + const size_t index = detail::RandomChunkIndex(num_blocks, bits); + HWY_ASSERT(((index + 1) * lanes_per_block) <= + num_blocks * lanes_per_block); + + sum += index; + } + + // Also ensure the mean is near the middle of the range + const double expected = (num_blocks - 1) / 2.0; + const double actual = static_cast<double>(sum) / kReps; + HWY_ASSERT(0.9 * expected <= actual && actual <= 1.1 * expected); + } +} + +HWY_NOINLINE void TestAllGenerator() { + TestRandomGenerator<uint32_t>(); + TestRandomGenerator<uint64_t>(); +} + +#else +static void TestAllMedian() {} +static void TestAllBaseCase() {} +static void TestAllPartition() {} +static void TestAllGenerator() {} +#endif // VQSORT_ENABLED + +// Remembers input, and compares results to that of a reference algorithm. +template <class Traits> +class CompareResults { + using LaneType = typename Traits::LaneType; + using KeyType = typename Traits::KeyType; + + public: + CompareResults(const LaneType* in, size_t num_lanes) { + copy_.resize(num_lanes); + memcpy(copy_.data(), in, num_lanes * sizeof(LaneType)); + } + + bool Verify(const LaneType* output) { +#if HAVE_PDQSORT + const Algo reference = Algo::kPDQ; +#else + const Algo reference = Algo::kStd; +#endif + SharedState shared; + using Order = typename Traits::Order; + const Traits st; + const size_t num_keys = copy_.size() / st.LanesPerKey(); + Run<Order>(reference, reinterpret_cast<KeyType*>(copy_.data()), num_keys, + shared, /*thread=*/0); +#if VQSORT_PRINT >= 3 + fprintf(stderr, "\nExpected:\n"); + for (size_t i = 0; i < copy_.size(); ++i) { + PrintValue(copy_[i]); + } + fprintf(stderr, "\n"); +#endif + for (size_t i = 0; i < copy_.size(); ++i) { + if (copy_[i] != output[i]) { + if (sizeof(KeyType) == 16) { + fprintf(stderr, + "%s Asc %d mismatch at %d of %d: %" PRIu64 " %" PRIu64 "\n", + st.KeyString(), Order().IsAscending(), static_cast<int>(i), + static_cast<int>(copy_.size()), + static_cast<uint64_t>(copy_[i]), + static_cast<uint64_t>(output[i])); + } else { + fprintf(stderr, + "Type %s Asc %d mismatch at %d of %d: ", st.KeyString(), + Order().IsAscending(), static_cast<int>(i), + static_cast<int>(copy_.size())); + PrintValue(copy_[i]); + PrintValue(output[i]); + fprintf(stderr, "\n"); + } + return false; + } + } + return true; + } + + private: + std::vector<LaneType> copy_; +}; + +std::vector<Algo> AlgoForTest() { + return { +#if HAVE_AVX2SORT + Algo::kSEA, +#endif +#if HAVE_IPS4O + Algo::kIPS4O, +#endif +#if HAVE_PDQSORT + Algo::kPDQ, +#endif +#if HAVE_SORT512 + Algo::kSort512, +#endif + Algo::kHeap, Algo::kVQSort, + }; +} + +template <class Traits> +void TestSort(size_t num_lanes) { +// Workaround for stack overflow on clang-cl (/F 8388608 does not help). +#if defined(_MSC_VER) + return; +#endif + using Order = typename Traits::Order; + using LaneType = typename Traits::LaneType; + using KeyType = typename Traits::KeyType; + SharedState shared; + SharedTraits<Traits> st; + + // Round up to a whole number of keys. + num_lanes += (st.Is128() && (num_lanes & 1)); + const size_t num_keys = num_lanes / st.LanesPerKey(); + + constexpr size_t kMaxMisalign = 16; + auto aligned = + hwy::AllocateAligned<LaneType>(kMaxMisalign + num_lanes + kMaxMisalign); + HWY_ASSERT(aligned); + for (Algo algo : AlgoForTest()) { + for (Dist dist : AllDist()) { + for (size_t misalign : {size_t{0}, size_t{st.LanesPerKey()}, + size_t{3 * st.LanesPerKey()}, kMaxMisalign / 2}) { + LaneType* lanes = aligned.get() + misalign; + + // Set up red zones before/after the keys to sort + for (size_t i = 0; i < misalign; ++i) { + aligned[i] = hwy::LowestValue<LaneType>(); + } + for (size_t i = 0; i < kMaxMisalign; ++i) { + lanes[num_lanes + i] = hwy::HighestValue<LaneType>(); + } +#if HWY_IS_MSAN + __msan_poison(aligned.get(), misalign * sizeof(LaneType)); + __msan_poison(lanes + num_lanes, kMaxMisalign * sizeof(LaneType)); +#endif + InputStats<LaneType> input_stats = + GenerateInput(dist, lanes, num_lanes); + + CompareResults<Traits> compare(lanes, num_lanes); + Run<Order>(algo, reinterpret_cast<KeyType*>(lanes), num_keys, shared, + /*thread=*/0); + HWY_ASSERT(compare.Verify(lanes)); + HWY_ASSERT(VerifySort(st, input_stats, lanes, num_lanes, "TestSort")); + + // Check red zones +#if HWY_IS_MSAN + __msan_unpoison(aligned.get(), misalign * sizeof(LaneType)); + __msan_unpoison(lanes + num_lanes, kMaxMisalign * sizeof(LaneType)); +#endif + for (size_t i = 0; i < misalign; ++i) { + if (aligned[i] != hwy::LowestValue<LaneType>()) + HWY_ABORT("Overrun left at %d\n", static_cast<int>(i)); + } + for (size_t i = num_lanes; i < num_lanes + kMaxMisalign; ++i) { + if (lanes[i] != hwy::HighestValue<LaneType>()) + HWY_ABORT("Overrun right at %d\n", static_cast<int>(i)); + } + } // misalign + } // dist + } // algo +} + +void TestAllSort() { + for (int num : {129, 504, 3 * 1000, 34567}) { + const size_t num_lanes = AdjustedReps(static_cast<size_t>(num)); +#if !HAVE_INTEL + TestSort<TraitsLane<OrderAscending<int16_t> > >(num_lanes); + TestSort<TraitsLane<OtherOrder<uint16_t> > >(num_lanes); +#endif + + TestSort<TraitsLane<OtherOrder<int32_t> > >(num_lanes); + TestSort<TraitsLane<OtherOrder<uint32_t> > >(num_lanes); + + TestSort<TraitsLane<OrderAscending<int64_t> > >(num_lanes); + TestSort<TraitsLane<OrderAscending<uint64_t> > >(num_lanes); + + // WARNING: for float types, SIMD comparisons will flush denormals to + // zero, causing mismatches with scalar sorts. In this test, we avoid + // generating denormal inputs. + TestSort<TraitsLane<OrderAscending<float> > >(num_lanes); +#if HWY_HAVE_FLOAT64 // protects algo-inl's GenerateRandom + if (HWY_HAVE_FLOAT64) { + TestSort<TraitsLane<OtherOrder<double> > >(num_lanes); + } +#endif + +// Other algorithms do not support 128-bit keys. +#if !HAVE_VXSORT && !HAVE_INTEL && VQSORT_ENABLED + TestSort<Traits128<OrderAscending128> >(num_lanes); + TestSort<Traits128<OrderDescending128> >(num_lanes); + + TestSort<TraitsLane<OrderAscendingKV64> >(num_lanes); + TestSort<TraitsLane<OrderDescendingKV64> >(num_lanes); + + TestSort<Traits128<OrderAscendingKV128> >(num_lanes); + TestSort<Traits128<OrderDescendingKV128> >(num_lanes); +#endif + } +} + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +namespace { +HWY_BEFORE_TEST(SortTest); +HWY_EXPORT_AND_TEST_P(SortTest, TestAllMedian); +HWY_EXPORT_AND_TEST_P(SortTest, TestAllBaseCase); +HWY_EXPORT_AND_TEST_P(SortTest, TestAllPartition); +HWY_EXPORT_AND_TEST_P(SortTest, TestAllGenerator); +HWY_EXPORT_AND_TEST_P(SortTest, TestAllSort); +} // namespace +} // namespace hwy + +#endif // HWY_ONCE diff --git a/third_party/highway/hwy/contrib/sort/sorting_networks-inl.h b/third_party/highway/hwy/contrib/sort/sorting_networks-inl.h new file mode 100644 index 0000000000..c47fd8da7d --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/sorting_networks-inl.h @@ -0,0 +1,898 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Per-target +#if defined(HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE +#undef HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE +#else +#define HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE +#endif + +#include "hwy/contrib/sort/shared-inl.h" // SortConstants +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { +namespace detail { + +#if VQSORT_ENABLED + +using Constants = hwy::SortConstants; + +// ------------------------------ SharedTraits + +// Code shared between all traits. It's unclear whether these can profitably be +// specialized for Lane vs Block, or optimized like SortPairsDistance1 using +// Compare/DupOdd. +template <class Base> +struct SharedTraits : public Base { + // Conditionally swaps lane 0 with 2, 1 with 3 etc. + template <class D> + HWY_INLINE Vec<D> SortPairsDistance2(D d, Vec<D> v) const { + const Base* base = static_cast<const Base*>(this); + Vec<D> swapped = base->SwapAdjacentPairs(d, v); + base->Sort2(d, v, swapped); + return base->OddEvenPairs(d, swapped, v); + } + + // Swaps with the vector formed by reversing contiguous groups of 8 keys. + template <class D> + HWY_INLINE Vec<D> SortPairsReverse8(D d, Vec<D> v) const { + const Base* base = static_cast<const Base*>(this); + Vec<D> swapped = base->ReverseKeys8(d, v); + base->Sort2(d, v, swapped); + return base->OddEvenQuads(d, swapped, v); + } + + // Swaps with the vector formed by reversing contiguous groups of 8 keys. + template <class D> + HWY_INLINE Vec<D> SortPairsReverse16(D d, Vec<D> v) const { + const Base* base = static_cast<const Base*>(this); + static_assert(Constants::kMaxCols <= 16, "Need actual Reverse16"); + Vec<D> swapped = base->ReverseKeys(d, v); + base->Sort2(d, v, swapped); + return ConcatUpperLower(d, swapped, v); // 8 = half of the vector + } +}; + +// ------------------------------ Sorting network + +// Sorting networks for independent columns in 2, 4 and 8 vectors from +// https://bertdobbelaere.github.io/sorting_networks.html. + +template <class D, class Traits, class V = Vec<D>> +HWY_INLINE void Sort2(D d, Traits st, V& v0, V& v1) { + st.Sort2(d, v0, v1); +} + +template <class D, class Traits, class V = Vec<D>> +HWY_INLINE void Sort4(D d, Traits st, V& v0, V& v1, V& v2, V& v3) { + st.Sort2(d, v0, v2); + st.Sort2(d, v1, v3); + st.Sort2(d, v0, v1); + st.Sort2(d, v2, v3); + st.Sort2(d, v1, v2); +} + +template <class D, class Traits, class V = Vec<D>> +HWY_INLINE void Sort8(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5, + V& v6, V& v7) { + st.Sort2(d, v0, v2); + st.Sort2(d, v1, v3); + st.Sort2(d, v4, v6); + st.Sort2(d, v5, v7); + + st.Sort2(d, v0, v4); + st.Sort2(d, v1, v5); + st.Sort2(d, v2, v6); + st.Sort2(d, v3, v7); + + st.Sort2(d, v0, v1); + st.Sort2(d, v2, v3); + st.Sort2(d, v4, v5); + st.Sort2(d, v6, v7); + + st.Sort2(d, v2, v4); + st.Sort2(d, v3, v5); + + st.Sort2(d, v1, v4); + st.Sort2(d, v3, v6); + + st.Sort2(d, v1, v2); + st.Sort2(d, v3, v4); + st.Sort2(d, v5, v6); +} + +// (Green's irregular) sorting network for independent columns in 16 vectors. +template <class D, class Traits, class V = Vec<D>> +HWY_INLINE void Sort16(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5, + V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd, + V& ve, V& vf) { + st.Sort2(d, v0, v1); + st.Sort2(d, v2, v3); + st.Sort2(d, v4, v5); + st.Sort2(d, v6, v7); + st.Sort2(d, v8, v9); + st.Sort2(d, va, vb); + st.Sort2(d, vc, vd); + st.Sort2(d, ve, vf); + st.Sort2(d, v0, v2); + st.Sort2(d, v1, v3); + st.Sort2(d, v4, v6); + st.Sort2(d, v5, v7); + st.Sort2(d, v8, va); + st.Sort2(d, v9, vb); + st.Sort2(d, vc, ve); + st.Sort2(d, vd, vf); + st.Sort2(d, v0, v4); + st.Sort2(d, v1, v5); + st.Sort2(d, v2, v6); + st.Sort2(d, v3, v7); + st.Sort2(d, v8, vc); + st.Sort2(d, v9, vd); + st.Sort2(d, va, ve); + st.Sort2(d, vb, vf); + st.Sort2(d, v0, v8); + st.Sort2(d, v1, v9); + st.Sort2(d, v2, va); + st.Sort2(d, v3, vb); + st.Sort2(d, v4, vc); + st.Sort2(d, v5, vd); + st.Sort2(d, v6, ve); + st.Sort2(d, v7, vf); + st.Sort2(d, v5, va); + st.Sort2(d, v6, v9); + st.Sort2(d, v3, vc); + st.Sort2(d, v7, vb); + st.Sort2(d, vd, ve); + st.Sort2(d, v4, v8); + st.Sort2(d, v1, v2); + st.Sort2(d, v1, v4); + st.Sort2(d, v7, vd); + st.Sort2(d, v2, v8); + st.Sort2(d, vb, ve); + st.Sort2(d, v2, v4); + st.Sort2(d, v5, v6); + st.Sort2(d, v9, va); + st.Sort2(d, vb, vd); + st.Sort2(d, v3, v8); + st.Sort2(d, v7, vc); + st.Sort2(d, v3, v5); + st.Sort2(d, v6, v8); + st.Sort2(d, v7, v9); + st.Sort2(d, va, vc); + st.Sort2(d, v3, v4); + st.Sort2(d, v5, v6); + st.Sort2(d, v7, v8); + st.Sort2(d, v9, va); + st.Sort2(d, vb, vc); + st.Sort2(d, v6, v7); + st.Sort2(d, v8, v9); +} + +// ------------------------------ Merging networks + +// Blacher's hybrid bitonic/odd-even networks, generated by print_network.cc. +// For acceptable performance, these must be inlined, otherwise vectors are +// loaded from the stack. The kKeysPerVector allows calling from generic code +// but skipping the functions when vectors have too few lanes for +// st.SortPairsDistance1 to compile. `if constexpr` in the caller would also +// work, but is not available in C++11. We write out the (unused) argument types +// rather than `...` because GCC 9 (but not 10) fails to compile with `...`. + +template <size_t kKeysPerVector, class D, class Traits, class V, + HWY_IF_LANES_LE(kKeysPerVector, 1)> +HWY_INLINE void Merge8x2(D, Traits, V, V, V, V, V, V, V, V) {} +template <size_t kKeysPerVector, class D, class Traits, class V, + HWY_IF_LANES_LE(kKeysPerVector, 2)> +HWY_INLINE void Merge8x4(D, Traits, V, V, V, V, V, V, V, V) {} + +template <size_t kKeysPerVector, class D, class Traits, class V, + HWY_IF_LANES_LE(kKeysPerVector, 1)> +HWY_INLINE void Merge16x2(D, Traits, V, V, V, V, V, V, V, V, V, V, V, V, V, V, + V, V) {} +template <size_t kKeysPerVector, class D, class Traits, class V, + HWY_IF_LANES_LE(kKeysPerVector, 2)> +HWY_INLINE void Merge16x4(D, Traits, V, V, V, V, V, V, V, V, V, V, V, V, V, V, + V, V) {} +template <size_t kKeysPerVector, class D, class Traits, class V, + HWY_IF_LANES_LE(kKeysPerVector, 4)> +HWY_INLINE void Merge16x8(D, Traits, V, V, V, V, V, V, V, V, V, V, V, V, V, V, + V, V) {} +template <size_t kKeysPerVector, class D, class Traits, class V, + HWY_IF_LANES_LE(kKeysPerVector, 8)> +HWY_INLINE void Merge16x16(D, Traits, V, V, V, V, V, V, V, V, V, V, V, V, V, V, + V, V) {} + +template <size_t kKeysPerVector, class D, class Traits, class V = Vec<D>, + HWY_IF_LANES_GT(kKeysPerVector, 1)> +HWY_INLINE void Merge8x2(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, + V& v5, V& v6, V& v7) { + v7 = st.ReverseKeys2(d, v7); + v6 = st.ReverseKeys2(d, v6); + v5 = st.ReverseKeys2(d, v5); + v4 = st.ReverseKeys2(d, v4); + st.Sort2(d, v0, v7); + st.Sort2(d, v1, v6); + st.Sort2(d, v2, v5); + st.Sort2(d, v3, v4); + + v3 = st.ReverseKeys2(d, v3); + v2 = st.ReverseKeys2(d, v2); + v7 = st.ReverseKeys2(d, v7); + v6 = st.ReverseKeys2(d, v6); + st.Sort2(d, v0, v3); + st.Sort2(d, v1, v2); + st.Sort2(d, v4, v7); + st.Sort2(d, v5, v6); + + v1 = st.ReverseKeys2(d, v1); + v3 = st.ReverseKeys2(d, v3); + v5 = st.ReverseKeys2(d, v5); + v7 = st.ReverseKeys2(d, v7); + st.Sort2(d, v0, v1); + st.Sort2(d, v2, v3); + st.Sort2(d, v4, v5); + st.Sort2(d, v6, v7); + + v0 = st.SortPairsDistance1(d, v0); + v1 = st.SortPairsDistance1(d, v1); + v2 = st.SortPairsDistance1(d, v2); + v3 = st.SortPairsDistance1(d, v3); + v4 = st.SortPairsDistance1(d, v4); + v5 = st.SortPairsDistance1(d, v5); + v6 = st.SortPairsDistance1(d, v6); + v7 = st.SortPairsDistance1(d, v7); +} + +template <size_t kKeysPerVector, class D, class Traits, class V = Vec<D>, + HWY_IF_LANES_GT(kKeysPerVector, 2)> +HWY_INLINE void Merge8x4(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, + V& v5, V& v6, V& v7) { + v7 = st.ReverseKeys4(d, v7); + v6 = st.ReverseKeys4(d, v6); + v5 = st.ReverseKeys4(d, v5); + v4 = st.ReverseKeys4(d, v4); + st.Sort2(d, v0, v7); + st.Sort2(d, v1, v6); + st.Sort2(d, v2, v5); + st.Sort2(d, v3, v4); + + v3 = st.ReverseKeys4(d, v3); + v2 = st.ReverseKeys4(d, v2); + v7 = st.ReverseKeys4(d, v7); + v6 = st.ReverseKeys4(d, v6); + st.Sort2(d, v0, v3); + st.Sort2(d, v1, v2); + st.Sort2(d, v4, v7); + st.Sort2(d, v5, v6); + + v1 = st.ReverseKeys4(d, v1); + v3 = st.ReverseKeys4(d, v3); + v5 = st.ReverseKeys4(d, v5); + v7 = st.ReverseKeys4(d, v7); + st.Sort2(d, v0, v1); + st.Sort2(d, v2, v3); + st.Sort2(d, v4, v5); + st.Sort2(d, v6, v7); + + v0 = st.SortPairsReverse4(d, v0); + v1 = st.SortPairsReverse4(d, v1); + v2 = st.SortPairsReverse4(d, v2); + v3 = st.SortPairsReverse4(d, v3); + v4 = st.SortPairsReverse4(d, v4); + v5 = st.SortPairsReverse4(d, v5); + v6 = st.SortPairsReverse4(d, v6); + v7 = st.SortPairsReverse4(d, v7); + + v0 = st.SortPairsDistance1(d, v0); + v1 = st.SortPairsDistance1(d, v1); + v2 = st.SortPairsDistance1(d, v2); + v3 = st.SortPairsDistance1(d, v3); + v4 = st.SortPairsDistance1(d, v4); + v5 = st.SortPairsDistance1(d, v5); + v6 = st.SortPairsDistance1(d, v6); + v7 = st.SortPairsDistance1(d, v7); +} + +template <size_t kKeysPerVector, class D, class Traits, class V = Vec<D>, + HWY_IF_LANES_GT(kKeysPerVector, 1)> +HWY_INLINE void Merge16x2(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, + V& v5, V& v6, V& v7, V& v8, V& v9, V& va, V& vb, + V& vc, V& vd, V& ve, V& vf) { + vf = st.ReverseKeys2(d, vf); + ve = st.ReverseKeys2(d, ve); + vd = st.ReverseKeys2(d, vd); + vc = st.ReverseKeys2(d, vc); + vb = st.ReverseKeys2(d, vb); + va = st.ReverseKeys2(d, va); + v9 = st.ReverseKeys2(d, v9); + v8 = st.ReverseKeys2(d, v8); + st.Sort2(d, v0, vf); + st.Sort2(d, v1, ve); + st.Sort2(d, v2, vd); + st.Sort2(d, v3, vc); + st.Sort2(d, v4, vb); + st.Sort2(d, v5, va); + st.Sort2(d, v6, v9); + st.Sort2(d, v7, v8); + + v7 = st.ReverseKeys2(d, v7); + v6 = st.ReverseKeys2(d, v6); + v5 = st.ReverseKeys2(d, v5); + v4 = st.ReverseKeys2(d, v4); + vf = st.ReverseKeys2(d, vf); + ve = st.ReverseKeys2(d, ve); + vd = st.ReverseKeys2(d, vd); + vc = st.ReverseKeys2(d, vc); + st.Sort2(d, v0, v7); + st.Sort2(d, v1, v6); + st.Sort2(d, v2, v5); + st.Sort2(d, v3, v4); + st.Sort2(d, v8, vf); + st.Sort2(d, v9, ve); + st.Sort2(d, va, vd); + st.Sort2(d, vb, vc); + + v3 = st.ReverseKeys2(d, v3); + v2 = st.ReverseKeys2(d, v2); + v7 = st.ReverseKeys2(d, v7); + v6 = st.ReverseKeys2(d, v6); + vb = st.ReverseKeys2(d, vb); + va = st.ReverseKeys2(d, va); + vf = st.ReverseKeys2(d, vf); + ve = st.ReverseKeys2(d, ve); + st.Sort2(d, v0, v3); + st.Sort2(d, v1, v2); + st.Sort2(d, v4, v7); + st.Sort2(d, v5, v6); + st.Sort2(d, v8, vb); + st.Sort2(d, v9, va); + st.Sort2(d, vc, vf); + st.Sort2(d, vd, ve); + + v1 = st.ReverseKeys2(d, v1); + v3 = st.ReverseKeys2(d, v3); + v5 = st.ReverseKeys2(d, v5); + v7 = st.ReverseKeys2(d, v7); + v9 = st.ReverseKeys2(d, v9); + vb = st.ReverseKeys2(d, vb); + vd = st.ReverseKeys2(d, vd); + vf = st.ReverseKeys2(d, vf); + st.Sort2(d, v0, v1); + st.Sort2(d, v2, v3); + st.Sort2(d, v4, v5); + st.Sort2(d, v6, v7); + st.Sort2(d, v8, v9); + st.Sort2(d, va, vb); + st.Sort2(d, vc, vd); + st.Sort2(d, ve, vf); + + v0 = st.SortPairsDistance1(d, v0); + v1 = st.SortPairsDistance1(d, v1); + v2 = st.SortPairsDistance1(d, v2); + v3 = st.SortPairsDistance1(d, v3); + v4 = st.SortPairsDistance1(d, v4); + v5 = st.SortPairsDistance1(d, v5); + v6 = st.SortPairsDistance1(d, v6); + v7 = st.SortPairsDistance1(d, v7); + v8 = st.SortPairsDistance1(d, v8); + v9 = st.SortPairsDistance1(d, v9); + va = st.SortPairsDistance1(d, va); + vb = st.SortPairsDistance1(d, vb); + vc = st.SortPairsDistance1(d, vc); + vd = st.SortPairsDistance1(d, vd); + ve = st.SortPairsDistance1(d, ve); + vf = st.SortPairsDistance1(d, vf); +} + +template <size_t kKeysPerVector, class D, class Traits, class V = Vec<D>, + HWY_IF_LANES_GT(kKeysPerVector, 2)> +HWY_INLINE void Merge16x4(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, + V& v5, V& v6, V& v7, V& v8, V& v9, V& va, V& vb, + V& vc, V& vd, V& ve, V& vf) { + vf = st.ReverseKeys4(d, vf); + ve = st.ReverseKeys4(d, ve); + vd = st.ReverseKeys4(d, vd); + vc = st.ReverseKeys4(d, vc); + vb = st.ReverseKeys4(d, vb); + va = st.ReverseKeys4(d, va); + v9 = st.ReverseKeys4(d, v9); + v8 = st.ReverseKeys4(d, v8); + st.Sort2(d, v0, vf); + st.Sort2(d, v1, ve); + st.Sort2(d, v2, vd); + st.Sort2(d, v3, vc); + st.Sort2(d, v4, vb); + st.Sort2(d, v5, va); + st.Sort2(d, v6, v9); + st.Sort2(d, v7, v8); + + v7 = st.ReverseKeys4(d, v7); + v6 = st.ReverseKeys4(d, v6); + v5 = st.ReverseKeys4(d, v5); + v4 = st.ReverseKeys4(d, v4); + vf = st.ReverseKeys4(d, vf); + ve = st.ReverseKeys4(d, ve); + vd = st.ReverseKeys4(d, vd); + vc = st.ReverseKeys4(d, vc); + st.Sort2(d, v0, v7); + st.Sort2(d, v1, v6); + st.Sort2(d, v2, v5); + st.Sort2(d, v3, v4); + st.Sort2(d, v8, vf); + st.Sort2(d, v9, ve); + st.Sort2(d, va, vd); + st.Sort2(d, vb, vc); + + v3 = st.ReverseKeys4(d, v3); + v2 = st.ReverseKeys4(d, v2); + v7 = st.ReverseKeys4(d, v7); + v6 = st.ReverseKeys4(d, v6); + vb = st.ReverseKeys4(d, vb); + va = st.ReverseKeys4(d, va); + vf = st.ReverseKeys4(d, vf); + ve = st.ReverseKeys4(d, ve); + st.Sort2(d, v0, v3); + st.Sort2(d, v1, v2); + st.Sort2(d, v4, v7); + st.Sort2(d, v5, v6); + st.Sort2(d, v8, vb); + st.Sort2(d, v9, va); + st.Sort2(d, vc, vf); + st.Sort2(d, vd, ve); + + v1 = st.ReverseKeys4(d, v1); + v3 = st.ReverseKeys4(d, v3); + v5 = st.ReverseKeys4(d, v5); + v7 = st.ReverseKeys4(d, v7); + v9 = st.ReverseKeys4(d, v9); + vb = st.ReverseKeys4(d, vb); + vd = st.ReverseKeys4(d, vd); + vf = st.ReverseKeys4(d, vf); + st.Sort2(d, v0, v1); + st.Sort2(d, v2, v3); + st.Sort2(d, v4, v5); + st.Sort2(d, v6, v7); + st.Sort2(d, v8, v9); + st.Sort2(d, va, vb); + st.Sort2(d, vc, vd); + st.Sort2(d, ve, vf); + + v0 = st.SortPairsReverse4(d, v0); + v1 = st.SortPairsReverse4(d, v1); + v2 = st.SortPairsReverse4(d, v2); + v3 = st.SortPairsReverse4(d, v3); + v4 = st.SortPairsReverse4(d, v4); + v5 = st.SortPairsReverse4(d, v5); + v6 = st.SortPairsReverse4(d, v6); + v7 = st.SortPairsReverse4(d, v7); + v8 = st.SortPairsReverse4(d, v8); + v9 = st.SortPairsReverse4(d, v9); + va = st.SortPairsReverse4(d, va); + vb = st.SortPairsReverse4(d, vb); + vc = st.SortPairsReverse4(d, vc); + vd = st.SortPairsReverse4(d, vd); + ve = st.SortPairsReverse4(d, ve); + vf = st.SortPairsReverse4(d, vf); + + v0 = st.SortPairsDistance1(d, v0); + v1 = st.SortPairsDistance1(d, v1); + v2 = st.SortPairsDistance1(d, v2); + v3 = st.SortPairsDistance1(d, v3); + v4 = st.SortPairsDistance1(d, v4); + v5 = st.SortPairsDistance1(d, v5); + v6 = st.SortPairsDistance1(d, v6); + v7 = st.SortPairsDistance1(d, v7); + v8 = st.SortPairsDistance1(d, v8); + v9 = st.SortPairsDistance1(d, v9); + va = st.SortPairsDistance1(d, va); + vb = st.SortPairsDistance1(d, vb); + vc = st.SortPairsDistance1(d, vc); + vd = st.SortPairsDistance1(d, vd); + ve = st.SortPairsDistance1(d, ve); + vf = st.SortPairsDistance1(d, vf); +} + +template <size_t kKeysPerVector, class D, class Traits, class V = Vec<D>, + HWY_IF_LANES_GT(kKeysPerVector, 4)> +HWY_INLINE void Merge16x8(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, + V& v5, V& v6, V& v7, V& v8, V& v9, V& va, V& vb, + V& vc, V& vd, V& ve, V& vf) { + vf = st.ReverseKeys8(d, vf); + ve = st.ReverseKeys8(d, ve); + vd = st.ReverseKeys8(d, vd); + vc = st.ReverseKeys8(d, vc); + vb = st.ReverseKeys8(d, vb); + va = st.ReverseKeys8(d, va); + v9 = st.ReverseKeys8(d, v9); + v8 = st.ReverseKeys8(d, v8); + st.Sort2(d, v0, vf); + st.Sort2(d, v1, ve); + st.Sort2(d, v2, vd); + st.Sort2(d, v3, vc); + st.Sort2(d, v4, vb); + st.Sort2(d, v5, va); + st.Sort2(d, v6, v9); + st.Sort2(d, v7, v8); + + v7 = st.ReverseKeys8(d, v7); + v6 = st.ReverseKeys8(d, v6); + v5 = st.ReverseKeys8(d, v5); + v4 = st.ReverseKeys8(d, v4); + vf = st.ReverseKeys8(d, vf); + ve = st.ReverseKeys8(d, ve); + vd = st.ReverseKeys8(d, vd); + vc = st.ReverseKeys8(d, vc); + st.Sort2(d, v0, v7); + st.Sort2(d, v1, v6); + st.Sort2(d, v2, v5); + st.Sort2(d, v3, v4); + st.Sort2(d, v8, vf); + st.Sort2(d, v9, ve); + st.Sort2(d, va, vd); + st.Sort2(d, vb, vc); + + v3 = st.ReverseKeys8(d, v3); + v2 = st.ReverseKeys8(d, v2); + v7 = st.ReverseKeys8(d, v7); + v6 = st.ReverseKeys8(d, v6); + vb = st.ReverseKeys8(d, vb); + va = st.ReverseKeys8(d, va); + vf = st.ReverseKeys8(d, vf); + ve = st.ReverseKeys8(d, ve); + st.Sort2(d, v0, v3); + st.Sort2(d, v1, v2); + st.Sort2(d, v4, v7); + st.Sort2(d, v5, v6); + st.Sort2(d, v8, vb); + st.Sort2(d, v9, va); + st.Sort2(d, vc, vf); + st.Sort2(d, vd, ve); + + v1 = st.ReverseKeys8(d, v1); + v3 = st.ReverseKeys8(d, v3); + v5 = st.ReverseKeys8(d, v5); + v7 = st.ReverseKeys8(d, v7); + v9 = st.ReverseKeys8(d, v9); + vb = st.ReverseKeys8(d, vb); + vd = st.ReverseKeys8(d, vd); + vf = st.ReverseKeys8(d, vf); + st.Sort2(d, v0, v1); + st.Sort2(d, v2, v3); + st.Sort2(d, v4, v5); + st.Sort2(d, v6, v7); + st.Sort2(d, v8, v9); + st.Sort2(d, va, vb); + st.Sort2(d, vc, vd); + st.Sort2(d, ve, vf); + + v0 = st.SortPairsReverse8(d, v0); + v1 = st.SortPairsReverse8(d, v1); + v2 = st.SortPairsReverse8(d, v2); + v3 = st.SortPairsReverse8(d, v3); + v4 = st.SortPairsReverse8(d, v4); + v5 = st.SortPairsReverse8(d, v5); + v6 = st.SortPairsReverse8(d, v6); + v7 = st.SortPairsReverse8(d, v7); + v8 = st.SortPairsReverse8(d, v8); + v9 = st.SortPairsReverse8(d, v9); + va = st.SortPairsReverse8(d, va); + vb = st.SortPairsReverse8(d, vb); + vc = st.SortPairsReverse8(d, vc); + vd = st.SortPairsReverse8(d, vd); + ve = st.SortPairsReverse8(d, ve); + vf = st.SortPairsReverse8(d, vf); + + v0 = st.SortPairsDistance2(d, v0); + v1 = st.SortPairsDistance2(d, v1); + v2 = st.SortPairsDistance2(d, v2); + v3 = st.SortPairsDistance2(d, v3); + v4 = st.SortPairsDistance2(d, v4); + v5 = st.SortPairsDistance2(d, v5); + v6 = st.SortPairsDistance2(d, v6); + v7 = st.SortPairsDistance2(d, v7); + v8 = st.SortPairsDistance2(d, v8); + v9 = st.SortPairsDistance2(d, v9); + va = st.SortPairsDistance2(d, va); + vb = st.SortPairsDistance2(d, vb); + vc = st.SortPairsDistance2(d, vc); + vd = st.SortPairsDistance2(d, vd); + ve = st.SortPairsDistance2(d, ve); + vf = st.SortPairsDistance2(d, vf); + + v0 = st.SortPairsDistance1(d, v0); + v1 = st.SortPairsDistance1(d, v1); + v2 = st.SortPairsDistance1(d, v2); + v3 = st.SortPairsDistance1(d, v3); + v4 = st.SortPairsDistance1(d, v4); + v5 = st.SortPairsDistance1(d, v5); + v6 = st.SortPairsDistance1(d, v6); + v7 = st.SortPairsDistance1(d, v7); + v8 = st.SortPairsDistance1(d, v8); + v9 = st.SortPairsDistance1(d, v9); + va = st.SortPairsDistance1(d, va); + vb = st.SortPairsDistance1(d, vb); + vc = st.SortPairsDistance1(d, vc); + vd = st.SortPairsDistance1(d, vd); + ve = st.SortPairsDistance1(d, ve); + vf = st.SortPairsDistance1(d, vf); +} + +// Unused on MSVC, see below +#if !HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD + +template <size_t kKeysPerVector, class D, class Traits, class V = Vec<D>, + HWY_IF_LANES_GT(kKeysPerVector, 8)> +HWY_INLINE void Merge16x16(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, + V& v5, V& v6, V& v7, V& v8, V& v9, V& va, V& vb, + V& vc, V& vd, V& ve, V& vf) { + vf = st.ReverseKeys16(d, vf); + ve = st.ReverseKeys16(d, ve); + vd = st.ReverseKeys16(d, vd); + vc = st.ReverseKeys16(d, vc); + vb = st.ReverseKeys16(d, vb); + va = st.ReverseKeys16(d, va); + v9 = st.ReverseKeys16(d, v9); + v8 = st.ReverseKeys16(d, v8); + st.Sort2(d, v0, vf); + st.Sort2(d, v1, ve); + st.Sort2(d, v2, vd); + st.Sort2(d, v3, vc); + st.Sort2(d, v4, vb); + st.Sort2(d, v5, va); + st.Sort2(d, v6, v9); + st.Sort2(d, v7, v8); + + v7 = st.ReverseKeys16(d, v7); + v6 = st.ReverseKeys16(d, v6); + v5 = st.ReverseKeys16(d, v5); + v4 = st.ReverseKeys16(d, v4); + vf = st.ReverseKeys16(d, vf); + ve = st.ReverseKeys16(d, ve); + vd = st.ReverseKeys16(d, vd); + vc = st.ReverseKeys16(d, vc); + st.Sort2(d, v0, v7); + st.Sort2(d, v1, v6); + st.Sort2(d, v2, v5); + st.Sort2(d, v3, v4); + st.Sort2(d, v8, vf); + st.Sort2(d, v9, ve); + st.Sort2(d, va, vd); + st.Sort2(d, vb, vc); + + v3 = st.ReverseKeys16(d, v3); + v2 = st.ReverseKeys16(d, v2); + v7 = st.ReverseKeys16(d, v7); + v6 = st.ReverseKeys16(d, v6); + vb = st.ReverseKeys16(d, vb); + va = st.ReverseKeys16(d, va); + vf = st.ReverseKeys16(d, vf); + ve = st.ReverseKeys16(d, ve); + st.Sort2(d, v0, v3); + st.Sort2(d, v1, v2); + st.Sort2(d, v4, v7); + st.Sort2(d, v5, v6); + st.Sort2(d, v8, vb); + st.Sort2(d, v9, va); + st.Sort2(d, vc, vf); + st.Sort2(d, vd, ve); + + v1 = st.ReverseKeys16(d, v1); + v3 = st.ReverseKeys16(d, v3); + v5 = st.ReverseKeys16(d, v5); + v7 = st.ReverseKeys16(d, v7); + v9 = st.ReverseKeys16(d, v9); + vb = st.ReverseKeys16(d, vb); + vd = st.ReverseKeys16(d, vd); + vf = st.ReverseKeys16(d, vf); + st.Sort2(d, v0, v1); + st.Sort2(d, v2, v3); + st.Sort2(d, v4, v5); + st.Sort2(d, v6, v7); + st.Sort2(d, v8, v9); + st.Sort2(d, va, vb); + st.Sort2(d, vc, vd); + st.Sort2(d, ve, vf); + + v0 = st.SortPairsReverse16(d, v0); + v1 = st.SortPairsReverse16(d, v1); + v2 = st.SortPairsReverse16(d, v2); + v3 = st.SortPairsReverse16(d, v3); + v4 = st.SortPairsReverse16(d, v4); + v5 = st.SortPairsReverse16(d, v5); + v6 = st.SortPairsReverse16(d, v6); + v7 = st.SortPairsReverse16(d, v7); + v8 = st.SortPairsReverse16(d, v8); + v9 = st.SortPairsReverse16(d, v9); + va = st.SortPairsReverse16(d, va); + vb = st.SortPairsReverse16(d, vb); + vc = st.SortPairsReverse16(d, vc); + vd = st.SortPairsReverse16(d, vd); + ve = st.SortPairsReverse16(d, ve); + vf = st.SortPairsReverse16(d, vf); + + v0 = st.SortPairsDistance4(d, v0); + v1 = st.SortPairsDistance4(d, v1); + v2 = st.SortPairsDistance4(d, v2); + v3 = st.SortPairsDistance4(d, v3); + v4 = st.SortPairsDistance4(d, v4); + v5 = st.SortPairsDistance4(d, v5); + v6 = st.SortPairsDistance4(d, v6); + v7 = st.SortPairsDistance4(d, v7); + v8 = st.SortPairsDistance4(d, v8); + v9 = st.SortPairsDistance4(d, v9); + va = st.SortPairsDistance4(d, va); + vb = st.SortPairsDistance4(d, vb); + vc = st.SortPairsDistance4(d, vc); + vd = st.SortPairsDistance4(d, vd); + ve = st.SortPairsDistance4(d, ve); + vf = st.SortPairsDistance4(d, vf); + + v0 = st.SortPairsDistance2(d, v0); + v1 = st.SortPairsDistance2(d, v1); + v2 = st.SortPairsDistance2(d, v2); + v3 = st.SortPairsDistance2(d, v3); + v4 = st.SortPairsDistance2(d, v4); + v5 = st.SortPairsDistance2(d, v5); + v6 = st.SortPairsDistance2(d, v6); + v7 = st.SortPairsDistance2(d, v7); + v8 = st.SortPairsDistance2(d, v8); + v9 = st.SortPairsDistance2(d, v9); + va = st.SortPairsDistance2(d, va); + vb = st.SortPairsDistance2(d, vb); + vc = st.SortPairsDistance2(d, vc); + vd = st.SortPairsDistance2(d, vd); + ve = st.SortPairsDistance2(d, ve); + vf = st.SortPairsDistance2(d, vf); + + v0 = st.SortPairsDistance1(d, v0); + v1 = st.SortPairsDistance1(d, v1); + v2 = st.SortPairsDistance1(d, v2); + v3 = st.SortPairsDistance1(d, v3); + v4 = st.SortPairsDistance1(d, v4); + v5 = st.SortPairsDistance1(d, v5); + v6 = st.SortPairsDistance1(d, v6); + v7 = st.SortPairsDistance1(d, v7); + v8 = st.SortPairsDistance1(d, v8); + v9 = st.SortPairsDistance1(d, v9); + va = st.SortPairsDistance1(d, va); + vb = st.SortPairsDistance1(d, vb); + vc = st.SortPairsDistance1(d, vc); + vd = st.SortPairsDistance1(d, vd); + ve = st.SortPairsDistance1(d, ve); + vf = st.SortPairsDistance1(d, vf); +} + +#endif // !HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD + +// Reshapes `buf` into a matrix, sorts columns independently, and then merges +// into a sorted 1D array without transposing. +// +// DEPRECATED, use BaseCase() instead. +template <class Traits, class V> +HWY_INLINE void SortingNetwork(Traits st, size_t cols, V& v0, V& v1, V& v2, + V& v3, V& v4, V& v5, V& v6, V& v7, V& v8, V& v9, + V& va, V& vb, V& vc, V& vd, V& ve, V& vf) { + // traits*-inl assume 'full' vectors (but still capped to kMaxCols). + const CappedTag<typename Traits::LaneType, Constants::kMaxCols> d; + + HWY_DASSERT(cols <= Constants::kMaxCols); + + // The network width depends on the number of keys, not lanes. + constexpr size_t kLanesPerKey = st.LanesPerKey(); + const size_t keys = cols / kLanesPerKey; + constexpr size_t kMaxKeys = MaxLanes(d) / kLanesPerKey; + + Sort16(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve, vf); + + // Checking MaxLanes avoids generating HWY_ASSERT code for the unreachable + // code paths: if MaxLanes < 2, then keys <= cols < 2. + if (HWY_LIKELY(keys >= 2 && kMaxKeys >= 2)) { + Merge16x2<kMaxKeys>(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, + vc, vd, ve, vf); + + if (HWY_LIKELY(keys >= 4 && kMaxKeys >= 4)) { + Merge16x4<kMaxKeys>(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, + vc, vd, ve, vf); + + if (HWY_LIKELY(keys >= 8 && kMaxKeys >= 8)) { + Merge16x8<kMaxKeys>(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, + vb, vc, vd, ve, vf); + + // Avoids build timeout. Must match #if condition in kMaxCols. +#if !HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD + if (HWY_LIKELY(keys >= 16 && kMaxKeys >= 16)) { + Merge16x16<kMaxKeys>(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, + va, vb, vc, vd, ve, vf); + + static_assert(Constants::kMaxCols <= 16, "Add more branches"); + } +#endif + } + } + } +} + +// As above, but loads from/stores to `buf`. This ensures full vectors are +// aligned, and enables loads/stores without bounds checks. +// +// DEPRECATED, use BaseCase() instead. +template <class Traits, typename T> +HWY_NOINLINE void SortingNetwork(Traits st, T* HWY_RESTRICT buf, size_t cols) { + // traits*-inl assume 'full' vectors (but still capped to kMaxCols). + // However, for smaller arrays and sub-maximal `cols` we have overlapping + // loads where only the lowest `cols` are valid, and we skip Merge16 etc. + const CappedTag<T, Constants::kMaxCols> d; + using V = decltype(Zero(d)); + + HWY_DASSERT(cols <= Constants::kMaxCols); + + // These are aligned iff cols == Lanes(d). We prefer unaligned/non-constexpr + // offsets to duplicating this code for every value of cols. + static_assert(Constants::kMaxRows == 16, "Update loads/stores/args"); + V v0 = LoadU(d, buf + 0x0 * cols); + V v1 = LoadU(d, buf + 0x1 * cols); + V v2 = LoadU(d, buf + 0x2 * cols); + V v3 = LoadU(d, buf + 0x3 * cols); + V v4 = LoadU(d, buf + 0x4 * cols); + V v5 = LoadU(d, buf + 0x5 * cols); + V v6 = LoadU(d, buf + 0x6 * cols); + V v7 = LoadU(d, buf + 0x7 * cols); + V v8 = LoadU(d, buf + 0x8 * cols); + V v9 = LoadU(d, buf + 0x9 * cols); + V va = LoadU(d, buf + 0xa * cols); + V vb = LoadU(d, buf + 0xb * cols); + V vc = LoadU(d, buf + 0xc * cols); + V vd = LoadU(d, buf + 0xd * cols); + V ve = LoadU(d, buf + 0xe * cols); + V vf = LoadU(d, buf + 0xf * cols); + + SortingNetwork(st, cols, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, + vd, ve, vf); + + StoreU(v0, d, buf + 0x0 * cols); + StoreU(v1, d, buf + 0x1 * cols); + StoreU(v2, d, buf + 0x2 * cols); + StoreU(v3, d, buf + 0x3 * cols); + StoreU(v4, d, buf + 0x4 * cols); + StoreU(v5, d, buf + 0x5 * cols); + StoreU(v6, d, buf + 0x6 * cols); + StoreU(v7, d, buf + 0x7 * cols); + StoreU(v8, d, buf + 0x8 * cols); + StoreU(v9, d, buf + 0x9 * cols); + StoreU(va, d, buf + 0xa * cols); + StoreU(vb, d, buf + 0xb * cols); + StoreU(vc, d, buf + 0xc * cols); + StoreU(vd, d, buf + 0xd * cols); + StoreU(ve, d, buf + 0xe * cols); + StoreU(vf, d, buf + 0xf * cols); +} + +#else +template <class Base> +struct SharedTraits : public Base {}; +#endif // VQSORT_ENABLED + +} // namespace detail +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE diff --git a/third_party/highway/hwy/contrib/sort/traits-inl.h b/third_party/highway/hwy/contrib/sort/traits-inl.h new file mode 100644 index 0000000000..732f87ee23 --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/traits-inl.h @@ -0,0 +1,561 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Per-target +#if defined(HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE +#undef HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE +#else +#define HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE +#endif + +#include "hwy/contrib/sort/shared-inl.h" // SortConstants +#include "hwy/contrib/sort/vqsort.h" // SortDescending +#include "hwy/highway.h" +#include "hwy/print.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { +namespace detail { + +// Base class of both KeyLane (with or without VQSORT_ENABLED) +template <typename T> +struct KeyLaneBase { + static constexpr bool Is128() { return false; } + constexpr size_t LanesPerKey() const { return 1; } + + // What type bench_sort should allocate for generating inputs. + using LaneType = T; + // What type to pass to VQSort. + using KeyType = T; + + const char* KeyString() const { + return IsSame<T, float>() ? "f32" + : IsSame<T, double>() ? "f64" + : IsSame<T, int16_t>() ? "i16" + : IsSame<T, int32_t>() ? "i32" + : IsSame<T, int64_t>() ? "i64" + : IsSame<T, uint16_t>() ? "u32" + : IsSame<T, uint32_t>() ? "u32" + : IsSame<T, uint64_t>() ? "u64" + : "?"; + } +}; + +#if VQSORT_ENABLED || HWY_IDE + +// Highway does not provide a lane type for 128-bit keys, so we use uint64_t +// along with an abstraction layer for single-lane vs. lane-pair, which is +// independent of the order. +template <typename T> +struct KeyLane : public KeyLaneBase<T> { + // False indicates the entire key (i.e. lane) should be compared. KV stands + // for key-value. + static constexpr bool IsKV() { return false; } + + // For HeapSort + HWY_INLINE void Swap(T* a, T* b) const { + const T temp = *a; + *a = *b; + *b = temp; + } + + template <class V, class M> + HWY_INLINE V CompressKeys(V keys, M mask) const { + return CompressNot(keys, mask); + } + + // Broadcasts one key into a vector + template <class D> + HWY_INLINE Vec<D> SetKey(D d, const T* key) const { + return Set(d, *key); + } + + template <class D> + HWY_INLINE Mask<D> EqualKeys(D /*tag*/, Vec<D> a, Vec<D> b) const { + return Eq(a, b); + } + + template <class D> + HWY_INLINE Mask<D> NotEqualKeys(D /*tag*/, Vec<D> a, Vec<D> b) const { + return Ne(a, b); + } + + // For keys=lanes, any difference counts. + template <class D> + HWY_INLINE bool NoKeyDifference(D /*tag*/, Vec<D> diff) const { + // Must avoid floating-point comparisons (for -0) + const RebindToUnsigned<D> du; + return AllTrue(du, Eq(BitCast(du, diff), Zero(du))); + } + + HWY_INLINE bool Equal1(const T* a, const T* b) const { return *a == *b; } + + template <class D> + HWY_INLINE Vec<D> ReverseKeys(D d, Vec<D> v) const { + return Reverse(d, v); + } + + template <class D> + HWY_INLINE Vec<D> ReverseKeys2(D d, Vec<D> v) const { + return Reverse2(d, v); + } + + template <class D> + HWY_INLINE Vec<D> ReverseKeys4(D d, Vec<D> v) const { + return Reverse4(d, v); + } + + template <class D> + HWY_INLINE Vec<D> ReverseKeys8(D d, Vec<D> v) const { + return Reverse8(d, v); + } + + template <class D> + HWY_INLINE Vec<D> ReverseKeys16(D d, Vec<D> v) const { + static_assert(SortConstants::kMaxCols <= 16, "Assumes u32x16 = 512 bit"); + return ReverseKeys(d, v); + } + + template <class V> + HWY_INLINE V OddEvenKeys(const V odd, const V even) const { + return OddEven(odd, even); + } + + template <class D, HWY_IF_T_SIZE_D(D, 2)> + HWY_INLINE Vec<D> SwapAdjacentPairs(D d, const Vec<D> v) const { + const Repartition<uint32_t, D> du32; + return BitCast(d, Shuffle2301(BitCast(du32, v))); + } + template <class D, HWY_IF_T_SIZE_D(D, 4)> + HWY_INLINE Vec<D> SwapAdjacentPairs(D /* tag */, const Vec<D> v) const { + return Shuffle1032(v); + } + template <class D, HWY_IF_T_SIZE_D(D, 8)> + HWY_INLINE Vec<D> SwapAdjacentPairs(D /* tag */, const Vec<D> v) const { + return SwapAdjacentBlocks(v); + } + + template <class D, HWY_IF_NOT_T_SIZE_D(D, 8)> + HWY_INLINE Vec<D> SwapAdjacentQuads(D d, const Vec<D> v) const { +#if HWY_HAVE_FLOAT64 // in case D is float32 + const RepartitionToWide<D> dw; +#else + const RepartitionToWide<RebindToUnsigned<D> > dw; +#endif + return BitCast(d, SwapAdjacentPairs(dw, BitCast(dw, v))); + } + template <class D, HWY_IF_T_SIZE_D(D, 8)> + HWY_INLINE Vec<D> SwapAdjacentQuads(D d, const Vec<D> v) const { + // Assumes max vector size = 512 + return ConcatLowerUpper(d, v, v); + } + + template <class D, HWY_IF_NOT_T_SIZE_D(D, 8)> + HWY_INLINE Vec<D> OddEvenPairs(D d, const Vec<D> odd, + const Vec<D> even) const { +#if HWY_HAVE_FLOAT64 // in case D is float32 + const RepartitionToWide<D> dw; +#else + const RepartitionToWide<RebindToUnsigned<D> > dw; +#endif + return BitCast(d, OddEven(BitCast(dw, odd), BitCast(dw, even))); + } + template <class D, HWY_IF_T_SIZE_D(D, 8)> + HWY_INLINE Vec<D> OddEvenPairs(D /* tag */, Vec<D> odd, Vec<D> even) const { + return OddEvenBlocks(odd, even); + } + + template <class D, HWY_IF_NOT_T_SIZE_D(D, 8)> + HWY_INLINE Vec<D> OddEvenQuads(D d, Vec<D> odd, Vec<D> even) const { +#if HWY_HAVE_FLOAT64 // in case D is float32 + const RepartitionToWide<D> dw; +#else + const RepartitionToWide<RebindToUnsigned<D> > dw; +#endif + return BitCast(d, OddEvenPairs(dw, BitCast(dw, odd), BitCast(dw, even))); + } + template <class D, HWY_IF_T_SIZE_D(D, 8)> + HWY_INLINE Vec<D> OddEvenQuads(D d, Vec<D> odd, Vec<D> even) const { + return ConcatUpperLower(d, odd, even); + } +}; + +// Anything order-related depends on the key traits *and* the order (see +// FirstOfLanes). We cannot implement just one Compare function because Lt128 +// only compiles if the lane type is u64. Thus we need either overloaded +// functions with a tag type, class specializations, or separate classes. +// We avoid overloaded functions because we want all functions to be callable +// from a SortTraits without per-function wrappers. Specializing would work, but +// we are anyway going to specialize at a higher level. +template <typename T> +struct OrderAscending : public KeyLane<T> { + using Order = SortAscending; + + HWY_INLINE bool Compare1(const T* a, const T* b) { return *a < *b; } + + template <class D> + HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const { + return Lt(a, b); + } + + // Two halves of Sort2, used in ScanMinMax. + template <class D> + HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const { + return Min(a, b); + } + + template <class D> + HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const { + return Max(a, b); + } + + template <class D> + HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v, + T* HWY_RESTRICT /* buf */) const { + return MinOfLanes(d, v); + } + + template <class D> + HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v, + T* HWY_RESTRICT /* buf */) const { + return MaxOfLanes(d, v); + } + + template <class D> + HWY_INLINE Vec<D> FirstValue(D d) const { + return Set(d, hwy::LowestValue<T>()); + } + + template <class D> + HWY_INLINE Vec<D> LastValue(D d) const { + return Set(d, hwy::HighestValue<T>()); + } + + template <class D> + HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const { + return Sub(v, Set(d, hwy::Epsilon<T>())); + } +}; + +template <typename T> +struct OrderDescending : public KeyLane<T> { + using Order = SortDescending; + + HWY_INLINE bool Compare1(const T* a, const T* b) { return *b < *a; } + + template <class D> + HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const { + return Lt(b, a); + } + + template <class D> + HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const { + return Max(a, b); + } + + template <class D> + HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const { + return Min(a, b); + } + + template <class D> + HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v, + T* HWY_RESTRICT /* buf */) const { + return MaxOfLanes(d, v); + } + + template <class D> + HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v, + T* HWY_RESTRICT /* buf */) const { + return MinOfLanes(d, v); + } + + template <class D> + HWY_INLINE Vec<D> FirstValue(D d) const { + return Set(d, hwy::HighestValue<T>()); + } + + template <class D> + HWY_INLINE Vec<D> LastValue(D d) const { + return Set(d, hwy::LowestValue<T>()); + } + + template <class D> + HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const { + return Add(v, Set(d, hwy::Epsilon<T>())); + } +}; + +struct KeyValue64 : public KeyLane<uint64_t> { + // True indicates only part of the key (i.e. lane) should be compared. KV + // stands for key-value. + static constexpr bool IsKV() { return true; } + + template <class D> + HWY_INLINE Mask<D> EqualKeys(D /*tag*/, Vec<D> a, Vec<D> b) const { + return Eq(ShiftRight<32>(a), ShiftRight<32>(b)); + } + + template <class D> + HWY_INLINE Mask<D> NotEqualKeys(D /*tag*/, Vec<D> a, Vec<D> b) const { + return Ne(ShiftRight<32>(a), ShiftRight<32>(b)); + } + + HWY_INLINE bool Equal1(const uint64_t* a, const uint64_t* b) const { + return (*a >> 32) == (*b >> 32); + } + + // Only count differences in the actual key, not the value. + template <class D> + HWY_INLINE bool NoKeyDifference(D /*tag*/, Vec<D> diff) const { + // Must avoid floating-point comparisons (for -0) + const RebindToUnsigned<D> du; + const Vec<decltype(du)> zero = Zero(du); + const Vec<decltype(du)> keys = ShiftRight<32>(diff); // clear values + return AllTrue(du, Eq(BitCast(du, keys), zero)); + } +}; + +struct OrderAscendingKV64 : public KeyValue64 { + using Order = SortAscending; + + HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) { + return (*a >> 32) < (*b >> 32); + } + + template <class D> + HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const { + return Lt(ShiftRight<32>(a), ShiftRight<32>(b)); + } + + // Not required to be stable (preserving the order of equivalent keys), so + // we can include the value in the comparison. + template <class D> + HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const { + return Min(a, b); + } + + template <class D> + HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const { + return Max(a, b); + } + + template <class D> + HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v, + uint64_t* HWY_RESTRICT /* buf */) const { + return MinOfLanes(d, v); + } + + template <class D> + HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v, + uint64_t* HWY_RESTRICT /* buf */) const { + return MaxOfLanes(d, v); + } + + // Same as for regular lanes. + template <class D> + HWY_INLINE Vec<D> FirstValue(D d) const { + return Set(d, hwy::LowestValue<TFromD<D> >()); + } + + template <class D> + HWY_INLINE Vec<D> LastValue(D d) const { + return Set(d, hwy::HighestValue<TFromD<D> >()); + } + + template <class D> + HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const { + return Sub(v, Set(d, uint64_t{1})); + } +}; + +struct OrderDescendingKV64 : public KeyValue64 { + using Order = SortDescending; + + HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) { + return (*b >> 32) < (*a >> 32); + } + + template <class D> + HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const { + return Lt(ShiftRight<32>(b), ShiftRight<32>(a)); + } + + // Not required to be stable (preserving the order of equivalent keys), so + // we can include the value in the comparison. + template <class D> + HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const { + return Max(a, b); + } + + template <class D> + HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const { + return Min(a, b); + } + + template <class D> + HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v, + uint64_t* HWY_RESTRICT /* buf */) const { + return MaxOfLanes(d, v); + } + + template <class D> + HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v, + uint64_t* HWY_RESTRICT /* buf */) const { + return MinOfLanes(d, v); + } + + template <class D> + HWY_INLINE Vec<D> FirstValue(D d) const { + return Set(d, hwy::HighestValue<TFromD<D> >()); + } + + template <class D> + HWY_INLINE Vec<D> LastValue(D d) const { + return Set(d, hwy::LowestValue<TFromD<D> >()); + } + + template <class D> + HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const { + return Add(v, Set(d, uint64_t{1})); + } +}; + +// Shared code that depends on Order. +template <class Base> +struct TraitsLane : public Base { + // For each lane i: replaces a[i] with the first and b[i] with the second + // according to Base. + // Corresponds to a conditional swap, which is one "node" of a sorting + // network. Min/Max are cheaper than compare + blend at least for integers. + template <class D> + HWY_INLINE void Sort2(D d, Vec<D>& a, Vec<D>& b) const { + const Base* base = static_cast<const Base*>(this); + + const Vec<D> a_copy = a; + // Prior to AVX3, there is no native 64-bit Min/Max, so they compile to 4 + // instructions. We can reduce it to a compare + 2 IfThenElse. +#if HWY_AVX3 < HWY_TARGET && HWY_TARGET <= HWY_SSSE3 + if (sizeof(TFromD<D>) == 8) { + const Mask<D> cmp = base->Compare(d, a, b); + a = IfThenElse(cmp, a, b); + b = IfThenElse(cmp, b, a_copy); + return; + } +#endif + a = base->First(d, a, b); + b = base->Last(d, a_copy, b); + } + + // Conditionally swaps even-numbered lanes with their odd-numbered neighbor. + template <class D, HWY_IF_T_SIZE_D(D, 8)> + HWY_INLINE Vec<D> SortPairsDistance1(D d, Vec<D> v) const { + const Base* base = static_cast<const Base*>(this); + Vec<D> swapped = base->ReverseKeys2(d, v); + // Further to the above optimization, Sort2+OddEvenKeys compile to four + // instructions; we can save one by combining two blends. +#if HWY_AVX3 < HWY_TARGET && HWY_TARGET <= HWY_SSSE3 + const Vec<D> cmp = VecFromMask(d, base->Compare(d, v, swapped)); + return IfVecThenElse(DupOdd(cmp), swapped, v); +#else + Sort2(d, v, swapped); + return base->OddEvenKeys(swapped, v); +#endif + } + + // (See above - we use Sort2 for non-64-bit types.) + template <class D, HWY_IF_NOT_T_SIZE_D(D, 8)> + HWY_INLINE Vec<D> SortPairsDistance1(D d, Vec<D> v) const { + const Base* base = static_cast<const Base*>(this); + Vec<D> swapped = base->ReverseKeys2(d, v); + Sort2(d, v, swapped); + return base->OddEvenKeys(swapped, v); + } + + // Swaps with the vector formed by reversing contiguous groups of 4 keys. + template <class D> + HWY_INLINE Vec<D> SortPairsReverse4(D d, Vec<D> v) const { + const Base* base = static_cast<const Base*>(this); + Vec<D> swapped = base->ReverseKeys4(d, v); + Sort2(d, v, swapped); + return base->OddEvenPairs(d, swapped, v); + } + + // Conditionally swaps lane 0 with 4, 1 with 5 etc. + template <class D> + HWY_INLINE Vec<D> SortPairsDistance4(D d, Vec<D> v) const { + const Base* base = static_cast<const Base*>(this); + Vec<D> swapped = base->SwapAdjacentQuads(d, v); + // Only used in Merge16, so this will not be used on AVX2 (which only has 4 + // u64 lanes), so skip the above optimization for 64-bit AVX2. + Sort2(d, v, swapped); + return base->OddEvenQuads(d, swapped, v); + } +}; + +#else + +template <typename T> +struct OrderAscending : public KeyLaneBase<T> { + using Order = SortAscending; + + HWY_INLINE bool Compare1(const T* a, const T* b) { return *a < *b; } + + template <class D> + HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) { + return Lt(a, b); + } +}; + +template <typename T> +struct OrderDescending : public KeyLaneBase<T> { + using Order = SortDescending; + + HWY_INLINE bool Compare1(const T* a, const T* b) { return *b < *a; } + + template <class D> + HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) { + return Lt(b, a); + } +}; + +template <class Order> +struct TraitsLane : public Order { + // For HeapSort + template <typename T> // MSVC doesn't find typename Order::LaneType. + HWY_INLINE void Swap(T* a, T* b) const { + const T temp = *a; + *a = *b; + *b = temp; + } + + template <class D> + HWY_INLINE Vec<D> SetKey(D d, const TFromD<D>* key) const { + return Set(d, *key); + } +}; + +#endif // VQSORT_ENABLED + +} // namespace detail +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE diff --git a/third_party/highway/hwy/contrib/sort/traits128-inl.h b/third_party/highway/hwy/contrib/sort/traits128-inl.h new file mode 100644 index 0000000000..ba9207c533 --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/traits128-inl.h @@ -0,0 +1,529 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Per-target +#if defined(HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE +#undef HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE +#else +#define HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE +#endif + +#include <string> + +#include "hwy/contrib/sort/shared-inl.h" +#include "hwy/contrib/sort/vqsort.h" // SortDescending +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { +namespace detail { + +#if VQSORT_ENABLED || HWY_IDE + +// Highway does not provide a lane type for 128-bit keys, so we use uint64_t +// along with an abstraction layer for single-lane vs. lane-pair, which is +// independent of the order. +struct KeyAny128 { + static constexpr bool Is128() { return true; } + constexpr size_t LanesPerKey() const { return 2; } + + // What type bench_sort should allocate for generating inputs. + using LaneType = uint64_t; + // KeyType and KeyString are defined by derived classes. + + HWY_INLINE void Swap(LaneType* a, LaneType* b) const { + const FixedTag<LaneType, 2> d; + const auto temp = LoadU(d, a); + StoreU(LoadU(d, b), d, a); + StoreU(temp, d, b); + } + + template <class V, class M> + HWY_INLINE V CompressKeys(V keys, M mask) const { + return CompressBlocksNot(keys, mask); + } + + template <class D> + HWY_INLINE Vec<D> SetKey(D d, const TFromD<D>* key) const { + return LoadDup128(d, key); + } + + template <class D> + HWY_INLINE Vec<D> ReverseKeys(D d, Vec<D> v) const { + return ReverseBlocks(d, v); + } + + template <class D> + HWY_INLINE Vec<D> ReverseKeys2(D /* tag */, const Vec<D> v) const { + return SwapAdjacentBlocks(v); + } + + // Only called for 4 keys because we do not support >512-bit vectors. + template <class D> + HWY_INLINE Vec<D> ReverseKeys4(D d, const Vec<D> v) const { + HWY_DASSERT(Lanes(d) <= 64 / sizeof(TFromD<D>)); + return ReverseKeys(d, v); + } + + // Only called for 4 keys because we do not support >512-bit vectors. + template <class D> + HWY_INLINE Vec<D> OddEvenPairs(D d, const Vec<D> odd, + const Vec<D> even) const { + HWY_DASSERT(Lanes(d) <= 64 / sizeof(TFromD<D>)); + return ConcatUpperLower(d, odd, even); + } + + template <class V> + HWY_INLINE V OddEvenKeys(const V odd, const V even) const { + return OddEvenBlocks(odd, even); + } + + template <class D> + HWY_INLINE Vec<D> ReverseKeys8(D, Vec<D>) const { + HWY_ASSERT(0); // not supported: would require 1024-bit vectors + } + + template <class D> + HWY_INLINE Vec<D> ReverseKeys16(D, Vec<D>) const { + HWY_ASSERT(0); // not supported: would require 2048-bit vectors + } + + // This is only called for 8/16 col networks (not supported). + template <class D> + HWY_INLINE Vec<D> SwapAdjacentPairs(D, Vec<D>) const { + HWY_ASSERT(0); + } + + // This is only called for 16 col networks (not supported). + template <class D> + HWY_INLINE Vec<D> SwapAdjacentQuads(D, Vec<D>) const { + HWY_ASSERT(0); + } + + // This is only called for 8 col networks (not supported). + template <class D> + HWY_INLINE Vec<D> OddEvenQuads(D, Vec<D>, Vec<D>) const { + HWY_ASSERT(0); + } +}; + +// Base class shared between OrderAscending128, OrderDescending128. +struct Key128 : public KeyAny128 { + // False indicates the entire key should be compared. KV means key-value. + static constexpr bool IsKV() { return false; } + + // What type to pass to VQSort. + using KeyType = hwy::uint128_t; + + const char* KeyString() const { return "U128"; } + + template <class D> + HWY_INLINE Mask<D> EqualKeys(D d, Vec<D> a, Vec<D> b) const { + return Eq128(d, a, b); + } + + template <class D> + HWY_INLINE Mask<D> NotEqualKeys(D d, Vec<D> a, Vec<D> b) const { + return Ne128(d, a, b); + } + + // For keys=entire 128 bits, any difference counts. + template <class D> + HWY_INLINE bool NoKeyDifference(D /*tag*/, Vec<D> diff) const { + // Must avoid floating-point comparisons (for -0) + const RebindToUnsigned<D> du; + return AllTrue(du, Eq(BitCast(du, diff), Zero(du))); + } + + HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) const { + return a[0] == b[0] && a[1] == b[1]; + } + + // Returns vector with only the top half of each block valid. This allows + // fusing the "replicate upper to lower half" step with a subsequent permute. + template <class Order, class D> + HWY_INLINE HWY_MAYBE_UNUSED Vec<D> CompareTop(D d, Vec<D> a, Vec<D> b) const { + const Mask<D> eqHL = Eq(a, b); + const Vec<D> ltHL = VecFromMask(d, Order().CompareLanes(a, b)); +#if HWY_TARGET <= HWY_AVX2 // slightly faster + const Vec<D> ltLX = ShiftLeftLanes<1>(ltHL); + return OrAnd(ltHL, VecFromMask(d, eqHL), ltLX); +#else + return IfThenElse(eqHL, DupEven(ltHL), ltHL); +#endif + } +}; + +// Anything order-related depends on the key traits *and* the order (see +// FirstOfLanes). We cannot implement just one Compare function because Lt128 +// only compiles if the lane type is u64. Thus we need either overloaded +// functions with a tag type, class specializations, or separate classes. +// We avoid overloaded functions because we want all functions to be callable +// from a SortTraits without per-function wrappers. Specializing would work, but +// we are anyway going to specialize at a higher level. +struct OrderAscending128 : public Key128 { + using Order = SortAscending; + + HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) { + return (a[1] == b[1]) ? a[0] < b[0] : a[1] < b[1]; + } + + template <class D> + HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const { + return Lt128(d, a, b); + } + + // Used by CompareTop + template <class V> + HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const { + return Lt(a, b); + } + + template <class D> + HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const { + return Min128(d, a, b); + } + + template <class D> + HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const { + return Max128(d, a, b); + } + + // Same as for regular lanes because 128-bit keys are u64. + template <class D> + HWY_INLINE Vec<D> FirstValue(D d) const { + return Set(d, hwy::LowestValue<TFromD<D> >()); + } + + template <class D> + HWY_INLINE Vec<D> LastValue(D d) const { + return Set(d, hwy::HighestValue<TFromD<D> >()); + } + + template <class D> + HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const { + const Vec<D> k0 = Zero(d); + const Vec<D> k1 = OddEven(k0, Set(d, uint64_t{1})); + const Mask<D> borrow = Eq(v, k0); // don't-care, lo == 0 + // lo == 0? 1 : 0, 0 + const Vec<D> adjust = ShiftLeftLanes<1>(IfThenElseZero(borrow, k1)); + return Sub(Sub(v, k1), adjust); + } +}; + +struct OrderDescending128 : public Key128 { + using Order = SortDescending; + + HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) { + return (a[1] == b[1]) ? b[0] < a[0] : b[1] < a[1]; + } + + template <class D> + HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const { + return Lt128(d, b, a); + } + + // Used by CompareTop + template <class V> + HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const { + return Lt(b, a); + } + + template <class D> + HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const { + return Max128(d, a, b); + } + + template <class D> + HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const { + return Min128(d, a, b); + } + + // Same as for regular lanes because 128-bit keys are u64. + template <class D> + HWY_INLINE Vec<D> FirstValue(D d) const { + return Set(d, hwy::HighestValue<TFromD<D> >()); + } + + template <class D> + HWY_INLINE Vec<D> LastValue(D d) const { + return Set(d, hwy::LowestValue<TFromD<D> >()); + } + + template <class D> + HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const { + const Vec<D> k1 = OddEven(Zero(d), Set(d, uint64_t{1})); + const Vec<D> added = Add(v, k1); + const Mask<D> overflowed = Lt(added, v); // false, overflowed + // overflowed? 1 : 0, 0 + const Vec<D> adjust = ShiftLeftLanes<1>(IfThenElseZero(overflowed, k1)); + return Add(added, adjust); + } +}; + +// Base class shared between OrderAscendingKV128, OrderDescendingKV128. +struct KeyValue128 : public KeyAny128 { + // True indicates only part of the key (the more significant lane) should be + // compared. KV stands for key-value. + static constexpr bool IsKV() { return true; } + + // What type to pass to VQSort. + using KeyType = K64V64; + + const char* KeyString() const { return "KV128"; } + + template <class D> + HWY_INLINE Mask<D> EqualKeys(D d, Vec<D> a, Vec<D> b) const { + return Eq128Upper(d, a, b); + } + + template <class D> + HWY_INLINE Mask<D> NotEqualKeys(D d, Vec<D> a, Vec<D> b) const { + return Ne128Upper(d, a, b); + } + + // Only count differences in the actual key, not the value. + template <class D> + HWY_INLINE bool NoKeyDifference(D /*tag*/, Vec<D> diff) const { + // Must avoid floating-point comparisons (for -0) + const RebindToUnsigned<D> du; + const Vec<decltype(du)> zero = Zero(du); + const Vec<decltype(du)> keys = OddEven(diff, zero); // clear values + return AllTrue(du, Eq(BitCast(du, keys), zero)); + } + + HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) const { + return a[1] == b[1]; + } + + // Returns vector with only the top half of each block valid. This allows + // fusing the "replicate upper to lower half" step with a subsequent permute. + template <class Order, class D> + HWY_INLINE HWY_MAYBE_UNUSED Vec<D> CompareTop(D d, Vec<D> a, Vec<D> b) const { + // Only the upper lane of each block is a key, and only that lane is + // required to be valid, so comparing all lanes is sufficient. + return VecFromMask(d, Order().CompareLanes(a, b)); + } +}; + +struct OrderAscendingKV128 : public KeyValue128 { + using Order = SortAscending; + + HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) { + return a[1] < b[1]; + } + + template <class D> + HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const { + return Lt128Upper(d, a, b); + } + + // Used by CompareTop + template <class V> + HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const { + return Lt(a, b); + } + + template <class D> + HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const { + return Min128Upper(d, a, b); + } + + template <class D> + HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const { + return Max128Upper(d, a, b); + } + + // Same as for regular lanes because 128-bit keys are u64. + template <class D> + HWY_INLINE Vec<D> FirstValue(D d) const { + return Set(d, hwy::LowestValue<TFromD<D> >()); + } + + template <class D> + HWY_INLINE Vec<D> LastValue(D d) const { + return Set(d, hwy::HighestValue<TFromD<D> >()); + } + + template <class D> + HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const { + const Vec<D> k1 = OddEven(Set(d, uint64_t{1}), Zero(d)); + return Sub(v, k1); + } +}; + +struct OrderDescendingKV128 : public KeyValue128 { + using Order = SortDescending; + + HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) { + return b[1] < a[1]; + } + + template <class D> + HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const { + return Lt128Upper(d, b, a); + } + + // Used by CompareTop + template <class V> + HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const { + return Lt(b, a); + } + + template <class D> + HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const { + return Max128Upper(d, a, b); + } + + template <class D> + HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const { + return Min128Upper(d, a, b); + } + + // Same as for regular lanes because 128-bit keys are u64. + template <class D> + HWY_INLINE Vec<D> FirstValue(D d) const { + return Set(d, hwy::HighestValue<TFromD<D> >()); + } + + template <class D> + HWY_INLINE Vec<D> LastValue(D d) const { + return Set(d, hwy::LowestValue<TFromD<D> >()); + } + + template <class D> + HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const { + const Vec<D> k1 = OddEven(Set(d, uint64_t{1}), Zero(d)); + return Add(v, k1); + } +}; + +// We want to swap 2 u128, i.e. 4 u64 lanes, based on the 0 or FF..FF mask in +// the most-significant of those lanes (the result of CompareTop), so +// replicate it 4x. Only called for >= 256-bit vectors. + +#if HWY_TARGET <= HWY_AVX3 +template <class V, HWY_IF_V_SIZE_V(V, 64)> +HWY_INLINE V ReplicateTop4x(V v) { + return V{_mm512_permutex_epi64(v.raw, _MM_SHUFFLE(3, 3, 3, 3))}; +} +#endif // HWY_TARGET <= HWY_AVX3 + +#if HWY_TARGET <= HWY_AVX2 + +template <class V, HWY_IF_V_SIZE_V(V, 32)> +HWY_INLINE V ReplicateTop4x(V v) { + return V{_mm256_permute4x64_epi64(v.raw, _MM_SHUFFLE(3, 3, 3, 3))}; +} + +#else // HWY_TARGET > HWY_AVX2 + +template <class V> +HWY_INLINE V ReplicateTop4x(V v) { +#if HWY_TARGET == HWY_SVE_256 + return svdup_lane_u64(v, 3); +#else + alignas(64) static constexpr uint64_t kIndices[8] = {3, 3, 3, 3, + 7, 7, 7, 7}; + const ScalableTag<uint64_t> d; + return TableLookupLanes(v, SetTableIndices(d, kIndices)); +#endif +} + +#endif // HWY_TARGET <= HWY_AVX2 + +// Shared code that depends on Order. +template <class Base> +struct Traits128 : public Base { + template <class D> + HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v, + TFromD<D>* HWY_RESTRICT buf) const { + const Base* base = static_cast<const Base*>(this); + const size_t N = Lanes(d); + Store(v, d, buf); + v = base->SetKey(d, buf + 0); // result must be broadcasted + for (size_t i = base->LanesPerKey(); i < N; i += base->LanesPerKey()) { + v = base->First(d, v, base->SetKey(d, buf + i)); + } + return v; + } + + template <class D> + HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v, + TFromD<D>* HWY_RESTRICT buf) const { + const Base* base = static_cast<const Base*>(this); + const size_t N = Lanes(d); + Store(v, d, buf); + v = base->SetKey(d, buf + 0); // result must be broadcasted + for (size_t i = base->LanesPerKey(); i < N; i += base->LanesPerKey()) { + v = base->Last(d, v, base->SetKey(d, buf + i)); + } + return v; + } + + template <class D> + HWY_INLINE void Sort2(D d, Vec<D>& a, Vec<D>& b) const { + const Base* base = static_cast<const Base*>(this); + + const Vec<D> a_copy = a; + const auto lt = base->Compare(d, a, b); + a = IfThenElse(lt, a, b); + b = IfThenElse(lt, b, a_copy); + } + + // Conditionally swaps even-numbered keys with their odd-numbered neighbor. + template <class D> + HWY_INLINE Vec<D> SortPairsDistance1(D d, Vec<D> v) const { + const Base* base = static_cast<const Base*>(this); + Vec<D> swapped = base->ReverseKeys2(d, v); + const Vec<D> cmpHx = base->template CompareTop<Base>(d, v, swapped); + return IfVecThenElse(ReplicateTop4x(cmpHx), swapped, v); + } + + // Swaps with the vector formed by reversing contiguous groups of four 128-bit + // keys, which implies 512-bit vectors (we do not support more than that). + template <class D> + HWY_INLINE Vec<D> SortPairsReverse4(D d, Vec<D> v) const { + const Base* base = static_cast<const Base*>(this); + Vec<D> swapped = base->ReverseKeys4(d, v); + + const Vec<D> cmpHx = base->template CompareTop<Base>(d, v, swapped); + // Similar to ReplicateTop4x, we want to gang together 2 comparison results + // (4 lanes). They are not contiguous, so use permute to replicate 4x. + alignas(64) uint64_t kIndices[8] = {7, 7, 5, 5, 5, 5, 7, 7}; + const Vec<D> select = TableLookupLanes(cmpHx, SetTableIndices(d, kIndices)); + return IfVecThenElse(select, swapped, v); + } + + // Conditionally swaps lane 0 with 4, 1 with 5 etc. + template <class D> + HWY_INLINE Vec<D> SortPairsDistance4(D, Vec<D>) const { + // Only used by Merge16, which would require 2048 bit vectors (unsupported). + HWY_ASSERT(0); + } +}; + +#endif // VQSORT_ENABLED + +} // namespace detail +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE diff --git a/third_party/highway/hwy/contrib/sort/vqsort-inl.h b/third_party/highway/hwy/contrib/sort/vqsort-inl.h new file mode 100644 index 0000000000..cf827baee5 --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/vqsort-inl.h @@ -0,0 +1,1724 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Normal include guard for target-independent parts +#ifndef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_ +#define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_ + +#include <stdio.h> // unconditional #include so we can use if(VQSORT_PRINT). +#include <string.h> // memcpy + +#include "hwy/base.h" +#include "hwy/cache_control.h" // Prefetch +#include "hwy/contrib/sort/vqsort.h" // Fill24Bytes + +#ifndef VQSORT_PRINT +#define VQSORT_PRINT 0 +#endif + +#endif // HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_ + +// Per-target +#if defined(HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE +#undef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE +#else +#define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE +#endif + +#if VQSORT_PRINT +#include "hwy/print-inl.h" +#endif + +#include "hwy/contrib/sort/shared-inl.h" +#include "hwy/contrib/sort/sorting_networks-inl.h" +// Placeholder for internal instrumentation. Do not remove. +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { +namespace detail { + +using Constants = hwy::SortConstants; + +// Wrapper avoids #if in user code (interferes with code folding) +template <class D> +HWY_INLINE void MaybePrintVector(D d, const char* label, Vec<D> v, + size_t start = 0, size_t max_lanes = 16) { +#if VQSORT_PRINT >= 2 // Print is only defined #if + Print(d, label, v, start, max_lanes); +#else + (void)d; + (void)label; + (void)v; + (void)start; + (void)max_lanes; +#endif +} + +// ------------------------------ HeapSort + +template <class Traits, typename T> +void SiftDown(Traits st, T* HWY_RESTRICT lanes, const size_t num_lanes, + size_t start) { + constexpr size_t N1 = st.LanesPerKey(); + const FixedTag<T, N1> d; + + while (start < num_lanes) { + const size_t left = 2 * start + N1; + const size_t right = 2 * start + 2 * N1; + if (left >= num_lanes) break; + size_t idx_larger = start; + const auto key_j = st.SetKey(d, lanes + start); + if (AllTrue(d, st.Compare(d, key_j, st.SetKey(d, lanes + left)))) { + idx_larger = left; + } + if (right < num_lanes && + AllTrue(d, st.Compare(d, st.SetKey(d, lanes + idx_larger), + st.SetKey(d, lanes + right)))) { + idx_larger = right; + } + if (idx_larger == start) break; + st.Swap(lanes + start, lanes + idx_larger); + start = idx_larger; + } +} + +// Heapsort: O(1) space, O(N*logN) worst-case comparisons. +// Based on LLVM sanitizer_common.h, licensed under Apache-2.0. +template <class Traits, typename T> +void HeapSort(Traits st, T* HWY_RESTRICT lanes, const size_t num_lanes) { + constexpr size_t N1 = st.LanesPerKey(); + + if (num_lanes < 2 * N1) return; + + // Build heap. + for (size_t i = ((num_lanes - N1) / N1 / 2) * N1; i != (~N1 + 1); i -= N1) { + SiftDown(st, lanes, num_lanes, i); + } + + for (size_t i = num_lanes - N1; i != 0; i -= N1) { + // Swap root with last + st.Swap(lanes + 0, lanes + i); + + // Sift down the new root. + SiftDown(st, lanes, i, 0); + } +} + +#if VQSORT_ENABLED || HWY_IDE + +// ------------------------------ BaseCase + +// Special cases where `num_lanes` is in the specified range (inclusive). +template <class Traits, typename T> +HWY_INLINE void Sort2To2(Traits st, T* HWY_RESTRICT keys, size_t num_lanes, + T* HWY_RESTRICT /* buf */) { + constexpr size_t kLPK = st.LanesPerKey(); + const size_t num_keys = num_lanes / kLPK; + HWY_DASSERT(num_keys == 2); + HWY_ASSUME(num_keys == 2); + + // One key per vector, required to avoid reading past the end of `keys`. + const CappedTag<T, kLPK> d; + using V = Vec<decltype(d)>; + + V v0 = LoadU(d, keys + 0x0 * kLPK); + V v1 = LoadU(d, keys + 0x1 * kLPK); + + Sort2(d, st, v0, v1); + + StoreU(v0, d, keys + 0x0 * kLPK); + StoreU(v1, d, keys + 0x1 * kLPK); +} + +template <class Traits, typename T> +HWY_INLINE void Sort3To4(Traits st, T* HWY_RESTRICT keys, size_t num_lanes, + T* HWY_RESTRICT buf) { + constexpr size_t kLPK = st.LanesPerKey(); + const size_t num_keys = num_lanes / kLPK; + HWY_DASSERT(3 <= num_keys && num_keys <= 4); + HWY_ASSUME(num_keys >= 3); + HWY_ASSUME(num_keys <= 4); // reduces branches + + // One key per vector, required to avoid reading past the end of `keys`. + const CappedTag<T, kLPK> d; + using V = Vec<decltype(d)>; + + // If num_keys == 3, initialize padding for the last sorting network element + // so that it does not influence the other elements. + Store(st.LastValue(d), d, buf); + + // Points to a valid key, or padding. This avoids special-casing + // HWY_MEM_OPS_MIGHT_FAULT because there is only a single key per vector. + T* in_out3 = num_keys == 3 ? buf : keys + 0x3 * kLPK; + + V v0 = LoadU(d, keys + 0x0 * kLPK); + V v1 = LoadU(d, keys + 0x1 * kLPK); + V v2 = LoadU(d, keys + 0x2 * kLPK); + V v3 = LoadU(d, in_out3); + + Sort4(d, st, v0, v1, v2, v3); + + StoreU(v0, d, keys + 0x0 * kLPK); + StoreU(v1, d, keys + 0x1 * kLPK); + StoreU(v2, d, keys + 0x2 * kLPK); + StoreU(v3, d, in_out3); +} + +#if HWY_MEM_OPS_MIGHT_FAULT + +template <size_t kRows, size_t kLanesPerRow, class D, class Traits, + typename T = TFromD<D>> +HWY_INLINE void CopyHalfToPaddedBuf(D d, Traits st, T* HWY_RESTRICT keys, + size_t num_lanes, T* HWY_RESTRICT buf) { + constexpr size_t kMinLanes = kRows / 2 * kLanesPerRow; + // Must cap for correctness: we will load up to the last valid lane, so + // Lanes(dmax) must not exceed `num_lanes` (known to be at least kMinLanes). + const CappedTag<T, kMinLanes> dmax; + const size_t Nmax = Lanes(dmax); + HWY_DASSERT(Nmax < num_lanes); + HWY_ASSUME(Nmax <= kMinLanes); + + // Fill with padding - last in sort order, not copied to keys. + const Vec<decltype(dmax)> kPadding = st.LastValue(dmax); + + // Rounding down allows aligned stores, which are typically faster. + size_t i = num_lanes & ~(Nmax - 1); + HWY_ASSUME(i != 0); // because Nmax <= num_lanes; avoids branch + do { + Store(kPadding, dmax, buf + i); + i += Nmax; + // Initialize enough for the last vector even if Nmax > kLanesPerRow. + } while (i < (kRows - 1) * kLanesPerRow + Lanes(d)); + + // Ensure buf contains all we will read, and perhaps more before. + ptrdiff_t end = static_cast<ptrdiff_t>(num_lanes); + do { + end -= static_cast<ptrdiff_t>(Nmax); + StoreU(LoadU(dmax, keys + end), dmax, buf + end); + } while (end > static_cast<ptrdiff_t>(kRows / 2 * kLanesPerRow)); +} + +#endif // HWY_MEM_OPS_MIGHT_FAULT + +template <size_t kKeysPerRow, class Traits, typename T> +HWY_NOINLINE void Sort8Rows(Traits st, T* HWY_RESTRICT keys, size_t num_lanes, + T* HWY_RESTRICT buf) { + // kKeysPerRow <= 4 because 8 64-bit keys implies 512-bit vectors, which + // are likely slower than 16x4, so 8x4 is the largest we handle here. + static_assert(kKeysPerRow <= 4, ""); + + constexpr size_t kLPK = st.LanesPerKey(); + + // We reshape the 1D keys into kRows x kKeysPerRow. + constexpr size_t kRows = 8; + constexpr size_t kLanesPerRow = kKeysPerRow * kLPK; + constexpr size_t kMinLanes = kRows / 2 * kLanesPerRow; + HWY_DASSERT(kMinLanes < num_lanes && num_lanes <= kRows * kLanesPerRow); + + const CappedTag<T, kLanesPerRow> d; + using V = Vec<decltype(d)>; + V v4, v5, v6, v7; + + // At least half the kRows are valid, otherwise a different function would + // have been called to handle this num_lanes. + V v0 = LoadU(d, keys + 0x0 * kLanesPerRow); + V v1 = LoadU(d, keys + 0x1 * kLanesPerRow); + V v2 = LoadU(d, keys + 0x2 * kLanesPerRow); + V v3 = LoadU(d, keys + 0x3 * kLanesPerRow); +#if HWY_MEM_OPS_MIGHT_FAULT + CopyHalfToPaddedBuf<kRows, kLanesPerRow>(d, st, keys, num_lanes, buf); + v4 = LoadU(d, buf + 0x4 * kLanesPerRow); + v5 = LoadU(d, buf + 0x5 * kLanesPerRow); + v6 = LoadU(d, buf + 0x6 * kLanesPerRow); + v7 = LoadU(d, buf + 0x7 * kLanesPerRow); +#endif // HWY_MEM_OPS_MIGHT_FAULT +#if !HWY_MEM_OPS_MIGHT_FAULT || HWY_IDE + (void)buf; + const V vnum_lanes = Set(d, static_cast<T>(num_lanes)); + // First offset where not all vector are guaranteed valid. + const V kIota = Iota(d, static_cast<T>(kMinLanes)); + const V k1 = Set(d, static_cast<T>(kLanesPerRow)); + const V k2 = Add(k1, k1); + + using M = Mask<decltype(d)>; + const M m4 = Gt(vnum_lanes, kIota); + const M m5 = Gt(vnum_lanes, Add(kIota, k1)); + const M m6 = Gt(vnum_lanes, Add(kIota, k2)); + const M m7 = Gt(vnum_lanes, Add(kIota, Add(k2, k1))); + + const V kPadding = st.LastValue(d); // Not copied to keys. + v4 = MaskedLoadOr(kPadding, m4, d, keys + 0x4 * kLanesPerRow); + v5 = MaskedLoadOr(kPadding, m5, d, keys + 0x5 * kLanesPerRow); + v6 = MaskedLoadOr(kPadding, m6, d, keys + 0x6 * kLanesPerRow); + v7 = MaskedLoadOr(kPadding, m7, d, keys + 0x7 * kLanesPerRow); +#endif // !HWY_MEM_OPS_MIGHT_FAULT + + Sort8(d, st, v0, v1, v2, v3, v4, v5, v6, v7); + + // Merge8x2 is a no-op if kKeysPerRow < 2 etc. + Merge8x2<kKeysPerRow>(d, st, v0, v1, v2, v3, v4, v5, v6, v7); + Merge8x4<kKeysPerRow>(d, st, v0, v1, v2, v3, v4, v5, v6, v7); + + StoreU(v0, d, keys + 0x0 * kLanesPerRow); + StoreU(v1, d, keys + 0x1 * kLanesPerRow); + StoreU(v2, d, keys + 0x2 * kLanesPerRow); + StoreU(v3, d, keys + 0x3 * kLanesPerRow); + +#if HWY_MEM_OPS_MIGHT_FAULT + // Store remaining vectors into buf and safely copy them into keys. + StoreU(v4, d, buf + 0x4 * kLanesPerRow); + StoreU(v5, d, buf + 0x5 * kLanesPerRow); + StoreU(v6, d, buf + 0x6 * kLanesPerRow); + StoreU(v7, d, buf + 0x7 * kLanesPerRow); + + const ScalableTag<T> dmax; + const size_t Nmax = Lanes(dmax); + + // The first half of vectors have already been stored unconditionally into + // `keys`, so we do not copy them. + size_t i = kMinLanes; + HWY_UNROLL(1) + for (; i + Nmax <= num_lanes; i += Nmax) { + StoreU(LoadU(dmax, buf + i), dmax, keys + i); + } + + // Last iteration: copy partial vector + const size_t remaining = num_lanes - i; + HWY_ASSUME(remaining < 256); // helps FirstN + SafeCopyN(remaining, dmax, buf + i, keys + i); +#endif // HWY_MEM_OPS_MIGHT_FAULT +#if !HWY_MEM_OPS_MIGHT_FAULT || HWY_IDE + BlendedStore(v4, m4, d, keys + 0x4 * kLanesPerRow); + BlendedStore(v5, m5, d, keys + 0x5 * kLanesPerRow); + BlendedStore(v6, m6, d, keys + 0x6 * kLanesPerRow); + BlendedStore(v7, m7, d, keys + 0x7 * kLanesPerRow); +#endif // !HWY_MEM_OPS_MIGHT_FAULT +} + +template <size_t kKeysPerRow, class Traits, typename T> +HWY_NOINLINE void Sort16Rows(Traits st, T* HWY_RESTRICT keys, size_t num_lanes, + T* HWY_RESTRICT buf) { + static_assert(kKeysPerRow <= SortConstants::kMaxCols, ""); + + constexpr size_t kLPK = st.LanesPerKey(); + + // We reshape the 1D keys into kRows x kKeysPerRow. + constexpr size_t kRows = 16; + constexpr size_t kLanesPerRow = kKeysPerRow * kLPK; + constexpr size_t kMinLanes = kRows / 2 * kLanesPerRow; + HWY_DASSERT(kMinLanes < num_lanes && num_lanes <= kRows * kLanesPerRow); + + const CappedTag<T, kLanesPerRow> d; + using V = Vec<decltype(d)>; + V v8, v9, va, vb, vc, vd, ve, vf; + + // At least half the kRows are valid, otherwise a different function would + // have been called to handle this num_lanes. + V v0 = LoadU(d, keys + 0x0 * kLanesPerRow); + V v1 = LoadU(d, keys + 0x1 * kLanesPerRow); + V v2 = LoadU(d, keys + 0x2 * kLanesPerRow); + V v3 = LoadU(d, keys + 0x3 * kLanesPerRow); + V v4 = LoadU(d, keys + 0x4 * kLanesPerRow); + V v5 = LoadU(d, keys + 0x5 * kLanesPerRow); + V v6 = LoadU(d, keys + 0x6 * kLanesPerRow); + V v7 = LoadU(d, keys + 0x7 * kLanesPerRow); +#if HWY_MEM_OPS_MIGHT_FAULT + CopyHalfToPaddedBuf<kRows, kLanesPerRow>(d, st, keys, num_lanes, buf); + v8 = LoadU(d, buf + 0x8 * kLanesPerRow); + v9 = LoadU(d, buf + 0x9 * kLanesPerRow); + va = LoadU(d, buf + 0xa * kLanesPerRow); + vb = LoadU(d, buf + 0xb * kLanesPerRow); + vc = LoadU(d, buf + 0xc * kLanesPerRow); + vd = LoadU(d, buf + 0xd * kLanesPerRow); + ve = LoadU(d, buf + 0xe * kLanesPerRow); + vf = LoadU(d, buf + 0xf * kLanesPerRow); +#endif // HWY_MEM_OPS_MIGHT_FAULT +#if !HWY_MEM_OPS_MIGHT_FAULT || HWY_IDE + (void)buf; + const V vnum_lanes = Set(d, static_cast<T>(num_lanes)); + // First offset where not all vector are guaranteed valid. + const V kIota = Iota(d, static_cast<T>(kMinLanes)); + const V k1 = Set(d, static_cast<T>(kLanesPerRow)); + const V k2 = Add(k1, k1); + const V k4 = Add(k2, k2); + const V k8 = Add(k4, k4); + + using M = Mask<decltype(d)>; + const M m8 = Gt(vnum_lanes, kIota); + const M m9 = Gt(vnum_lanes, Add(kIota, k1)); + const M ma = Gt(vnum_lanes, Add(kIota, k2)); + const M mb = Gt(vnum_lanes, Add(kIota, Sub(k4, k1))); + const M mc = Gt(vnum_lanes, Add(kIota, k4)); + const M md = Gt(vnum_lanes, Add(kIota, Add(k4, k1))); + const M me = Gt(vnum_lanes, Add(kIota, Add(k4, k2))); + const M mf = Gt(vnum_lanes, Add(kIota, Sub(k8, k1))); + + const V kPadding = st.LastValue(d); // Not copied to keys. + v8 = MaskedLoadOr(kPadding, m8, d, keys + 0x8 * kLanesPerRow); + v9 = MaskedLoadOr(kPadding, m9, d, keys + 0x9 * kLanesPerRow); + va = MaskedLoadOr(kPadding, ma, d, keys + 0xa * kLanesPerRow); + vb = MaskedLoadOr(kPadding, mb, d, keys + 0xb * kLanesPerRow); + vc = MaskedLoadOr(kPadding, mc, d, keys + 0xc * kLanesPerRow); + vd = MaskedLoadOr(kPadding, md, d, keys + 0xd * kLanesPerRow); + ve = MaskedLoadOr(kPadding, me, d, keys + 0xe * kLanesPerRow); + vf = MaskedLoadOr(kPadding, mf, d, keys + 0xf * kLanesPerRow); +#endif // !HWY_MEM_OPS_MIGHT_FAULT + + Sort16(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve, vf); + + // Merge16x4 is a no-op if kKeysPerRow < 4 etc. + Merge16x2<kKeysPerRow>(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, + vc, vd, ve, vf); + Merge16x4<kKeysPerRow>(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, + vc, vd, ve, vf); + Merge16x8<kKeysPerRow>(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, + vc, vd, ve, vf); +#if !HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD + Merge16x16<kKeysPerRow>(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, + vc, vd, ve, vf); +#endif + + StoreU(v0, d, keys + 0x0 * kLanesPerRow); + StoreU(v1, d, keys + 0x1 * kLanesPerRow); + StoreU(v2, d, keys + 0x2 * kLanesPerRow); + StoreU(v3, d, keys + 0x3 * kLanesPerRow); + StoreU(v4, d, keys + 0x4 * kLanesPerRow); + StoreU(v5, d, keys + 0x5 * kLanesPerRow); + StoreU(v6, d, keys + 0x6 * kLanesPerRow); + StoreU(v7, d, keys + 0x7 * kLanesPerRow); + +#if HWY_MEM_OPS_MIGHT_FAULT + // Store remaining vectors into buf and safely copy them into keys. + StoreU(v8, d, buf + 0x8 * kLanesPerRow); + StoreU(v9, d, buf + 0x9 * kLanesPerRow); + StoreU(va, d, buf + 0xa * kLanesPerRow); + StoreU(vb, d, buf + 0xb * kLanesPerRow); + StoreU(vc, d, buf + 0xc * kLanesPerRow); + StoreU(vd, d, buf + 0xd * kLanesPerRow); + StoreU(ve, d, buf + 0xe * kLanesPerRow); + StoreU(vf, d, buf + 0xf * kLanesPerRow); + + const ScalableTag<T> dmax; + const size_t Nmax = Lanes(dmax); + + // The first half of vectors have already been stored unconditionally into + // `keys`, so we do not copy them. + size_t i = kMinLanes; + HWY_UNROLL(1) + for (; i + Nmax <= num_lanes; i += Nmax) { + StoreU(LoadU(dmax, buf + i), dmax, keys + i); + } + + // Last iteration: copy partial vector + const size_t remaining = num_lanes - i; + HWY_ASSUME(remaining < 256); // helps FirstN + SafeCopyN(remaining, dmax, buf + i, keys + i); +#endif // HWY_MEM_OPS_MIGHT_FAULT +#if !HWY_MEM_OPS_MIGHT_FAULT || HWY_IDE + BlendedStore(v8, m8, d, keys + 0x8 * kLanesPerRow); + BlendedStore(v9, m9, d, keys + 0x9 * kLanesPerRow); + BlendedStore(va, ma, d, keys + 0xa * kLanesPerRow); + BlendedStore(vb, mb, d, keys + 0xb * kLanesPerRow); + BlendedStore(vc, mc, d, keys + 0xc * kLanesPerRow); + BlendedStore(vd, md, d, keys + 0xd * kLanesPerRow); + BlendedStore(ve, me, d, keys + 0xe * kLanesPerRow); + BlendedStore(vf, mf, d, keys + 0xf * kLanesPerRow); +#endif // !HWY_MEM_OPS_MIGHT_FAULT +} + +// Sorts `keys` within the range [0, num_lanes) via sorting network. +// Reshapes into a matrix, sorts columns independently, and then merges +// into a sorted 1D array without transposing. +// +// `st` is SharedTraits<Traits*<Order*>>. This abstraction layer bridges +// differences in sort order and single-lane vs 128-bit keys. +// +// See M. Blacher's thesis: https://github.com/mark-blacher/masterthesis +template <class D, class Traits, typename T> +HWY_NOINLINE void BaseCase(D d, Traits st, T* HWY_RESTRICT keys, + size_t num_lanes, T* buf) { + constexpr size_t kLPK = st.LanesPerKey(); + HWY_DASSERT(num_lanes <= Constants::BaseCaseNumLanes<kLPK>(Lanes(d))); + const size_t num_keys = num_lanes / kLPK; + + // Can be zero when called through HandleSpecialCases, but also 1 (in which + // case the array is already sorted). Also ensures num_lanes - 1 != 0. + if (HWY_UNLIKELY(num_keys <= 1)) return; + + const size_t ceil_log2 = + 32 - Num0BitsAboveMS1Bit_Nonzero32(static_cast<uint32_t>(num_keys - 1)); + + // Checking kMaxKeysPerVector avoids generating unreachable codepaths. + constexpr size_t kMaxKeysPerVector = MaxLanes(d) / kLPK; + + using FuncPtr = decltype(&Sort2To2<Traits, T>); + const FuncPtr funcs[9] = { + /* <= 1 */ nullptr, // We ensured num_keys > 1. + /* <= 2 */ &Sort2To2<Traits, T>, + /* <= 4 */ &Sort3To4<Traits, T>, + /* <= 8 */ &Sort8Rows<1, Traits, T>, // 1 key per row + /* <= 16 */ kMaxKeysPerVector >= 2 ? &Sort8Rows<2, Traits, T> : nullptr, + /* <= 32 */ kMaxKeysPerVector >= 4 ? &Sort8Rows<4, Traits, T> : nullptr, + /* <= 64 */ kMaxKeysPerVector >= 4 ? &Sort16Rows<4, Traits, T> : nullptr, + /* <= 128 */ kMaxKeysPerVector >= 8 ? &Sort16Rows<8, Traits, T> : nullptr, +#if !HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD + /* <= 256 */ kMaxKeysPerVector >= 16 ? &Sort16Rows<16, Traits, T> : nullptr, +#endif + }; + funcs[ceil_log2](st, keys, num_lanes, buf); +} + +// ------------------------------ Partition + +// Consumes from `keys` until a multiple of kUnroll*N remains. +// Temporarily stores the right side into `buf`, then moves behind `num`. +// Returns the number of keys consumed from the left side. +template <class D, class Traits, class T> +HWY_INLINE size_t PartitionToMultipleOfUnroll(D d, Traits st, + T* HWY_RESTRICT keys, size_t& num, + const Vec<D> pivot, + T* HWY_RESTRICT buf) { + constexpr size_t kUnroll = Constants::kPartitionUnroll; + const size_t N = Lanes(d); + size_t readL = 0; + T* HWY_RESTRICT posL = keys; + size_t bufR = 0; + // Partition requires both a multiple of kUnroll*N and at least + // 2*kUnroll*N for the initial loads. If less, consume all here. + const size_t num_rem = + (num < 2 * kUnroll * N) ? num : (num & (kUnroll * N - 1)); + size_t i = 0; + for (; i + N <= num_rem; i += N) { + const Vec<D> vL = LoadU(d, keys + readL); + readL += N; + + const auto comp = st.Compare(d, pivot, vL); + posL += CompressBlendedStore(vL, Not(comp), d, posL); + bufR += CompressStore(vL, comp, d, buf + bufR); + } + // Last iteration: only use valid lanes. + if (HWY_LIKELY(i != num_rem)) { + const auto mask = FirstN(d, num_rem - i); + const Vec<D> vL = LoadU(d, keys + readL); + + const auto comp = st.Compare(d, pivot, vL); + posL += CompressBlendedStore(vL, AndNot(comp, mask), d, posL); + bufR += CompressStore(vL, And(comp, mask), d, buf + bufR); + } + + // MSAN seems not to understand CompressStore. buf[0, bufR) are valid. + detail::MaybeUnpoison(buf, bufR); + + // Everything we loaded was put into buf, or behind the current `posL`, after + // which there is space for bufR items. First move items from `keys + num` to + // `posL` to free up space, then copy `buf` into the vacated `keys + num`. + // A loop with masked loads from `buf` is insufficient - we would also need to + // mask from `keys + num`. Combining a loop with memcpy for the remainders is + // slower than just memcpy, so we use that for simplicity. + num -= bufR; + memcpy(posL, keys + num, bufR * sizeof(T)); + memcpy(keys + num, buf, bufR * sizeof(T)); + return static_cast<size_t>(posL - keys); // caller will shrink num by this. +} + +template <class V> +V OrXor(const V o, const V x1, const V x2) { + return Or(o, Xor(x1, x2)); // ternlog on AVX3 +} + +// Note: we could track the OrXor of v and pivot to see if the entire left +// partition is equal, but that happens rarely and thus is a net loss. +template <class D, class Traits, typename T> +HWY_INLINE void StoreLeftRight(D d, Traits st, const Vec<D> v, + const Vec<D> pivot, T* HWY_RESTRICT keys, + size_t& writeL, size_t& remaining) { + const size_t N = Lanes(d); + + const auto comp = st.Compare(d, pivot, v); + + remaining -= N; + if (hwy::HWY_NAMESPACE::CompressIsPartition<T>::value || + (HWY_MAX_BYTES == 16 && st.Is128())) { + // Non-native Compress (e.g. AVX2): we are able to partition a vector using + // a single Compress+two StoreU instead of two Compress[Blended]Store. The + // latter are more expensive. Because we store entire vectors, the contents + // between the updated writeL and writeR are ignored and will be overwritten + // by subsequent calls. This works because writeL and writeR are at least + // two vectors apart. + const auto lr = st.CompressKeys(v, comp); + const size_t num_left = N - CountTrue(d, comp); + StoreU(lr, d, keys + writeL); + // Now write the right-side elements (if any), such that the previous writeR + // is one past the end of the newly written right elements, then advance. + StoreU(lr, d, keys + remaining + writeL); + writeL += num_left; + } else { + // Native Compress[Store] (e.g. AVX3), which only keep the left or right + // side, not both, hence we require two calls. + const size_t num_left = CompressStore(v, Not(comp), d, keys + writeL); + writeL += num_left; + + (void)CompressBlendedStore(v, comp, d, keys + remaining + writeL); + } +} + +template <class D, class Traits, typename T> +HWY_INLINE void StoreLeftRight4(D d, Traits st, const Vec<D> v0, + const Vec<D> v1, const Vec<D> v2, + const Vec<D> v3, const Vec<D> pivot, + T* HWY_RESTRICT keys, size_t& writeL, + size_t& remaining) { + StoreLeftRight(d, st, v0, pivot, keys, writeL, remaining); + StoreLeftRight(d, st, v1, pivot, keys, writeL, remaining); + StoreLeftRight(d, st, v2, pivot, keys, writeL, remaining); + StoreLeftRight(d, st, v3, pivot, keys, writeL, remaining); +} + +// Moves "<= pivot" keys to the front, and others to the back. pivot is +// broadcasted. Time-critical! +// +// Aligned loads do not seem to be worthwhile (not bottlenecked by load ports). +template <class D, class Traits, typename T> +HWY_INLINE size_t Partition(D d, Traits st, T* HWY_RESTRICT keys, size_t num, + const Vec<D> pivot, T* HWY_RESTRICT buf) { + using V = decltype(Zero(d)); + const size_t N = Lanes(d); + + // StoreLeftRight will CompressBlendedStore ending at `writeR`. Unless all + // lanes happen to be in the right-side partition, this will overrun `keys`, + // which triggers asan errors. Avoid by special-casing the last vector. + HWY_DASSERT(num > 2 * N); // ensured by HandleSpecialCases + num -= N; + size_t last = num; + const V vlast = LoadU(d, keys + last); + + const size_t consumedL = + PartitionToMultipleOfUnroll(d, st, keys, num, pivot, buf); + keys += consumedL; + last -= consumedL; + num -= consumedL; + constexpr size_t kUnroll = Constants::kPartitionUnroll; + + // Partition splits the vector into 3 sections, left to right: Elements + // smaller or equal to the pivot, unpartitioned elements and elements larger + // than the pivot. To write elements unconditionally on the loop body without + // overwriting existing data, we maintain two regions of the loop where all + // elements have been copied elsewhere (e.g. vector registers.). I call these + // bufferL and bufferR, for left and right respectively. + // + // These regions are tracked by the indices (writeL, writeR, left, right) as + // presented in the diagram below. + // + // writeL writeR + // \/ \/ + // | <= pivot | bufferL | unpartitioned | bufferR | > pivot | + // \/ \/ + // left right + // + // In the main loop body below we choose a side, load some elements out of the + // vector and move either `left` or `right`. Next we call into StoreLeftRight + // to partition the data, and the partitioned elements will be written either + // to writeR or writeL and the corresponding index will be moved accordingly. + // + // Note that writeR is not explicitly tracked as an optimization for platforms + // with conditional operations. Instead we track writeL and the number of + // elements left to process (`remaining`). From the diagram above we can see + // that: + // writeR - writeL = remaining => writeR = remaining + writeL + // + // Tracking `remaining` is advantageous because each iteration reduces the + // number of unpartitioned elements by a fixed amount, so we can compute + // `remaining` without data dependencies. + // + size_t writeL = 0; + size_t remaining = num; + + const T* HWY_RESTRICT readL = keys; + const T* HWY_RESTRICT readR = keys + num; + // Cannot load if there were fewer than 2 * kUnroll * N. + if (HWY_LIKELY(num != 0)) { + HWY_DASSERT(num >= 2 * kUnroll * N); + HWY_DASSERT((num & (kUnroll * N - 1)) == 0); + + // Make space for writing in-place by reading from readL/readR. + const V vL0 = LoadU(d, readL + 0 * N); + const V vL1 = LoadU(d, readL + 1 * N); + const V vL2 = LoadU(d, readL + 2 * N); + const V vL3 = LoadU(d, readL + 3 * N); + readL += kUnroll * N; + readR -= kUnroll * N; + const V vR0 = LoadU(d, readR + 0 * N); + const V vR1 = LoadU(d, readR + 1 * N); + const V vR2 = LoadU(d, readR + 2 * N); + const V vR3 = LoadU(d, readR + 3 * N); + + // readL/readR changed above, so check again before the loop. + while (readL != readR) { + V v0, v1, v2, v3; + + // Data-dependent but branching is faster than forcing branch-free. + const size_t capacityL = + static_cast<size_t>((readL - keys) - static_cast<ptrdiff_t>(writeL)); + HWY_DASSERT(capacityL <= num); // >= 0 + // Load data from the end of the vector with less data (front or back). + // The next paragraphs explain how this works. + // + // let block_size = (kUnroll * N) + // On the loop prelude we load block_size elements from the front of the + // vector and an additional block_size elements from the back. On each + // iteration k elements are written to the front of the vector and + // (block_size - k) to the back. + // + // This creates a loop invariant where the capacity on the front + // (capacityL) and on the back (capacityR) always add to 2 * block_size. + // In other words: + // capacityL + capacityR = 2 * block_size + // capacityR = 2 * block_size - capacityL + // + // This means that: + // capacityL < capacityR <=> + // capacityL < 2 * block_size - capacityL <=> + // 2 * capacityL < 2 * block_size <=> + // capacityL < block_size + // + // Thus the check on the next line is equivalent to capacityL > capacityR. + // + if (kUnroll * N < capacityL) { + readR -= kUnroll * N; + v0 = LoadU(d, readR + 0 * N); + v1 = LoadU(d, readR + 1 * N); + v2 = LoadU(d, readR + 2 * N); + v3 = LoadU(d, readR + 3 * N); + hwy::Prefetch(readR - 3 * kUnroll * N); + } else { + v0 = LoadU(d, readL + 0 * N); + v1 = LoadU(d, readL + 1 * N); + v2 = LoadU(d, readL + 2 * N); + v3 = LoadU(d, readL + 3 * N); + readL += kUnroll * N; + hwy::Prefetch(readL + 3 * kUnroll * N); + } + + StoreLeftRight4(d, st, v0, v1, v2, v3, pivot, keys, writeL, remaining); + } + + // Now finish writing the saved vectors to the middle. + StoreLeftRight4(d, st, vL0, vL1, vL2, vL3, pivot, keys, writeL, remaining); + StoreLeftRight4(d, st, vR0, vR1, vR2, vR3, pivot, keys, writeL, remaining); + } + + // We have partitioned [left, right) such that writeL is the boundary. + HWY_DASSERT(remaining == 0); + // Make space for inserting vlast: move up to N of the first right-side keys + // into the unused space starting at last. If we have fewer, ensure they are + // the last items in that vector by subtracting from the *load* address, + // which is safe because we have at least two vectors (checked above). + const size_t totalR = last - writeL; + const size_t startR = totalR < N ? writeL + totalR - N : writeL; + StoreU(LoadU(d, keys + startR), d, keys + last); + + // Partition vlast: write L, then R, into the single-vector gap at writeL. + const auto comp = st.Compare(d, pivot, vlast); + writeL += CompressBlendedStore(vlast, Not(comp), d, keys + writeL); + (void)CompressBlendedStore(vlast, comp, d, keys + writeL); + + return consumedL + writeL; +} + +// Returns true and partitions if [keys, keys + num) contains only {valueL, +// valueR}. Otherwise, sets third to the first differing value; keys may have +// been reordered and a regular Partition is still necessary. +// Called from two locations, hence NOINLINE. +template <class D, class Traits, typename T> +HWY_NOINLINE bool MaybePartitionTwoValue(D d, Traits st, T* HWY_RESTRICT keys, + size_t num, const Vec<D> valueL, + const Vec<D> valueR, Vec<D>& third, + T* HWY_RESTRICT buf) { + const size_t N = Lanes(d); + + size_t i = 0; + size_t writeL = 0; + + // As long as all lanes are equal to L or R, we can overwrite with valueL. + // This is faster than first counting, then backtracking to fill L and R. + for (; i + N <= num; i += N) { + const Vec<D> v = LoadU(d, keys + i); + // It is not clear how to apply OrXor here - that can check if *both* + // comparisons are true, but here we want *either*. Comparing the unsigned + // min of differences to zero works, but is expensive for u64 prior to AVX3. + const Mask<D> eqL = st.EqualKeys(d, v, valueL); + const Mask<D> eqR = st.EqualKeys(d, v, valueR); + // At least one other value present; will require a regular partition. + // On AVX-512, Or + AllTrue are folded into a single kortest if we are + // careful with the FindKnownFirstTrue argument, see below. + if (HWY_UNLIKELY(!AllTrue(d, Or(eqL, eqR)))) { + // If we repeat Or(eqL, eqR) here, the compiler will hoist it into the + // loop, which is a pessimization because this if-true branch is cold. + // We can defeat this via Not(Xor), which is equivalent because eqL and + // eqR cannot be true at the same time. Can we elide the additional Not? + // FindFirstFalse instructions are generally unavailable, but we can + // fuse Not and Xor/Or into one ExclusiveNeither. + const size_t lane = FindKnownFirstTrue(d, ExclusiveNeither(eqL, eqR)); + third = st.SetKey(d, keys + i + lane); + if (VQSORT_PRINT >= 2) { + fprintf(stderr, "found 3rd value at vec %zu; writeL %zu\n", i, writeL); + } + // 'Undo' what we did by filling the remainder of what we read with R. + for (; writeL + N <= i; writeL += N) { + StoreU(valueR, d, keys + writeL); + } + BlendedStore(valueR, FirstN(d, i - writeL), d, keys + writeL); + return false; + } + StoreU(valueL, d, keys + writeL); + writeL += CountTrue(d, eqL); + } + + // Final vector, masked comparison (no effect if i == num) + const size_t remaining = num - i; + SafeCopyN(remaining, d, keys + i, buf); + const Vec<D> v = Load(d, buf); + const Mask<D> valid = FirstN(d, remaining); + const Mask<D> eqL = And(st.EqualKeys(d, v, valueL), valid); + const Mask<D> eqR = st.EqualKeys(d, v, valueR); + // Invalid lanes are considered equal. + const Mask<D> eq = Or(Or(eqL, eqR), Not(valid)); + // At least one other value present; will require a regular partition. + if (HWY_UNLIKELY(!AllTrue(d, eq))) { + const size_t lane = FindKnownFirstTrue(d, Not(eq)); + third = st.SetKey(d, keys + i + lane); + if (VQSORT_PRINT >= 2) { + fprintf(stderr, "found 3rd value at partial vec %zu; writeL %zu\n", i, + writeL); + } + // 'Undo' what we did by filling the remainder of what we read with R. + for (; writeL + N <= i; writeL += N) { + StoreU(valueR, d, keys + writeL); + } + BlendedStore(valueR, FirstN(d, i - writeL), d, keys + writeL); + return false; + } + BlendedStore(valueL, valid, d, keys + writeL); + writeL += CountTrue(d, eqL); + + // Fill right side + i = writeL; + for (; i + N <= num; i += N) { + StoreU(valueR, d, keys + i); + } + BlendedStore(valueR, FirstN(d, num - i), d, keys + i); + + if (VQSORT_PRINT >= 2) { + fprintf(stderr, "Successful MaybePartitionTwoValue\n"); + } + return true; +} + +// Same as above, except that the pivot equals valueR, so scan right to left. +template <class D, class Traits, typename T> +HWY_INLINE bool MaybePartitionTwoValueR(D d, Traits st, T* HWY_RESTRICT keys, + size_t num, const Vec<D> valueL, + const Vec<D> valueR, Vec<D>& third, + T* HWY_RESTRICT buf) { + const size_t N = Lanes(d); + + HWY_DASSERT(num >= N); + size_t pos = num - N; // current read/write position + size_t countR = 0; // number of valueR found + + // For whole vectors, in descending address order: as long as all lanes are + // equal to L or R, overwrite with valueR. This is faster than counting, then + // filling both L and R. Loop terminates after unsigned wraparound. + for (; pos < num; pos -= N) { + const Vec<D> v = LoadU(d, keys + pos); + // It is not clear how to apply OrXor here - that can check if *both* + // comparisons are true, but here we want *either*. Comparing the unsigned + // min of differences to zero works, but is expensive for u64 prior to AVX3. + const Mask<D> eqL = st.EqualKeys(d, v, valueL); + const Mask<D> eqR = st.EqualKeys(d, v, valueR); + // If there is a third value, stop and undo what we've done. On AVX-512, + // Or + AllTrue are folded into a single kortest, but only if we are + // careful with the FindKnownFirstTrue argument - see prior comment on that. + if (HWY_UNLIKELY(!AllTrue(d, Or(eqL, eqR)))) { + const size_t lane = FindKnownFirstTrue(d, ExclusiveNeither(eqL, eqR)); + third = st.SetKey(d, keys + pos + lane); + if (VQSORT_PRINT >= 2) { + fprintf(stderr, "found 3rd value at vec %zu; countR %zu\n", pos, + countR); + MaybePrintVector(d, "third", third, 0, st.LanesPerKey()); + } + pos += N; // rewind: we haven't yet committed changes in this iteration. + // We have filled [pos, num) with R, but only countR of them should have + // been written. Rewrite [pos, num - countR) to L. + HWY_DASSERT(countR <= num - pos); + const size_t endL = num - countR; + for (; pos + N <= endL; pos += N) { + StoreU(valueL, d, keys + pos); + } + BlendedStore(valueL, FirstN(d, endL - pos), d, keys + pos); + return false; + } + StoreU(valueR, d, keys + pos); + countR += CountTrue(d, eqR); + } + + // Final partial (or empty) vector, masked comparison. + const size_t remaining = pos + N; + HWY_DASSERT(remaining <= N); + const Vec<D> v = LoadU(d, keys); // Safe because num >= N. + const Mask<D> valid = FirstN(d, remaining); + const Mask<D> eqL = st.EqualKeys(d, v, valueL); + const Mask<D> eqR = And(st.EqualKeys(d, v, valueR), valid); + // Invalid lanes are considered equal. + const Mask<D> eq = Or(Or(eqL, eqR), Not(valid)); + // At least one other value present; will require a regular partition. + if (HWY_UNLIKELY(!AllTrue(d, eq))) { + const size_t lane = FindKnownFirstTrue(d, Not(eq)); + third = st.SetKey(d, keys + lane); + if (VQSORT_PRINT >= 2) { + fprintf(stderr, "found 3rd value at partial vec %zu; writeR %zu\n", pos, + countR); + MaybePrintVector(d, "third", third, 0, st.LanesPerKey()); + } + pos += N; // rewind: we haven't yet committed changes in this iteration. + // We have filled [pos, num) with R, but only countR of them should have + // been written. Rewrite [pos, num - countR) to L. + HWY_DASSERT(countR <= num - pos); + const size_t endL = num - countR; + for (; pos + N <= endL; pos += N) { + StoreU(valueL, d, keys + pos); + } + BlendedStore(valueL, FirstN(d, endL - pos), d, keys + pos); + return false; + } + const size_t lastR = CountTrue(d, eqR); + countR += lastR; + + // First finish writing valueR - [0, N) lanes were not yet written. + StoreU(valueR, d, keys); // Safe because num >= N. + + // Fill left side (ascending order for clarity) + const size_t endL = num - countR; + size_t i = 0; + for (; i + N <= endL; i += N) { + StoreU(valueL, d, keys + i); + } + Store(valueL, d, buf); + SafeCopyN(endL - i, d, buf, keys + i); // avoids asan overrun + + if (VQSORT_PRINT >= 2) { + fprintf(stderr, + "MaybePartitionTwoValueR countR %zu pos %zu i %zu endL %zu\n", + countR, pos, i, endL); + } + + return true; +} + +// `idx_second` is `first_mismatch` from `AllEqual` and thus the index of the +// second key. This is the first path into `MaybePartitionTwoValue`, called +// when all samples are equal. Returns false if there are at least a third +// value and sets `third`. Otherwise, partitions the array and returns true. +template <class D, class Traits, typename T> +HWY_INLINE bool PartitionIfTwoKeys(D d, Traits st, const Vec<D> pivot, + T* HWY_RESTRICT keys, size_t num, + const size_t idx_second, const Vec<D> second, + Vec<D>& third, T* HWY_RESTRICT buf) { + // True if second comes before pivot. + const bool is_pivotR = AllFalse(d, st.Compare(d, pivot, second)); + if (VQSORT_PRINT >= 1) { + fprintf(stderr, "Samples all equal, diff at %zu, isPivotR %d\n", idx_second, + is_pivotR); + } + HWY_DASSERT(AllFalse(d, st.EqualKeys(d, second, pivot))); + + // If pivot is R, we scan backwards over the entire array. Otherwise, + // we already scanned up to idx_second and can leave those in place. + return is_pivotR ? MaybePartitionTwoValueR(d, st, keys, num, second, pivot, + third, buf) + : MaybePartitionTwoValue(d, st, keys + idx_second, + num - idx_second, pivot, second, + third, buf); +} + +// Second path into `MaybePartitionTwoValue`, called when not all samples are +// equal. `samples` is sorted. +template <class D, class Traits, typename T> +HWY_INLINE bool PartitionIfTwoSamples(D d, Traits st, T* HWY_RESTRICT keys, + size_t num, T* HWY_RESTRICT samples) { + constexpr size_t kSampleLanes = Constants::SampleLanes<T>(); + constexpr size_t N1 = st.LanesPerKey(); + const Vec<D> valueL = st.SetKey(d, samples); + const Vec<D> valueR = st.SetKey(d, samples + kSampleLanes - N1); + HWY_DASSERT(AllTrue(d, st.Compare(d, valueL, valueR))); + HWY_DASSERT(AllFalse(d, st.EqualKeys(d, valueL, valueR))); + const Vec<D> prev = st.PrevValue(d, valueR); + // If the sample has more than two values, then the keys have at least that + // many, and thus this special case is inapplicable. + if (HWY_UNLIKELY(!AllTrue(d, st.EqualKeys(d, valueL, prev)))) { + return false; + } + + // Must not overwrite samples because if this returns false, caller wants to + // read the original samples again. + T* HWY_RESTRICT buf = samples + kSampleLanes; + Vec<D> third; // unused + return MaybePartitionTwoValue(d, st, keys, num, valueL, valueR, third, buf); +} + +// ------------------------------ Pivot sampling + +template <class Traits, class V> +HWY_INLINE V MedianOf3(Traits st, V v0, V v1, V v2) { + const DFromV<V> d; + // Slightly faster for 128-bit, apparently because not serially dependent. + if (st.Is128()) { + // Median = XOR-sum 'minus' the first and last. Calling First twice is + // slightly faster than Compare + 2 IfThenElse or even IfThenElse + XOR. + const auto sum = Xor(Xor(v0, v1), v2); + const auto first = st.First(d, st.First(d, v0, v1), v2); + const auto last = st.Last(d, st.Last(d, v0, v1), v2); + return Xor(Xor(sum, first), last); + } + st.Sort2(d, v0, v2); + v1 = st.Last(d, v0, v1); + v1 = st.First(d, v1, v2); + return v1; +} + +// Based on https://github.com/numpy/numpy/issues/16313#issuecomment-641897028 +HWY_INLINE uint64_t RandomBits(uint64_t* HWY_RESTRICT state) { + const uint64_t a = state[0]; + const uint64_t b = state[1]; + const uint64_t w = state[2] + 1; + const uint64_t next = a ^ w; + state[0] = (b + (b << 3)) ^ (b >> 11); + const uint64_t rot = (b << 24) | (b >> 40); + state[1] = rot + next; + state[2] = w; + return next; +} + +// Returns slightly biased random index of a chunk in [0, num_chunks). +// See https://www.pcg-random.org/posts/bounded-rands.html. +HWY_INLINE size_t RandomChunkIndex(const uint32_t num_chunks, uint32_t bits) { + const uint64_t chunk_index = (static_cast<uint64_t>(bits) * num_chunks) >> 32; + HWY_DASSERT(chunk_index < num_chunks); + return static_cast<size_t>(chunk_index); +} + +// Writes samples from `keys[0, num)` into `buf`. +template <class D, class Traits, typename T> +HWY_INLINE void DrawSamples(D d, Traits st, T* HWY_RESTRICT keys, size_t num, + T* HWY_RESTRICT buf, uint64_t* HWY_RESTRICT state) { + using V = decltype(Zero(d)); + const size_t N = Lanes(d); + + // Power of two + constexpr size_t kLanesPerChunk = Constants::LanesPerChunk(sizeof(T)); + + // Align start of keys to chunks. We have at least 2 chunks (x 64 bytes) + // because the base case handles anything up to 8 vectors (x 16 bytes). + HWY_DASSERT(num >= Constants::SampleLanes<T>()); + const size_t misalign = + (reinterpret_cast<uintptr_t>(keys) / sizeof(T)) & (kLanesPerChunk - 1); + if (misalign != 0) { + const size_t consume = kLanesPerChunk - misalign; + keys += consume; + num -= consume; + } + + // Generate enough random bits for 6 uint32 + uint32_t bits[6]; + for (size_t i = 0; i < 6; i += 2) { + const uint64_t bits64 = RandomBits(state); + CopyBytes<8>(&bits64, bits + i); + } + + const size_t num_chunks64 = num / kLanesPerChunk; + // Clamp to uint32 for RandomChunkIndex + const uint32_t num_chunks = + static_cast<uint32_t>(HWY_MIN(num_chunks64, 0xFFFFFFFFull)); + + const size_t offset0 = RandomChunkIndex(num_chunks, bits[0]) * kLanesPerChunk; + const size_t offset1 = RandomChunkIndex(num_chunks, bits[1]) * kLanesPerChunk; + const size_t offset2 = RandomChunkIndex(num_chunks, bits[2]) * kLanesPerChunk; + const size_t offset3 = RandomChunkIndex(num_chunks, bits[3]) * kLanesPerChunk; + const size_t offset4 = RandomChunkIndex(num_chunks, bits[4]) * kLanesPerChunk; + const size_t offset5 = RandomChunkIndex(num_chunks, bits[5]) * kLanesPerChunk; + for (size_t i = 0; i < kLanesPerChunk; i += N) { + const V v0 = Load(d, keys + offset0 + i); + const V v1 = Load(d, keys + offset1 + i); + const V v2 = Load(d, keys + offset2 + i); + const V medians0 = MedianOf3(st, v0, v1, v2); + Store(medians0, d, buf + i); + + const V v3 = Load(d, keys + offset3 + i); + const V v4 = Load(d, keys + offset4 + i); + const V v5 = Load(d, keys + offset5 + i); + const V medians1 = MedianOf3(st, v3, v4, v5); + Store(medians1, d, buf + i + kLanesPerChunk); + } +} + +// For detecting inputs where (almost) all keys are equal. +template <class D, class Traits> +HWY_INLINE bool UnsortedSampleEqual(D d, Traits st, + const TFromD<D>* HWY_RESTRICT samples) { + constexpr size_t kSampleLanes = Constants::SampleLanes<TFromD<D>>(); + const size_t N = Lanes(d); + // Both are powers of two, so there will be no remainders. + HWY_DASSERT(N < kSampleLanes); + using V = Vec<D>; + + const V first = st.SetKey(d, samples); + // OR of XOR-difference may be faster than comparison. + V diff = Zero(d); + for (size_t i = 0; i < kSampleLanes; i += N) { + const V v = Load(d, samples + i); + diff = OrXor(diff, first, v); + } + + return st.NoKeyDifference(d, diff); +} + +template <class D, class Traits, typename T> +HWY_INLINE void SortSamples(D d, Traits st, T* HWY_RESTRICT buf) { + const size_t N = Lanes(d); + constexpr size_t kSampleLanes = Constants::SampleLanes<T>(); + // Network must be large enough to sort two chunks. + HWY_DASSERT(Constants::BaseCaseNumLanes<st.LanesPerKey()>(N) >= kSampleLanes); + + BaseCase(d, st, buf, kSampleLanes, buf + kSampleLanes); + + if (VQSORT_PRINT >= 2) { + fprintf(stderr, "Samples:\n"); + for (size_t i = 0; i < kSampleLanes; i += N) { + MaybePrintVector(d, "", Load(d, buf + i), 0, N); + } + } +} + +// ------------------------------ Pivot selection + +enum class PivotResult { + kDone, // stop without partitioning (all equal, or two-value partition) + kNormal, // partition and recurse left and right + kIsFirst, // partition but skip left recursion + kWasLast, // partition but skip right recursion +}; + +HWY_INLINE const char* PivotResultString(PivotResult result) { + switch (result) { + case PivotResult::kDone: + return "done"; + case PivotResult::kNormal: + return "normal"; + case PivotResult::kIsFirst: + return "first"; + case PivotResult::kWasLast: + return "last"; + } + return "unknown"; +} + +// (Could vectorize, but only 0.2% of total time) +template <class Traits, typename T> +HWY_INLINE size_t PivotRank(Traits st, const T* HWY_RESTRICT samples) { + constexpr size_t kSampleLanes = Constants::SampleLanes<T>(); + constexpr size_t N1 = st.LanesPerKey(); + + constexpr size_t kRankMid = kSampleLanes / 2; + static_assert(kRankMid % N1 == 0, "Mid is not an aligned key"); + + // Find the previous value not equal to the median. + size_t rank_prev = kRankMid - N1; + for (; st.Equal1(samples + rank_prev, samples + kRankMid); rank_prev -= N1) { + // All previous samples are equal to the median. + if (rank_prev == 0) return 0; + } + + size_t rank_next = rank_prev + N1; + for (; st.Equal1(samples + rank_next, samples + kRankMid); rank_next += N1) { + // The median is also the largest sample. If it is also the largest key, + // we'd end up with an empty right partition, so choose the previous key. + if (rank_next == kSampleLanes - N1) return rank_prev; + } + + // If we choose the median as pivot, the ratio of keys ending in the left + // partition will likely be rank_next/kSampleLanes (if the sample is + // representative). This is because equal-to-pivot values also land in the + // left - it's infeasible to do an in-place vectorized 3-way partition. + // Check whether prev would lead to a more balanced partition. + const size_t excess_if_median = rank_next - kRankMid; + const size_t excess_if_prev = kRankMid - rank_prev; + return excess_if_median < excess_if_prev ? kRankMid : rank_prev; +} + +// Returns pivot chosen from `samples`. It will never be the largest key +// (thus the right partition will never be empty). +template <class D, class Traits, typename T> +HWY_INLINE Vec<D> ChoosePivotByRank(D d, Traits st, + const T* HWY_RESTRICT samples) { + const size_t pivot_rank = PivotRank(st, samples); + const Vec<D> pivot = st.SetKey(d, samples + pivot_rank); + if (VQSORT_PRINT >= 2) { + fprintf(stderr, " Pivot rank %zu = %f\n", pivot_rank, + static_cast<double>(GetLane(pivot))); + } + // Verify pivot is not equal to the last sample. + constexpr size_t kSampleLanes = Constants::SampleLanes<T>(); + constexpr size_t N1 = st.LanesPerKey(); + const Vec<D> last = st.SetKey(d, samples + kSampleLanes - N1); + const bool all_neq = AllTrue(d, st.NotEqualKeys(d, pivot, last)); + (void)all_neq; + HWY_DASSERT(all_neq); + return pivot; +} + +// Returns true if all keys equal `pivot`, otherwise returns false and sets +// `*first_mismatch' to the index of the first differing key. +template <class D, class Traits, typename T> +HWY_INLINE bool AllEqual(D d, Traits st, const Vec<D> pivot, + const T* HWY_RESTRICT keys, size_t num, + size_t* HWY_RESTRICT first_mismatch) { + const size_t N = Lanes(d); + // Ensures we can use overlapping loads for the tail; see HandleSpecialCases. + HWY_DASSERT(num >= N); + const Vec<D> zero = Zero(d); + + // Vector-align keys + i. + const size_t misalign = + (reinterpret_cast<uintptr_t>(keys) / sizeof(T)) & (N - 1); + HWY_DASSERT(misalign % st.LanesPerKey() == 0); + const size_t consume = N - misalign; + { + const Vec<D> v = LoadU(d, keys); + // Only check masked lanes; consider others to be equal. + const Mask<D> diff = And(FirstN(d, consume), st.NotEqualKeys(d, v, pivot)); + if (HWY_UNLIKELY(!AllFalse(d, diff))) { + const size_t lane = FindKnownFirstTrue(d, diff); + *first_mismatch = lane; + return false; + } + } + size_t i = consume; + HWY_DASSERT(((reinterpret_cast<uintptr_t>(keys + i) / sizeof(T)) & (N - 1)) == + 0); + + // Sticky bits registering any difference between `keys` and the first key. + // We use vector XOR because it may be cheaper than comparisons, especially + // for 128-bit. 2x unrolled for more ILP. + Vec<D> diff0 = zero; + Vec<D> diff1 = zero; + + // We want to stop once a difference has been found, but without slowing + // down the loop by comparing during each iteration. The compromise is to + // compare after a 'group', which consists of kLoops times two vectors. + constexpr size_t kLoops = 8; + const size_t lanes_per_group = kLoops * 2 * N; + + for (; i + lanes_per_group <= num; i += lanes_per_group) { + HWY_DEFAULT_UNROLL + for (size_t loop = 0; loop < kLoops; ++loop) { + const Vec<D> v0 = Load(d, keys + i + loop * 2 * N); + const Vec<D> v1 = Load(d, keys + i + loop * 2 * N + N); + diff0 = OrXor(diff0, v0, pivot); + diff1 = OrXor(diff1, v1, pivot); + } + + // If there was a difference in the entire group: + if (HWY_UNLIKELY(!st.NoKeyDifference(d, Or(diff0, diff1)))) { + // .. then loop until the first one, with termination guarantee. + for (;; i += N) { + const Vec<D> v = Load(d, keys + i); + const Mask<D> diff = st.NotEqualKeys(d, v, pivot); + if (HWY_UNLIKELY(!AllFalse(d, diff))) { + const size_t lane = FindKnownFirstTrue(d, diff); + *first_mismatch = i + lane; + return false; + } + } + } + } + + // Whole vectors, no unrolling, compare directly + for (; i + N <= num; i += N) { + const Vec<D> v = Load(d, keys + i); + const Mask<D> diff = st.NotEqualKeys(d, v, pivot); + if (HWY_UNLIKELY(!AllFalse(d, diff))) { + const size_t lane = FindKnownFirstTrue(d, diff); + *first_mismatch = i + lane; + return false; + } + } + // Always re-check the last (unaligned) vector to reduce branching. + i = num - N; + const Vec<D> v = LoadU(d, keys + i); + const Mask<D> diff = st.NotEqualKeys(d, v, pivot); + if (HWY_UNLIKELY(!AllFalse(d, diff))) { + const size_t lane = FindKnownFirstTrue(d, diff); + *first_mismatch = i + lane; + return false; + } + + if (VQSORT_PRINT >= 1) { + fprintf(stderr, "All keys equal\n"); + } + return true; // all equal +} + +// Called from 'two locations', but only one is active (IsKV is constexpr). +template <class D, class Traits, typename T> +HWY_INLINE bool ExistsAnyBefore(D d, Traits st, const T* HWY_RESTRICT keys, + size_t num, const Vec<D> pivot) { + const size_t N = Lanes(d); + HWY_DASSERT(num >= N); // See HandleSpecialCases + + if (VQSORT_PRINT >= 2) { + fprintf(stderr, "Scanning for before\n"); + } + + size_t i = 0; + + constexpr size_t kLoops = 16; + const size_t lanes_per_group = kLoops * N; + + Vec<D> first = pivot; + + // Whole group, unrolled + for (; i + lanes_per_group <= num; i += lanes_per_group) { + HWY_DEFAULT_UNROLL + for (size_t loop = 0; loop < kLoops; ++loop) { + const Vec<D> curr = LoadU(d, keys + i + loop * N); + first = st.First(d, first, curr); + } + + if (HWY_UNLIKELY(!AllFalse(d, st.Compare(d, first, pivot)))) { + if (VQSORT_PRINT >= 2) { + fprintf(stderr, "Stopped scanning at end of group %zu\n", + i + lanes_per_group); + } + return true; + } + } + // Whole vectors, no unrolling + for (; i + N <= num; i += N) { + const Vec<D> curr = LoadU(d, keys + i); + if (HWY_UNLIKELY(!AllFalse(d, st.Compare(d, curr, pivot)))) { + if (VQSORT_PRINT >= 2) { + fprintf(stderr, "Stopped scanning at %zu\n", i); + } + return true; + } + } + // If there are remainders, re-check the last whole vector. + if (HWY_LIKELY(i != num)) { + const Vec<D> curr = LoadU(d, keys + num - N); + if (HWY_UNLIKELY(!AllFalse(d, st.Compare(d, curr, pivot)))) { + if (VQSORT_PRINT >= 2) { + fprintf(stderr, "Stopped scanning at last %zu\n", num - N); + } + return true; + } + } + + return false; // pivot is the first +} + +// Called from 'two locations', but only one is active (IsKV is constexpr). +template <class D, class Traits, typename T> +HWY_INLINE bool ExistsAnyAfter(D d, Traits st, const T* HWY_RESTRICT keys, + size_t num, const Vec<D> pivot) { + const size_t N = Lanes(d); + HWY_DASSERT(num >= N); // See HandleSpecialCases + + if (VQSORT_PRINT >= 2) { + fprintf(stderr, "Scanning for after\n"); + } + + size_t i = 0; + + constexpr size_t kLoops = 16; + const size_t lanes_per_group = kLoops * N; + + Vec<D> last = pivot; + + // Whole group, unrolled + for (; i + lanes_per_group <= num; i += lanes_per_group) { + HWY_DEFAULT_UNROLL + for (size_t loop = 0; loop < kLoops; ++loop) { + const Vec<D> curr = LoadU(d, keys + i + loop * N); + last = st.Last(d, last, curr); + } + + if (HWY_UNLIKELY(!AllFalse(d, st.Compare(d, pivot, last)))) { + if (VQSORT_PRINT >= 2) { + fprintf(stderr, "Stopped scanning at end of group %zu\n", + i + lanes_per_group); + } + return true; + } + } + // Whole vectors, no unrolling + for (; i + N <= num; i += N) { + const Vec<D> curr = LoadU(d, keys + i); + if (HWY_UNLIKELY(!AllFalse(d, st.Compare(d, pivot, curr)))) { + if (VQSORT_PRINT >= 2) { + fprintf(stderr, "Stopped scanning at %zu\n", i); + } + return true; + } + } + // If there are remainders, re-check the last whole vector. + if (HWY_LIKELY(i != num)) { + const Vec<D> curr = LoadU(d, keys + num - N); + if (HWY_UNLIKELY(!AllFalse(d, st.Compare(d, pivot, curr)))) { + if (VQSORT_PRINT >= 2) { + fprintf(stderr, "Stopped scanning at last %zu\n", num - N); + } + return true; + } + } + + return false; // pivot is the last +} + +// Returns pivot chosen from `keys[0, num)`. It will never be the largest key +// (thus the right partition will never be empty). +template <class D, class Traits, typename T> +HWY_INLINE Vec<D> ChoosePivotForEqualSamples(D d, Traits st, + T* HWY_RESTRICT keys, size_t num, + T* HWY_RESTRICT samples, + Vec<D> second, Vec<D> third, + PivotResult& result) { + const Vec<D> pivot = st.SetKey(d, samples); // the single unique sample + + // Early out for mostly-0 arrays, where pivot is often FirstValue. + if (HWY_UNLIKELY(AllTrue(d, st.EqualKeys(d, pivot, st.FirstValue(d))))) { + result = PivotResult::kIsFirst; + return pivot; + } + if (HWY_UNLIKELY(AllTrue(d, st.EqualKeys(d, pivot, st.LastValue(d))))) { + result = PivotResult::kWasLast; + return st.PrevValue(d, pivot); + } + + // If key-value, we didn't run PartitionIfTwo* and thus `third` is unknown and + // cannot be used. + if (st.IsKV()) { + // If true, pivot is either middle or last. + const bool before = !AllFalse(d, st.Compare(d, second, pivot)); + if (HWY_UNLIKELY(before)) { + // Not last, so middle. + if (HWY_UNLIKELY(ExistsAnyAfter(d, st, keys, num, pivot))) { + result = PivotResult::kNormal; + return pivot; + } + + // We didn't find anything after pivot, so it is the last. Because keys + // equal to the pivot go to the left partition, the right partition would + // be empty and Partition will not have changed anything. Instead use the + // previous value in sort order, which is not necessarily an actual key. + result = PivotResult::kWasLast; + return st.PrevValue(d, pivot); + } + + // Otherwise, pivot is first or middle. Rule out it being first: + if (HWY_UNLIKELY(ExistsAnyBefore(d, st, keys, num, pivot))) { + result = PivotResult::kNormal; + return pivot; + } + // It is first: fall through to shared code below. + } else { + // Check if pivot is between two known values. If so, it is not the first + // nor the last and we can avoid scanning. + st.Sort2(d, second, third); + HWY_DASSERT(AllTrue(d, st.Compare(d, second, third))); + const bool before = !AllFalse(d, st.Compare(d, second, pivot)); + const bool after = !AllFalse(d, st.Compare(d, pivot, third)); + // Only reached if there are three keys, which means pivot is either first, + // last, or in between. Thus there is another key that comes before or + // after. + HWY_DASSERT(before || after); + if (HWY_UNLIKELY(before)) { + // Neither first nor last. + if (HWY_UNLIKELY(after || ExistsAnyAfter(d, st, keys, num, pivot))) { + result = PivotResult::kNormal; + return pivot; + } + + // We didn't find anything after pivot, so it is the last. Because keys + // equal to the pivot go to the left partition, the right partition would + // be empty and Partition will not have changed anything. Instead use the + // previous value in sort order, which is not necessarily an actual key. + result = PivotResult::kWasLast; + return st.PrevValue(d, pivot); + } + + // Has after, and we found one before: in the middle. + if (HWY_UNLIKELY(ExistsAnyBefore(d, st, keys, num, pivot))) { + result = PivotResult::kNormal; + return pivot; + } + } + + // Pivot is first. We could consider a special partition mode that only + // reads from and writes to the right side, and later fills in the left + // side, which we know is equal to the pivot. However, that leads to more + // cache misses if the array is large, and doesn't save much, hence is a + // net loss. + result = PivotResult::kIsFirst; + return pivot; +} + +// ------------------------------ Quicksort recursion + +template <class D, class Traits, typename T> +HWY_NOINLINE void PrintMinMax(D d, Traits st, const T* HWY_RESTRICT keys, + size_t num, T* HWY_RESTRICT buf) { + if (VQSORT_PRINT >= 2) { + const size_t N = Lanes(d); + if (num < N) return; + + Vec<D> first = st.LastValue(d); + Vec<D> last = st.FirstValue(d); + + size_t i = 0; + for (; i + N <= num; i += N) { + const Vec<D> v = LoadU(d, keys + i); + first = st.First(d, v, first); + last = st.Last(d, v, last); + } + if (HWY_LIKELY(i != num)) { + HWY_DASSERT(num >= N); // See HandleSpecialCases + const Vec<D> v = LoadU(d, keys + num - N); + first = st.First(d, v, first); + last = st.Last(d, v, last); + } + + first = st.FirstOfLanes(d, first, buf); + last = st.LastOfLanes(d, last, buf); + MaybePrintVector(d, "first", first, 0, st.LanesPerKey()); + MaybePrintVector(d, "last", last, 0, st.LanesPerKey()); + } +} + +template <class D, class Traits, typename T> +HWY_NOINLINE void Recurse(D d, Traits st, T* HWY_RESTRICT keys, + const size_t num, T* HWY_RESTRICT buf, + uint64_t* HWY_RESTRICT state, + const size_t remaining_levels) { + HWY_DASSERT(num != 0); + + const size_t N = Lanes(d); + constexpr size_t kLPK = st.LanesPerKey(); + if (HWY_UNLIKELY(num <= Constants::BaseCaseNumLanes<kLPK>(N))) { + BaseCase(d, st, keys, num, buf); + return; + } + + // Move after BaseCase so we skip printing for small subarrays. + if (VQSORT_PRINT >= 1) { + fprintf(stderr, "\n\n=== Recurse depth=%zu len=%zu\n", remaining_levels, + num); + PrintMinMax(d, st, keys, num, buf); + } + + DrawSamples(d, st, keys, num, buf, state); + + Vec<D> pivot; + PivotResult result = PivotResult::kNormal; + if (HWY_UNLIKELY(UnsortedSampleEqual(d, st, buf))) { + pivot = st.SetKey(d, buf); + size_t idx_second = 0; + if (HWY_UNLIKELY(AllEqual(d, st, pivot, keys, num, &idx_second))) { + return; + } + HWY_DASSERT(idx_second % st.LanesPerKey() == 0); + // Must capture the value before PartitionIfTwoKeys may overwrite it. + const Vec<D> second = st.SetKey(d, keys + idx_second); + MaybePrintVector(d, "pivot", pivot, 0, st.LanesPerKey()); + MaybePrintVector(d, "second", second, 0, st.LanesPerKey()); + + Vec<D> third; + // Not supported for key-value types because two 'keys' may be equivalent + // but not interchangeable (their values may differ). + if (HWY_UNLIKELY(!st.IsKV() && + PartitionIfTwoKeys(d, st, pivot, keys, num, idx_second, + second, third, buf))) { + return; // Done, skip recursion because each side has all-equal keys. + } + + // We can no longer start scanning from idx_second because + // PartitionIfTwoKeys may have reordered keys. + pivot = ChoosePivotForEqualSamples(d, st, keys, num, buf, second, third, + result); + // If kNormal, `pivot` is very common but not the first/last. It is + // tempting to do a 3-way partition (to avoid moving the =pivot keys a + // second time), but that is a net loss due to the extra comparisons. + } else { + SortSamples(d, st, buf); + + // Not supported for key-value types because two 'keys' may be equivalent + // but not interchangeable (their values may differ). + if (HWY_UNLIKELY(!st.IsKV() && + PartitionIfTwoSamples(d, st, keys, num, buf))) { + return; + } + + pivot = ChoosePivotByRank(d, st, buf); + } + + // Too many recursions. This is unlikely to happen because we select pivots + // from large (though still O(1)) samples. + if (HWY_UNLIKELY(remaining_levels == 0)) { + if (VQSORT_PRINT >= 1) { + fprintf(stderr, "HeapSort reached, size=%zu\n", num); + } + HeapSort(st, keys, num); // Slow but N*logN. + return; + } + + const size_t bound = Partition(d, st, keys, num, pivot, buf); + if (VQSORT_PRINT >= 2) { + fprintf(stderr, "bound %zu num %zu result %s\n", bound, num, + PivotResultString(result)); + } + // The left partition is not empty because the pivot is one of the keys + // (unless kWasLast, in which case the pivot is PrevValue, but we still + // have at least one value <= pivot because AllEqual ruled out the case of + // only one unique value, and there is exactly one value after pivot). + HWY_DASSERT(bound != 0); + // ChoosePivot* ensure pivot != last, so the right partition is never empty + // except in the rare case of the pivot matching the last-in-sort-order value, + // which implies we anyway skip the right partition due to kWasLast. + HWY_DASSERT(bound != num || result == PivotResult::kWasLast); + + if (HWY_LIKELY(result != PivotResult::kIsFirst)) { + Recurse(d, st, keys, bound, buf, state, remaining_levels - 1); + } + if (HWY_LIKELY(result != PivotResult::kWasLast)) { + Recurse(d, st, keys + bound, num - bound, buf, state, remaining_levels - 1); + } +} + +// Returns true if sorting is finished. +template <class D, class Traits, typename T> +HWY_INLINE bool HandleSpecialCases(D d, Traits st, T* HWY_RESTRICT keys, + size_t num, T* HWY_RESTRICT buf) { + const size_t N = Lanes(d); + constexpr size_t kLPK = st.LanesPerKey(); + const size_t base_case_num = Constants::BaseCaseNumLanes<kLPK>(N); + + // Recurse will also check this, but doing so here first avoids setting up + // the random generator state. + if (HWY_UNLIKELY(num <= base_case_num)) { + BaseCase(d, st, keys, num, buf); + return true; + } + + // 128-bit keys require vectors with at least two u64 lanes, which is always + // the case unless `d` requests partial vectors (e.g. fraction = 1/2) AND the + // hardware vector width is less than 128bit / fraction. + const bool partial_128 = !IsFull(d) && N < 2 && st.Is128(); + // Partition assumes its input is at least two vectors. If vectors are huge, + // base_case_num may actually be smaller. If so, which is only possible on + // RVV, pass a capped or partial d (LMUL < 1). Use HWY_MAX_BYTES instead of + // HWY_LANES to account for the largest possible LMUL. + constexpr bool kPotentiallyHuge = + HWY_MAX_BYTES / sizeof(T) > Constants::kMaxRows * Constants::kMaxCols; + const bool huge_vec = kPotentiallyHuge && (2 * N > base_case_num); + if (partial_128 || huge_vec) { + if (VQSORT_PRINT >= 1) { + fprintf(stderr, "WARNING: using slow HeapSort: partial %d huge %d\n", + partial_128, huge_vec); + } + HeapSort(st, keys, num); + return true; + } + + // We could also check for already sorted/reverse/equal, but that's probably + // counterproductive if vqsort is used as a base case. + + return false; // not finished sorting +} + +#endif // VQSORT_ENABLED +} // namespace detail + +// Old interface with user-specified buffer, retained for compatibility. +// `buf` must be vector-aligned and hold at least +// SortConstants::BufBytes(HWY_MAX_BYTES, st.LanesPerKey()). +template <class D, class Traits, typename T> +void Sort(D d, Traits st, T* HWY_RESTRICT keys, size_t num, + T* HWY_RESTRICT buf) { + if (VQSORT_PRINT >= 1) { + fprintf(stderr, "=============== Sort num %zu\n", num); + } + +#if VQSORT_ENABLED || HWY_IDE + if (detail::HandleSpecialCases(d, st, keys, num, buf)) return; + +#if HWY_MAX_BYTES > 64 + // sorting_networks-inl and traits assume no more than 512 bit vectors. + if (HWY_UNLIKELY(Lanes(d) > 64 / sizeof(T))) { + return Sort(CappedTag<T, 64 / sizeof(T)>(), st, keys, num, buf); + } +#endif // HWY_MAX_BYTES > 64 + + uint64_t* HWY_RESTRICT state = GetGeneratorState(); + // Introspection: switch to worst-case N*logN heapsort after this many. + // Should never be reached, so computing log2 exactly does not help. + const size_t max_levels = 50; + detail::Recurse(d, st, keys, num, buf, state, max_levels); +#else // !VQSORT_ENABLED + (void)d; + (void)buf; + if (VQSORT_PRINT >= 1) { + fprintf(stderr, "WARNING: using slow HeapSort because vqsort disabled\n"); + } + return detail::HeapSort(st, keys, num); +#endif // VQSORT_ENABLED +} + +// Sorts `keys[0..num-1]` according to the order defined by `st.Compare`. +// In-place i.e. O(1) additional storage. Worst-case N*logN comparisons. +// Non-stable (order of equal keys may change), except for the common case where +// the upper bits of T are the key, and the lower bits are a sequential or at +// least unique ID. +// There is no upper limit on `num`, but note that pivots may be chosen by +// sampling only from the first 256 GiB. +// +// `d` is typically SortTag<T> (chooses between full and partial vectors). +// `st` is SharedTraits<Traits*<Order*>>. This abstraction layer bridges +// differences in sort order and single-lane vs 128-bit keys. +template <class D, class Traits, typename T> +HWY_API void Sort(D d, Traits st, T* HWY_RESTRICT keys, size_t num) { + constexpr size_t kLPK = st.LanesPerKey(); + HWY_ALIGN T buf[SortConstants::BufBytes<T, kLPK>(HWY_MAX_BYTES) / sizeof(T)]; + return Sort(d, st, keys, num, buf); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE diff --git a/third_party/highway/hwy/contrib/sort/vqsort.cc b/third_party/highway/hwy/contrib/sort/vqsort.cc new file mode 100644 index 0000000000..e4ec91d9ce --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/vqsort.cc @@ -0,0 +1,124 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#include <time.h> + +#include <cstdint> + +#include "hwy/base.h" +#include "hwy/contrib/sort/shared-inl.h" + +// Check if we have sys/random.h. First skip some systems on which the check +// itself (features.h) might be problematic. +#if defined(ANDROID) || defined(__ANDROID__) || HWY_ARCH_RVV +#define VQSORT_GETRANDOM 0 +#endif + +#if !defined(VQSORT_GETRANDOM) && HWY_OS_LINUX +#include <features.h> + +// ---- which libc +#if defined(__UCLIBC__) +#define VQSORT_GETRANDOM 1 // added Mar 2015, before uclibc-ng 1.0 + +#elif defined(__GLIBC__) && defined(__GLIBC_PREREQ) +#if __GLIBC_PREREQ(2, 25) +#define VQSORT_GETRANDOM 1 +#else +#define VQSORT_GETRANDOM 0 +#endif + +#else +// Assume MUSL, which has getrandom since 2018. There is no macro to test, see +// https://www.openwall.com/lists/musl/2013/03/29/13. +#define VQSORT_GETRANDOM 1 + +#endif // ---- which libc +#endif // linux + +#if !defined(VQSORT_GETRANDOM) +#define VQSORT_GETRANDOM 0 +#endif + +// Choose a seed source for SFC generator: 1=getrandom, 2=CryptGenRandom. +// Allow user override - not all Android support the getrandom wrapper. +#ifndef VQSORT_SECURE_SEED + +#if VQSORT_GETRANDOM +#define VQSORT_SECURE_SEED 1 +#elif defined(_WIN32) || defined(_WIN64) +#define VQSORT_SECURE_SEED 2 +#else +#define VQSORT_SECURE_SEED 0 +#endif + +#endif // VQSORT_SECURE_SEED + +// Pull in dependencies of the chosen seed source. +#if VQSORT_SECURE_SEED == 1 +#include <sys/random.h> +#elif VQSORT_SECURE_SEED == 2 +#include <windows.h> +#pragma comment(lib, "advapi32.lib") +// Must come after windows.h. +#include <wincrypt.h> +#endif // VQSORT_SECURE_SEED + +namespace hwy { +namespace { + +void Fill16Bytes(void* bytes) { +#if VQSORT_SECURE_SEED == 1 + // May block if urandom is not yet initialized. + const ssize_t ret = getrandom(bytes, 16, /*flags=*/0); + if (ret == 16) return; +#elif VQSORT_SECURE_SEED == 2 + HCRYPTPROV hProvider{}; + if (CryptAcquireContextA(&hProvider, nullptr, nullptr, PROV_RSA_FULL, + CRYPT_VERIFYCONTEXT)) { + const BOOL ok = + CryptGenRandom(hProvider, 16, reinterpret_cast<BYTE*>(bytes)); + CryptReleaseContext(hProvider, 0); + if (ok) return; + } +#endif + + // VQSORT_SECURE_SEED == 0, or one of the above failed. Get some entropy from + // the address and the clock() timer. + uint64_t* words = reinterpret_cast<uint64_t*>(bytes); + uint64_t** seed_stack = &words; + void (*seed_code)(void*) = &Fill16Bytes; + const uintptr_t bits_stack = reinterpret_cast<uintptr_t>(seed_stack); + const uintptr_t bits_code = reinterpret_cast<uintptr_t>(seed_code); + const uint64_t bits_time = static_cast<uint64_t>(clock()); + words[0] = bits_stack ^ bits_time ^ 0xFEDCBA98; // "Nothing up my sleeve" + words[1] = bits_code ^ bits_time ^ 0x01234567; // constants. +} + +} // namespace + +uint64_t* GetGeneratorState() { + thread_local uint64_t state[3] = {0}; + // This is a counter; zero indicates not yet initialized. + if (HWY_UNLIKELY(state[2] == 0)) { + Fill16Bytes(state); + state[2] = 1; + } + return state; +} + +} // namespace hwy diff --git a/third_party/highway/hwy/contrib/sort/vqsort.h b/third_party/highway/hwy/contrib/sort/vqsort.h new file mode 100644 index 0000000000..95c140d140 --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/vqsort.h @@ -0,0 +1,221 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Interface to vectorized quicksort with dynamic dispatch. +// Blog post: https://tinyurl.com/vqsort-blog +// Paper with measurements: https://arxiv.org/abs/2205.05982 +// +// To ensure the overhead of using wide vectors (e.g. AVX2 or AVX-512) is +// worthwhile, we recommend using this code for sorting arrays whose size is at +// least 512 KiB. + +#ifndef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_ +#define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_ + +#include "hwy/base.h" + +namespace hwy { + +// Tag arguments that determine the sort order. +struct SortAscending { + constexpr bool IsAscending() const { return true; } +}; +struct SortDescending { + constexpr bool IsAscending() const { return false; } +}; + +// Vectorized Quicksort: sorts keys[0, n). Dispatches to the best available +// instruction set and does not allocate memory. +// Uses about 1.2 KiB stack plus an internal 3-word TLS cache for random state. +HWY_CONTRIB_DLLEXPORT void VQSort(uint16_t* HWY_RESTRICT keys, size_t n, + SortAscending); +HWY_CONTRIB_DLLEXPORT void VQSort(uint16_t* HWY_RESTRICT keys, size_t n, + SortDescending); +HWY_CONTRIB_DLLEXPORT void VQSort(uint32_t* HWY_RESTRICT keys, size_t n, + SortAscending); +HWY_CONTRIB_DLLEXPORT void VQSort(uint32_t* HWY_RESTRICT keys, size_t n, + SortDescending); +HWY_CONTRIB_DLLEXPORT void VQSort(uint64_t* HWY_RESTRICT keys, size_t n, + SortAscending); +HWY_CONTRIB_DLLEXPORT void VQSort(uint64_t* HWY_RESTRICT keys, size_t n, + SortDescending); +HWY_CONTRIB_DLLEXPORT void VQSort(int16_t* HWY_RESTRICT keys, size_t n, + SortAscending); +HWY_CONTRIB_DLLEXPORT void VQSort(int16_t* HWY_RESTRICT keys, size_t n, + SortDescending); +HWY_CONTRIB_DLLEXPORT void VQSort(int32_t* HWY_RESTRICT keys, size_t n, + SortAscending); +HWY_CONTRIB_DLLEXPORT void VQSort(int32_t* HWY_RESTRICT keys, size_t n, + SortDescending); +HWY_CONTRIB_DLLEXPORT void VQSort(int64_t* HWY_RESTRICT keys, size_t n, + SortAscending); +HWY_CONTRIB_DLLEXPORT void VQSort(int64_t* HWY_RESTRICT keys, size_t n, + SortDescending); +HWY_CONTRIB_DLLEXPORT void VQSort(float* HWY_RESTRICT keys, size_t n, + SortAscending); +HWY_CONTRIB_DLLEXPORT void VQSort(float* HWY_RESTRICT keys, size_t n, + SortDescending); +HWY_CONTRIB_DLLEXPORT void VQSort(double* HWY_RESTRICT keys, size_t n, + SortAscending); +HWY_CONTRIB_DLLEXPORT void VQSort(double* HWY_RESTRICT keys, size_t n, + SortDescending); +HWY_CONTRIB_DLLEXPORT void VQSort(uint128_t* HWY_RESTRICT keys, size_t n, + SortAscending); +HWY_CONTRIB_DLLEXPORT void VQSort(uint128_t* HWY_RESTRICT keys, size_t n, + SortDescending); +HWY_CONTRIB_DLLEXPORT void VQSort(K64V64* HWY_RESTRICT keys, size_t n, + SortAscending); +HWY_CONTRIB_DLLEXPORT void VQSort(K64V64* HWY_RESTRICT keys, size_t n, + SortDescending); +HWY_CONTRIB_DLLEXPORT void VQSort(K32V32* HWY_RESTRICT keys, size_t n, + SortAscending); +HWY_CONTRIB_DLLEXPORT void VQSort(K32V32* HWY_RESTRICT keys, size_t n, + SortDescending); + +// User-level caching is no longer required, so this class is no longer +// beneficial. We recommend using the simpler VQSort() interface instead, and +// retain this class only for compatibility. It now just calls VQSort. +class HWY_CONTRIB_DLLEXPORT Sorter { + public: + Sorter() {} + ~Sorter() {} + + // Move-only + Sorter(const Sorter&) = delete; + Sorter& operator=(const Sorter&) = delete; + Sorter(Sorter&& /*other*/) {} + Sorter& operator=(Sorter&& /*other*/) { return *this; } + + void operator()(uint16_t* HWY_RESTRICT keys, size_t n, + SortAscending tag) const { + VQSort(keys, n, tag); + } + void operator()(uint16_t* HWY_RESTRICT keys, size_t n, + SortDescending tag) const { + VQSort(keys, n, tag); + } + void operator()(uint32_t* HWY_RESTRICT keys, size_t n, + SortAscending tag) const { + VQSort(keys, n, tag); + } + void operator()(uint32_t* HWY_RESTRICT keys, size_t n, + SortDescending tag) const { + VQSort(keys, n, tag); + } + void operator()(uint64_t* HWY_RESTRICT keys, size_t n, + SortAscending tag) const { + VQSort(keys, n, tag); + } + void operator()(uint64_t* HWY_RESTRICT keys, size_t n, + SortDescending tag) const { + VQSort(keys, n, tag); + } + + void operator()(int16_t* HWY_RESTRICT keys, size_t n, + SortAscending tag) const { + VQSort(keys, n, tag); + } + void operator()(int16_t* HWY_RESTRICT keys, size_t n, + SortDescending tag) const { + VQSort(keys, n, tag); + } + void operator()(int32_t* HWY_RESTRICT keys, size_t n, + SortAscending tag) const { + VQSort(keys, n, tag); + } + void operator()(int32_t* HWY_RESTRICT keys, size_t n, + SortDescending tag) const { + VQSort(keys, n, tag); + } + void operator()(int64_t* HWY_RESTRICT keys, size_t n, + SortAscending tag) const { + VQSort(keys, n, tag); + } + void operator()(int64_t* HWY_RESTRICT keys, size_t n, + SortDescending tag) const { + VQSort(keys, n, tag); + } + + void operator()(float* HWY_RESTRICT keys, size_t n, SortAscending tag) const { + VQSort(keys, n, tag); + } + void operator()(float* HWY_RESTRICT keys, size_t n, + SortDescending tag) const { + VQSort(keys, n, tag); + } + void operator()(double* HWY_RESTRICT keys, size_t n, + SortAscending tag) const { + VQSort(keys, n, tag); + } + void operator()(double* HWY_RESTRICT keys, size_t n, + SortDescending tag) const { + VQSort(keys, n, tag); + } + + void operator()(uint128_t* HWY_RESTRICT keys, size_t n, + SortAscending tag) const { + VQSort(keys, n, tag); + } + void operator()(uint128_t* HWY_RESTRICT keys, size_t n, + SortDescending tag) const { + VQSort(keys, n, tag); + } + + void operator()(K64V64* HWY_RESTRICT keys, size_t n, + SortAscending tag) const { + VQSort(keys, n, tag); + } + void operator()(K64V64* HWY_RESTRICT keys, size_t n, + SortDescending tag) const { + VQSort(keys, n, tag); + } + + void operator()(K32V32* HWY_RESTRICT keys, size_t n, + SortAscending tag) const { + VQSort(keys, n, tag); + } + void operator()(K32V32* HWY_RESTRICT keys, size_t n, + SortDescending tag) const { + VQSort(keys, n, tag); + } + + // Unused + static void Fill24Bytes(const void*, size_t, void*) {} + static bool HaveFloat64() { return false; } + + private: + void Delete() {} + + template <typename T> + T* Get() const { + return nullptr; + } + +#if HWY_COMPILER_CLANG + HWY_DIAGNOSTICS(push) + HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wunused-private-field") +#endif + void* unused_ = nullptr; +#if HWY_COMPILER_CLANG + HWY_DIAGNOSTICS(pop) +#endif +}; + +// Internal use only +HWY_CONTRIB_DLLEXPORT uint64_t* GetGeneratorState(); + +} // namespace hwy + +#endif // HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_ diff --git a/third_party/highway/hwy/contrib/sort/vqsort_128a.cc b/third_party/highway/hwy/contrib/sort/vqsort_128a.cc new file mode 100644 index 0000000000..9acd33375d --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/vqsort_128a.cc @@ -0,0 +1,59 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_128a.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits128-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void Sort128Asc(uint64_t* HWY_RESTRICT keys, size_t num) { +#if VQSORT_ENABLED + SortTag<uint64_t> d; + detail::SharedTraits<detail::Traits128<detail::OrderAscending128>> st; + Sort(d, st, keys, num); +#else + (void)keys; + (void)num; + HWY_ASSERT(0); +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(Sort128Asc); +} // namespace + +void VQSort(uint128_t* HWY_RESTRICT keys, size_t n, SortAscending) { + HWY_DYNAMIC_DISPATCH(Sort128Asc) + (reinterpret_cast<uint64_t*>(keys), n * 2); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/third_party/highway/hwy/contrib/sort/vqsort_128d.cc b/third_party/highway/hwy/contrib/sort/vqsort_128d.cc new file mode 100644 index 0000000000..633a1ef452 --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/vqsort_128d.cc @@ -0,0 +1,59 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_128d.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits128-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void Sort128Desc(uint64_t* HWY_RESTRICT keys, size_t num) { +#if VQSORT_ENABLED + SortTag<uint64_t> d; + detail::SharedTraits<detail::Traits128<detail::OrderDescending128>> st; + Sort(d, st, keys, num); +#else + (void)keys; + (void)num; + HWY_ASSERT(0); +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(Sort128Desc); +} // namespace + +void VQSort(uint128_t* HWY_RESTRICT keys, size_t n, SortDescending) { + HWY_DYNAMIC_DISPATCH(Sort128Desc) + (reinterpret_cast<uint64_t*>(keys), n * 2); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/third_party/highway/hwy/contrib/sort/vqsort_f32a.cc b/third_party/highway/hwy/contrib/sort/vqsort_f32a.cc new file mode 100644 index 0000000000..0018bcc580 --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/vqsort_f32a.cc @@ -0,0 +1,52 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f32a.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortF32Asc(float* HWY_RESTRICT keys, size_t num) { + SortTag<float> d; + detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<float>>> st; + Sort(d, st, keys, num); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortF32Asc); +} // namespace + +void VQSort(float* HWY_RESTRICT keys, size_t n, SortAscending) { + HWY_DYNAMIC_DISPATCH(SortF32Asc)(keys, n); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/third_party/highway/hwy/contrib/sort/vqsort_f32d.cc b/third_party/highway/hwy/contrib/sort/vqsort_f32d.cc new file mode 100644 index 0000000000..fb974c592d --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/vqsort_f32d.cc @@ -0,0 +1,52 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f32d.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortF32Desc(float* HWY_RESTRICT keys, size_t num) { + SortTag<float> d; + detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<float>>> st; + Sort(d, st, keys, num); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortF32Desc); +} // namespace + +void VQSort(float* HWY_RESTRICT keys, size_t n, SortDescending) { + HWY_DYNAMIC_DISPATCH(SortF32Desc)(keys, n); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/third_party/highway/hwy/contrib/sort/vqsort_f64a.cc b/third_party/highway/hwy/contrib/sort/vqsort_f64a.cc new file mode 100644 index 0000000000..79c9712902 --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/vqsort_f64a.cc @@ -0,0 +1,58 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f64a.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortF64Asc(double* HWY_RESTRICT keys, size_t num) { +#if HWY_HAVE_FLOAT64 + SortTag<double> d; + detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<double>>> st; + Sort(d, st, keys, num); +#else + (void)keys; + (void)num; + HWY_ASSERT(0); +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortF64Asc); +} // namespace + +void VQSort(double* HWY_RESTRICT keys, size_t n, SortAscending) { + HWY_DYNAMIC_DISPATCH(SortF64Asc)(keys, n); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/third_party/highway/hwy/contrib/sort/vqsort_f64d.cc b/third_party/highway/hwy/contrib/sort/vqsort_f64d.cc new file mode 100644 index 0000000000..922878c407 --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/vqsort_f64d.cc @@ -0,0 +1,58 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f64d.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortF64Desc(double* HWY_RESTRICT keys, size_t num) { +#if HWY_HAVE_FLOAT64 + SortTag<double> d; + detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<double>>> st; + Sort(d, st, keys, num); +#else + (void)keys; + (void)num; + HWY_ASSERT(0); +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortF64Desc); +} // namespace + +void VQSort(double* HWY_RESTRICT keys, size_t n, SortDescending) { + HWY_DYNAMIC_DISPATCH(SortF64Desc)(keys, n); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/third_party/highway/hwy/contrib/sort/vqsort_i16a.cc b/third_party/highway/hwy/contrib/sort/vqsort_i16a.cc new file mode 100644 index 0000000000..809827fba9 --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/vqsort_i16a.cc @@ -0,0 +1,52 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i16a.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortI16Asc(int16_t* HWY_RESTRICT keys, size_t num) { + SortTag<int16_t> d; + detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int16_t>>> st; + Sort(d, st, keys, num); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortI16Asc); +} // namespace + +void VQSort(int16_t* HWY_RESTRICT keys, size_t n, SortAscending) { + HWY_DYNAMIC_DISPATCH(SortI16Asc)(keys, n); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/third_party/highway/hwy/contrib/sort/vqsort_i16d.cc b/third_party/highway/hwy/contrib/sort/vqsort_i16d.cc new file mode 100644 index 0000000000..e168e3349c --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/vqsort_i16d.cc @@ -0,0 +1,52 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i16d.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortI16Desc(int16_t* HWY_RESTRICT keys, size_t num) { + SortTag<int16_t> d; + detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<int16_t>>> st; + Sort(d, st, keys, num); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortI16Desc); +} // namespace + +void VQSort(int16_t* HWY_RESTRICT keys, size_t n, SortDescending) { + HWY_DYNAMIC_DISPATCH(SortI16Desc)(keys, n); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/third_party/highway/hwy/contrib/sort/vqsort_i32a.cc b/third_party/highway/hwy/contrib/sort/vqsort_i32a.cc new file mode 100644 index 0000000000..df8d7e622d --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/vqsort_i32a.cc @@ -0,0 +1,52 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i32a.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortI32Asc(int32_t* HWY_RESTRICT keys, size_t num) { + SortTag<int32_t> d; + detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int32_t>>> st; + Sort(d, st, keys, num); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortI32Asc); +} // namespace + +void VQSort(int32_t* HWY_RESTRICT keys, size_t n, SortAscending) { + HWY_DYNAMIC_DISPATCH(SortI32Asc)(keys, n); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/third_party/highway/hwy/contrib/sort/vqsort_i32d.cc b/third_party/highway/hwy/contrib/sort/vqsort_i32d.cc new file mode 100644 index 0000000000..5bf93e99c6 --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/vqsort_i32d.cc @@ -0,0 +1,52 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i32d.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortI32Desc(int32_t* HWY_RESTRICT keys, size_t num) { + SortTag<int32_t> d; + detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<int32_t>>> st; + Sort(d, st, keys, num); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortI32Desc); +} // namespace + +void VQSort(int32_t* HWY_RESTRICT keys, size_t n, SortDescending) { + HWY_DYNAMIC_DISPATCH(SortI32Desc)(keys, n); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/third_party/highway/hwy/contrib/sort/vqsort_i64a.cc b/third_party/highway/hwy/contrib/sort/vqsort_i64a.cc new file mode 100644 index 0000000000..fb8ae90a01 --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/vqsort_i64a.cc @@ -0,0 +1,52 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i64a.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortI64Asc(int64_t* HWY_RESTRICT keys, size_t num) { + SortTag<int64_t> d; + detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int64_t>>> st; + Sort(d, st, keys, num); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortI64Asc); +} // namespace + +void VQSort(int64_t* HWY_RESTRICT keys, size_t n, SortAscending) { + HWY_DYNAMIC_DISPATCH(SortI64Asc)(keys, n); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/third_party/highway/hwy/contrib/sort/vqsort_i64d.cc b/third_party/highway/hwy/contrib/sort/vqsort_i64d.cc new file mode 100644 index 0000000000..8605f0e483 --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/vqsort_i64d.cc @@ -0,0 +1,52 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i64d.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortI64Desc(int64_t* HWY_RESTRICT keys, size_t num) { + SortTag<int64_t> d; + detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<int64_t>>> st; + Sort(d, st, keys, num); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortI64Desc); +} // namespace + +void VQSort(int64_t* HWY_RESTRICT keys, size_t n, SortDescending) { + HWY_DYNAMIC_DISPATCH(SortI64Desc)(keys, n); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/third_party/highway/hwy/contrib/sort/vqsort_kv128a.cc b/third_party/highway/hwy/contrib/sort/vqsort_kv128a.cc new file mode 100644 index 0000000000..4c7f3f15e9 --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/vqsort_kv128a.cc @@ -0,0 +1,62 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +// clang-format off +// (avoid line break, which would prevent Copybara rules from matching) +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv128a.cc" //NOLINT +// clang-format on +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits128-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortKV128Asc(uint64_t* HWY_RESTRICT keys, size_t num) { +#if VQSORT_ENABLED + SortTag<uint64_t> d; + detail::SharedTraits<detail::Traits128<detail::OrderAscendingKV128>> st; + Sort(d, st, keys, num); +#else + (void)keys; + (void)num; + HWY_ASSERT(0); +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortKV128Asc); +} // namespace + +void VQSort(K64V64* HWY_RESTRICT keys, size_t n, SortAscending) { + HWY_DYNAMIC_DISPATCH(SortKV128Asc) + (reinterpret_cast<uint64_t*>(keys), n * 2); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/third_party/highway/hwy/contrib/sort/vqsort_kv128d.cc b/third_party/highway/hwy/contrib/sort/vqsort_kv128d.cc new file mode 100644 index 0000000000..7b91dd94d3 --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/vqsort_kv128d.cc @@ -0,0 +1,62 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +// clang-format off +// (avoid line break, which would prevent Copybara rules from matching) +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv128d.cc" //NOLINT +// clang-format on +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits128-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortKV128Desc(uint64_t* HWY_RESTRICT keys, size_t num) { +#if VQSORT_ENABLED + SortTag<uint64_t> d; + detail::SharedTraits<detail::Traits128<detail::OrderDescendingKV128>> st; + Sort(d, st, keys, num); +#else + (void)keys; + (void)num; + HWY_ASSERT(0); +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortKV128Desc); +} // namespace + +void VQSort(K64V64* HWY_RESTRICT keys, size_t n, SortDescending) { + HWY_DYNAMIC_DISPATCH(SortKV128Desc) + (reinterpret_cast<uint64_t*>(keys), n * 2); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/third_party/highway/hwy/contrib/sort/vqsort_kv64a.cc b/third_party/highway/hwy/contrib/sort/vqsort_kv64a.cc new file mode 100644 index 0000000000..dd6886aefa --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/vqsort_kv64a.cc @@ -0,0 +1,62 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +// clang-format off +// (avoid line break, which would prevent Copybara rules from matching) +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv64a.cc" //NOLINT +// clang-format on +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortKV64Asc(uint64_t* HWY_RESTRICT keys, size_t num) { +#if VQSORT_ENABLED + SortTag<uint64_t> d; + detail::SharedTraits<detail::TraitsLane<detail::OrderAscendingKV64>> st; + Sort(d, st, keys, num); +#else + (void)keys; + (void)num; + HWY_ASSERT(0); +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortKV64Asc); +} // namespace + +void VQSort(K32V32* HWY_RESTRICT keys, size_t n, SortAscending) { + HWY_DYNAMIC_DISPATCH(SortKV64Asc) + (reinterpret_cast<uint64_t*>(keys), n); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/third_party/highway/hwy/contrib/sort/vqsort_kv64d.cc b/third_party/highway/hwy/contrib/sort/vqsort_kv64d.cc new file mode 100644 index 0000000000..091492f065 --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/vqsort_kv64d.cc @@ -0,0 +1,62 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +// clang-format off +// (avoid line break, which would prevent Copybara rules from matching) +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv64d.cc" //NOLINT +// clang-format on +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortKV64Desc(uint64_t* HWY_RESTRICT keys, size_t num) { +#if VQSORT_ENABLED + SortTag<uint64_t> d; + detail::SharedTraits<detail::TraitsLane<detail::OrderDescendingKV64>> st; + Sort(d, st, keys, num); +#else + (void)keys; + (void)num; + HWY_ASSERT(0); +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortKV64Desc); +} // namespace + +void VQSort(K32V32* HWY_RESTRICT keys, size_t n, SortDescending) { + HWY_DYNAMIC_DISPATCH(SortKV64Desc) + (reinterpret_cast<uint64_t*>(keys), n); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/third_party/highway/hwy/contrib/sort/vqsort_u16a.cc b/third_party/highway/hwy/contrib/sort/vqsort_u16a.cc new file mode 100644 index 0000000000..492cfd49b9 --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/vqsort_u16a.cc @@ -0,0 +1,52 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u16a.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortU16Asc(uint16_t* HWY_RESTRICT keys, size_t num) { + SortTag<uint16_t> d; + detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<uint16_t>>> st; + Sort(d, st, keys, num); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortU16Asc); +} // namespace + +void VQSort(uint16_t* HWY_RESTRICT keys, size_t n, SortAscending) { + HWY_DYNAMIC_DISPATCH(SortU16Asc)(keys, n); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/third_party/highway/hwy/contrib/sort/vqsort_u16d.cc b/third_party/highway/hwy/contrib/sort/vqsort_u16d.cc new file mode 100644 index 0000000000..1e33220d3c --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/vqsort_u16d.cc @@ -0,0 +1,53 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u16d.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortU16Desc(uint16_t* HWY_RESTRICT keys, size_t num) { + SortTag<uint16_t> d; + detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<uint16_t>>> + st; + Sort(d, st, keys, num); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortU16Desc); +} // namespace + +void VQSort(uint16_t* HWY_RESTRICT keys, size_t n, SortDescending) { + HWY_DYNAMIC_DISPATCH(SortU16Desc)(keys, n); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/third_party/highway/hwy/contrib/sort/vqsort_u32a.cc b/third_party/highway/hwy/contrib/sort/vqsort_u32a.cc new file mode 100644 index 0000000000..f2be8753c6 --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/vqsort_u32a.cc @@ -0,0 +1,52 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u32a.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortU32Asc(uint32_t* HWY_RESTRICT keys, size_t num) { + SortTag<uint32_t> d; + detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<uint32_t>>> st; + Sort(d, st, keys, num); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortU32Asc); +} // namespace + +void VQSort(uint32_t* HWY_RESTRICT keys, size_t n, SortAscending) { + HWY_DYNAMIC_DISPATCH(SortU32Asc)(keys, n); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/third_party/highway/hwy/contrib/sort/vqsort_u32d.cc b/third_party/highway/hwy/contrib/sort/vqsort_u32d.cc new file mode 100644 index 0000000000..0caf695689 --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/vqsort_u32d.cc @@ -0,0 +1,53 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u32d.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortU32Desc(uint32_t* HWY_RESTRICT keys, size_t num) { + SortTag<uint32_t> d; + detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<uint32_t>>> + st; + Sort(d, st, keys, num); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortU32Desc); +} // namespace + +void VQSort(uint32_t* HWY_RESTRICT keys, size_t n, SortDescending) { + HWY_DYNAMIC_DISPATCH(SortU32Desc)(keys, n); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/third_party/highway/hwy/contrib/sort/vqsort_u64a.cc b/third_party/highway/hwy/contrib/sort/vqsort_u64a.cc new file mode 100644 index 0000000000..758f1f4c80 --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/vqsort_u64a.cc @@ -0,0 +1,52 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u64a.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortU64Asc(uint64_t* HWY_RESTRICT keys, size_t num) { + SortTag<uint64_t> d; + detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<uint64_t>>> st; + Sort(d, st, keys, num); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortU64Asc); +} // namespace + +void VQSort(uint64_t* HWY_RESTRICT keys, size_t n, SortAscending) { + HWY_DYNAMIC_DISPATCH(SortU64Asc)(keys, n); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/third_party/highway/hwy/contrib/sort/vqsort_u64d.cc b/third_party/highway/hwy/contrib/sort/vqsort_u64d.cc new file mode 100644 index 0000000000..6c34fbed9a --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/vqsort_u64d.cc @@ -0,0 +1,53 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u64d.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortU64Desc(uint64_t* HWY_RESTRICT keys, size_t num) { + SortTag<uint64_t> d; + detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<uint64_t>>> + st; + Sort(d, st, keys, num); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortU64Desc); +} // namespace + +void VQSort(uint64_t* HWY_RESTRICT keys, size_t n, SortDescending) { + HWY_DYNAMIC_DISPATCH(SortU64Desc)(keys, n); +} + +} // namespace hwy +#endif // HWY_ONCE |