summaryrefslogtreecommitdiffstats
path: root/third_party/highway/hwy/contrib/sort
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
commit26a029d407be480d791972afb5975cf62c9360a6 (patch)
treef435a8308119effd964b339f76abb83a57c29483 /third_party/highway/hwy/contrib/sort
parentInitial commit. (diff)
downloadfirefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz
firefox-26a029d407be480d791972afb5975cf62c9360a6.zip
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/highway/hwy/contrib/sort')
-rw-r--r--third_party/highway/hwy/contrib/sort/BUILD210
-rw-r--r--third_party/highway/hwy/contrib/sort/README.md109
-rw-r--r--third_party/highway/hwy/contrib/sort/algo-inl.h553
-rw-r--r--third_party/highway/hwy/contrib/sort/bench_parallel.cc238
-rw-r--r--third_party/highway/hwy/contrib/sort/bench_sort.cc367
-rw-r--r--third_party/highway/hwy/contrib/sort/print_network.cc90
-rw-r--r--third_party/highway/hwy/contrib/sort/result-inl.h140
-rw-r--r--third_party/highway/hwy/contrib/sort/shared-inl.h154
-rw-r--r--third_party/highway/hwy/contrib/sort/sort_test.cc650
-rw-r--r--third_party/highway/hwy/contrib/sort/sorting_networks-inl.h898
-rw-r--r--third_party/highway/hwy/contrib/sort/traits-inl.h561
-rw-r--r--third_party/highway/hwy/contrib/sort/traits128-inl.h529
-rw-r--r--third_party/highway/hwy/contrib/sort/vqsort-inl.h1724
-rw-r--r--third_party/highway/hwy/contrib/sort/vqsort.cc124
-rw-r--r--third_party/highway/hwy/contrib/sort/vqsort.h221
-rw-r--r--third_party/highway/hwy/contrib/sort/vqsort_128a.cc59
-rw-r--r--third_party/highway/hwy/contrib/sort/vqsort_128d.cc59
-rw-r--r--third_party/highway/hwy/contrib/sort/vqsort_f32a.cc52
-rw-r--r--third_party/highway/hwy/contrib/sort/vqsort_f32d.cc52
-rw-r--r--third_party/highway/hwy/contrib/sort/vqsort_f64a.cc58
-rw-r--r--third_party/highway/hwy/contrib/sort/vqsort_f64d.cc58
-rw-r--r--third_party/highway/hwy/contrib/sort/vqsort_i16a.cc52
-rw-r--r--third_party/highway/hwy/contrib/sort/vqsort_i16d.cc52
-rw-r--r--third_party/highway/hwy/contrib/sort/vqsort_i32a.cc52
-rw-r--r--third_party/highway/hwy/contrib/sort/vqsort_i32d.cc52
-rw-r--r--third_party/highway/hwy/contrib/sort/vqsort_i64a.cc52
-rw-r--r--third_party/highway/hwy/contrib/sort/vqsort_i64d.cc52
-rw-r--r--third_party/highway/hwy/contrib/sort/vqsort_kv128a.cc62
-rw-r--r--third_party/highway/hwy/contrib/sort/vqsort_kv128d.cc62
-rw-r--r--third_party/highway/hwy/contrib/sort/vqsort_kv64a.cc62
-rw-r--r--third_party/highway/hwy/contrib/sort/vqsort_kv64d.cc62
-rw-r--r--third_party/highway/hwy/contrib/sort/vqsort_u16a.cc52
-rw-r--r--third_party/highway/hwy/contrib/sort/vqsort_u16d.cc53
-rw-r--r--third_party/highway/hwy/contrib/sort/vqsort_u32a.cc52
-rw-r--r--third_party/highway/hwy/contrib/sort/vqsort_u32d.cc53
-rw-r--r--third_party/highway/hwy/contrib/sort/vqsort_u64a.cc52
-rw-r--r--third_party/highway/hwy/contrib/sort/vqsort_u64d.cc53
37 files changed, 7781 insertions, 0 deletions
diff --git a/third_party/highway/hwy/contrib/sort/BUILD b/third_party/highway/hwy/contrib/sort/BUILD
new file mode 100644
index 0000000000..dc15341908
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/BUILD
@@ -0,0 +1,210 @@
+package(
+ default_applicable_licenses = ["//:license"],
+ default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])
+
+# Unused on Bazel builds, where this is not defined/known; Copybara replaces
+# usages with an empty list.
+COMPAT = [
+ "//buildenv/target:non_prod", # includes mobile/vendor.
+]
+
+cc_library(
+ name = "intel",
+ # hdrs = select({
+ # "//third_party/bazel_platforms/cpu:x86_64": [
+ # "avx512-16bit-common.h",
+ # "avx512-16bit-qsort.hpp",
+ # "avx512-32bit-qsort.hpp",
+ # "avx512-64bit-common.h",
+ # "avx512-64bit-qsort.hpp",
+ # "avx512-common-qsort.h",
+ # ],
+ # "//conditions:default": [],
+ # }),
+ compatible_with = [],
+)
+
+cc_library(
+ name = "vxsort",
+ srcs = [
+ # "vxsort/isa_detection.cpp",
+ # "vxsort/isa_detection_msvc.cpp",
+ # "vxsort/isa_detection_sane.cpp",
+ # "vxsort/machine_traits.avx2.cpp",
+ # "vxsort/smallsort/avx2_load_mask_tables.cpp",
+ # "vxsort/smallsort/bitonic_sort.AVX2.double.generated.cpp",
+ # "vxsort/smallsort/bitonic_sort.AVX2.float.generated.cpp",
+ # "vxsort/smallsort/bitonic_sort.AVX2.int32_t.generated.cpp",
+ # "vxsort/smallsort/bitonic_sort.AVX2.int64_t.generated.cpp",
+ # "vxsort/smallsort/bitonic_sort.AVX2.uint32_t.generated.cpp",
+ # "vxsort/smallsort/bitonic_sort.AVX2.uint64_t.generated.cpp",
+ # "vxsort/smallsort/bitonic_sort.AVX512.double.generated.cpp",
+ # "vxsort/smallsort/bitonic_sort.AVX512.float.generated.cpp",
+ # "vxsort/smallsort/bitonic_sort.AVX512.int32_t.generated.cpp",
+ # "vxsort/smallsort/bitonic_sort.AVX512.int64_t.generated.cpp",
+ # "vxsort/smallsort/bitonic_sort.AVX512.uint32_t.generated.cpp",
+ # "vxsort/smallsort/bitonic_sort.AVX512.uint64_t.generated.cpp",
+ # "vxsort/vxsort_stats.cpp",
+ ],
+ hdrs = [
+ # "vxsort/alignment.h",
+ # "vxsort/defs.h",
+ # "vxsort/isa_detection.h",
+ # "vxsort/machine_traits.avx2.h",
+ # "vxsort/machine_traits.avx512.h",
+ # "vxsort/machine_traits.h",
+ # "vxsort/packer.h",
+ # "vxsort/smallsort/bitonic_sort.AVX2.double.generated.h",
+ # "vxsort/smallsort/bitonic_sort.AVX2.float.generated.h",
+ # "vxsort/smallsort/bitonic_sort.AVX2.int32_t.generated.h",
+ # "vxsort/smallsort/bitonic_sort.AVX2.int64_t.generated.h",
+ # "vxsort/smallsort/bitonic_sort.AVX2.uint32_t.generated.h",
+ # "vxsort/smallsort/bitonic_sort.AVX2.uint64_t.generated.h",
+ # "vxsort/smallsort/bitonic_sort.AVX512.double.generated.h",
+ # "vxsort/smallsort/bitonic_sort.AVX512.float.generated.h",
+ # "vxsort/smallsort/bitonic_sort.AVX512.int32_t.generated.h",
+ # "vxsort/smallsort/bitonic_sort.AVX512.int64_t.generated.h",
+ # "vxsort/smallsort/bitonic_sort.AVX512.uint32_t.generated.h",
+ # "vxsort/smallsort/bitonic_sort.AVX512.uint64_t.generated.h",
+ # "vxsort/smallsort/bitonic_sort.h",
+ # "vxsort/vxsort.h",
+ # "vxsort/vxsort_stats.h",
+ ],
+ compatible_with = [],
+ textual_hdrs = [
+ # "vxsort/vxsort_targets_disable.h",
+ # "vxsort/vxsort_targets_enable_avx2.h",
+ # "vxsort/vxsort_targets_enable_avx512.h",
+ ],
+)
+
+cc_library(
+ name = "vqsort",
+ srcs = [
+ # Split into separate files to reduce MSVC build time.
+ "vqsort.cc",
+ "vqsort_128a.cc",
+ "vqsort_128d.cc",
+ "vqsort_f32a.cc",
+ "vqsort_f32d.cc",
+ "vqsort_f64a.cc",
+ "vqsort_f64d.cc",
+ "vqsort_i16a.cc",
+ "vqsort_i16d.cc",
+ "vqsort_i32a.cc",
+ "vqsort_i32d.cc",
+ "vqsort_i64a.cc",
+ "vqsort_i64d.cc",
+ "vqsort_kv64a.cc",
+ "vqsort_kv64d.cc",
+ "vqsort_kv128a.cc",
+ "vqsort_kv128d.cc",
+ "vqsort_u16a.cc",
+ "vqsort_u16d.cc",
+ "vqsort_u32a.cc",
+ "vqsort_u32d.cc",
+ "vqsort_u64a.cc",
+ "vqsort_u64d.cc",
+ ],
+ hdrs = [
+ "vqsort.h", # public interface
+ ],
+ compatible_with = [],
+ local_defines = ["hwy_contrib_EXPORTS"],
+ textual_hdrs = [
+ "shared-inl.h",
+ "sorting_networks-inl.h",
+ "traits-inl.h",
+ "traits128-inl.h",
+ "vqsort-inl.h",
+ # Placeholder for internal instrumentation. Do not remove.
+ ],
+ deps = [
+ ":intel", # required if HAVE_INTEL
+ ":vxsort", # required if HAVE_VXSORT
+ "//:hwy",
+ ],
+)
+
+# -----------------------------------------------------------------------------
+# Internal-only targets
+
+cc_library(
+ name = "helpers",
+ testonly = 1,
+ textual_hdrs = [
+ "algo-inl.h",
+ "result-inl.h",
+ ],
+ deps = [
+ ":vqsort",
+ "//:nanobenchmark",
+ # Required for HAVE_PDQSORT, but that is unused and this is
+ # unavailable to Bazel builds, hence commented out.
+ # "//third_party/boost/allowed",
+ # Avoid ips4o and thus TBB to work around hwloc build failure.
+ ],
+)
+
+cc_binary(
+ name = "print_network",
+ testonly = 1,
+ srcs = ["print_network.cc"],
+ deps = [
+ ":helpers",
+ ":vqsort",
+ "//:hwy",
+ ],
+)
+
+cc_test(
+ name = "sort_test",
+ size = "medium",
+ srcs = ["sort_test.cc"],
+ # Do not enable fully_static_link (pthread crash on bazel)
+ local_defines = ["HWY_IS_TEST"],
+ # for test_suite.
+ tags = ["hwy_ops_test"],
+ deps = [
+ ":helpers",
+ ":vqsort",
+ "@com_google_googletest//:gtest_main",
+ "//:hwy",
+ "//:hwy_test_util",
+ ],
+)
+
+cc_test(
+ name = "bench_sort",
+ size = "medium",
+ srcs = ["bench_sort.cc"],
+ # Do not enable fully_static_link (pthread crash on bazel)
+ local_defines = ["HWY_IS_TEST"],
+ # for test_suite.
+ tags = ["hwy_ops_test"],
+ deps = [
+ ":helpers",
+ ":vqsort",
+ "@com_google_googletest//:gtest_main",
+ "//:hwy",
+ "//:hwy_test_util",
+ ],
+)
+
+cc_binary(
+ name = "bench_parallel",
+ testonly = 1,
+ srcs = ["bench_parallel.cc"],
+ # Do not enable fully_static_link (pthread crash on bazel)
+ local_defines = ["HWY_IS_TEST"],
+ deps = [
+ ":helpers",
+ ":vqsort",
+ "@com_google_googletest//:gtest_main",
+ "//:hwy",
+ "//:hwy_test_util",
+ ],
+)
diff --git a/third_party/highway/hwy/contrib/sort/README.md b/third_party/highway/hwy/contrib/sort/README.md
new file mode 100644
index 0000000000..46047e6359
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/README.md
@@ -0,0 +1,109 @@
+# Vectorized and performance-portable Quicksort
+
+## Introduction
+
+As of 2022-06-07 this sorts large arrays of built-in types about ten times as
+fast as `std::sort`. See also our
+[blog post](https://opensource.googleblog.com/2022/06/Vectorized%20and%20performance%20portable%20Quicksort.html)
+and [paper](https://arxiv.org/abs/2205.05982).
+
+## Instructions
+
+Here are instructions for reproducing our results with cross-platform CMake,
+Linux, or AWS (SVE, NEON).
+
+### CMake, any platform
+
+Please first ensure that Clang (tested with 13.0.1 and 15.0.6) is installed, and
+if it is not the default compiler, point the CC and CXX environment variables to
+it, e.g.
+
+```
+export CC=clang-15
+export CXX=clang++-15
+```
+
+Then run the usual CMake workflow, also documented in the Highway README, e.g.:
+
+```
+mkdir -p build && cd build && cmake .. && make -j
+taskset -c 2 tests/bench_sort
+```
+
+The optional `taskset -c 2` part reduces the variability of measurements by
+preventing the OS from migrating the benchmark between cores.
+
+### Linux
+
+Please first ensure golang, and Clang (tested with 13.0.1) are installed via
+your system's package manager.
+
+```
+go install github.com/bazelbuild/bazelisk@latest
+git clone https://github.com/google/highway
+cd highway
+CC=clang CXX=clang++ ~/go/bin/bazelisk build -c opt hwy/contrib/sort:all
+bazel-bin/hwy/contrib/sort/sort_test
+bazel-bin/hwy/contrib/sort/bench_sort
+```
+
+### AWS Graviton3
+
+Instance config: amazon linux 5.10 arm64, c7g.8xlarge (largest allowed config is
+32 vCPU). Initial launch will fail. Wait a few minutes for an email saying the
+config is verified, then re-launch. See IPv4 hostname in list of instances.
+
+`ssh -i /path/key.pem ec2-user@hostname`
+
+Note that the AWS CMake package is too old for llvm, so we build it first:
+```
+wget https://cmake.org/files/v3.23/cmake-3.23.2.tar.gz
+tar -xvzf cmake-3.23.2.tar.gz && cd cmake-3.23.2/
+./bootstrap -- -DCMAKE_USE_OPENSSL=OFF
+make -j8 && sudo make install
+cd ..
+```
+
+AWS clang is at version 11.1, which generates unnecessary `AND` instructions
+which slow down the sort by 1.15x. We tested with clang trunk as of June 13
+(which reports Git hash 8f6512fea000c3a0d394864bb94e524bee375069). To build:
+
+```
+git clone --depth 1 https://github.com/llvm/llvm-project.git
+cd llvm-project
+mkdir -p build && cd build
+/usr/local/bin/cmake ../llvm -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi" -DCMAKE_BUILD_TYPE=Release
+make -j32 && sudo make install
+```
+
+```
+sudo yum install go
+go install github.com/bazelbuild/bazelisk@latest
+git clone https://github.com/google/highway
+cd highway
+CC=/usr/local/bin/clang CXX=/usr/local/bin/clang++ ~/go/bin/bazelisk build -c opt --copt=-march=armv8.2-a+sve hwy/contrib/sort:all
+bazel-bin/hwy/contrib/sort/sort_test
+bazel-bin/hwy/contrib/sort/bench_sort
+```
+
+The above command line enables SVE, which is currently only available on
+Graviton 3. You can also test NEON on the same processor, or other Arm CPUs, by
+changing the `-march=` option to `--copt=-march=armv8.2-a+crypto`. Note that
+such flags will be unnecessary once Clang supports `#pragma target` for NEON and
+SVE intrinsics, as it does for x86.
+
+## Results
+
+`bench_sort` outputs the instruction set (AVX3 refers to AVX-512), the sort
+algorithm (std for `std::sort`, vq for our vqsort), the type of keys being
+sorted (f32 is float), the distribution of keys (uniform32 for uniform random
+with range 0-2^32), the number of keys, then the throughput of sorted keys (i.e.
+number of key bytes output per second).
+
+Example excerpt from Xeon 6154 (Skylake-X) CPU clocked at 3 GHz:
+
+```
+[ RUN ] BenchSortGroup/BenchSort.BenchAllSort/AVX3
+ AVX3: std: f32: uniform32: 1.00E+06 54 MB/s ( 1 threads)
+ AVX3: vq: f32: uniform32: 1.00E+06 1143 MB/s ( 1 threads)
+```
diff --git a/third_party/highway/hwy/contrib/sort/algo-inl.h b/third_party/highway/hwy/contrib/sort/algo-inl.h
new file mode 100644
index 0000000000..546843e101
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/algo-inl.h
@@ -0,0 +1,553 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Normal include guard for target-independent parts
+#ifndef HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_
+#define HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_
+
+#include <stdint.h>
+#include <string.h> // memcpy
+
+#include <algorithm> // std::sort, std::min, std::max
+#include <functional> // std::less, std::greater
+#include <vector>
+
+#include "hwy/base.h"
+#include "hwy/contrib/sort/vqsort.h"
+#include "hwy/print.h"
+
+// Third-party algorithms
+#define HAVE_AVX2SORT 0
+#define HAVE_IPS4O 0
+// When enabling, consider changing max_threads (required for Table 1a)
+#define HAVE_PARALLEL_IPS4O (HAVE_IPS4O && 1)
+#define HAVE_PDQSORT 0
+#define HAVE_SORT512 0
+#define HAVE_VXSORT 0
+#if HWY_ARCH_X86
+#define HAVE_INTEL 0
+#else
+#define HAVE_INTEL 0
+#endif
+
+#if HAVE_PARALLEL_IPS4O
+#include <thread> // NOLINT
+#endif
+
+#if HAVE_AVX2SORT
+HWY_PUSH_ATTRIBUTES("avx2,avx")
+#include "avx2sort.h" //NOLINT
+HWY_POP_ATTRIBUTES
+#endif
+#if HAVE_IPS4O || HAVE_PARALLEL_IPS4O
+#include "third_party/ips4o/include/ips4o.hpp"
+#include "third_party/ips4o/include/ips4o/thread_pool.hpp"
+#endif
+#if HAVE_PDQSORT
+#include "third_party/boost/allowed/sort/sort.hpp"
+#endif
+#if HAVE_SORT512
+#include "sort512.h" //NOLINT
+#endif
+
+// vxsort is difficult to compile for multiple targets because it also uses
+// .cpp files, and we'd also have to #undef its include guards. Instead, compile
+// only for AVX2 or AVX3 depending on this macro.
+#define VXSORT_AVX3 1
+#if HAVE_VXSORT
+// inlined from vxsort_targets_enable_avx512 (must close before end of header)
+#ifdef __GNUC__
+#ifdef __clang__
+#if VXSORT_AVX3
+#pragma clang attribute push(__attribute__((target("avx512f,avx512dq"))), \
+ apply_to = any(function))
+#else
+#pragma clang attribute push(__attribute__((target("avx2"))), \
+ apply_to = any(function))
+#endif // VXSORT_AVX3
+
+#else
+#pragma GCC push_options
+#if VXSORT_AVX3
+#pragma GCC target("avx512f,avx512dq")
+#else
+#pragma GCC target("avx2")
+#endif // VXSORT_AVX3
+#endif
+#endif
+
+#if VXSORT_AVX3
+#include "vxsort/machine_traits.avx512.h"
+#else
+#include "vxsort/machine_traits.avx2.h"
+#endif // VXSORT_AVX3
+#include "vxsort/vxsort.h"
+#ifdef __GNUC__
+#ifdef __clang__
+#pragma clang attribute pop
+#else
+#pragma GCC pop_options
+#endif
+#endif
+#endif // HAVE_VXSORT
+
+namespace hwy {
+
+enum class Dist { kUniform8, kUniform16, kUniform32 };
+
+static inline std::vector<Dist> AllDist() {
+ return {/*Dist::kUniform8, Dist::kUniform16,*/ Dist::kUniform32};
+}
+
+static inline const char* DistName(Dist dist) {
+ switch (dist) {
+ case Dist::kUniform8:
+ return "uniform8";
+ case Dist::kUniform16:
+ return "uniform16";
+ case Dist::kUniform32:
+ return "uniform32";
+ }
+ return "unreachable";
+}
+
+template <typename T>
+class InputStats {
+ public:
+ void Notify(T value) {
+ min_ = std::min(min_, value);
+ max_ = std::max(max_, value);
+ // Converting to integer would truncate floats, multiplying to save digits
+ // risks overflow especially when casting, so instead take the sum of the
+ // bit representations as the checksum.
+ uint64_t bits = 0;
+ static_assert(sizeof(T) <= 8, "Expected a built-in type");
+ CopyBytes<sizeof(T)>(&value, &bits); // not same size
+ sum_ += bits;
+ count_ += 1;
+ }
+
+ bool operator==(const InputStats& other) const {
+ char type_name[100];
+ detail::TypeName(hwy::detail::MakeTypeInfo<T>(), 1, type_name);
+
+ if (count_ != other.count_) {
+ HWY_ABORT("Sort %s: count %d vs %d\n", type_name,
+ static_cast<int>(count_), static_cast<int>(other.count_));
+ }
+
+ if (min_ != other.min_ || max_ != other.max_) {
+ HWY_ABORT("Sort %s: minmax %f/%f vs %f/%f\n", type_name,
+ static_cast<double>(min_), static_cast<double>(max_),
+ static_cast<double>(other.min_),
+ static_cast<double>(other.max_));
+ }
+
+ // Sum helps detect duplicated/lost values
+ if (sum_ != other.sum_) {
+ HWY_ABORT("Sort %s: Sum mismatch %g %g; min %g max %g\n", type_name,
+ static_cast<double>(sum_), static_cast<double>(other.sum_),
+ static_cast<double>(min_), static_cast<double>(max_));
+ }
+
+ return true;
+ }
+
+ private:
+ T min_ = hwy::HighestValue<T>();
+ T max_ = hwy::LowestValue<T>();
+ uint64_t sum_ = 0;
+ size_t count_ = 0;
+};
+
+enum class Algo {
+#if HAVE_INTEL
+ kIntel,
+#endif
+#if HAVE_AVX2SORT
+ kSEA,
+#endif
+#if HAVE_IPS4O
+ kIPS4O,
+#endif
+#if HAVE_PARALLEL_IPS4O
+ kParallelIPS4O,
+#endif
+#if HAVE_PDQSORT
+ kPDQ,
+#endif
+#if HAVE_SORT512
+ kSort512,
+#endif
+#if HAVE_VXSORT
+ kVXSort,
+#endif
+ kStd,
+ kVQSort,
+ kHeap,
+};
+
+static inline const char* AlgoName(Algo algo) {
+ switch (algo) {
+#if HAVE_INTEL
+ case Algo::kIntel:
+ return "intel";
+#endif
+#if HAVE_AVX2SORT
+ case Algo::kSEA:
+ return "sea";
+#endif
+#if HAVE_IPS4O
+ case Algo::kIPS4O:
+ return "ips4o";
+#endif
+#if HAVE_PARALLEL_IPS4O
+ case Algo::kParallelIPS4O:
+ return "par_ips4o";
+#endif
+#if HAVE_PDQSORT
+ case Algo::kPDQ:
+ return "pdq";
+#endif
+#if HAVE_SORT512
+ case Algo::kSort512:
+ return "sort512";
+#endif
+#if HAVE_VXSORT
+ case Algo::kVXSort:
+ return "vxsort";
+#endif
+ case Algo::kStd:
+ return "std";
+ case Algo::kVQSort:
+ return "vq";
+ case Algo::kHeap:
+ return "heap";
+ }
+ return "unreachable";
+}
+
+} // namespace hwy
+#endif // HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_
+
+// Per-target
+#if defined(HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE) == \
+ defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
+#undef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
+#else
+#define HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
+#endif
+
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/traits128-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h" // HeapSort
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+
+// Requires target pragma set by HWY_BEFORE_NAMESPACE
+#if HAVE_INTEL && HWY_TARGET <= HWY_AVX3
+// #include "avx512-16bit-qsort.hpp" // requires vbmi2
+#include "avx512-32bit-qsort.hpp"
+#include "avx512-64bit-qsort.hpp"
+#endif
+
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+#if HAVE_INTEL // only supports ascending order
+template <typename T>
+using OtherOrder = detail::OrderAscending<T>;
+#else
+template <typename T>
+using OtherOrder = detail::OrderDescending<T>;
+#endif
+
+class Xorshift128Plus {
+ static HWY_INLINE uint64_t SplitMix64(uint64_t z) {
+ z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull;
+ z = (z ^ (z >> 27)) * 0x94D049BB133111EBull;
+ return z ^ (z >> 31);
+ }
+
+ public:
+ // Generates two vectors of 64-bit seeds via SplitMix64 and stores into
+ // `seeds`. Generating these afresh in each ChoosePivot is too expensive.
+ template <class DU64>
+ static void GenerateSeeds(DU64 du64, TFromD<DU64>* HWY_RESTRICT seeds) {
+ seeds[0] = SplitMix64(0x9E3779B97F4A7C15ull);
+ for (size_t i = 1; i < 2 * Lanes(du64); ++i) {
+ seeds[i] = SplitMix64(seeds[i - 1]);
+ }
+ }
+
+ // Need to pass in the state because vector cannot be class members.
+ template <class VU64>
+ static VU64 RandomBits(VU64& state0, VU64& state1) {
+ VU64 s1 = state0;
+ VU64 s0 = state1;
+ const VU64 bits = Add(s1, s0);
+ state0 = s0;
+ s1 = Xor(s1, ShiftLeft<23>(s1));
+ state1 = Xor(s1, Xor(s0, Xor(ShiftRight<18>(s1), ShiftRight<5>(s0))));
+ return bits;
+ }
+};
+
+template <class D, class VU64, HWY_IF_NOT_FLOAT_D(D)>
+Vec<D> RandomValues(D d, VU64& s0, VU64& s1, const VU64 mask) {
+ const VU64 bits = Xorshift128Plus::RandomBits(s0, s1);
+ return BitCast(d, And(bits, mask));
+}
+
+// It is important to avoid denormals, which are flushed to zero by SIMD but not
+// scalar sorts, and NaN, which may be ordered differently in scalar vs. SIMD.
+template <class DF, class VU64, HWY_IF_FLOAT_D(DF)>
+Vec<DF> RandomValues(DF df, VU64& s0, VU64& s1, const VU64 mask) {
+ using TF = TFromD<DF>;
+ const RebindToUnsigned<decltype(df)> du;
+ using VU = Vec<decltype(du)>;
+
+ const VU64 bits64 = And(Xorshift128Plus::RandomBits(s0, s1), mask);
+
+#if HWY_TARGET == HWY_SCALAR // Cannot repartition u64 to smaller types
+ using TU = MakeUnsigned<TF>;
+ const VU bits = Set(du, static_cast<TU>(GetLane(bits64) & LimitsMax<TU>()));
+#else
+ const VU bits = BitCast(du, bits64);
+#endif
+ // Avoid NaN/denormal by only generating values in [1, 2), i.e. random
+ // mantissas with the exponent taken from the representation of 1.0.
+ const VU k1 = BitCast(du, Set(df, TF{1.0}));
+ const VU mantissa_mask = Set(du, MantissaMask<TF>());
+ const VU representation = OrAnd(k1, bits, mantissa_mask);
+ return BitCast(df, representation);
+}
+
+template <class DU64>
+Vec<DU64> MaskForDist(DU64 du64, const Dist dist, size_t sizeof_t) {
+ switch (sizeof_t) {
+ case 2:
+ return Set(du64, (dist == Dist::kUniform8) ? 0x00FF00FF00FF00FFull
+ : 0xFFFFFFFFFFFFFFFFull);
+ case 4:
+ return Set(du64, (dist == Dist::kUniform8) ? 0x000000FF000000FFull
+ : (dist == Dist::kUniform16) ? 0x0000FFFF0000FFFFull
+ : 0xFFFFFFFFFFFFFFFFull);
+ case 8:
+ return Set(du64, (dist == Dist::kUniform8) ? 0x00000000000000FFull
+ : (dist == Dist::kUniform16) ? 0x000000000000FFFFull
+ : 0x00000000FFFFFFFFull);
+ default:
+ HWY_ABORT("Logic error");
+ return Zero(du64);
+ }
+}
+
+template <typename T>
+InputStats<T> GenerateInput(const Dist dist, T* v, size_t num) {
+ SortTag<uint64_t> du64;
+ using VU64 = Vec<decltype(du64)>;
+ const size_t N64 = Lanes(du64);
+ auto seeds = hwy::AllocateAligned<uint64_t>(2 * N64);
+ Xorshift128Plus::GenerateSeeds(du64, seeds.get());
+ VU64 s0 = Load(du64, seeds.get());
+ VU64 s1 = Load(du64, seeds.get() + N64);
+
+#if HWY_TARGET == HWY_SCALAR
+ const Sisd<T> d;
+#else
+ const Repartition<T, decltype(du64)> d;
+#endif
+ using V = Vec<decltype(d)>;
+ const size_t N = Lanes(d);
+ const VU64 mask = MaskForDist(du64, dist, sizeof(T));
+ auto buf = hwy::AllocateAligned<T>(N);
+
+ size_t i = 0;
+ for (; i + N <= num; i += N) {
+ const V values = RandomValues(d, s0, s1, mask);
+ StoreU(values, d, v + i);
+ }
+ if (i < num) {
+ const V values = RandomValues(d, s0, s1, mask);
+ StoreU(values, d, buf.get());
+ memcpy(v + i, buf.get(), (num - i) * sizeof(T));
+ }
+
+ InputStats<T> input_stats;
+ for (size_t i = 0; i < num; ++i) {
+ input_stats.Notify(v[i]);
+ }
+ return input_stats;
+}
+
+struct SharedState {
+#if HAVE_PARALLEL_IPS4O
+ const unsigned max_threads = hwy::LimitsMax<unsigned>(); // 16 for Table 1a
+ ips4o::StdThreadPool pool{static_cast<int>(
+ HWY_MIN(max_threads, std::thread::hardware_concurrency() / 2))};
+#endif
+};
+
+// Bridge from keys (passed to Run) to lanes as expected by HeapSort. For
+// non-128-bit keys they are the same:
+template <class Order, typename KeyType, HWY_IF_NOT_T_SIZE(KeyType, 16)>
+void CallHeapSort(KeyType* HWY_RESTRICT keys, const size_t num_keys) {
+ using detail::TraitsLane;
+ using detail::SharedTraits;
+ if (Order().IsAscending()) {
+ const SharedTraits<TraitsLane<detail::OrderAscending<KeyType>>> st;
+ return detail::HeapSort(st, keys, num_keys);
+ } else {
+ const SharedTraits<TraitsLane<detail::OrderDescending<KeyType>>> st;
+ return detail::HeapSort(st, keys, num_keys);
+ }
+}
+
+#if VQSORT_ENABLED
+template <class Order>
+void CallHeapSort(hwy::uint128_t* HWY_RESTRICT keys, const size_t num_keys) {
+ using detail::SharedTraits;
+ using detail::Traits128;
+ uint64_t* lanes = reinterpret_cast<uint64_t*>(keys);
+ const size_t num_lanes = num_keys * 2;
+ if (Order().IsAscending()) {
+ const SharedTraits<Traits128<detail::OrderAscending128>> st;
+ return detail::HeapSort(st, lanes, num_lanes);
+ } else {
+ const SharedTraits<Traits128<detail::OrderDescending128>> st;
+ return detail::HeapSort(st, lanes, num_lanes);
+ }
+}
+
+template <class Order>
+void CallHeapSort(K64V64* HWY_RESTRICT keys, const size_t num_keys) {
+ using detail::SharedTraits;
+ using detail::Traits128;
+ uint64_t* lanes = reinterpret_cast<uint64_t*>(keys);
+ const size_t num_lanes = num_keys * 2;
+ if (Order().IsAscending()) {
+ const SharedTraits<Traits128<detail::OrderAscendingKV128>> st;
+ return detail::HeapSort(st, lanes, num_lanes);
+ } else {
+ const SharedTraits<Traits128<detail::OrderDescendingKV128>> st;
+ return detail::HeapSort(st, lanes, num_lanes);
+ }
+}
+#endif // VQSORT_ENABLED
+
+template <class Order, typename KeyType>
+void Run(Algo algo, KeyType* HWY_RESTRICT inout, size_t num,
+ SharedState& shared, size_t /*thread*/) {
+ const std::less<KeyType> less;
+ const std::greater<KeyType> greater;
+
+#if !HAVE_PARALLEL_IPS4O
+ (void)shared;
+#endif
+
+ switch (algo) {
+#if HAVE_INTEL && HWY_TARGET <= HWY_AVX3
+ case Algo::kIntel:
+ return avx512_qsort<KeyType>(inout, static_cast<int64_t>(num));
+#endif
+
+#if HAVE_AVX2SORT
+ case Algo::kSEA:
+ return avx2::quicksort(inout, static_cast<int>(num));
+#endif
+
+#if HAVE_IPS4O
+ case Algo::kIPS4O:
+ if (Order().IsAscending()) {
+ return ips4o::sort(inout, inout + num, less);
+ } else {
+ return ips4o::sort(inout, inout + num, greater);
+ }
+#endif
+
+#if HAVE_PARALLEL_IPS4O
+ case Algo::kParallelIPS4O:
+ if (Order().IsAscending()) {
+ return ips4o::parallel::sort(inout, inout + num, less, shared.pool);
+ } else {
+ return ips4o::parallel::sort(inout, inout + num, greater, shared.pool);
+ }
+#endif
+
+#if HAVE_SORT512
+ case Algo::kSort512:
+ HWY_ABORT("not supported");
+ // return Sort512::Sort(inout, num);
+#endif
+
+#if HAVE_PDQSORT
+ case Algo::kPDQ:
+ if (Order().IsAscending()) {
+ return boost::sort::pdqsort_branchless(inout, inout + num, less);
+ } else {
+ return boost::sort::pdqsort_branchless(inout, inout + num, greater);
+ }
+#endif
+
+#if HAVE_VXSORT
+ case Algo::kVXSort: {
+#if (VXSORT_AVX3 && HWY_TARGET != HWY_AVX3) || \
+ (!VXSORT_AVX3 && HWY_TARGET != HWY_AVX2)
+ fprintf(stderr, "Do not call for target %s\n",
+ hwy::TargetName(HWY_TARGET));
+ return;
+#else
+#if VXSORT_AVX3
+ vxsort::vxsort<KeyType, vxsort::AVX512> vx;
+#else
+ vxsort::vxsort<KeyType, vxsort::AVX2> vx;
+#endif
+ if (Order().IsAscending()) {
+ return vx.sort(inout, inout + num - 1);
+ } else {
+ fprintf(stderr, "Skipping VX - does not support descending order\n");
+ return;
+ }
+#endif // enabled for this target
+ }
+#endif // HAVE_VXSORT
+
+ case Algo::kStd:
+ if (Order().IsAscending()) {
+ return std::sort(inout, inout + num, less);
+ } else {
+ return std::sort(inout, inout + num, greater);
+ }
+
+ case Algo::kVQSort:
+ return VQSort(inout, num, Order());
+
+ case Algo::kHeap:
+ return CallHeapSort<Order>(inout, num);
+
+ default:
+ HWY_ABORT("Not implemented");
+ }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#endif // HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
diff --git a/third_party/highway/hwy/contrib/sort/bench_parallel.cc b/third_party/highway/hwy/contrib/sort/bench_parallel.cc
new file mode 100644
index 0000000000..113061bab3
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/bench_parallel.cc
@@ -0,0 +1,238 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Concurrent, independent sorts for generating more memory traffic and testing
+// scalability.
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include <condition_variable> //NOLINT
+#include <functional>
+#include <memory>
+#include <mutex> //NOLINT
+#include <thread> //NOLINT
+#include <utility>
+#include <vector>
+
+// clang-format off
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_parallel.cc" //NOLINT
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/algo-inl.h"
+#include "hwy/contrib/sort/result-inl.h"
+#include "hwy/aligned_allocator.h"
+// Last
+#include "hwy/tests/test_util-inl.h"
+// clang-format on
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+namespace {
+
+class ThreadPool {
+ public:
+ // Starts the given number of worker threads and blocks until they are ready.
+ explicit ThreadPool(
+ const size_t num_threads = std::thread::hardware_concurrency())
+ : num_threads_(num_threads) {
+ HWY_ASSERT(num_threads_ > 0);
+ threads_.reserve(num_threads_);
+ for (size_t i = 0; i < num_threads_; ++i) {
+ threads_.emplace_back(ThreadFunc, this, i);
+ }
+
+ WorkersReadyBarrier();
+ }
+
+ ThreadPool(const ThreadPool&) = delete;
+ ThreadPool& operator&(const ThreadPool&) = delete;
+
+ // Waits for all threads to exit.
+ ~ThreadPool() {
+ StartWorkers(kWorkerExit);
+
+ for (std::thread& thread : threads_) {
+ thread.join();
+ }
+ }
+
+ size_t NumThreads() const { return threads_.size(); }
+
+ template <class Func>
+ void RunOnThreads(size_t max_threads, const Func& func) {
+ task_ = &CallClosure<Func>;
+ data_ = &func;
+ StartWorkers(max_threads);
+ WorkersReadyBarrier();
+ }
+
+ private:
+ // After construction and between calls to Run, workers are "ready", i.e.
+ // waiting on worker_start_cv_. They are "started" by sending a "command"
+ // and notifying all worker_start_cv_ waiters. (That is why all workers
+ // must be ready/waiting - otherwise, the notification will not reach all of
+ // them and the main thread waits in vain for them to report readiness.)
+ using WorkerCommand = uint64_t;
+
+ static constexpr WorkerCommand kWorkerWait = ~1ULL;
+ static constexpr WorkerCommand kWorkerExit = ~2ULL;
+
+ // Calls a closure (lambda with captures).
+ template <class Closure>
+ static void CallClosure(const void* f, size_t thread) {
+ (*reinterpret_cast<const Closure*>(f))(thread);
+ }
+
+ void WorkersReadyBarrier() {
+ std::unique_lock<std::mutex> lock(mutex_);
+ // Typically only a single iteration.
+ while (workers_ready_ != threads_.size()) {
+ workers_ready_cv_.wait(lock);
+ }
+ workers_ready_ = 0;
+
+ // Safely handle spurious worker wakeups.
+ worker_start_command_ = kWorkerWait;
+ }
+
+ // Precondition: all workers are ready.
+ void StartWorkers(const WorkerCommand worker_command) {
+ std::unique_lock<std::mutex> lock(mutex_);
+ worker_start_command_ = worker_command;
+ // Workers will need this lock, so release it before they wake up.
+ lock.unlock();
+ worker_start_cv_.notify_all();
+ }
+
+ static void ThreadFunc(ThreadPool* self, size_t thread) {
+ // Until kWorkerExit command received:
+ for (;;) {
+ std::unique_lock<std::mutex> lock(self->mutex_);
+ // Notify main thread that this thread is ready.
+ if (++self->workers_ready_ == self->num_threads_) {
+ self->workers_ready_cv_.notify_one();
+ }
+ RESUME_WAIT:
+ // Wait for a command.
+ self->worker_start_cv_.wait(lock);
+ const WorkerCommand command = self->worker_start_command_;
+ switch (command) {
+ case kWorkerWait: // spurious wakeup:
+ goto RESUME_WAIT; // lock still held, avoid incrementing ready.
+ case kWorkerExit:
+ return; // exits thread
+ default:
+ break;
+ }
+
+ lock.unlock();
+ // Command is the maximum number of threads that should run the task.
+ HWY_ASSERT(command < self->NumThreads());
+ if (thread < command) {
+ self->task_(self->data_, thread);
+ }
+ }
+ }
+
+ const size_t num_threads_;
+
+ // Unmodified after ctor, but cannot be const because we call thread::join().
+ std::vector<std::thread> threads_;
+
+ std::mutex mutex_; // guards both cv and their variables.
+ std::condition_variable workers_ready_cv_;
+ size_t workers_ready_ = 0;
+ std::condition_variable worker_start_cv_;
+ WorkerCommand worker_start_command_;
+
+ // Written by main thread, read by workers (after mutex lock/unlock).
+ std::function<void(const void*, size_t)> task_; // points to CallClosure
+ const void* data_; // points to caller's Func
+};
+
+template <class Traits>
+void RunWithoutVerify(Traits st, const Dist dist, const size_t num_keys,
+ const Algo algo, SharedState& shared, size_t thread) {
+ using LaneType = typename Traits::LaneType;
+ using KeyType = typename Traits::KeyType;
+ using Order = typename Traits::Order;
+ const size_t num_lanes = num_keys * st.LanesPerKey();
+ auto aligned = hwy::AllocateAligned<LaneType>(num_lanes);
+
+ (void)GenerateInput(dist, aligned.get(), num_lanes);
+
+ const Timestamp t0;
+ Run<Order>(algo, reinterpret_cast<KeyType*>(aligned.get()), num_keys, shared,
+ thread);
+ HWY_ASSERT(aligned[0] < aligned[num_lanes - 1]);
+}
+
+void BenchParallel() {
+ // Not interested in benchmark results for other targets on x86
+ if (HWY_ARCH_X86 && (HWY_TARGET != HWY_AVX2 && HWY_TARGET != HWY_AVX3 &&
+ HWY_TARGET != HWY_AVX3_ZEN4)) {
+ return;
+ }
+
+ ThreadPool pool;
+ const size_t NT = pool.NumThreads();
+
+ detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int64_t>>> st;
+ using KeyType = typename decltype(st)::KeyType;
+ const size_t num_keys = size_t{100} * 1000 * 1000;
+
+#if HAVE_IPS4O
+ const Algo algo = Algo::kIPS4O;
+#else
+ const Algo algo = Algo::kVQSort;
+#endif
+ const Dist dist = Dist::kUniform32;
+
+ SharedState shared;
+
+ std::vector<Result> results;
+ for (size_t nt = 1; nt < NT; nt += HWY_MAX(1, NT / 16)) {
+ Timestamp t0;
+ // Default capture because MSVC wants algo/dist but clang does not.
+ pool.RunOnThreads(nt, [=, &shared](size_t thread) {
+ RunWithoutVerify(st, dist, num_keys, algo, shared, thread);
+ });
+ const double sec = SecondsSince(t0);
+ results.emplace_back(algo, dist, num_keys, nt, sec, sizeof(KeyType),
+ st.KeyString());
+ results.back().Print();
+ }
+}
+
+} // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+namespace {
+HWY_BEFORE_TEST(BenchParallel);
+HWY_EXPORT_AND_TEST_P(BenchParallel, BenchParallel);
+} // namespace
+} // namespace hwy
+
+#endif // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/bench_sort.cc b/third_party/highway/hwy/contrib/sort/bench_sort.cc
new file mode 100644
index 0000000000..13025aa26b
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/bench_sort.cc
@@ -0,0 +1,367 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include <vector>
+
+// clang-format off
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_sort.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/algo-inl.h"
+#include "hwy/contrib/sort/vqsort.h"
+#include "hwy/contrib/sort/result-inl.h"
+#include "hwy/contrib/sort/sorting_networks-inl.h" // SharedTraits
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/traits128-inl.h"
+#include "hwy/tests/test_util-inl.h"
+// clang-format on
+
+// Mode for larger sorts because M1 is able to access more than the per-core
+// share of L2, so 1M elements might still be in cache.
+#define SORT_100M 0
+
+#define SORT_BENCH_BASE_AND_PARTITION 0
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+// Defined within HWY_ONCE, used by BenchAllSort.
+extern int64_t first_sort_target;
+
+namespace HWY_NAMESPACE {
+namespace {
+using detail::TraitsLane;
+using detail::OrderAscending;
+using detail::OrderDescending;
+using detail::SharedTraits;
+
+#if VQSORT_ENABLED
+using detail::OrderAscending128;
+using detail::OrderAscendingKV128;
+using detail::Traits128;
+#endif
+
+#if (VQSORT_ENABLED && SORT_BENCH_BASE_AND_PARTITION) || HWY_IDE
+
+template <class Traits>
+HWY_NOINLINE void BenchPartition() {
+ using LaneType = typename Traits::LaneType;
+ using KeyType = typename Traits::KeyType;
+ const SortTag<LaneType> d;
+ detail::SharedTraits<Traits> st;
+ const Dist dist = Dist::kUniform8;
+ double sum = 0.0;
+
+ constexpr size_t kLPK = st.LanesPerKey();
+ HWY_ALIGN LaneType
+ buf[SortConstants::BufBytes<LaneType>(HWY_MAX_BYTES, kLPK) /
+ sizeof(LaneType)];
+ uint64_t* HWY_RESTRICT state = GetGeneratorState();
+
+ const size_t max_log2 = AdjustedLog2Reps(20);
+ for (size_t log2 = max_log2; log2 < max_log2 + 1; ++log2) {
+ const size_t num_lanes = 1ull << log2;
+ const size_t num_keys = num_lanes / kLPK;
+ auto aligned = hwy::AllocateAligned<LaneType>(num_lanes);
+
+ std::vector<double> seconds;
+ const size_t num_reps = (1ull << (14 - log2 / 2)) * 30;
+ for (size_t rep = 0; rep < num_reps; ++rep) {
+ (void)GenerateInput(dist, aligned.get(), num_lanes);
+
+ // The pivot value can influence performance. Do exactly what vqsort will
+ // do so that the performance (influenced by prefetching and branch
+ // prediction) is likely to predict the actual performance inside vqsort.
+ detail::DrawSamples(d, st, aligned.get(), num_lanes, buf, state);
+ detail::SortSamples(d, st, buf);
+ auto pivot = detail::ChoosePivotByRank(d, st, buf);
+
+ const Timestamp t0;
+ detail::Partition(d, st, aligned.get(), num_lanes - 1, pivot, buf);
+ seconds.push_back(SecondsSince(t0));
+ // 'Use' the result to prevent optimizing out the partition.
+ sum += static_cast<double>(aligned.get()[num_lanes / 2]);
+ }
+
+ Result(Algo::kVQSort, dist, num_keys, 1, SummarizeMeasurements(seconds),
+ sizeof(KeyType), st.KeyString())
+ .Print();
+ }
+ HWY_ASSERT(sum != 999999); // Prevent optimizing out
+}
+
+HWY_NOINLINE void BenchAllPartition() {
+ // Not interested in benchmark results for these targets
+ if (HWY_TARGET == HWY_SSSE3) {
+ return;
+ }
+
+ BenchPartition<TraitsLane<OrderDescending<float>>>();
+ BenchPartition<TraitsLane<OrderDescending<int32_t>>>();
+ BenchPartition<TraitsLane<OrderDescending<int64_t>>>();
+ BenchPartition<Traits128<OrderAscending128>>();
+ // BenchPartition<Traits128<OrderDescending128>>();
+ BenchPartition<Traits128<OrderAscendingKV128>>();
+}
+
+template <class Traits>
+HWY_NOINLINE void BenchBase(std::vector<Result>& results) {
+ // Not interested in benchmark results for these targets
+ if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) {
+ return;
+ }
+
+ using LaneType = typename Traits::LaneType;
+ using KeyType = typename Traits::KeyType;
+ const SortTag<LaneType> d;
+ detail::SharedTraits<Traits> st;
+ const Dist dist = Dist::kUniform32;
+
+ const size_t N = Lanes(d);
+ constexpr size_t kLPK = st.LanesPerKey();
+ const size_t num_lanes = SortConstants::BaseCaseNumLanes<kLPK>(N);
+ const size_t num_keys = num_lanes / kLPK;
+ auto keys = hwy::AllocateAligned<LaneType>(num_lanes);
+ auto buf = hwy::AllocateAligned<LaneType>(num_lanes + N);
+
+ std::vector<double> seconds;
+ double sum = 0; // prevents elision
+ constexpr size_t kMul = AdjustedReps(600); // ensures long enough to measure
+
+ for (size_t rep = 0; rep < 30; ++rep) {
+ InputStats<LaneType> input_stats =
+ GenerateInput(dist, keys.get(), num_lanes);
+
+ const Timestamp t0;
+ for (size_t i = 0; i < kMul; ++i) {
+ detail::BaseCase(d, st, keys.get(), keys.get() + num_lanes, num_lanes,
+ buf.get());
+ sum += static_cast<double>(keys[0]);
+ }
+ seconds.push_back(SecondsSince(t0));
+ // printf("%f\n", seconds.back());
+
+ HWY_ASSERT(VerifySort(st, input_stats, keys.get(), num_lanes, "BenchBase"));
+ }
+ HWY_ASSERT(sum < 1E99);
+ results.emplace_back(Algo::kVQSort, dist, num_keys * kMul, 1,
+ SummarizeMeasurements(seconds), sizeof(KeyType),
+ st.KeyString());
+}
+
+HWY_NOINLINE void BenchAllBase() {
+ // Not interested in benchmark results for these targets
+ if (HWY_TARGET == HWY_SSSE3) {
+ return;
+ }
+
+ std::vector<Result> results;
+ BenchBase<TraitsLane<OrderAscending<float>>>(results);
+ BenchBase<TraitsLane<OrderDescending<int64_t>>>(results);
+ BenchBase<Traits128<OrderAscending128>>(results);
+ for (const Result& r : results) {
+ r.Print();
+ }
+}
+
+#endif // VQSORT_ENABLED && SORT_BENCH_BASE_AND_PARTITION
+
+std::vector<Algo> AlgoForBench() {
+ return {
+#if HAVE_AVX2SORT
+ Algo::kSEA,
+#endif
+#if HAVE_PARALLEL_IPS4O
+ Algo::kParallelIPS4O,
+#elif HAVE_IPS4O
+ Algo::kIPS4O,
+#endif
+#if HAVE_PDQSORT
+ Algo::kPDQ,
+#endif
+#if HAVE_SORT512
+ Algo::kSort512,
+#endif
+// Only include if we're compiling for the target it supports.
+#if HAVE_VXSORT && ((VXSORT_AVX3 && HWY_TARGET == HWY_AVX3) || \
+ (!VXSORT_AVX3 && HWY_TARGET == HWY_AVX2))
+ Algo::kVXSort,
+#endif
+// Only include if we're compiling for the target it supports.
+#if HAVE_INTEL && HWY_TARGET <= HWY_AVX3
+ Algo::kIntel,
+#endif
+
+#if !HAVE_PARALLEL_IPS4O
+#if !SORT_100M
+ // 10-20x slower, but that's OK for the default size when we are not
+ // testing the parallel nor 100M modes.
+ Algo::kStd,
+#endif
+
+ Algo::kVQSort, // only ~4x slower, but not required for Table 1a
+#endif // !HAVE_PARALLEL_IPS4O
+ };
+}
+
+template <class Traits>
+HWY_NOINLINE void BenchSort(size_t num_keys) {
+ if (first_sort_target == 0) first_sort_target = HWY_TARGET;
+
+ SharedState shared;
+ detail::SharedTraits<Traits> st;
+ using Order = typename Traits::Order;
+ using LaneType = typename Traits::LaneType;
+ using KeyType = typename Traits::KeyType;
+ const size_t num_lanes = num_keys * st.LanesPerKey();
+ auto aligned = hwy::AllocateAligned<LaneType>(num_lanes);
+
+ const size_t reps = num_keys > 1000 * 1000 ? 10 : 30;
+
+ for (Algo algo : AlgoForBench()) {
+ // Other algorithms don't depend on the vector instructions, so only run
+ // them for the first target.
+#if !HAVE_VXSORT
+ if (algo != Algo::kVQSort && HWY_TARGET != first_sort_target) {
+ continue;
+ }
+#endif
+
+ for (Dist dist : AllDist()) {
+ std::vector<double> seconds;
+ for (size_t rep = 0; rep < reps; ++rep) {
+ InputStats<LaneType> input_stats =
+ GenerateInput(dist, aligned.get(), num_lanes);
+
+ const Timestamp t0;
+ Run<Order>(algo, reinterpret_cast<KeyType*>(aligned.get()), num_keys,
+ shared, /*thread=*/0);
+ seconds.push_back(SecondsSince(t0));
+ // printf("%f\n", seconds.back());
+
+ HWY_ASSERT(
+ VerifySort(st, input_stats, aligned.get(), num_lanes, "BenchSort"));
+ }
+ Result(algo, dist, num_keys, 1, SummarizeMeasurements(seconds),
+ sizeof(KeyType), st.KeyString())
+ .Print();
+ } // dist
+ } // algo
+}
+
+enum class BenchmarkModes {
+ kDefault,
+ k1M,
+ kAllSmall,
+ kSmallPow2,
+ kPow4,
+ kPow10
+};
+
+std::vector<size_t> SizesToBenchmark(BenchmarkModes mode) {
+ std::vector<size_t> sizes;
+ switch (mode) {
+ default:
+ case BenchmarkModes::kDefault:
+#if HAVE_PARALLEL_IPS4O || SORT_100M
+ sizes.push_back(100 * 1000 * size_t{1000});
+#else
+ sizes.push_back(100);
+ sizes.push_back(100 * 1000);
+#endif
+ break;
+ case BenchmarkModes::k1M:
+ sizes.push_back(1000 * 1000);
+ break;
+
+ case BenchmarkModes::kAllSmall:
+ sizes.reserve(128);
+ for (size_t i = 1; i <= 128; ++i) {
+ sizes.push_back(i);
+ }
+ break;
+ case BenchmarkModes::kSmallPow2:
+ for (size_t size = 2; size <= 128; size *= 2) {
+ sizes.push_back(size);
+ }
+ break;
+ case BenchmarkModes::kPow4:
+ for (size_t size = 4; size <= 256 * 1024; size *= 4) {
+ sizes.push_back(size);
+ }
+ break;
+ case BenchmarkModes::kPow10:
+ for (size_t size = 10; size <= 100 * 1000; size *= 10) {
+ sizes.push_back(size);
+ }
+ break;
+ }
+ return sizes;
+}
+
+HWY_NOINLINE void BenchAllSort() {
+ // Not interested in benchmark results for these targets. Note that SSE4 is
+ // numerically less than SSE2, hence it is the lower bound.
+ if (HWY_SSE4 <= HWY_TARGET && HWY_TARGET <= HWY_SSE2) {
+ return;
+ }
+#if HAVE_INTEL
+ if (HWY_TARGET > HWY_AVX3) return;
+#endif
+
+ for (size_t num_keys : SizesToBenchmark(BenchmarkModes::kSmallPow2)) {
+#if !HAVE_INTEL
+ BenchSort<TraitsLane<OrderAscending<float>>>(num_keys);
+#endif
+ // BenchSort<TraitsLane<OtherOrder<double>>>(num_keys);
+ // BenchSort<TraitsLane<OrderAscending<int16_t>>>(num_keys);
+ BenchSort<TraitsLane<OtherOrder<int32_t>>>(num_keys);
+ BenchSort<TraitsLane<OrderAscending<int64_t>>>(num_keys);
+ // BenchSort<TraitsLane<OtherOrder<uint16_t>>>(num_keys);
+ // BenchSort<TraitsLane<OtherOrder<uint32_t>>>(num_keys);
+ // BenchSort<TraitsLane<OrderAscending<uint64_t>>>(num_keys);
+
+#if !HAVE_VXSORT && !HAVE_INTEL && VQSORT_ENABLED
+ BenchSort<Traits128<OrderAscending128>>(num_keys);
+ BenchSort<Traits128<OrderAscendingKV128>>(num_keys);
+#endif
+ }
+}
+
+} // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+int64_t first_sort_target = 0; // none run yet
+namespace {
+HWY_BEFORE_TEST(BenchSort);
+#if SORT_BENCH_BASE_AND_PARTITION
+HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllPartition);
+HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllBase);
+#endif
+HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllSort);
+} // namespace
+} // namespace hwy
+
+#endif // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/print_network.cc b/third_party/highway/hwy/contrib/sort/print_network.cc
new file mode 100644
index 0000000000..0760696e79
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/print_network.cc
@@ -0,0 +1,90 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+
+#include <vector>
+
+#include "hwy/base.h"
+
+// Based on A.7 in "Entwurf und Implementierung vektorisierter
+// Sortieralgorithmen" and code by Mark Blacher.
+void PrintMergeNetwork(int rows, int cols) {
+ printf("\n%d x %d:\n", rows, cols);
+ // Powers of two
+ HWY_ASSERT(rows != 0 && (rows & (rows - 1)) == 0);
+ HWY_ASSERT(cols != 0 && (cols & (cols - 1)) == 0);
+ HWY_ASSERT(rows >= 4);
+ HWY_ASSERT(cols >= 2); // otherwise no cross-column merging required
+ HWY_ASSERT(cols <= 16); // SortTraits lacks Reverse32
+
+ // Log(rows) times: sort half of the vectors with reversed groups of the
+ // other half. Group size halves until we are sorting adjacent vectors.
+ int group_size = rows;
+ int num_groups = 1;
+ for (; group_size >= 2; group_size /= 2, num_groups *= 2) {
+ // All vectors except those being reversed. Allows us to group the
+ // ReverseKeys and Sort2 operations, which is easier to read and may help
+ // in-order machines with high-latency ReverseKeys.
+ std::vector<int> all_vi;
+ for (int group = 0; group < num_groups; ++group) {
+ for (int i = 0; i < group_size / 2; ++i) {
+ all_vi.push_back(group * group_size + i);
+ }
+ }
+ for (int vi : all_vi) {
+ const int vr = vi ^ (group_size - 1);
+ printf("v%x = st.ReverseKeys%d(d, v%x);\n", vr, cols, vr);
+ }
+ for (int vi : all_vi) {
+ const int vr = vi ^ (group_size - 1);
+ printf("st.Sort2(d, v%x, v%x);\n", vi, vr);
+ }
+ printf("\n");
+ }
+
+ // Now merge across columns in all vectors.
+ if (cols > 2) {
+ for (int i = 0; i < rows; ++i) {
+ printf("v%x = st.SortPairsReverse%d(d, v%x);\n", i, cols, i);
+ }
+ printf("\n");
+ }
+ if (cols >= 16) {
+ for (int i = 0; i < rows; ++i) {
+ printf("v%x = st.SortPairsDistance4(d, v%x);\n", i, i);
+ }
+ printf("\n");
+ }
+ if (cols >= 8) {
+ for (int i = 0; i < rows; ++i) {
+ printf("v%x = st.SortPairsDistance2(d, v%x);\n", i, i);
+ }
+ printf("\n");
+ }
+ for (int i = 0; i < rows; ++i) {
+ printf("v%x = st.SortPairsDistance1(d, v%x);\n", i, i);
+ }
+ printf("\n");
+}
+
+int main(int argc, char** argv) {
+ PrintMergeNetwork(8, 2);
+ PrintMergeNetwork(8, 4);
+ PrintMergeNetwork(16, 4);
+ PrintMergeNetwork(16, 8);
+ PrintMergeNetwork(16, 16);
+ return 0;
+}
diff --git a/third_party/highway/hwy/contrib/sort/result-inl.h b/third_party/highway/hwy/contrib/sort/result-inl.h
new file mode 100644
index 0000000000..34365a1669
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/result-inl.h
@@ -0,0 +1,140 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/algo-inl.h"
+
+// Normal include guard for non-SIMD parts
+#ifndef HIGHWAY_HWY_CONTRIB_SORT_RESULT_INL_H_
+#define HIGHWAY_HWY_CONTRIB_SORT_RESULT_INL_H_
+
+#include <time.h>
+
+#include <algorithm> // std::sort
+#include <string>
+
+#include "hwy/base.h"
+#include "hwy/nanobenchmark.h"
+
+namespace hwy {
+
+struct Timestamp {
+ Timestamp() { t = platform::Now(); }
+ double t;
+};
+
+static inline double SecondsSince(const Timestamp& t0) {
+ const Timestamp t1;
+ return t1.t - t0.t;
+}
+
+// Returns trimmed mean (we don't want to run an out-of-L3-cache sort often
+// enough for the mode to be reliable).
+static inline double SummarizeMeasurements(std::vector<double>& seconds) {
+ std::sort(seconds.begin(), seconds.end());
+ double sum = 0;
+ int count = 0;
+ const size_t num = seconds.size();
+ for (size_t i = num / 4; i < num / 2; ++i) {
+ sum += seconds[i];
+ count += 1;
+ }
+ return sum / count;
+}
+
+} // namespace hwy
+#endif // HIGHWAY_HWY_CONTRIB_SORT_RESULT_INL_H_
+
+// Per-target
+#if defined(HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE) == \
+ defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE
+#undef HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE
+#else
+#define HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE
+#endif
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct Result {
+ Result() {}
+ Result(const Algo algo, Dist dist, size_t num_keys, size_t num_threads,
+ double sec, size_t sizeof_key, const char* key_name)
+ : target(HWY_TARGET),
+ algo(algo),
+ dist(dist),
+ num_keys(num_keys),
+ num_threads(num_threads),
+ sec(sec),
+ sizeof_key(sizeof_key),
+ key_name(key_name) {}
+
+ void Print() const {
+ const double bytes = static_cast<double>(num_keys) *
+ static_cast<double>(num_threads) *
+ static_cast<double>(sizeof_key);
+ printf("%10s: %12s: %7s: %9s: %05g %4.0f MB/s (%2zu threads)\n",
+ hwy::TargetName(target), AlgoName(algo), key_name.c_str(),
+ DistName(dist), static_cast<double>(num_keys), bytes * 1E-6 / sec,
+ num_threads);
+ }
+
+ int64_t target;
+ Algo algo;
+ Dist dist;
+ size_t num_keys = 0;
+ size_t num_threads = 0;
+ double sec = 0.0;
+ size_t sizeof_key = 0;
+ std::string key_name;
+};
+
+template <class Traits, typename LaneType>
+bool VerifySort(Traits st, const InputStats<LaneType>& input_stats,
+ const LaneType* out, size_t num_lanes, const char* caller) {
+ constexpr size_t N1 = st.LanesPerKey();
+ HWY_ASSERT(num_lanes >= N1);
+
+ InputStats<LaneType> output_stats;
+ // Ensure it matches the sort order
+ for (size_t i = 0; i < num_lanes - N1; i += N1) {
+ output_stats.Notify(out[i]);
+ if (N1 == 2) output_stats.Notify(out[i + 1]);
+ // Reverse order instead of checking !Compare1 so we accept equal keys.
+ if (st.Compare1(out + i + N1, out + i)) {
+ fprintf(stderr, "%s: i=%d of %d lanes: N1=%d", caller,
+ static_cast<int>(i), static_cast<int>(num_lanes),
+ static_cast<int>(N1));
+ fprintf(stderr, "%5.0f %5.0f vs. %5.0f %5.0f\n\n",
+ static_cast<double>(out[i + 1]), static_cast<double>(out[i + 0]),
+ static_cast<double>(out[i + N1 + 1]),
+ static_cast<double>(out[i + N1]));
+ HWY_ABORT("%d-bit sort is incorrect\n",
+ static_cast<int>(sizeof(LaneType) * 8 * N1));
+ }
+ }
+ output_stats.Notify(out[num_lanes - N1]);
+ if (N1 == 2) output_stats.Notify(out[num_lanes - N1 + 1]);
+
+ return input_stats == output_stats;
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#endif // HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE
diff --git a/third_party/highway/hwy/contrib/sort/shared-inl.h b/third_party/highway/hwy/contrib/sort/shared-inl.h
new file mode 100644
index 0000000000..18cb58d78b
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/shared-inl.h
@@ -0,0 +1,154 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Definitions shared between vqsort-inl and sorting_networks-inl.
+
+// Normal include guard for target-independent parts
+#ifndef HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
+#define HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
+
+#include "hwy/base.h"
+
+namespace hwy {
+
+// Internal constants - these are to avoid magic numbers/literals and cannot be
+// changed without also changing the associated code.
+struct SortConstants {
+ // SortingNetwork reshapes its input into a matrix. This is the maximum number
+ // of *lanes* per vector. Must be at least 8 because SortSamples assumes the
+ // sorting network can handle 128 bytes with 8 rows, so 16 bytes per vector,
+ // which means 8 lanes for 16-bit types.
+#if HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
+ static constexpr size_t kMaxCols = 8; // avoid build timeout/stack overflow
+#else
+ static constexpr size_t kMaxCols = 16; // enough for u32 in 512-bit vector
+#endif
+
+ // 16 rows is a compromise between using the 32 AVX-512/SVE/RVV registers,
+ // fitting within 16 AVX2 registers with only a few spills, keeping BaseCase
+ // code size reasonable, and minimizing the extra logN factor for larger
+ // networks (for which only loose upper bounds on size are known).
+ static constexpr size_t kMaxRows = 16;
+
+ // Template argument ensures there is no actual division instruction.
+ template <size_t kLPK>
+ static constexpr HWY_INLINE size_t BaseCaseNumLanes(size_t N) {
+ // We use 8, 8x2, 8x4, and 16x{4..} networks, in units of keys. For N/kLPK
+ // < 4, we cannot use the 16-row networks.
+ return (((N / kLPK) >= 4) ? kMaxRows : 8) * HWY_MIN(N, kMaxCols);
+ }
+
+ // Unrolling is important (pipelining and amortizing branch mispredictions);
+ // 2x is sufficient to reach full memory bandwidth on SKX in Partition, but
+ // somewhat slower for sorting than 4x.
+ //
+ // To change, must also update left + 3 * N etc. in the loop.
+ static constexpr size_t kPartitionUnroll = 4;
+
+ // Chunk := group of keys loaded for sampling a pivot. Matches the typical
+ // cache line size of 64 bytes to get maximum benefit per L2 miss. Sort()
+ // ensures vectors are no larger than that, so this can be independent of the
+ // vector size and thus constexpr.
+ static constexpr HWY_INLINE size_t LanesPerChunk(size_t sizeof_t) {
+ return 64 / sizeof_t;
+ }
+
+ template <typename T>
+ static constexpr HWY_INLINE size_t SampleLanes() {
+ return 2 * LanesPerChunk(sizeof(T)); // Stored samples
+ }
+
+ static constexpr HWY_INLINE size_t PartitionBufNum(size_t N) {
+ // The main loop reads kPartitionUnroll vectors, and first loads from
+ // both left and right beforehand, so it requires min = 2 *
+ // kPartitionUnroll vectors. To handle smaller amounts (only guaranteed
+ // >= BaseCaseNumLanes), we partition the right side into a buffer. We need
+ // another vector at the end so CompressStore does not overwrite anything.
+ return (2 * kPartitionUnroll + 1) * N;
+ }
+
+ // Max across the three buffer usages.
+ template <typename T, size_t kLPK>
+ static constexpr HWY_INLINE size_t BufNum(size_t N) {
+ // BaseCase may write one padding vector, and SortSamples uses the space
+ // after samples as the buffer.
+ return HWY_MAX(SampleLanes<T>() + BaseCaseNumLanes<kLPK>(N) + N,
+ PartitionBufNum(N));
+ }
+
+ // Translates vector_size to lanes and returns size in bytes.
+ template <typename T, size_t kLPK>
+ static constexpr HWY_INLINE size_t BufBytes(size_t vector_size) {
+ return BufNum<T, kLPK>(vector_size / sizeof(T)) * sizeof(T);
+ }
+
+ // Returns max for any type.
+ template <size_t kLPK>
+ static constexpr HWY_INLINE size_t MaxBufBytes(size_t vector_size) {
+ // If 2 lanes per key, it's a 128-bit key with u64 lanes.
+ return kLPK == 2 ? BufBytes<uint64_t, 2>(vector_size)
+ : HWY_MAX((BufBytes<uint16_t, 1>(vector_size)),
+ HWY_MAX((BufBytes<uint32_t, 1>(vector_size)),
+ (BufBytes<uint64_t, 1>(vector_size))));
+ }
+};
+
+static_assert(SortConstants::MaxBufBytes<1>(64) <= 1280, "Unexpectedly high");
+static_assert(SortConstants::MaxBufBytes<2>(64) <= 1280, "Unexpectedly high");
+
+} // namespace hwy
+
+#endif // HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
+
+// Per-target
+#if defined(HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE) == \
+ defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
+#undef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
+#else
+#define HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
+#endif
+
+#include "hwy/highway.h"
+
+// vqsort isn't available on HWY_SCALAR, and builds time out on MSVC opt and
+// Armv7 debug.
+#undef VQSORT_ENABLED
+#if (HWY_TARGET == HWY_SCALAR) || \
+ (HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD) || \
+ (HWY_ARCH_ARM_V7 && HWY_IS_DEBUG_BUILD)
+#define VQSORT_ENABLED 0
+#else
+#define VQSORT_ENABLED 1
+#endif
+
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// Default tag / vector width selector.
+#if HWY_TARGET == HWY_RVV
+// Use LMUL = 1/2; for SEW=64 this ends up emulated via vsetvl.
+template <typename T>
+using SortTag = ScalableTag<T, -1>;
+#else
+template <typename T>
+using SortTag = ScalableTag<T>;
+#endif
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+
+#endif // HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
diff --git a/third_party/highway/hwy/contrib/sort/sort_test.cc b/third_party/highway/hwy/contrib/sort/sort_test.cc
new file mode 100644
index 0000000000..b38a42d214
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/sort_test.cc
@@ -0,0 +1,650 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS // before inttypes.h
+#endif
+#include <inttypes.h> // IWYU pragma: keep
+#include <stdio.h>
+#include <string.h> // memcpy
+
+#include <unordered_map>
+#include <vector>
+
+// clang-format off
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/sort_test.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/contrib/sort/vqsort.h"
+// After foreach_target
+#include "hwy/contrib/sort/algo-inl.h"
+#include "hwy/contrib/sort/traits128-inl.h"
+#include "hwy/contrib/sort/result-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h" // BaseCase
+#include "hwy/tests/test_util-inl.h"
+// clang-format on
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+namespace {
+
+using detail::OrderAscending;
+using detail::SharedTraits;
+using detail::TraitsLane;
+#if VQSORT_ENABLED || HWY_IDE
+#if !HAVE_INTEL
+using detail::OrderAscending128;
+using detail::OrderAscendingKV128;
+using detail::OrderAscendingKV64;
+using detail::OrderDescending128;
+using detail::OrderDescendingKV128;
+using detail::OrderDescendingKV64;
+using detail::Traits128;
+#endif
+
+template <class Traits>
+static HWY_NOINLINE void TestMedian3() {
+ using LaneType = typename Traits::LaneType;
+ using D = CappedTag<LaneType, 1>;
+ SharedTraits<Traits> st;
+ const D d;
+ using V = Vec<D>;
+ for (uint32_t bits = 0; bits < 8; ++bits) {
+ const V v0 = Set(d, LaneType{(bits & (1u << 0)) ? 1u : 0u});
+ const V v1 = Set(d, LaneType{(bits & (1u << 1)) ? 1u : 0u});
+ const V v2 = Set(d, LaneType{(bits & (1u << 2)) ? 1u : 0u});
+ const LaneType m = GetLane(detail::MedianOf3(st, v0, v1, v2));
+ // If at least half(rounded up) of bits are 1, so is the median.
+ const size_t count = PopCount(bits);
+ HWY_ASSERT_EQ((count >= 2) ? static_cast<LaneType>(1) : 0, m);
+ }
+}
+
+HWY_NOINLINE void TestAllMedian() {
+ TestMedian3<TraitsLane<OrderAscending<uint64_t> > >();
+}
+
+template <class Traits>
+static HWY_NOINLINE void TestBaseCaseAscDesc() {
+ using LaneType = typename Traits::LaneType;
+ SharedTraits<Traits> st;
+ const SortTag<LaneType> d;
+ const size_t N = Lanes(d);
+ constexpr size_t N1 = st.LanesPerKey();
+ const size_t base_case_num = SortConstants::BaseCaseNumLanes<N1>(N);
+
+ constexpr int kDebug = 0;
+ auto aligned_lanes = hwy::AllocateAligned<LaneType>(N + base_case_num + N);
+ auto buf = hwy::AllocateAligned<LaneType>(base_case_num + 2 * N);
+ HWY_ASSERT(aligned_lanes && buf);
+
+ std::vector<size_t> lengths;
+ lengths.push_back(HWY_MAX(1, N1));
+ lengths.push_back(3 * N1);
+ lengths.push_back(base_case_num / 2);
+ lengths.push_back(base_case_num / 2 + N1);
+ lengths.push_back(base_case_num - N1);
+ lengths.push_back(base_case_num);
+
+ std::vector<size_t> misalignments;
+ misalignments.push_back(0);
+ misalignments.push_back(1);
+ if (N >= 6) misalignments.push_back(N / 2 - 1);
+ misalignments.push_back(N / 2);
+ misalignments.push_back(N / 2 + 1);
+ misalignments.push_back(HWY_MIN(2 * N / 3 + 3, size_t{N - 1}));
+
+ for (bool asc : {false, true}) {
+ for (size_t len : lengths) {
+ for (size_t misalign : misalignments) {
+ LaneType* HWY_RESTRICT lanes = aligned_lanes.get() + misalign;
+ if (kDebug) {
+ printf("============%s asc %d N1 %d len %d misalign %d\n",
+ st.KeyString(), asc, static_cast<int>(N1),
+ static_cast<int>(len), static_cast<int>(misalign));
+ }
+
+ for (size_t i = 0; i < misalign; ++i) {
+ aligned_lanes[i] = hwy::LowestValue<LaneType>();
+ }
+ InputStats<LaneType> input_stats;
+ for (size_t i = 0; i < len; ++i) {
+ lanes[i] = asc ? static_cast<LaneType>(LaneType(i) + 1)
+ : static_cast<LaneType>(LaneType(len) - LaneType(i));
+ input_stats.Notify(lanes[i]);
+ if (kDebug >= 2) {
+ printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
+ }
+ }
+ for (size_t i = len; i < base_case_num + N; ++i) {
+ lanes[i] = hwy::LowestValue<LaneType>();
+ }
+
+ detail::BaseCase(d, st, lanes, len, buf.get());
+
+ if (kDebug >= 2) {
+ printf("out>>>>>>\n");
+ for (size_t i = 0; i < len; ++i) {
+ printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
+ }
+ }
+
+ HWY_ASSERT(VerifySort(st, input_stats, lanes, len, "BaseAscDesc"));
+ for (size_t i = 0; i < misalign; ++i) {
+ if (aligned_lanes[i] != hwy::LowestValue<LaneType>())
+ HWY_ABORT("Overrun misalign at %d\n", static_cast<int>(i));
+ }
+ for (size_t i = len; i < base_case_num + N; ++i) {
+ if (lanes[i] != hwy::LowestValue<LaneType>())
+ HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
+ }
+ } // misalign
+ } // len
+ } // asc
+}
+
+template <class Traits>
+static HWY_NOINLINE void TestBaseCase01() {
+ using LaneType = typename Traits::LaneType;
+ SharedTraits<Traits> st;
+ const SortTag<LaneType> d;
+ const size_t N = Lanes(d);
+ constexpr size_t N1 = st.LanesPerKey();
+ const size_t base_case_num = SortConstants::BaseCaseNumLanes<N1>(N);
+
+ constexpr int kDebug = 0;
+ auto lanes = hwy::AllocateAligned<LaneType>(base_case_num + N);
+ auto buf = hwy::AllocateAligned<LaneType>(base_case_num + 2 * N);
+ HWY_ASSERT(lanes && buf);
+
+ std::vector<size_t> lengths;
+ lengths.push_back(HWY_MAX(1, N1));
+ lengths.push_back(3 * N1);
+ lengths.push_back(base_case_num / 2);
+ lengths.push_back(base_case_num / 2 + N1);
+ lengths.push_back(base_case_num - N1);
+ lengths.push_back(base_case_num);
+
+ for (size_t len : lengths) {
+ if (kDebug) {
+ printf("============%s 01 N1 %d len %d\n", st.KeyString(),
+ static_cast<int>(N1), static_cast<int>(len));
+ }
+ const uint64_t kMaxBits = AdjustedLog2Reps(HWY_MIN(len, size_t{14}));
+ for (uint64_t bits = 0; bits < ((1ull << kMaxBits) - 1); ++bits) {
+ InputStats<LaneType> input_stats;
+ for (size_t i = 0; i < len; ++i) {
+ lanes[i] = (i < 64 && (bits & (1ull << i))) ? 1 : 0;
+ input_stats.Notify(lanes[i]);
+ if (kDebug >= 2) {
+ printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
+ }
+ }
+ for (size_t i = len; i < base_case_num + N; ++i) {
+ lanes[i] = hwy::LowestValue<LaneType>();
+ }
+
+ detail::BaseCase(d, st, lanes.get(), len, buf.get());
+
+ if (kDebug >= 2) {
+ printf("out>>>>>>\n");
+ for (size_t i = 0; i < len; ++i) {
+ printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
+ }
+ }
+
+ HWY_ASSERT(VerifySort(st, input_stats, lanes.get(), len, "Base01"));
+ for (size_t i = len; i < base_case_num + N; ++i) {
+ if (lanes[i] != hwy::LowestValue<LaneType>())
+ HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
+ }
+ } // bits
+ } // len
+}
+
+template <class Traits>
+static HWY_NOINLINE void TestBaseCase() {
+ TestBaseCaseAscDesc<Traits>();
+ TestBaseCase01<Traits>();
+}
+
+HWY_NOINLINE void TestAllBaseCase() {
+ // Workaround for stack overflow on MSVC debug.
+#if defined(_MSC_VER)
+ return;
+#endif
+ TestBaseCase<TraitsLane<OrderAscending<int32_t> > >();
+ TestBaseCase<TraitsLane<OtherOrder<int64_t> > >();
+#if !HAVE_INTEL
+ TestBaseCase<Traits128<OrderAscending128> >();
+ TestBaseCase<Traits128<OrderDescending128> >();
+#endif
+}
+
+template <class Traits>
+static HWY_NOINLINE void VerifyPartition(
+ Traits st, typename Traits::LaneType* HWY_RESTRICT lanes, size_t left,
+ size_t border, size_t right, const size_t N1,
+ const typename Traits::LaneType* pivot) {
+ /* for (size_t i = left; i < right; ++i) {
+ if (i == border) printf("--\n");
+ printf("%4zu: %3d\n", i, lanes[i]);
+ }*/
+
+ HWY_ASSERT(left % N1 == 0);
+ HWY_ASSERT(border % N1 == 0);
+ HWY_ASSERT(right % N1 == 0);
+ const bool asc = typename Traits::Order().IsAscending();
+ for (size_t i = left; i < border; i += N1) {
+ if (st.Compare1(pivot, lanes + i)) {
+ HWY_ABORT(
+ "%s: asc %d left[%d] piv %.0f %.0f compares before %.0f %.0f "
+ "border %d",
+ st.KeyString(), asc, static_cast<int>(i),
+ static_cast<double>(pivot[1]), static_cast<double>(pivot[0]),
+ static_cast<double>(lanes[i + 1]), static_cast<double>(lanes[i + 0]),
+ static_cast<int>(border));
+ }
+ }
+ for (size_t i = border; i < right; i += N1) {
+ if (!st.Compare1(pivot, lanes + i)) {
+ HWY_ABORT(
+ "%s: asc %d right[%d] piv %.0f %.0f compares after %.0f %.0f "
+ "border %d",
+ st.KeyString(), asc, static_cast<int>(i),
+ static_cast<double>(pivot[1]), static_cast<double>(pivot[0]),
+ static_cast<double>(lanes[i + 1]), static_cast<double>(lanes[i]),
+ static_cast<int>(border));
+ }
+ }
+}
+
+template <class Traits>
+static HWY_NOINLINE void TestPartition() {
+ using LaneType = typename Traits::LaneType;
+ const SortTag<LaneType> d;
+ SharedTraits<Traits> st;
+ const bool asc = typename Traits::Order().IsAscending();
+ const size_t N = Lanes(d);
+ constexpr int kDebug = 0;
+ constexpr size_t N1 = st.LanesPerKey();
+ const size_t base_case_num = SortConstants::BaseCaseNumLanes<N1>(N);
+ // left + len + align
+ const size_t total = 32 + (base_case_num + 4 * HWY_MAX(N, 4)) + 2 * N;
+ auto aligned_lanes = hwy::AllocateAligned<LaneType>(total);
+ HWY_ALIGN LaneType buf[SortConstants::BufBytes<LaneType, N1>(HWY_MAX_BYTES) /
+ sizeof(LaneType)];
+
+ for (bool in_asc : {false, true}) {
+ for (int left_i : {0, 1, 7, 8, 30, 31}) {
+ const size_t left = static_cast<size_t>(left_i) & ~(N1 - 1);
+ for (size_t ofs :
+ {N, N + 3, 2 * N, 2 * N + 2, 2 * N + 3, 3 * N - 1, 4 * N - 2}) {
+ const size_t len = (base_case_num + ofs) & ~(N1 - 1);
+ for (LaneType pivot1 : {LaneType(0), LaneType(len / 3),
+ LaneType(2 * len / 3), LaneType(len)}) {
+ const LaneType pivot2[2] = {pivot1, 0};
+ const auto pivot = st.SetKey(d, pivot2);
+ for (size_t misalign = 0; misalign < N;
+ misalign += st.LanesPerKey()) {
+ LaneType* HWY_RESTRICT lanes = aligned_lanes.get() + misalign;
+ const size_t right = left + len;
+ if (kDebug) {
+ printf(
+ "=========%s asc %d left %d len %d right %d piv %.0f %.0f\n",
+ st.KeyString(), asc, static_cast<int>(left),
+ static_cast<int>(len), static_cast<int>(right),
+ static_cast<double>(pivot2[1]),
+ static_cast<double>(pivot2[0]));
+ }
+
+ for (size_t i = 0; i < misalign; ++i) {
+ aligned_lanes[i] = hwy::LowestValue<LaneType>();
+ }
+ for (size_t i = 0; i < left; ++i) {
+ lanes[i] = hwy::LowestValue<LaneType>();
+ }
+ std::unordered_map<LaneType, int> counts;
+ for (size_t i = left; i < right; ++i) {
+ lanes[i] = static_cast<LaneType>(
+ in_asc ? LaneType(i + 1) - static_cast<LaneType>(left)
+ : static_cast<LaneType>(right) - LaneType(i));
+ ++counts[lanes[i]];
+ if (kDebug >= 2) {
+ printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
+ }
+ }
+ for (size_t i = right; i < total - misalign; ++i) {
+ lanes[i] = hwy::LowestValue<LaneType>();
+ }
+
+ size_t border = left + detail::Partition(d, st, lanes + left,
+ right - left, pivot, buf);
+
+ if (kDebug >= 2) {
+ printf("out>>>>>>\n");
+ for (size_t i = left; i < right; ++i) {
+ printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
+ }
+ for (size_t i = right; i < total - misalign; ++i) {
+ printf("%3zu: sentinel %f\n", i, static_cast<double>(lanes[i]));
+ }
+ }
+ for (size_t i = left; i < right; ++i) {
+ --counts[lanes[i]];
+ }
+ for (auto kv : counts) {
+ if (kv.second != 0) {
+ PrintValue(kv.first);
+ HWY_ABORT("Incorrect count %d\n", kv.second);
+ }
+ }
+ VerifyPartition(st, lanes, left, border, right, N1, pivot2);
+ for (size_t i = 0; i < misalign; ++i) {
+ if (aligned_lanes[i] != hwy::LowestValue<LaneType>())
+ HWY_ABORT("Overrun misalign at %d\n", static_cast<int>(i));
+ }
+ for (size_t i = 0; i < left; ++i) {
+ if (lanes[i] != hwy::LowestValue<LaneType>())
+ HWY_ABORT("Overrun left at %d\n", static_cast<int>(i));
+ }
+ for (size_t i = right; i < total - misalign; ++i) {
+ if (lanes[i] != hwy::LowestValue<LaneType>())
+ HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
+ }
+ } // misalign
+ } // pivot
+ } // len
+ } // left
+ } // asc
+}
+
+HWY_NOINLINE void TestAllPartition() {
+ TestPartition<TraitsLane<OtherOrder<int32_t> > >();
+#if !HAVE_INTEL
+ TestPartition<Traits128<OrderAscending128> >();
+#endif
+
+#if !HWY_IS_DEBUG_BUILD
+ TestPartition<TraitsLane<OrderAscending<int16_t> > >();
+ TestPartition<TraitsLane<OrderAscending<int64_t> > >();
+ TestPartition<TraitsLane<OtherOrder<float> > >();
+#if HWY_HAVE_FLOAT64
+ TestPartition<TraitsLane<OtherOrder<double> > >();
+#endif
+#if !HAVE_INTEL
+ TestPartition<Traits128<OrderDescending128> >();
+#endif
+#endif
+}
+
+// (used for sample selection for choosing a pivot)
+template <typename TU>
+static HWY_NOINLINE void TestRandomGenerator() {
+ static_assert(!hwy::IsSigned<TU>(), "");
+ SortTag<TU> du;
+ const size_t N = Lanes(du);
+
+ uint64_t* state = GetGeneratorState();
+
+ // Ensure lower and upper 32 bits are uniformly distributed.
+ uint64_t sum_lo = 0, sum_hi = 0;
+ for (size_t i = 0; i < 1000; ++i) {
+ const uint64_t bits = detail::RandomBits(state);
+ sum_lo += bits & 0xFFFFFFFF;
+ sum_hi += bits >> 32;
+ }
+ const double expected = 1000 * (1ULL << 31);
+ HWY_ASSERT(0.9 * expected <= static_cast<double>(sum_lo) &&
+ static_cast<double>(sum_lo) <= 1.1 * expected);
+ HWY_ASSERT(0.9 * expected <= static_cast<double>(sum_hi) &&
+ static_cast<double>(sum_hi) <= 1.1 * expected);
+
+ const size_t lanes_per_block = HWY_MAX(64 / sizeof(TU), N); // power of two
+
+ for (uint32_t num_blocks = 2; num_blocks < 100000;
+ num_blocks = 3 * num_blocks / 2) {
+ // Generate some numbers and ensure all are in range
+ uint64_t sum = 0;
+ constexpr size_t kReps = 10000;
+ for (size_t rep = 0; rep < kReps; ++rep) {
+ const uint32_t bits = detail::RandomBits(state) & 0xFFFFFFFF;
+ const size_t index = detail::RandomChunkIndex(num_blocks, bits);
+ HWY_ASSERT(((index + 1) * lanes_per_block) <=
+ num_blocks * lanes_per_block);
+
+ sum += index;
+ }
+
+ // Also ensure the mean is near the middle of the range
+ const double expected = (num_blocks - 1) / 2.0;
+ const double actual = static_cast<double>(sum) / kReps;
+ HWY_ASSERT(0.9 * expected <= actual && actual <= 1.1 * expected);
+ }
+}
+
+HWY_NOINLINE void TestAllGenerator() {
+ TestRandomGenerator<uint32_t>();
+ TestRandomGenerator<uint64_t>();
+}
+
+#else
+static void TestAllMedian() {}
+static void TestAllBaseCase() {}
+static void TestAllPartition() {}
+static void TestAllGenerator() {}
+#endif // VQSORT_ENABLED
+
+// Remembers input, and compares results to that of a reference algorithm.
+template <class Traits>
+class CompareResults {
+ using LaneType = typename Traits::LaneType;
+ using KeyType = typename Traits::KeyType;
+
+ public:
+ CompareResults(const LaneType* in, size_t num_lanes) {
+ copy_.resize(num_lanes);
+ memcpy(copy_.data(), in, num_lanes * sizeof(LaneType));
+ }
+
+ bool Verify(const LaneType* output) {
+#if HAVE_PDQSORT
+ const Algo reference = Algo::kPDQ;
+#else
+ const Algo reference = Algo::kStd;
+#endif
+ SharedState shared;
+ using Order = typename Traits::Order;
+ const Traits st;
+ const size_t num_keys = copy_.size() / st.LanesPerKey();
+ Run<Order>(reference, reinterpret_cast<KeyType*>(copy_.data()), num_keys,
+ shared, /*thread=*/0);
+#if VQSORT_PRINT >= 3
+ fprintf(stderr, "\nExpected:\n");
+ for (size_t i = 0; i < copy_.size(); ++i) {
+ PrintValue(copy_[i]);
+ }
+ fprintf(stderr, "\n");
+#endif
+ for (size_t i = 0; i < copy_.size(); ++i) {
+ if (copy_[i] != output[i]) {
+ if (sizeof(KeyType) == 16) {
+ fprintf(stderr,
+ "%s Asc %d mismatch at %d of %d: %" PRIu64 " %" PRIu64 "\n",
+ st.KeyString(), Order().IsAscending(), static_cast<int>(i),
+ static_cast<int>(copy_.size()),
+ static_cast<uint64_t>(copy_[i]),
+ static_cast<uint64_t>(output[i]));
+ } else {
+ fprintf(stderr,
+ "Type %s Asc %d mismatch at %d of %d: ", st.KeyString(),
+ Order().IsAscending(), static_cast<int>(i),
+ static_cast<int>(copy_.size()));
+ PrintValue(copy_[i]);
+ PrintValue(output[i]);
+ fprintf(stderr, "\n");
+ }
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private:
+ std::vector<LaneType> copy_;
+};
+
+std::vector<Algo> AlgoForTest() {
+ return {
+#if HAVE_AVX2SORT
+ Algo::kSEA,
+#endif
+#if HAVE_IPS4O
+ Algo::kIPS4O,
+#endif
+#if HAVE_PDQSORT
+ Algo::kPDQ,
+#endif
+#if HAVE_SORT512
+ Algo::kSort512,
+#endif
+ Algo::kHeap, Algo::kVQSort,
+ };
+}
+
+template <class Traits>
+void TestSort(size_t num_lanes) {
+// Workaround for stack overflow on clang-cl (/F 8388608 does not help).
+#if defined(_MSC_VER)
+ return;
+#endif
+ using Order = typename Traits::Order;
+ using LaneType = typename Traits::LaneType;
+ using KeyType = typename Traits::KeyType;
+ SharedState shared;
+ SharedTraits<Traits> st;
+
+ // Round up to a whole number of keys.
+ num_lanes += (st.Is128() && (num_lanes & 1));
+ const size_t num_keys = num_lanes / st.LanesPerKey();
+
+ constexpr size_t kMaxMisalign = 16;
+ auto aligned =
+ hwy::AllocateAligned<LaneType>(kMaxMisalign + num_lanes + kMaxMisalign);
+ HWY_ASSERT(aligned);
+ for (Algo algo : AlgoForTest()) {
+ for (Dist dist : AllDist()) {
+ for (size_t misalign : {size_t{0}, size_t{st.LanesPerKey()},
+ size_t{3 * st.LanesPerKey()}, kMaxMisalign / 2}) {
+ LaneType* lanes = aligned.get() + misalign;
+
+ // Set up red zones before/after the keys to sort
+ for (size_t i = 0; i < misalign; ++i) {
+ aligned[i] = hwy::LowestValue<LaneType>();
+ }
+ for (size_t i = 0; i < kMaxMisalign; ++i) {
+ lanes[num_lanes + i] = hwy::HighestValue<LaneType>();
+ }
+#if HWY_IS_MSAN
+ __msan_poison(aligned.get(), misalign * sizeof(LaneType));
+ __msan_poison(lanes + num_lanes, kMaxMisalign * sizeof(LaneType));
+#endif
+ InputStats<LaneType> input_stats =
+ GenerateInput(dist, lanes, num_lanes);
+
+ CompareResults<Traits> compare(lanes, num_lanes);
+ Run<Order>(algo, reinterpret_cast<KeyType*>(lanes), num_keys, shared,
+ /*thread=*/0);
+ HWY_ASSERT(compare.Verify(lanes));
+ HWY_ASSERT(VerifySort(st, input_stats, lanes, num_lanes, "TestSort"));
+
+ // Check red zones
+#if HWY_IS_MSAN
+ __msan_unpoison(aligned.get(), misalign * sizeof(LaneType));
+ __msan_unpoison(lanes + num_lanes, kMaxMisalign * sizeof(LaneType));
+#endif
+ for (size_t i = 0; i < misalign; ++i) {
+ if (aligned[i] != hwy::LowestValue<LaneType>())
+ HWY_ABORT("Overrun left at %d\n", static_cast<int>(i));
+ }
+ for (size_t i = num_lanes; i < num_lanes + kMaxMisalign; ++i) {
+ if (lanes[i] != hwy::HighestValue<LaneType>())
+ HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
+ }
+ } // misalign
+ } // dist
+ } // algo
+}
+
+void TestAllSort() {
+ for (int num : {129, 504, 3 * 1000, 34567}) {
+ const size_t num_lanes = AdjustedReps(static_cast<size_t>(num));
+#if !HAVE_INTEL
+ TestSort<TraitsLane<OrderAscending<int16_t> > >(num_lanes);
+ TestSort<TraitsLane<OtherOrder<uint16_t> > >(num_lanes);
+#endif
+
+ TestSort<TraitsLane<OtherOrder<int32_t> > >(num_lanes);
+ TestSort<TraitsLane<OtherOrder<uint32_t> > >(num_lanes);
+
+ TestSort<TraitsLane<OrderAscending<int64_t> > >(num_lanes);
+ TestSort<TraitsLane<OrderAscending<uint64_t> > >(num_lanes);
+
+ // WARNING: for float types, SIMD comparisons will flush denormals to
+ // zero, causing mismatches with scalar sorts. In this test, we avoid
+ // generating denormal inputs.
+ TestSort<TraitsLane<OrderAscending<float> > >(num_lanes);
+#if HWY_HAVE_FLOAT64 // protects algo-inl's GenerateRandom
+ if (HWY_HAVE_FLOAT64) {
+ TestSort<TraitsLane<OtherOrder<double> > >(num_lanes);
+ }
+#endif
+
+// Other algorithms do not support 128-bit keys.
+#if !HAVE_VXSORT && !HAVE_INTEL && VQSORT_ENABLED
+ TestSort<Traits128<OrderAscending128> >(num_lanes);
+ TestSort<Traits128<OrderDescending128> >(num_lanes);
+
+ TestSort<TraitsLane<OrderAscendingKV64> >(num_lanes);
+ TestSort<TraitsLane<OrderDescendingKV64> >(num_lanes);
+
+ TestSort<Traits128<OrderAscendingKV128> >(num_lanes);
+ TestSort<Traits128<OrderDescendingKV128> >(num_lanes);
+#endif
+ }
+}
+
+} // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+namespace {
+HWY_BEFORE_TEST(SortTest);
+HWY_EXPORT_AND_TEST_P(SortTest, TestAllMedian);
+HWY_EXPORT_AND_TEST_P(SortTest, TestAllBaseCase);
+HWY_EXPORT_AND_TEST_P(SortTest, TestAllPartition);
+HWY_EXPORT_AND_TEST_P(SortTest, TestAllGenerator);
+HWY_EXPORT_AND_TEST_P(SortTest, TestAllSort);
+} // namespace
+} // namespace hwy
+
+#endif // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/sorting_networks-inl.h b/third_party/highway/hwy/contrib/sort/sorting_networks-inl.h
new file mode 100644
index 0000000000..c47fd8da7d
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/sorting_networks-inl.h
@@ -0,0 +1,898 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Per-target
+#if defined(HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE) == \
+ defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
+#undef HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
+#else
+#define HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
+#endif
+
+#include "hwy/contrib/sort/shared-inl.h" // SortConstants
+#include "hwy/highway.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+namespace detail {
+
+#if VQSORT_ENABLED
+
+using Constants = hwy::SortConstants;
+
+// ------------------------------ SharedTraits
+
+// Code shared between all traits. It's unclear whether these can profitably be
+// specialized for Lane vs Block, or optimized like SortPairsDistance1 using
+// Compare/DupOdd.
+template <class Base>
+struct SharedTraits : public Base {
+ // Conditionally swaps lane 0 with 2, 1 with 3 etc.
+ template <class D>
+ HWY_INLINE Vec<D> SortPairsDistance2(D d, Vec<D> v) const {
+ const Base* base = static_cast<const Base*>(this);
+ Vec<D> swapped = base->SwapAdjacentPairs(d, v);
+ base->Sort2(d, v, swapped);
+ return base->OddEvenPairs(d, swapped, v);
+ }
+
+ // Swaps with the vector formed by reversing contiguous groups of 8 keys.
+ template <class D>
+ HWY_INLINE Vec<D> SortPairsReverse8(D d, Vec<D> v) const {
+ const Base* base = static_cast<const Base*>(this);
+ Vec<D> swapped = base->ReverseKeys8(d, v);
+ base->Sort2(d, v, swapped);
+ return base->OddEvenQuads(d, swapped, v);
+ }
+
+ // Swaps with the vector formed by reversing contiguous groups of 8 keys.
+ template <class D>
+ HWY_INLINE Vec<D> SortPairsReverse16(D d, Vec<D> v) const {
+ const Base* base = static_cast<const Base*>(this);
+ static_assert(Constants::kMaxCols <= 16, "Need actual Reverse16");
+ Vec<D> swapped = base->ReverseKeys(d, v);
+ base->Sort2(d, v, swapped);
+ return ConcatUpperLower(d, swapped, v); // 8 = half of the vector
+ }
+};
+
+// ------------------------------ Sorting network
+
+// Sorting networks for independent columns in 2, 4 and 8 vectors from
+// https://bertdobbelaere.github.io/sorting_networks.html.
+
+template <class D, class Traits, class V = Vec<D>>
+HWY_INLINE void Sort2(D d, Traits st, V& v0, V& v1) {
+ st.Sort2(d, v0, v1);
+}
+
+template <class D, class Traits, class V = Vec<D>>
+HWY_INLINE void Sort4(D d, Traits st, V& v0, V& v1, V& v2, V& v3) {
+ st.Sort2(d, v0, v2);
+ st.Sort2(d, v1, v3);
+ st.Sort2(d, v0, v1);
+ st.Sort2(d, v2, v3);
+ st.Sort2(d, v1, v2);
+}
+
+template <class D, class Traits, class V = Vec<D>>
+HWY_INLINE void Sort8(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
+ V& v6, V& v7) {
+ st.Sort2(d, v0, v2);
+ st.Sort2(d, v1, v3);
+ st.Sort2(d, v4, v6);
+ st.Sort2(d, v5, v7);
+
+ st.Sort2(d, v0, v4);
+ st.Sort2(d, v1, v5);
+ st.Sort2(d, v2, v6);
+ st.Sort2(d, v3, v7);
+
+ st.Sort2(d, v0, v1);
+ st.Sort2(d, v2, v3);
+ st.Sort2(d, v4, v5);
+ st.Sort2(d, v6, v7);
+
+ st.Sort2(d, v2, v4);
+ st.Sort2(d, v3, v5);
+
+ st.Sort2(d, v1, v4);
+ st.Sort2(d, v3, v6);
+
+ st.Sort2(d, v1, v2);
+ st.Sort2(d, v3, v4);
+ st.Sort2(d, v5, v6);
+}
+
+// (Green's irregular) sorting network for independent columns in 16 vectors.
+template <class D, class Traits, class V = Vec<D>>
+HWY_INLINE void Sort16(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
+ V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd,
+ V& ve, V& vf) {
+ st.Sort2(d, v0, v1);
+ st.Sort2(d, v2, v3);
+ st.Sort2(d, v4, v5);
+ st.Sort2(d, v6, v7);
+ st.Sort2(d, v8, v9);
+ st.Sort2(d, va, vb);
+ st.Sort2(d, vc, vd);
+ st.Sort2(d, ve, vf);
+ st.Sort2(d, v0, v2);
+ st.Sort2(d, v1, v3);
+ st.Sort2(d, v4, v6);
+ st.Sort2(d, v5, v7);
+ st.Sort2(d, v8, va);
+ st.Sort2(d, v9, vb);
+ st.Sort2(d, vc, ve);
+ st.Sort2(d, vd, vf);
+ st.Sort2(d, v0, v4);
+ st.Sort2(d, v1, v5);
+ st.Sort2(d, v2, v6);
+ st.Sort2(d, v3, v7);
+ st.Sort2(d, v8, vc);
+ st.Sort2(d, v9, vd);
+ st.Sort2(d, va, ve);
+ st.Sort2(d, vb, vf);
+ st.Sort2(d, v0, v8);
+ st.Sort2(d, v1, v9);
+ st.Sort2(d, v2, va);
+ st.Sort2(d, v3, vb);
+ st.Sort2(d, v4, vc);
+ st.Sort2(d, v5, vd);
+ st.Sort2(d, v6, ve);
+ st.Sort2(d, v7, vf);
+ st.Sort2(d, v5, va);
+ st.Sort2(d, v6, v9);
+ st.Sort2(d, v3, vc);
+ st.Sort2(d, v7, vb);
+ st.Sort2(d, vd, ve);
+ st.Sort2(d, v4, v8);
+ st.Sort2(d, v1, v2);
+ st.Sort2(d, v1, v4);
+ st.Sort2(d, v7, vd);
+ st.Sort2(d, v2, v8);
+ st.Sort2(d, vb, ve);
+ st.Sort2(d, v2, v4);
+ st.Sort2(d, v5, v6);
+ st.Sort2(d, v9, va);
+ st.Sort2(d, vb, vd);
+ st.Sort2(d, v3, v8);
+ st.Sort2(d, v7, vc);
+ st.Sort2(d, v3, v5);
+ st.Sort2(d, v6, v8);
+ st.Sort2(d, v7, v9);
+ st.Sort2(d, va, vc);
+ st.Sort2(d, v3, v4);
+ st.Sort2(d, v5, v6);
+ st.Sort2(d, v7, v8);
+ st.Sort2(d, v9, va);
+ st.Sort2(d, vb, vc);
+ st.Sort2(d, v6, v7);
+ st.Sort2(d, v8, v9);
+}
+
+// ------------------------------ Merging networks
+
+// Blacher's hybrid bitonic/odd-even networks, generated by print_network.cc.
+// For acceptable performance, these must be inlined, otherwise vectors are
+// loaded from the stack. The kKeysPerVector allows calling from generic code
+// but skipping the functions when vectors have too few lanes for
+// st.SortPairsDistance1 to compile. `if constexpr` in the caller would also
+// work, but is not available in C++11. We write out the (unused) argument types
+// rather than `...` because GCC 9 (but not 10) fails to compile with `...`.
+
+template <size_t kKeysPerVector, class D, class Traits, class V,
+ HWY_IF_LANES_LE(kKeysPerVector, 1)>
+HWY_INLINE void Merge8x2(D, Traits, V, V, V, V, V, V, V, V) {}
+template <size_t kKeysPerVector, class D, class Traits, class V,
+ HWY_IF_LANES_LE(kKeysPerVector, 2)>
+HWY_INLINE void Merge8x4(D, Traits, V, V, V, V, V, V, V, V) {}
+
+template <size_t kKeysPerVector, class D, class Traits, class V,
+ HWY_IF_LANES_LE(kKeysPerVector, 1)>
+HWY_INLINE void Merge16x2(D, Traits, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
+ V, V) {}
+template <size_t kKeysPerVector, class D, class Traits, class V,
+ HWY_IF_LANES_LE(kKeysPerVector, 2)>
+HWY_INLINE void Merge16x4(D, Traits, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
+ V, V) {}
+template <size_t kKeysPerVector, class D, class Traits, class V,
+ HWY_IF_LANES_LE(kKeysPerVector, 4)>
+HWY_INLINE void Merge16x8(D, Traits, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
+ V, V) {}
+template <size_t kKeysPerVector, class D, class Traits, class V,
+ HWY_IF_LANES_LE(kKeysPerVector, 8)>
+HWY_INLINE void Merge16x16(D, Traits, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
+ V, V) {}
+
+template <size_t kKeysPerVector, class D, class Traits, class V = Vec<D>,
+ HWY_IF_LANES_GT(kKeysPerVector, 1)>
+HWY_INLINE void Merge8x2(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4,
+ V& v5, V& v6, V& v7) {
+ v7 = st.ReverseKeys2(d, v7);
+ v6 = st.ReverseKeys2(d, v6);
+ v5 = st.ReverseKeys2(d, v5);
+ v4 = st.ReverseKeys2(d, v4);
+ st.Sort2(d, v0, v7);
+ st.Sort2(d, v1, v6);
+ st.Sort2(d, v2, v5);
+ st.Sort2(d, v3, v4);
+
+ v3 = st.ReverseKeys2(d, v3);
+ v2 = st.ReverseKeys2(d, v2);
+ v7 = st.ReverseKeys2(d, v7);
+ v6 = st.ReverseKeys2(d, v6);
+ st.Sort2(d, v0, v3);
+ st.Sort2(d, v1, v2);
+ st.Sort2(d, v4, v7);
+ st.Sort2(d, v5, v6);
+
+ v1 = st.ReverseKeys2(d, v1);
+ v3 = st.ReverseKeys2(d, v3);
+ v5 = st.ReverseKeys2(d, v5);
+ v7 = st.ReverseKeys2(d, v7);
+ st.Sort2(d, v0, v1);
+ st.Sort2(d, v2, v3);
+ st.Sort2(d, v4, v5);
+ st.Sort2(d, v6, v7);
+
+ v0 = st.SortPairsDistance1(d, v0);
+ v1 = st.SortPairsDistance1(d, v1);
+ v2 = st.SortPairsDistance1(d, v2);
+ v3 = st.SortPairsDistance1(d, v3);
+ v4 = st.SortPairsDistance1(d, v4);
+ v5 = st.SortPairsDistance1(d, v5);
+ v6 = st.SortPairsDistance1(d, v6);
+ v7 = st.SortPairsDistance1(d, v7);
+}
+
+template <size_t kKeysPerVector, class D, class Traits, class V = Vec<D>,
+ HWY_IF_LANES_GT(kKeysPerVector, 2)>
+HWY_INLINE void Merge8x4(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4,
+ V& v5, V& v6, V& v7) {
+ v7 = st.ReverseKeys4(d, v7);
+ v6 = st.ReverseKeys4(d, v6);
+ v5 = st.ReverseKeys4(d, v5);
+ v4 = st.ReverseKeys4(d, v4);
+ st.Sort2(d, v0, v7);
+ st.Sort2(d, v1, v6);
+ st.Sort2(d, v2, v5);
+ st.Sort2(d, v3, v4);
+
+ v3 = st.ReverseKeys4(d, v3);
+ v2 = st.ReverseKeys4(d, v2);
+ v7 = st.ReverseKeys4(d, v7);
+ v6 = st.ReverseKeys4(d, v6);
+ st.Sort2(d, v0, v3);
+ st.Sort2(d, v1, v2);
+ st.Sort2(d, v4, v7);
+ st.Sort2(d, v5, v6);
+
+ v1 = st.ReverseKeys4(d, v1);
+ v3 = st.ReverseKeys4(d, v3);
+ v5 = st.ReverseKeys4(d, v5);
+ v7 = st.ReverseKeys4(d, v7);
+ st.Sort2(d, v0, v1);
+ st.Sort2(d, v2, v3);
+ st.Sort2(d, v4, v5);
+ st.Sort2(d, v6, v7);
+
+ v0 = st.SortPairsReverse4(d, v0);
+ v1 = st.SortPairsReverse4(d, v1);
+ v2 = st.SortPairsReverse4(d, v2);
+ v3 = st.SortPairsReverse4(d, v3);
+ v4 = st.SortPairsReverse4(d, v4);
+ v5 = st.SortPairsReverse4(d, v5);
+ v6 = st.SortPairsReverse4(d, v6);
+ v7 = st.SortPairsReverse4(d, v7);
+
+ v0 = st.SortPairsDistance1(d, v0);
+ v1 = st.SortPairsDistance1(d, v1);
+ v2 = st.SortPairsDistance1(d, v2);
+ v3 = st.SortPairsDistance1(d, v3);
+ v4 = st.SortPairsDistance1(d, v4);
+ v5 = st.SortPairsDistance1(d, v5);
+ v6 = st.SortPairsDistance1(d, v6);
+ v7 = st.SortPairsDistance1(d, v7);
+}
+
+template <size_t kKeysPerVector, class D, class Traits, class V = Vec<D>,
+ HWY_IF_LANES_GT(kKeysPerVector, 1)>
+HWY_INLINE void Merge16x2(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4,
+ V& v5, V& v6, V& v7, V& v8, V& v9, V& va, V& vb,
+ V& vc, V& vd, V& ve, V& vf) {
+ vf = st.ReverseKeys2(d, vf);
+ ve = st.ReverseKeys2(d, ve);
+ vd = st.ReverseKeys2(d, vd);
+ vc = st.ReverseKeys2(d, vc);
+ vb = st.ReverseKeys2(d, vb);
+ va = st.ReverseKeys2(d, va);
+ v9 = st.ReverseKeys2(d, v9);
+ v8 = st.ReverseKeys2(d, v8);
+ st.Sort2(d, v0, vf);
+ st.Sort2(d, v1, ve);
+ st.Sort2(d, v2, vd);
+ st.Sort2(d, v3, vc);
+ st.Sort2(d, v4, vb);
+ st.Sort2(d, v5, va);
+ st.Sort2(d, v6, v9);
+ st.Sort2(d, v7, v8);
+
+ v7 = st.ReverseKeys2(d, v7);
+ v6 = st.ReverseKeys2(d, v6);
+ v5 = st.ReverseKeys2(d, v5);
+ v4 = st.ReverseKeys2(d, v4);
+ vf = st.ReverseKeys2(d, vf);
+ ve = st.ReverseKeys2(d, ve);
+ vd = st.ReverseKeys2(d, vd);
+ vc = st.ReverseKeys2(d, vc);
+ st.Sort2(d, v0, v7);
+ st.Sort2(d, v1, v6);
+ st.Sort2(d, v2, v5);
+ st.Sort2(d, v3, v4);
+ st.Sort2(d, v8, vf);
+ st.Sort2(d, v9, ve);
+ st.Sort2(d, va, vd);
+ st.Sort2(d, vb, vc);
+
+ v3 = st.ReverseKeys2(d, v3);
+ v2 = st.ReverseKeys2(d, v2);
+ v7 = st.ReverseKeys2(d, v7);
+ v6 = st.ReverseKeys2(d, v6);
+ vb = st.ReverseKeys2(d, vb);
+ va = st.ReverseKeys2(d, va);
+ vf = st.ReverseKeys2(d, vf);
+ ve = st.ReverseKeys2(d, ve);
+ st.Sort2(d, v0, v3);
+ st.Sort2(d, v1, v2);
+ st.Sort2(d, v4, v7);
+ st.Sort2(d, v5, v6);
+ st.Sort2(d, v8, vb);
+ st.Sort2(d, v9, va);
+ st.Sort2(d, vc, vf);
+ st.Sort2(d, vd, ve);
+
+ v1 = st.ReverseKeys2(d, v1);
+ v3 = st.ReverseKeys2(d, v3);
+ v5 = st.ReverseKeys2(d, v5);
+ v7 = st.ReverseKeys2(d, v7);
+ v9 = st.ReverseKeys2(d, v9);
+ vb = st.ReverseKeys2(d, vb);
+ vd = st.ReverseKeys2(d, vd);
+ vf = st.ReverseKeys2(d, vf);
+ st.Sort2(d, v0, v1);
+ st.Sort2(d, v2, v3);
+ st.Sort2(d, v4, v5);
+ st.Sort2(d, v6, v7);
+ st.Sort2(d, v8, v9);
+ st.Sort2(d, va, vb);
+ st.Sort2(d, vc, vd);
+ st.Sort2(d, ve, vf);
+
+ v0 = st.SortPairsDistance1(d, v0);
+ v1 = st.SortPairsDistance1(d, v1);
+ v2 = st.SortPairsDistance1(d, v2);
+ v3 = st.SortPairsDistance1(d, v3);
+ v4 = st.SortPairsDistance1(d, v4);
+ v5 = st.SortPairsDistance1(d, v5);
+ v6 = st.SortPairsDistance1(d, v6);
+ v7 = st.SortPairsDistance1(d, v7);
+ v8 = st.SortPairsDistance1(d, v8);
+ v9 = st.SortPairsDistance1(d, v9);
+ va = st.SortPairsDistance1(d, va);
+ vb = st.SortPairsDistance1(d, vb);
+ vc = st.SortPairsDistance1(d, vc);
+ vd = st.SortPairsDistance1(d, vd);
+ ve = st.SortPairsDistance1(d, ve);
+ vf = st.SortPairsDistance1(d, vf);
+}
+
+template <size_t kKeysPerVector, class D, class Traits, class V = Vec<D>,
+ HWY_IF_LANES_GT(kKeysPerVector, 2)>
+HWY_INLINE void Merge16x4(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4,
+ V& v5, V& v6, V& v7, V& v8, V& v9, V& va, V& vb,
+ V& vc, V& vd, V& ve, V& vf) {
+ vf = st.ReverseKeys4(d, vf);
+ ve = st.ReverseKeys4(d, ve);
+ vd = st.ReverseKeys4(d, vd);
+ vc = st.ReverseKeys4(d, vc);
+ vb = st.ReverseKeys4(d, vb);
+ va = st.ReverseKeys4(d, va);
+ v9 = st.ReverseKeys4(d, v9);
+ v8 = st.ReverseKeys4(d, v8);
+ st.Sort2(d, v0, vf);
+ st.Sort2(d, v1, ve);
+ st.Sort2(d, v2, vd);
+ st.Sort2(d, v3, vc);
+ st.Sort2(d, v4, vb);
+ st.Sort2(d, v5, va);
+ st.Sort2(d, v6, v9);
+ st.Sort2(d, v7, v8);
+
+ v7 = st.ReverseKeys4(d, v7);
+ v6 = st.ReverseKeys4(d, v6);
+ v5 = st.ReverseKeys4(d, v5);
+ v4 = st.ReverseKeys4(d, v4);
+ vf = st.ReverseKeys4(d, vf);
+ ve = st.ReverseKeys4(d, ve);
+ vd = st.ReverseKeys4(d, vd);
+ vc = st.ReverseKeys4(d, vc);
+ st.Sort2(d, v0, v7);
+ st.Sort2(d, v1, v6);
+ st.Sort2(d, v2, v5);
+ st.Sort2(d, v3, v4);
+ st.Sort2(d, v8, vf);
+ st.Sort2(d, v9, ve);
+ st.Sort2(d, va, vd);
+ st.Sort2(d, vb, vc);
+
+ v3 = st.ReverseKeys4(d, v3);
+ v2 = st.ReverseKeys4(d, v2);
+ v7 = st.ReverseKeys4(d, v7);
+ v6 = st.ReverseKeys4(d, v6);
+ vb = st.ReverseKeys4(d, vb);
+ va = st.ReverseKeys4(d, va);
+ vf = st.ReverseKeys4(d, vf);
+ ve = st.ReverseKeys4(d, ve);
+ st.Sort2(d, v0, v3);
+ st.Sort2(d, v1, v2);
+ st.Sort2(d, v4, v7);
+ st.Sort2(d, v5, v6);
+ st.Sort2(d, v8, vb);
+ st.Sort2(d, v9, va);
+ st.Sort2(d, vc, vf);
+ st.Sort2(d, vd, ve);
+
+ v1 = st.ReverseKeys4(d, v1);
+ v3 = st.ReverseKeys4(d, v3);
+ v5 = st.ReverseKeys4(d, v5);
+ v7 = st.ReverseKeys4(d, v7);
+ v9 = st.ReverseKeys4(d, v9);
+ vb = st.ReverseKeys4(d, vb);
+ vd = st.ReverseKeys4(d, vd);
+ vf = st.ReverseKeys4(d, vf);
+ st.Sort2(d, v0, v1);
+ st.Sort2(d, v2, v3);
+ st.Sort2(d, v4, v5);
+ st.Sort2(d, v6, v7);
+ st.Sort2(d, v8, v9);
+ st.Sort2(d, va, vb);
+ st.Sort2(d, vc, vd);
+ st.Sort2(d, ve, vf);
+
+ v0 = st.SortPairsReverse4(d, v0);
+ v1 = st.SortPairsReverse4(d, v1);
+ v2 = st.SortPairsReverse4(d, v2);
+ v3 = st.SortPairsReverse4(d, v3);
+ v4 = st.SortPairsReverse4(d, v4);
+ v5 = st.SortPairsReverse4(d, v5);
+ v6 = st.SortPairsReverse4(d, v6);
+ v7 = st.SortPairsReverse4(d, v7);
+ v8 = st.SortPairsReverse4(d, v8);
+ v9 = st.SortPairsReverse4(d, v9);
+ va = st.SortPairsReverse4(d, va);
+ vb = st.SortPairsReverse4(d, vb);
+ vc = st.SortPairsReverse4(d, vc);
+ vd = st.SortPairsReverse4(d, vd);
+ ve = st.SortPairsReverse4(d, ve);
+ vf = st.SortPairsReverse4(d, vf);
+
+ v0 = st.SortPairsDistance1(d, v0);
+ v1 = st.SortPairsDistance1(d, v1);
+ v2 = st.SortPairsDistance1(d, v2);
+ v3 = st.SortPairsDistance1(d, v3);
+ v4 = st.SortPairsDistance1(d, v4);
+ v5 = st.SortPairsDistance1(d, v5);
+ v6 = st.SortPairsDistance1(d, v6);
+ v7 = st.SortPairsDistance1(d, v7);
+ v8 = st.SortPairsDistance1(d, v8);
+ v9 = st.SortPairsDistance1(d, v9);
+ va = st.SortPairsDistance1(d, va);
+ vb = st.SortPairsDistance1(d, vb);
+ vc = st.SortPairsDistance1(d, vc);
+ vd = st.SortPairsDistance1(d, vd);
+ ve = st.SortPairsDistance1(d, ve);
+ vf = st.SortPairsDistance1(d, vf);
+}
+
+template <size_t kKeysPerVector, class D, class Traits, class V = Vec<D>,
+ HWY_IF_LANES_GT(kKeysPerVector, 4)>
+HWY_INLINE void Merge16x8(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4,
+ V& v5, V& v6, V& v7, V& v8, V& v9, V& va, V& vb,
+ V& vc, V& vd, V& ve, V& vf) {
+ vf = st.ReverseKeys8(d, vf);
+ ve = st.ReverseKeys8(d, ve);
+ vd = st.ReverseKeys8(d, vd);
+ vc = st.ReverseKeys8(d, vc);
+ vb = st.ReverseKeys8(d, vb);
+ va = st.ReverseKeys8(d, va);
+ v9 = st.ReverseKeys8(d, v9);
+ v8 = st.ReverseKeys8(d, v8);
+ st.Sort2(d, v0, vf);
+ st.Sort2(d, v1, ve);
+ st.Sort2(d, v2, vd);
+ st.Sort2(d, v3, vc);
+ st.Sort2(d, v4, vb);
+ st.Sort2(d, v5, va);
+ st.Sort2(d, v6, v9);
+ st.Sort2(d, v7, v8);
+
+ v7 = st.ReverseKeys8(d, v7);
+ v6 = st.ReverseKeys8(d, v6);
+ v5 = st.ReverseKeys8(d, v5);
+ v4 = st.ReverseKeys8(d, v4);
+ vf = st.ReverseKeys8(d, vf);
+ ve = st.ReverseKeys8(d, ve);
+ vd = st.ReverseKeys8(d, vd);
+ vc = st.ReverseKeys8(d, vc);
+ st.Sort2(d, v0, v7);
+ st.Sort2(d, v1, v6);
+ st.Sort2(d, v2, v5);
+ st.Sort2(d, v3, v4);
+ st.Sort2(d, v8, vf);
+ st.Sort2(d, v9, ve);
+ st.Sort2(d, va, vd);
+ st.Sort2(d, vb, vc);
+
+ v3 = st.ReverseKeys8(d, v3);
+ v2 = st.ReverseKeys8(d, v2);
+ v7 = st.ReverseKeys8(d, v7);
+ v6 = st.ReverseKeys8(d, v6);
+ vb = st.ReverseKeys8(d, vb);
+ va = st.ReverseKeys8(d, va);
+ vf = st.ReverseKeys8(d, vf);
+ ve = st.ReverseKeys8(d, ve);
+ st.Sort2(d, v0, v3);
+ st.Sort2(d, v1, v2);
+ st.Sort2(d, v4, v7);
+ st.Sort2(d, v5, v6);
+ st.Sort2(d, v8, vb);
+ st.Sort2(d, v9, va);
+ st.Sort2(d, vc, vf);
+ st.Sort2(d, vd, ve);
+
+ v1 = st.ReverseKeys8(d, v1);
+ v3 = st.ReverseKeys8(d, v3);
+ v5 = st.ReverseKeys8(d, v5);
+ v7 = st.ReverseKeys8(d, v7);
+ v9 = st.ReverseKeys8(d, v9);
+ vb = st.ReverseKeys8(d, vb);
+ vd = st.ReverseKeys8(d, vd);
+ vf = st.ReverseKeys8(d, vf);
+ st.Sort2(d, v0, v1);
+ st.Sort2(d, v2, v3);
+ st.Sort2(d, v4, v5);
+ st.Sort2(d, v6, v7);
+ st.Sort2(d, v8, v9);
+ st.Sort2(d, va, vb);
+ st.Sort2(d, vc, vd);
+ st.Sort2(d, ve, vf);
+
+ v0 = st.SortPairsReverse8(d, v0);
+ v1 = st.SortPairsReverse8(d, v1);
+ v2 = st.SortPairsReverse8(d, v2);
+ v3 = st.SortPairsReverse8(d, v3);
+ v4 = st.SortPairsReverse8(d, v4);
+ v5 = st.SortPairsReverse8(d, v5);
+ v6 = st.SortPairsReverse8(d, v6);
+ v7 = st.SortPairsReverse8(d, v7);
+ v8 = st.SortPairsReverse8(d, v8);
+ v9 = st.SortPairsReverse8(d, v9);
+ va = st.SortPairsReverse8(d, va);
+ vb = st.SortPairsReverse8(d, vb);
+ vc = st.SortPairsReverse8(d, vc);
+ vd = st.SortPairsReverse8(d, vd);
+ ve = st.SortPairsReverse8(d, ve);
+ vf = st.SortPairsReverse8(d, vf);
+
+ v0 = st.SortPairsDistance2(d, v0);
+ v1 = st.SortPairsDistance2(d, v1);
+ v2 = st.SortPairsDistance2(d, v2);
+ v3 = st.SortPairsDistance2(d, v3);
+ v4 = st.SortPairsDistance2(d, v4);
+ v5 = st.SortPairsDistance2(d, v5);
+ v6 = st.SortPairsDistance2(d, v6);
+ v7 = st.SortPairsDistance2(d, v7);
+ v8 = st.SortPairsDistance2(d, v8);
+ v9 = st.SortPairsDistance2(d, v9);
+ va = st.SortPairsDistance2(d, va);
+ vb = st.SortPairsDistance2(d, vb);
+ vc = st.SortPairsDistance2(d, vc);
+ vd = st.SortPairsDistance2(d, vd);
+ ve = st.SortPairsDistance2(d, ve);
+ vf = st.SortPairsDistance2(d, vf);
+
+ v0 = st.SortPairsDistance1(d, v0);
+ v1 = st.SortPairsDistance1(d, v1);
+ v2 = st.SortPairsDistance1(d, v2);
+ v3 = st.SortPairsDistance1(d, v3);
+ v4 = st.SortPairsDistance1(d, v4);
+ v5 = st.SortPairsDistance1(d, v5);
+ v6 = st.SortPairsDistance1(d, v6);
+ v7 = st.SortPairsDistance1(d, v7);
+ v8 = st.SortPairsDistance1(d, v8);
+ v9 = st.SortPairsDistance1(d, v9);
+ va = st.SortPairsDistance1(d, va);
+ vb = st.SortPairsDistance1(d, vb);
+ vc = st.SortPairsDistance1(d, vc);
+ vd = st.SortPairsDistance1(d, vd);
+ ve = st.SortPairsDistance1(d, ve);
+ vf = st.SortPairsDistance1(d, vf);
+}
+
+// Unused on MSVC, see below
+#if !HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD
+
+template <size_t kKeysPerVector, class D, class Traits, class V = Vec<D>,
+ HWY_IF_LANES_GT(kKeysPerVector, 8)>
+HWY_INLINE void Merge16x16(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4,
+ V& v5, V& v6, V& v7, V& v8, V& v9, V& va, V& vb,
+ V& vc, V& vd, V& ve, V& vf) {
+ vf = st.ReverseKeys16(d, vf);
+ ve = st.ReverseKeys16(d, ve);
+ vd = st.ReverseKeys16(d, vd);
+ vc = st.ReverseKeys16(d, vc);
+ vb = st.ReverseKeys16(d, vb);
+ va = st.ReverseKeys16(d, va);
+ v9 = st.ReverseKeys16(d, v9);
+ v8 = st.ReverseKeys16(d, v8);
+ st.Sort2(d, v0, vf);
+ st.Sort2(d, v1, ve);
+ st.Sort2(d, v2, vd);
+ st.Sort2(d, v3, vc);
+ st.Sort2(d, v4, vb);
+ st.Sort2(d, v5, va);
+ st.Sort2(d, v6, v9);
+ st.Sort2(d, v7, v8);
+
+ v7 = st.ReverseKeys16(d, v7);
+ v6 = st.ReverseKeys16(d, v6);
+ v5 = st.ReverseKeys16(d, v5);
+ v4 = st.ReverseKeys16(d, v4);
+ vf = st.ReverseKeys16(d, vf);
+ ve = st.ReverseKeys16(d, ve);
+ vd = st.ReverseKeys16(d, vd);
+ vc = st.ReverseKeys16(d, vc);
+ st.Sort2(d, v0, v7);
+ st.Sort2(d, v1, v6);
+ st.Sort2(d, v2, v5);
+ st.Sort2(d, v3, v4);
+ st.Sort2(d, v8, vf);
+ st.Sort2(d, v9, ve);
+ st.Sort2(d, va, vd);
+ st.Sort2(d, vb, vc);
+
+ v3 = st.ReverseKeys16(d, v3);
+ v2 = st.ReverseKeys16(d, v2);
+ v7 = st.ReverseKeys16(d, v7);
+ v6 = st.ReverseKeys16(d, v6);
+ vb = st.ReverseKeys16(d, vb);
+ va = st.ReverseKeys16(d, va);
+ vf = st.ReverseKeys16(d, vf);
+ ve = st.ReverseKeys16(d, ve);
+ st.Sort2(d, v0, v3);
+ st.Sort2(d, v1, v2);
+ st.Sort2(d, v4, v7);
+ st.Sort2(d, v5, v6);
+ st.Sort2(d, v8, vb);
+ st.Sort2(d, v9, va);
+ st.Sort2(d, vc, vf);
+ st.Sort2(d, vd, ve);
+
+ v1 = st.ReverseKeys16(d, v1);
+ v3 = st.ReverseKeys16(d, v3);
+ v5 = st.ReverseKeys16(d, v5);
+ v7 = st.ReverseKeys16(d, v7);
+ v9 = st.ReverseKeys16(d, v9);
+ vb = st.ReverseKeys16(d, vb);
+ vd = st.ReverseKeys16(d, vd);
+ vf = st.ReverseKeys16(d, vf);
+ st.Sort2(d, v0, v1);
+ st.Sort2(d, v2, v3);
+ st.Sort2(d, v4, v5);
+ st.Sort2(d, v6, v7);
+ st.Sort2(d, v8, v9);
+ st.Sort2(d, va, vb);
+ st.Sort2(d, vc, vd);
+ st.Sort2(d, ve, vf);
+
+ v0 = st.SortPairsReverse16(d, v0);
+ v1 = st.SortPairsReverse16(d, v1);
+ v2 = st.SortPairsReverse16(d, v2);
+ v3 = st.SortPairsReverse16(d, v3);
+ v4 = st.SortPairsReverse16(d, v4);
+ v5 = st.SortPairsReverse16(d, v5);
+ v6 = st.SortPairsReverse16(d, v6);
+ v7 = st.SortPairsReverse16(d, v7);
+ v8 = st.SortPairsReverse16(d, v8);
+ v9 = st.SortPairsReverse16(d, v9);
+ va = st.SortPairsReverse16(d, va);
+ vb = st.SortPairsReverse16(d, vb);
+ vc = st.SortPairsReverse16(d, vc);
+ vd = st.SortPairsReverse16(d, vd);
+ ve = st.SortPairsReverse16(d, ve);
+ vf = st.SortPairsReverse16(d, vf);
+
+ v0 = st.SortPairsDistance4(d, v0);
+ v1 = st.SortPairsDistance4(d, v1);
+ v2 = st.SortPairsDistance4(d, v2);
+ v3 = st.SortPairsDistance4(d, v3);
+ v4 = st.SortPairsDistance4(d, v4);
+ v5 = st.SortPairsDistance4(d, v5);
+ v6 = st.SortPairsDistance4(d, v6);
+ v7 = st.SortPairsDistance4(d, v7);
+ v8 = st.SortPairsDistance4(d, v8);
+ v9 = st.SortPairsDistance4(d, v9);
+ va = st.SortPairsDistance4(d, va);
+ vb = st.SortPairsDistance4(d, vb);
+ vc = st.SortPairsDistance4(d, vc);
+ vd = st.SortPairsDistance4(d, vd);
+ ve = st.SortPairsDistance4(d, ve);
+ vf = st.SortPairsDistance4(d, vf);
+
+ v0 = st.SortPairsDistance2(d, v0);
+ v1 = st.SortPairsDistance2(d, v1);
+ v2 = st.SortPairsDistance2(d, v2);
+ v3 = st.SortPairsDistance2(d, v3);
+ v4 = st.SortPairsDistance2(d, v4);
+ v5 = st.SortPairsDistance2(d, v5);
+ v6 = st.SortPairsDistance2(d, v6);
+ v7 = st.SortPairsDistance2(d, v7);
+ v8 = st.SortPairsDistance2(d, v8);
+ v9 = st.SortPairsDistance2(d, v9);
+ va = st.SortPairsDistance2(d, va);
+ vb = st.SortPairsDistance2(d, vb);
+ vc = st.SortPairsDistance2(d, vc);
+ vd = st.SortPairsDistance2(d, vd);
+ ve = st.SortPairsDistance2(d, ve);
+ vf = st.SortPairsDistance2(d, vf);
+
+ v0 = st.SortPairsDistance1(d, v0);
+ v1 = st.SortPairsDistance1(d, v1);
+ v2 = st.SortPairsDistance1(d, v2);
+ v3 = st.SortPairsDistance1(d, v3);
+ v4 = st.SortPairsDistance1(d, v4);
+ v5 = st.SortPairsDistance1(d, v5);
+ v6 = st.SortPairsDistance1(d, v6);
+ v7 = st.SortPairsDistance1(d, v7);
+ v8 = st.SortPairsDistance1(d, v8);
+ v9 = st.SortPairsDistance1(d, v9);
+ va = st.SortPairsDistance1(d, va);
+ vb = st.SortPairsDistance1(d, vb);
+ vc = st.SortPairsDistance1(d, vc);
+ vd = st.SortPairsDistance1(d, vd);
+ ve = st.SortPairsDistance1(d, ve);
+ vf = st.SortPairsDistance1(d, vf);
+}
+
+#endif // !HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD
+
+// Reshapes `buf` into a matrix, sorts columns independently, and then merges
+// into a sorted 1D array without transposing.
+//
+// DEPRECATED, use BaseCase() instead.
+template <class Traits, class V>
+HWY_INLINE void SortingNetwork(Traits st, size_t cols, V& v0, V& v1, V& v2,
+ V& v3, V& v4, V& v5, V& v6, V& v7, V& v8, V& v9,
+ V& va, V& vb, V& vc, V& vd, V& ve, V& vf) {
+ // traits*-inl assume 'full' vectors (but still capped to kMaxCols).
+ const CappedTag<typename Traits::LaneType, Constants::kMaxCols> d;
+
+ HWY_DASSERT(cols <= Constants::kMaxCols);
+
+ // The network width depends on the number of keys, not lanes.
+ constexpr size_t kLanesPerKey = st.LanesPerKey();
+ const size_t keys = cols / kLanesPerKey;
+ constexpr size_t kMaxKeys = MaxLanes(d) / kLanesPerKey;
+
+ Sort16(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve, vf);
+
+ // Checking MaxLanes avoids generating HWY_ASSERT code for the unreachable
+ // code paths: if MaxLanes < 2, then keys <= cols < 2.
+ if (HWY_LIKELY(keys >= 2 && kMaxKeys >= 2)) {
+ Merge16x2<kMaxKeys>(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb,
+ vc, vd, ve, vf);
+
+ if (HWY_LIKELY(keys >= 4 && kMaxKeys >= 4)) {
+ Merge16x4<kMaxKeys>(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb,
+ vc, vd, ve, vf);
+
+ if (HWY_LIKELY(keys >= 8 && kMaxKeys >= 8)) {
+ Merge16x8<kMaxKeys>(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va,
+ vb, vc, vd, ve, vf);
+
+ // Avoids build timeout. Must match #if condition in kMaxCols.
+#if !HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD
+ if (HWY_LIKELY(keys >= 16 && kMaxKeys >= 16)) {
+ Merge16x16<kMaxKeys>(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
+ va, vb, vc, vd, ve, vf);
+
+ static_assert(Constants::kMaxCols <= 16, "Add more branches");
+ }
+#endif
+ }
+ }
+ }
+}
+
+// As above, but loads from/stores to `buf`. This ensures full vectors are
+// aligned, and enables loads/stores without bounds checks.
+//
+// DEPRECATED, use BaseCase() instead.
+template <class Traits, typename T>
+HWY_NOINLINE void SortingNetwork(Traits st, T* HWY_RESTRICT buf, size_t cols) {
+ // traits*-inl assume 'full' vectors (but still capped to kMaxCols).
+ // However, for smaller arrays and sub-maximal `cols` we have overlapping
+ // loads where only the lowest `cols` are valid, and we skip Merge16 etc.
+ const CappedTag<T, Constants::kMaxCols> d;
+ using V = decltype(Zero(d));
+
+ HWY_DASSERT(cols <= Constants::kMaxCols);
+
+ // These are aligned iff cols == Lanes(d). We prefer unaligned/non-constexpr
+ // offsets to duplicating this code for every value of cols.
+ static_assert(Constants::kMaxRows == 16, "Update loads/stores/args");
+ V v0 = LoadU(d, buf + 0x0 * cols);
+ V v1 = LoadU(d, buf + 0x1 * cols);
+ V v2 = LoadU(d, buf + 0x2 * cols);
+ V v3 = LoadU(d, buf + 0x3 * cols);
+ V v4 = LoadU(d, buf + 0x4 * cols);
+ V v5 = LoadU(d, buf + 0x5 * cols);
+ V v6 = LoadU(d, buf + 0x6 * cols);
+ V v7 = LoadU(d, buf + 0x7 * cols);
+ V v8 = LoadU(d, buf + 0x8 * cols);
+ V v9 = LoadU(d, buf + 0x9 * cols);
+ V va = LoadU(d, buf + 0xa * cols);
+ V vb = LoadU(d, buf + 0xb * cols);
+ V vc = LoadU(d, buf + 0xc * cols);
+ V vd = LoadU(d, buf + 0xd * cols);
+ V ve = LoadU(d, buf + 0xe * cols);
+ V vf = LoadU(d, buf + 0xf * cols);
+
+ SortingNetwork(st, cols, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc,
+ vd, ve, vf);
+
+ StoreU(v0, d, buf + 0x0 * cols);
+ StoreU(v1, d, buf + 0x1 * cols);
+ StoreU(v2, d, buf + 0x2 * cols);
+ StoreU(v3, d, buf + 0x3 * cols);
+ StoreU(v4, d, buf + 0x4 * cols);
+ StoreU(v5, d, buf + 0x5 * cols);
+ StoreU(v6, d, buf + 0x6 * cols);
+ StoreU(v7, d, buf + 0x7 * cols);
+ StoreU(v8, d, buf + 0x8 * cols);
+ StoreU(v9, d, buf + 0x9 * cols);
+ StoreU(va, d, buf + 0xa * cols);
+ StoreU(vb, d, buf + 0xb * cols);
+ StoreU(vc, d, buf + 0xc * cols);
+ StoreU(vd, d, buf + 0xd * cols);
+ StoreU(ve, d, buf + 0xe * cols);
+ StoreU(vf, d, buf + 0xf * cols);
+}
+
+#else
+template <class Base>
+struct SharedTraits : public Base {};
+#endif // VQSORT_ENABLED
+
+} // namespace detail
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#endif // HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
diff --git a/third_party/highway/hwy/contrib/sort/traits-inl.h b/third_party/highway/hwy/contrib/sort/traits-inl.h
new file mode 100644
index 0000000000..732f87ee23
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/traits-inl.h
@@ -0,0 +1,561 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Per-target
+#if defined(HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE) == \
+ defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE
+#undef HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE
+#else
+#define HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE
+#endif
+
+#include "hwy/contrib/sort/shared-inl.h" // SortConstants
+#include "hwy/contrib/sort/vqsort.h" // SortDescending
+#include "hwy/highway.h"
+#include "hwy/print.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+namespace detail {
+
+// Base class of both KeyLane (with or without VQSORT_ENABLED)
+template <typename T>
+struct KeyLaneBase {
+ static constexpr bool Is128() { return false; }
+ constexpr size_t LanesPerKey() const { return 1; }
+
+ // What type bench_sort should allocate for generating inputs.
+ using LaneType = T;
+ // What type to pass to VQSort.
+ using KeyType = T;
+
+ const char* KeyString() const {
+ return IsSame<T, float>() ? "f32"
+ : IsSame<T, double>() ? "f64"
+ : IsSame<T, int16_t>() ? "i16"
+ : IsSame<T, int32_t>() ? "i32"
+ : IsSame<T, int64_t>() ? "i64"
+ : IsSame<T, uint16_t>() ? "u32"
+ : IsSame<T, uint32_t>() ? "u32"
+ : IsSame<T, uint64_t>() ? "u64"
+ : "?";
+ }
+};
+
+#if VQSORT_ENABLED || HWY_IDE
+
+// Highway does not provide a lane type for 128-bit keys, so we use uint64_t
+// along with an abstraction layer for single-lane vs. lane-pair, which is
+// independent of the order.
+template <typename T>
+struct KeyLane : public KeyLaneBase<T> {
+ // False indicates the entire key (i.e. lane) should be compared. KV stands
+ // for key-value.
+ static constexpr bool IsKV() { return false; }
+
+ // For HeapSort
+ HWY_INLINE void Swap(T* a, T* b) const {
+ const T temp = *a;
+ *a = *b;
+ *b = temp;
+ }
+
+ template <class V, class M>
+ HWY_INLINE V CompressKeys(V keys, M mask) const {
+ return CompressNot(keys, mask);
+ }
+
+ // Broadcasts one key into a vector
+ template <class D>
+ HWY_INLINE Vec<D> SetKey(D d, const T* key) const {
+ return Set(d, *key);
+ }
+
+ template <class D>
+ HWY_INLINE Mask<D> EqualKeys(D /*tag*/, Vec<D> a, Vec<D> b) const {
+ return Eq(a, b);
+ }
+
+ template <class D>
+ HWY_INLINE Mask<D> NotEqualKeys(D /*tag*/, Vec<D> a, Vec<D> b) const {
+ return Ne(a, b);
+ }
+
+ // For keys=lanes, any difference counts.
+ template <class D>
+ HWY_INLINE bool NoKeyDifference(D /*tag*/, Vec<D> diff) const {
+ // Must avoid floating-point comparisons (for -0)
+ const RebindToUnsigned<D> du;
+ return AllTrue(du, Eq(BitCast(du, diff), Zero(du)));
+ }
+
+ HWY_INLINE bool Equal1(const T* a, const T* b) const { return *a == *b; }
+
+ template <class D>
+ HWY_INLINE Vec<D> ReverseKeys(D d, Vec<D> v) const {
+ return Reverse(d, v);
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> ReverseKeys2(D d, Vec<D> v) const {
+ return Reverse2(d, v);
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> ReverseKeys4(D d, Vec<D> v) const {
+ return Reverse4(d, v);
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> ReverseKeys8(D d, Vec<D> v) const {
+ return Reverse8(d, v);
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> ReverseKeys16(D d, Vec<D> v) const {
+ static_assert(SortConstants::kMaxCols <= 16, "Assumes u32x16 = 512 bit");
+ return ReverseKeys(d, v);
+ }
+
+ template <class V>
+ HWY_INLINE V OddEvenKeys(const V odd, const V even) const {
+ return OddEven(odd, even);
+ }
+
+ template <class D, HWY_IF_T_SIZE_D(D, 2)>
+ HWY_INLINE Vec<D> SwapAdjacentPairs(D d, const Vec<D> v) const {
+ const Repartition<uint32_t, D> du32;
+ return BitCast(d, Shuffle2301(BitCast(du32, v)));
+ }
+ template <class D, HWY_IF_T_SIZE_D(D, 4)>
+ HWY_INLINE Vec<D> SwapAdjacentPairs(D /* tag */, const Vec<D> v) const {
+ return Shuffle1032(v);
+ }
+ template <class D, HWY_IF_T_SIZE_D(D, 8)>
+ HWY_INLINE Vec<D> SwapAdjacentPairs(D /* tag */, const Vec<D> v) const {
+ return SwapAdjacentBlocks(v);
+ }
+
+ template <class D, HWY_IF_NOT_T_SIZE_D(D, 8)>
+ HWY_INLINE Vec<D> SwapAdjacentQuads(D d, const Vec<D> v) const {
+#if HWY_HAVE_FLOAT64 // in case D is float32
+ const RepartitionToWide<D> dw;
+#else
+ const RepartitionToWide<RebindToUnsigned<D> > dw;
+#endif
+ return BitCast(d, SwapAdjacentPairs(dw, BitCast(dw, v)));
+ }
+ template <class D, HWY_IF_T_SIZE_D(D, 8)>
+ HWY_INLINE Vec<D> SwapAdjacentQuads(D d, const Vec<D> v) const {
+ // Assumes max vector size = 512
+ return ConcatLowerUpper(d, v, v);
+ }
+
+ template <class D, HWY_IF_NOT_T_SIZE_D(D, 8)>
+ HWY_INLINE Vec<D> OddEvenPairs(D d, const Vec<D> odd,
+ const Vec<D> even) const {
+#if HWY_HAVE_FLOAT64 // in case D is float32
+ const RepartitionToWide<D> dw;
+#else
+ const RepartitionToWide<RebindToUnsigned<D> > dw;
+#endif
+ return BitCast(d, OddEven(BitCast(dw, odd), BitCast(dw, even)));
+ }
+ template <class D, HWY_IF_T_SIZE_D(D, 8)>
+ HWY_INLINE Vec<D> OddEvenPairs(D /* tag */, Vec<D> odd, Vec<D> even) const {
+ return OddEvenBlocks(odd, even);
+ }
+
+ template <class D, HWY_IF_NOT_T_SIZE_D(D, 8)>
+ HWY_INLINE Vec<D> OddEvenQuads(D d, Vec<D> odd, Vec<D> even) const {
+#if HWY_HAVE_FLOAT64 // in case D is float32
+ const RepartitionToWide<D> dw;
+#else
+ const RepartitionToWide<RebindToUnsigned<D> > dw;
+#endif
+ return BitCast(d, OddEvenPairs(dw, BitCast(dw, odd), BitCast(dw, even)));
+ }
+ template <class D, HWY_IF_T_SIZE_D(D, 8)>
+ HWY_INLINE Vec<D> OddEvenQuads(D d, Vec<D> odd, Vec<D> even) const {
+ return ConcatUpperLower(d, odd, even);
+ }
+};
+
+// Anything order-related depends on the key traits *and* the order (see
+// FirstOfLanes). We cannot implement just one Compare function because Lt128
+// only compiles if the lane type is u64. Thus we need either overloaded
+// functions with a tag type, class specializations, or separate classes.
+// We avoid overloaded functions because we want all functions to be callable
+// from a SortTraits without per-function wrappers. Specializing would work, but
+// we are anyway going to specialize at a higher level.
+template <typename T>
+struct OrderAscending : public KeyLane<T> {
+ using Order = SortAscending;
+
+ HWY_INLINE bool Compare1(const T* a, const T* b) { return *a < *b; }
+
+ template <class D>
+ HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
+ return Lt(a, b);
+ }
+
+ // Two halves of Sort2, used in ScanMinMax.
+ template <class D>
+ HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const {
+ return Min(a, b);
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const {
+ return Max(a, b);
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
+ T* HWY_RESTRICT /* buf */) const {
+ return MinOfLanes(d, v);
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
+ T* HWY_RESTRICT /* buf */) const {
+ return MaxOfLanes(d, v);
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> FirstValue(D d) const {
+ return Set(d, hwy::LowestValue<T>());
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> LastValue(D d) const {
+ return Set(d, hwy::HighestValue<T>());
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
+ return Sub(v, Set(d, hwy::Epsilon<T>()));
+ }
+};
+
+template <typename T>
+struct OrderDescending : public KeyLane<T> {
+ using Order = SortDescending;
+
+ HWY_INLINE bool Compare1(const T* a, const T* b) { return *b < *a; }
+
+ template <class D>
+ HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
+ return Lt(b, a);
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const {
+ return Max(a, b);
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const {
+ return Min(a, b);
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
+ T* HWY_RESTRICT /* buf */) const {
+ return MaxOfLanes(d, v);
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
+ T* HWY_RESTRICT /* buf */) const {
+ return MinOfLanes(d, v);
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> FirstValue(D d) const {
+ return Set(d, hwy::HighestValue<T>());
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> LastValue(D d) const {
+ return Set(d, hwy::LowestValue<T>());
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
+ return Add(v, Set(d, hwy::Epsilon<T>()));
+ }
+};
+
+struct KeyValue64 : public KeyLane<uint64_t> {
+ // True indicates only part of the key (i.e. lane) should be compared. KV
+ // stands for key-value.
+ static constexpr bool IsKV() { return true; }
+
+ template <class D>
+ HWY_INLINE Mask<D> EqualKeys(D /*tag*/, Vec<D> a, Vec<D> b) const {
+ return Eq(ShiftRight<32>(a), ShiftRight<32>(b));
+ }
+
+ template <class D>
+ HWY_INLINE Mask<D> NotEqualKeys(D /*tag*/, Vec<D> a, Vec<D> b) const {
+ return Ne(ShiftRight<32>(a), ShiftRight<32>(b));
+ }
+
+ HWY_INLINE bool Equal1(const uint64_t* a, const uint64_t* b) const {
+ return (*a >> 32) == (*b >> 32);
+ }
+
+ // Only count differences in the actual key, not the value.
+ template <class D>
+ HWY_INLINE bool NoKeyDifference(D /*tag*/, Vec<D> diff) const {
+ // Must avoid floating-point comparisons (for -0)
+ const RebindToUnsigned<D> du;
+ const Vec<decltype(du)> zero = Zero(du);
+ const Vec<decltype(du)> keys = ShiftRight<32>(diff); // clear values
+ return AllTrue(du, Eq(BitCast(du, keys), zero));
+ }
+};
+
+struct OrderAscendingKV64 : public KeyValue64 {
+ using Order = SortAscending;
+
+ HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
+ return (*a >> 32) < (*b >> 32);
+ }
+
+ template <class D>
+ HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
+ return Lt(ShiftRight<32>(a), ShiftRight<32>(b));
+ }
+
+ // Not required to be stable (preserving the order of equivalent keys), so
+ // we can include the value in the comparison.
+ template <class D>
+ HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const {
+ return Min(a, b);
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const {
+ return Max(a, b);
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
+ uint64_t* HWY_RESTRICT /* buf */) const {
+ return MinOfLanes(d, v);
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
+ uint64_t* HWY_RESTRICT /* buf */) const {
+ return MaxOfLanes(d, v);
+ }
+
+ // Same as for regular lanes.
+ template <class D>
+ HWY_INLINE Vec<D> FirstValue(D d) const {
+ return Set(d, hwy::LowestValue<TFromD<D> >());
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> LastValue(D d) const {
+ return Set(d, hwy::HighestValue<TFromD<D> >());
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
+ return Sub(v, Set(d, uint64_t{1}));
+ }
+};
+
+struct OrderDescendingKV64 : public KeyValue64 {
+ using Order = SortDescending;
+
+ HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
+ return (*b >> 32) < (*a >> 32);
+ }
+
+ template <class D>
+ HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
+ return Lt(ShiftRight<32>(b), ShiftRight<32>(a));
+ }
+
+ // Not required to be stable (preserving the order of equivalent keys), so
+ // we can include the value in the comparison.
+ template <class D>
+ HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const {
+ return Max(a, b);
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const {
+ return Min(a, b);
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
+ uint64_t* HWY_RESTRICT /* buf */) const {
+ return MaxOfLanes(d, v);
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
+ uint64_t* HWY_RESTRICT /* buf */) const {
+ return MinOfLanes(d, v);
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> FirstValue(D d) const {
+ return Set(d, hwy::HighestValue<TFromD<D> >());
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> LastValue(D d) const {
+ return Set(d, hwy::LowestValue<TFromD<D> >());
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
+ return Add(v, Set(d, uint64_t{1}));
+ }
+};
+
+// Shared code that depends on Order.
+template <class Base>
+struct TraitsLane : public Base {
+ // For each lane i: replaces a[i] with the first and b[i] with the second
+ // according to Base.
+ // Corresponds to a conditional swap, which is one "node" of a sorting
+ // network. Min/Max are cheaper than compare + blend at least for integers.
+ template <class D>
+ HWY_INLINE void Sort2(D d, Vec<D>& a, Vec<D>& b) const {
+ const Base* base = static_cast<const Base*>(this);
+
+ const Vec<D> a_copy = a;
+ // Prior to AVX3, there is no native 64-bit Min/Max, so they compile to 4
+ // instructions. We can reduce it to a compare + 2 IfThenElse.
+#if HWY_AVX3 < HWY_TARGET && HWY_TARGET <= HWY_SSSE3
+ if (sizeof(TFromD<D>) == 8) {
+ const Mask<D> cmp = base->Compare(d, a, b);
+ a = IfThenElse(cmp, a, b);
+ b = IfThenElse(cmp, b, a_copy);
+ return;
+ }
+#endif
+ a = base->First(d, a, b);
+ b = base->Last(d, a_copy, b);
+ }
+
+ // Conditionally swaps even-numbered lanes with their odd-numbered neighbor.
+ template <class D, HWY_IF_T_SIZE_D(D, 8)>
+ HWY_INLINE Vec<D> SortPairsDistance1(D d, Vec<D> v) const {
+ const Base* base = static_cast<const Base*>(this);
+ Vec<D> swapped = base->ReverseKeys2(d, v);
+ // Further to the above optimization, Sort2+OddEvenKeys compile to four
+ // instructions; we can save one by combining two blends.
+#if HWY_AVX3 < HWY_TARGET && HWY_TARGET <= HWY_SSSE3
+ const Vec<D> cmp = VecFromMask(d, base->Compare(d, v, swapped));
+ return IfVecThenElse(DupOdd(cmp), swapped, v);
+#else
+ Sort2(d, v, swapped);
+ return base->OddEvenKeys(swapped, v);
+#endif
+ }
+
+ // (See above - we use Sort2 for non-64-bit types.)
+ template <class D, HWY_IF_NOT_T_SIZE_D(D, 8)>
+ HWY_INLINE Vec<D> SortPairsDistance1(D d, Vec<D> v) const {
+ const Base* base = static_cast<const Base*>(this);
+ Vec<D> swapped = base->ReverseKeys2(d, v);
+ Sort2(d, v, swapped);
+ return base->OddEvenKeys(swapped, v);
+ }
+
+ // Swaps with the vector formed by reversing contiguous groups of 4 keys.
+ template <class D>
+ HWY_INLINE Vec<D> SortPairsReverse4(D d, Vec<D> v) const {
+ const Base* base = static_cast<const Base*>(this);
+ Vec<D> swapped = base->ReverseKeys4(d, v);
+ Sort2(d, v, swapped);
+ return base->OddEvenPairs(d, swapped, v);
+ }
+
+ // Conditionally swaps lane 0 with 4, 1 with 5 etc.
+ template <class D>
+ HWY_INLINE Vec<D> SortPairsDistance4(D d, Vec<D> v) const {
+ const Base* base = static_cast<const Base*>(this);
+ Vec<D> swapped = base->SwapAdjacentQuads(d, v);
+ // Only used in Merge16, so this will not be used on AVX2 (which only has 4
+ // u64 lanes), so skip the above optimization for 64-bit AVX2.
+ Sort2(d, v, swapped);
+ return base->OddEvenQuads(d, swapped, v);
+ }
+};
+
+#else
+
+template <typename T>
+struct OrderAscending : public KeyLaneBase<T> {
+ using Order = SortAscending;
+
+ HWY_INLINE bool Compare1(const T* a, const T* b) { return *a < *b; }
+
+ template <class D>
+ HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) {
+ return Lt(a, b);
+ }
+};
+
+template <typename T>
+struct OrderDescending : public KeyLaneBase<T> {
+ using Order = SortDescending;
+
+ HWY_INLINE bool Compare1(const T* a, const T* b) { return *b < *a; }
+
+ template <class D>
+ HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) {
+ return Lt(b, a);
+ }
+};
+
+template <class Order>
+struct TraitsLane : public Order {
+ // For HeapSort
+ template <typename T> // MSVC doesn't find typename Order::LaneType.
+ HWY_INLINE void Swap(T* a, T* b) const {
+ const T temp = *a;
+ *a = *b;
+ *b = temp;
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> SetKey(D d, const TFromD<D>* key) const {
+ return Set(d, *key);
+ }
+};
+
+#endif // VQSORT_ENABLED
+
+} // namespace detail
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#endif // HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE
diff --git a/third_party/highway/hwy/contrib/sort/traits128-inl.h b/third_party/highway/hwy/contrib/sort/traits128-inl.h
new file mode 100644
index 0000000000..ba9207c533
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/traits128-inl.h
@@ -0,0 +1,529 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Per-target
+#if defined(HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE) == \
+ defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE
+#undef HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE
+#else
+#define HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE
+#endif
+
+#include <string>
+
+#include "hwy/contrib/sort/shared-inl.h"
+#include "hwy/contrib/sort/vqsort.h" // SortDescending
+#include "hwy/highway.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+namespace detail {
+
+#if VQSORT_ENABLED || HWY_IDE
+
+// Highway does not provide a lane type for 128-bit keys, so we use uint64_t
+// along with an abstraction layer for single-lane vs. lane-pair, which is
+// independent of the order.
+struct KeyAny128 {
+ static constexpr bool Is128() { return true; }
+ constexpr size_t LanesPerKey() const { return 2; }
+
+ // What type bench_sort should allocate for generating inputs.
+ using LaneType = uint64_t;
+ // KeyType and KeyString are defined by derived classes.
+
+ HWY_INLINE void Swap(LaneType* a, LaneType* b) const {
+ const FixedTag<LaneType, 2> d;
+ const auto temp = LoadU(d, a);
+ StoreU(LoadU(d, b), d, a);
+ StoreU(temp, d, b);
+ }
+
+ template <class V, class M>
+ HWY_INLINE V CompressKeys(V keys, M mask) const {
+ return CompressBlocksNot(keys, mask);
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> SetKey(D d, const TFromD<D>* key) const {
+ return LoadDup128(d, key);
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> ReverseKeys(D d, Vec<D> v) const {
+ return ReverseBlocks(d, v);
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> ReverseKeys2(D /* tag */, const Vec<D> v) const {
+ return SwapAdjacentBlocks(v);
+ }
+
+ // Only called for 4 keys because we do not support >512-bit vectors.
+ template <class D>
+ HWY_INLINE Vec<D> ReverseKeys4(D d, const Vec<D> v) const {
+ HWY_DASSERT(Lanes(d) <= 64 / sizeof(TFromD<D>));
+ return ReverseKeys(d, v);
+ }
+
+ // Only called for 4 keys because we do not support >512-bit vectors.
+ template <class D>
+ HWY_INLINE Vec<D> OddEvenPairs(D d, const Vec<D> odd,
+ const Vec<D> even) const {
+ HWY_DASSERT(Lanes(d) <= 64 / sizeof(TFromD<D>));
+ return ConcatUpperLower(d, odd, even);
+ }
+
+ template <class V>
+ HWY_INLINE V OddEvenKeys(const V odd, const V even) const {
+ return OddEvenBlocks(odd, even);
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> ReverseKeys8(D, Vec<D>) const {
+ HWY_ASSERT(0); // not supported: would require 1024-bit vectors
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> ReverseKeys16(D, Vec<D>) const {
+ HWY_ASSERT(0); // not supported: would require 2048-bit vectors
+ }
+
+ // This is only called for 8/16 col networks (not supported).
+ template <class D>
+ HWY_INLINE Vec<D> SwapAdjacentPairs(D, Vec<D>) const {
+ HWY_ASSERT(0);
+ }
+
+ // This is only called for 16 col networks (not supported).
+ template <class D>
+ HWY_INLINE Vec<D> SwapAdjacentQuads(D, Vec<D>) const {
+ HWY_ASSERT(0);
+ }
+
+ // This is only called for 8 col networks (not supported).
+ template <class D>
+ HWY_INLINE Vec<D> OddEvenQuads(D, Vec<D>, Vec<D>) const {
+ HWY_ASSERT(0);
+ }
+};
+
+// Base class shared between OrderAscending128, OrderDescending128.
+struct Key128 : public KeyAny128 {
+ // False indicates the entire key should be compared. KV means key-value.
+ static constexpr bool IsKV() { return false; }
+
+ // What type to pass to VQSort.
+ using KeyType = hwy::uint128_t;
+
+ const char* KeyString() const { return "U128"; }
+
+ template <class D>
+ HWY_INLINE Mask<D> EqualKeys(D d, Vec<D> a, Vec<D> b) const {
+ return Eq128(d, a, b);
+ }
+
+ template <class D>
+ HWY_INLINE Mask<D> NotEqualKeys(D d, Vec<D> a, Vec<D> b) const {
+ return Ne128(d, a, b);
+ }
+
+ // For keys=entire 128 bits, any difference counts.
+ template <class D>
+ HWY_INLINE bool NoKeyDifference(D /*tag*/, Vec<D> diff) const {
+ // Must avoid floating-point comparisons (for -0)
+ const RebindToUnsigned<D> du;
+ return AllTrue(du, Eq(BitCast(du, diff), Zero(du)));
+ }
+
+ HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) const {
+ return a[0] == b[0] && a[1] == b[1];
+ }
+
+ // Returns vector with only the top half of each block valid. This allows
+ // fusing the "replicate upper to lower half" step with a subsequent permute.
+ template <class Order, class D>
+ HWY_INLINE HWY_MAYBE_UNUSED Vec<D> CompareTop(D d, Vec<D> a, Vec<D> b) const {
+ const Mask<D> eqHL = Eq(a, b);
+ const Vec<D> ltHL = VecFromMask(d, Order().CompareLanes(a, b));
+#if HWY_TARGET <= HWY_AVX2 // slightly faster
+ const Vec<D> ltLX = ShiftLeftLanes<1>(ltHL);
+ return OrAnd(ltHL, VecFromMask(d, eqHL), ltLX);
+#else
+ return IfThenElse(eqHL, DupEven(ltHL), ltHL);
+#endif
+ }
+};
+
+// Anything order-related depends on the key traits *and* the order (see
+// FirstOfLanes). We cannot implement just one Compare function because Lt128
+// only compiles if the lane type is u64. Thus we need either overloaded
+// functions with a tag type, class specializations, or separate classes.
+// We avoid overloaded functions because we want all functions to be callable
+// from a SortTraits without per-function wrappers. Specializing would work, but
+// we are anyway going to specialize at a higher level.
+struct OrderAscending128 : public Key128 {
+ using Order = SortAscending;
+
+ HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
+ return (a[1] == b[1]) ? a[0] < b[0] : a[1] < b[1];
+ }
+
+ template <class D>
+ HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const {
+ return Lt128(d, a, b);
+ }
+
+ // Used by CompareTop
+ template <class V>
+ HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const {
+ return Lt(a, b);
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const {
+ return Min128(d, a, b);
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const {
+ return Max128(d, a, b);
+ }
+
+ // Same as for regular lanes because 128-bit keys are u64.
+ template <class D>
+ HWY_INLINE Vec<D> FirstValue(D d) const {
+ return Set(d, hwy::LowestValue<TFromD<D> >());
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> LastValue(D d) const {
+ return Set(d, hwy::HighestValue<TFromD<D> >());
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
+ const Vec<D> k0 = Zero(d);
+ const Vec<D> k1 = OddEven(k0, Set(d, uint64_t{1}));
+ const Mask<D> borrow = Eq(v, k0); // don't-care, lo == 0
+ // lo == 0? 1 : 0, 0
+ const Vec<D> adjust = ShiftLeftLanes<1>(IfThenElseZero(borrow, k1));
+ return Sub(Sub(v, k1), adjust);
+ }
+};
+
+struct OrderDescending128 : public Key128 {
+ using Order = SortDescending;
+
+ HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
+ return (a[1] == b[1]) ? b[0] < a[0] : b[1] < a[1];
+ }
+
+ template <class D>
+ HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const {
+ return Lt128(d, b, a);
+ }
+
+ // Used by CompareTop
+ template <class V>
+ HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const {
+ return Lt(b, a);
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const {
+ return Max128(d, a, b);
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const {
+ return Min128(d, a, b);
+ }
+
+ // Same as for regular lanes because 128-bit keys are u64.
+ template <class D>
+ HWY_INLINE Vec<D> FirstValue(D d) const {
+ return Set(d, hwy::HighestValue<TFromD<D> >());
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> LastValue(D d) const {
+ return Set(d, hwy::LowestValue<TFromD<D> >());
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
+ const Vec<D> k1 = OddEven(Zero(d), Set(d, uint64_t{1}));
+ const Vec<D> added = Add(v, k1);
+ const Mask<D> overflowed = Lt(added, v); // false, overflowed
+ // overflowed? 1 : 0, 0
+ const Vec<D> adjust = ShiftLeftLanes<1>(IfThenElseZero(overflowed, k1));
+ return Add(added, adjust);
+ }
+};
+
+// Base class shared between OrderAscendingKV128, OrderDescendingKV128.
+struct KeyValue128 : public KeyAny128 {
+ // True indicates only part of the key (the more significant lane) should be
+ // compared. KV stands for key-value.
+ static constexpr bool IsKV() { return true; }
+
+ // What type to pass to VQSort.
+ using KeyType = K64V64;
+
+ const char* KeyString() const { return "KV128"; }
+
+ template <class D>
+ HWY_INLINE Mask<D> EqualKeys(D d, Vec<D> a, Vec<D> b) const {
+ return Eq128Upper(d, a, b);
+ }
+
+ template <class D>
+ HWY_INLINE Mask<D> NotEqualKeys(D d, Vec<D> a, Vec<D> b) const {
+ return Ne128Upper(d, a, b);
+ }
+
+ // Only count differences in the actual key, not the value.
+ template <class D>
+ HWY_INLINE bool NoKeyDifference(D /*tag*/, Vec<D> diff) const {
+ // Must avoid floating-point comparisons (for -0)
+ const RebindToUnsigned<D> du;
+ const Vec<decltype(du)> zero = Zero(du);
+ const Vec<decltype(du)> keys = OddEven(diff, zero); // clear values
+ return AllTrue(du, Eq(BitCast(du, keys), zero));
+ }
+
+ HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) const {
+ return a[1] == b[1];
+ }
+
+ // Returns vector with only the top half of each block valid. This allows
+ // fusing the "replicate upper to lower half" step with a subsequent permute.
+ template <class Order, class D>
+ HWY_INLINE HWY_MAYBE_UNUSED Vec<D> CompareTop(D d, Vec<D> a, Vec<D> b) const {
+ // Only the upper lane of each block is a key, and only that lane is
+ // required to be valid, so comparing all lanes is sufficient.
+ return VecFromMask(d, Order().CompareLanes(a, b));
+ }
+};
+
+struct OrderAscendingKV128 : public KeyValue128 {
+ using Order = SortAscending;
+
+ HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
+ return a[1] < b[1];
+ }
+
+ template <class D>
+ HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const {
+ return Lt128Upper(d, a, b);
+ }
+
+ // Used by CompareTop
+ template <class V>
+ HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const {
+ return Lt(a, b);
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const {
+ return Min128Upper(d, a, b);
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const {
+ return Max128Upper(d, a, b);
+ }
+
+ // Same as for regular lanes because 128-bit keys are u64.
+ template <class D>
+ HWY_INLINE Vec<D> FirstValue(D d) const {
+ return Set(d, hwy::LowestValue<TFromD<D> >());
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> LastValue(D d) const {
+ return Set(d, hwy::HighestValue<TFromD<D> >());
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
+ const Vec<D> k1 = OddEven(Set(d, uint64_t{1}), Zero(d));
+ return Sub(v, k1);
+ }
+};
+
+struct OrderDescendingKV128 : public KeyValue128 {
+ using Order = SortDescending;
+
+ HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
+ return b[1] < a[1];
+ }
+
+ template <class D>
+ HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const {
+ return Lt128Upper(d, b, a);
+ }
+
+ // Used by CompareTop
+ template <class V>
+ HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const {
+ return Lt(b, a);
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const {
+ return Max128Upper(d, a, b);
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const {
+ return Min128Upper(d, a, b);
+ }
+
+ // Same as for regular lanes because 128-bit keys are u64.
+ template <class D>
+ HWY_INLINE Vec<D> FirstValue(D d) const {
+ return Set(d, hwy::HighestValue<TFromD<D> >());
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> LastValue(D d) const {
+ return Set(d, hwy::LowestValue<TFromD<D> >());
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
+ const Vec<D> k1 = OddEven(Set(d, uint64_t{1}), Zero(d));
+ return Add(v, k1);
+ }
+};
+
+// We want to swap 2 u128, i.e. 4 u64 lanes, based on the 0 or FF..FF mask in
+// the most-significant of those lanes (the result of CompareTop), so
+// replicate it 4x. Only called for >= 256-bit vectors.
+
+#if HWY_TARGET <= HWY_AVX3
+template <class V, HWY_IF_V_SIZE_V(V, 64)>
+HWY_INLINE V ReplicateTop4x(V v) {
+ return V{_mm512_permutex_epi64(v.raw, _MM_SHUFFLE(3, 3, 3, 3))};
+}
+#endif // HWY_TARGET <= HWY_AVX3
+
+#if HWY_TARGET <= HWY_AVX2
+
+template <class V, HWY_IF_V_SIZE_V(V, 32)>
+HWY_INLINE V ReplicateTop4x(V v) {
+ return V{_mm256_permute4x64_epi64(v.raw, _MM_SHUFFLE(3, 3, 3, 3))};
+}
+
+#else // HWY_TARGET > HWY_AVX2
+
+template <class V>
+HWY_INLINE V ReplicateTop4x(V v) {
+#if HWY_TARGET == HWY_SVE_256
+ return svdup_lane_u64(v, 3);
+#else
+ alignas(64) static constexpr uint64_t kIndices[8] = {3, 3, 3, 3,
+ 7, 7, 7, 7};
+ const ScalableTag<uint64_t> d;
+ return TableLookupLanes(v, SetTableIndices(d, kIndices));
+#endif
+}
+
+#endif // HWY_TARGET <= HWY_AVX2
+
+// Shared code that depends on Order.
+template <class Base>
+struct Traits128 : public Base {
+ template <class D>
+ HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
+ TFromD<D>* HWY_RESTRICT buf) const {
+ const Base* base = static_cast<const Base*>(this);
+ const size_t N = Lanes(d);
+ Store(v, d, buf);
+ v = base->SetKey(d, buf + 0); // result must be broadcasted
+ for (size_t i = base->LanesPerKey(); i < N; i += base->LanesPerKey()) {
+ v = base->First(d, v, base->SetKey(d, buf + i));
+ }
+ return v;
+ }
+
+ template <class D>
+ HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
+ TFromD<D>* HWY_RESTRICT buf) const {
+ const Base* base = static_cast<const Base*>(this);
+ const size_t N = Lanes(d);
+ Store(v, d, buf);
+ v = base->SetKey(d, buf + 0); // result must be broadcasted
+ for (size_t i = base->LanesPerKey(); i < N; i += base->LanesPerKey()) {
+ v = base->Last(d, v, base->SetKey(d, buf + i));
+ }
+ return v;
+ }
+
+ template <class D>
+ HWY_INLINE void Sort2(D d, Vec<D>& a, Vec<D>& b) const {
+ const Base* base = static_cast<const Base*>(this);
+
+ const Vec<D> a_copy = a;
+ const auto lt = base->Compare(d, a, b);
+ a = IfThenElse(lt, a, b);
+ b = IfThenElse(lt, b, a_copy);
+ }
+
+ // Conditionally swaps even-numbered keys with their odd-numbered neighbor.
+ template <class D>
+ HWY_INLINE Vec<D> SortPairsDistance1(D d, Vec<D> v) const {
+ const Base* base = static_cast<const Base*>(this);
+ Vec<D> swapped = base->ReverseKeys2(d, v);
+ const Vec<D> cmpHx = base->template CompareTop<Base>(d, v, swapped);
+ return IfVecThenElse(ReplicateTop4x(cmpHx), swapped, v);
+ }
+
+ // Swaps with the vector formed by reversing contiguous groups of four 128-bit
+ // keys, which implies 512-bit vectors (we do not support more than that).
+ template <class D>
+ HWY_INLINE Vec<D> SortPairsReverse4(D d, Vec<D> v) const {
+ const Base* base = static_cast<const Base*>(this);
+ Vec<D> swapped = base->ReverseKeys4(d, v);
+
+ const Vec<D> cmpHx = base->template CompareTop<Base>(d, v, swapped);
+ // Similar to ReplicateTop4x, we want to gang together 2 comparison results
+ // (4 lanes). They are not contiguous, so use permute to replicate 4x.
+ alignas(64) uint64_t kIndices[8] = {7, 7, 5, 5, 5, 5, 7, 7};
+ const Vec<D> select = TableLookupLanes(cmpHx, SetTableIndices(d, kIndices));
+ return IfVecThenElse(select, swapped, v);
+ }
+
+ // Conditionally swaps lane 0 with 4, 1 with 5 etc.
+ template <class D>
+ HWY_INLINE Vec<D> SortPairsDistance4(D, Vec<D>) const {
+ // Only used by Merge16, which would require 2048 bit vectors (unsupported).
+ HWY_ASSERT(0);
+ }
+};
+
+#endif // VQSORT_ENABLED
+
+} // namespace detail
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#endif // HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort-inl.h b/third_party/highway/hwy/contrib/sort/vqsort-inl.h
new file mode 100644
index 0000000000..cf827baee5
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/vqsort-inl.h
@@ -0,0 +1,1724 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Normal include guard for target-independent parts
+#ifndef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_
+#define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_
+
+#include <stdio.h> // unconditional #include so we can use if(VQSORT_PRINT).
+#include <string.h> // memcpy
+
+#include "hwy/base.h"
+#include "hwy/cache_control.h" // Prefetch
+#include "hwy/contrib/sort/vqsort.h" // Fill24Bytes
+
+#ifndef VQSORT_PRINT
+#define VQSORT_PRINT 0
+#endif
+
+#endif // HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_
+
+// Per-target
+#if defined(HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE) == \
+ defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE
+#undef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE
+#else
+#define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE
+#endif
+
+#if VQSORT_PRINT
+#include "hwy/print-inl.h"
+#endif
+
+#include "hwy/contrib/sort/shared-inl.h"
+#include "hwy/contrib/sort/sorting_networks-inl.h"
+// Placeholder for internal instrumentation. Do not remove.
+#include "hwy/highway.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+namespace detail {
+
+using Constants = hwy::SortConstants;
+
+// Wrapper avoids #if in user code (interferes with code folding)
+template <class D>
+HWY_INLINE void MaybePrintVector(D d, const char* label, Vec<D> v,
+ size_t start = 0, size_t max_lanes = 16) {
+#if VQSORT_PRINT >= 2 // Print is only defined #if
+ Print(d, label, v, start, max_lanes);
+#else
+ (void)d;
+ (void)label;
+ (void)v;
+ (void)start;
+ (void)max_lanes;
+#endif
+}
+
+// ------------------------------ HeapSort
+
+template <class Traits, typename T>
+void SiftDown(Traits st, T* HWY_RESTRICT lanes, const size_t num_lanes,
+ size_t start) {
+ constexpr size_t N1 = st.LanesPerKey();
+ const FixedTag<T, N1> d;
+
+ while (start < num_lanes) {
+ const size_t left = 2 * start + N1;
+ const size_t right = 2 * start + 2 * N1;
+ if (left >= num_lanes) break;
+ size_t idx_larger = start;
+ const auto key_j = st.SetKey(d, lanes + start);
+ if (AllTrue(d, st.Compare(d, key_j, st.SetKey(d, lanes + left)))) {
+ idx_larger = left;
+ }
+ if (right < num_lanes &&
+ AllTrue(d, st.Compare(d, st.SetKey(d, lanes + idx_larger),
+ st.SetKey(d, lanes + right)))) {
+ idx_larger = right;
+ }
+ if (idx_larger == start) break;
+ st.Swap(lanes + start, lanes + idx_larger);
+ start = idx_larger;
+ }
+}
+
+// Heapsort: O(1) space, O(N*logN) worst-case comparisons.
+// Based on LLVM sanitizer_common.h, licensed under Apache-2.0.
+template <class Traits, typename T>
+void HeapSort(Traits st, T* HWY_RESTRICT lanes, const size_t num_lanes) {
+ constexpr size_t N1 = st.LanesPerKey();
+
+ if (num_lanes < 2 * N1) return;
+
+ // Build heap.
+ for (size_t i = ((num_lanes - N1) / N1 / 2) * N1; i != (~N1 + 1); i -= N1) {
+ SiftDown(st, lanes, num_lanes, i);
+ }
+
+ for (size_t i = num_lanes - N1; i != 0; i -= N1) {
+ // Swap root with last
+ st.Swap(lanes + 0, lanes + i);
+
+ // Sift down the new root.
+ SiftDown(st, lanes, i, 0);
+ }
+}
+
+#if VQSORT_ENABLED || HWY_IDE
+
+// ------------------------------ BaseCase
+
+// Special cases where `num_lanes` is in the specified range (inclusive).
+template <class Traits, typename T>
+HWY_INLINE void Sort2To2(Traits st, T* HWY_RESTRICT keys, size_t num_lanes,
+ T* HWY_RESTRICT /* buf */) {
+ constexpr size_t kLPK = st.LanesPerKey();
+ const size_t num_keys = num_lanes / kLPK;
+ HWY_DASSERT(num_keys == 2);
+ HWY_ASSUME(num_keys == 2);
+
+ // One key per vector, required to avoid reading past the end of `keys`.
+ const CappedTag<T, kLPK> d;
+ using V = Vec<decltype(d)>;
+
+ V v0 = LoadU(d, keys + 0x0 * kLPK);
+ V v1 = LoadU(d, keys + 0x1 * kLPK);
+
+ Sort2(d, st, v0, v1);
+
+ StoreU(v0, d, keys + 0x0 * kLPK);
+ StoreU(v1, d, keys + 0x1 * kLPK);
+}
+
+template <class Traits, typename T>
+HWY_INLINE void Sort3To4(Traits st, T* HWY_RESTRICT keys, size_t num_lanes,
+ T* HWY_RESTRICT buf) {
+ constexpr size_t kLPK = st.LanesPerKey();
+ const size_t num_keys = num_lanes / kLPK;
+ HWY_DASSERT(3 <= num_keys && num_keys <= 4);
+ HWY_ASSUME(num_keys >= 3);
+ HWY_ASSUME(num_keys <= 4); // reduces branches
+
+ // One key per vector, required to avoid reading past the end of `keys`.
+ const CappedTag<T, kLPK> d;
+ using V = Vec<decltype(d)>;
+
+ // If num_keys == 3, initialize padding for the last sorting network element
+ // so that it does not influence the other elements.
+ Store(st.LastValue(d), d, buf);
+
+ // Points to a valid key, or padding. This avoids special-casing
+ // HWY_MEM_OPS_MIGHT_FAULT because there is only a single key per vector.
+ T* in_out3 = num_keys == 3 ? buf : keys + 0x3 * kLPK;
+
+ V v0 = LoadU(d, keys + 0x0 * kLPK);
+ V v1 = LoadU(d, keys + 0x1 * kLPK);
+ V v2 = LoadU(d, keys + 0x2 * kLPK);
+ V v3 = LoadU(d, in_out3);
+
+ Sort4(d, st, v0, v1, v2, v3);
+
+ StoreU(v0, d, keys + 0x0 * kLPK);
+ StoreU(v1, d, keys + 0x1 * kLPK);
+ StoreU(v2, d, keys + 0x2 * kLPK);
+ StoreU(v3, d, in_out3);
+}
+
+#if HWY_MEM_OPS_MIGHT_FAULT
+
+template <size_t kRows, size_t kLanesPerRow, class D, class Traits,
+ typename T = TFromD<D>>
+HWY_INLINE void CopyHalfToPaddedBuf(D d, Traits st, T* HWY_RESTRICT keys,
+ size_t num_lanes, T* HWY_RESTRICT buf) {
+ constexpr size_t kMinLanes = kRows / 2 * kLanesPerRow;
+ // Must cap for correctness: we will load up to the last valid lane, so
+ // Lanes(dmax) must not exceed `num_lanes` (known to be at least kMinLanes).
+ const CappedTag<T, kMinLanes> dmax;
+ const size_t Nmax = Lanes(dmax);
+ HWY_DASSERT(Nmax < num_lanes);
+ HWY_ASSUME(Nmax <= kMinLanes);
+
+ // Fill with padding - last in sort order, not copied to keys.
+ const Vec<decltype(dmax)> kPadding = st.LastValue(dmax);
+
+ // Rounding down allows aligned stores, which are typically faster.
+ size_t i = num_lanes & ~(Nmax - 1);
+ HWY_ASSUME(i != 0); // because Nmax <= num_lanes; avoids branch
+ do {
+ Store(kPadding, dmax, buf + i);
+ i += Nmax;
+ // Initialize enough for the last vector even if Nmax > kLanesPerRow.
+ } while (i < (kRows - 1) * kLanesPerRow + Lanes(d));
+
+ // Ensure buf contains all we will read, and perhaps more before.
+ ptrdiff_t end = static_cast<ptrdiff_t>(num_lanes);
+ do {
+ end -= static_cast<ptrdiff_t>(Nmax);
+ StoreU(LoadU(dmax, keys + end), dmax, buf + end);
+ } while (end > static_cast<ptrdiff_t>(kRows / 2 * kLanesPerRow));
+}
+
+#endif // HWY_MEM_OPS_MIGHT_FAULT
+
+template <size_t kKeysPerRow, class Traits, typename T>
+HWY_NOINLINE void Sort8Rows(Traits st, T* HWY_RESTRICT keys, size_t num_lanes,
+ T* HWY_RESTRICT buf) {
+ // kKeysPerRow <= 4 because 8 64-bit keys implies 512-bit vectors, which
+ // are likely slower than 16x4, so 8x4 is the largest we handle here.
+ static_assert(kKeysPerRow <= 4, "");
+
+ constexpr size_t kLPK = st.LanesPerKey();
+
+ // We reshape the 1D keys into kRows x kKeysPerRow.
+ constexpr size_t kRows = 8;
+ constexpr size_t kLanesPerRow = kKeysPerRow * kLPK;
+ constexpr size_t kMinLanes = kRows / 2 * kLanesPerRow;
+ HWY_DASSERT(kMinLanes < num_lanes && num_lanes <= kRows * kLanesPerRow);
+
+ const CappedTag<T, kLanesPerRow> d;
+ using V = Vec<decltype(d)>;
+ V v4, v5, v6, v7;
+
+ // At least half the kRows are valid, otherwise a different function would
+ // have been called to handle this num_lanes.
+ V v0 = LoadU(d, keys + 0x0 * kLanesPerRow);
+ V v1 = LoadU(d, keys + 0x1 * kLanesPerRow);
+ V v2 = LoadU(d, keys + 0x2 * kLanesPerRow);
+ V v3 = LoadU(d, keys + 0x3 * kLanesPerRow);
+#if HWY_MEM_OPS_MIGHT_FAULT
+ CopyHalfToPaddedBuf<kRows, kLanesPerRow>(d, st, keys, num_lanes, buf);
+ v4 = LoadU(d, buf + 0x4 * kLanesPerRow);
+ v5 = LoadU(d, buf + 0x5 * kLanesPerRow);
+ v6 = LoadU(d, buf + 0x6 * kLanesPerRow);
+ v7 = LoadU(d, buf + 0x7 * kLanesPerRow);
+#endif // HWY_MEM_OPS_MIGHT_FAULT
+#if !HWY_MEM_OPS_MIGHT_FAULT || HWY_IDE
+ (void)buf;
+ const V vnum_lanes = Set(d, static_cast<T>(num_lanes));
+ // First offset where not all vector are guaranteed valid.
+ const V kIota = Iota(d, static_cast<T>(kMinLanes));
+ const V k1 = Set(d, static_cast<T>(kLanesPerRow));
+ const V k2 = Add(k1, k1);
+
+ using M = Mask<decltype(d)>;
+ const M m4 = Gt(vnum_lanes, kIota);
+ const M m5 = Gt(vnum_lanes, Add(kIota, k1));
+ const M m6 = Gt(vnum_lanes, Add(kIota, k2));
+ const M m7 = Gt(vnum_lanes, Add(kIota, Add(k2, k1)));
+
+ const V kPadding = st.LastValue(d); // Not copied to keys.
+ v4 = MaskedLoadOr(kPadding, m4, d, keys + 0x4 * kLanesPerRow);
+ v5 = MaskedLoadOr(kPadding, m5, d, keys + 0x5 * kLanesPerRow);
+ v6 = MaskedLoadOr(kPadding, m6, d, keys + 0x6 * kLanesPerRow);
+ v7 = MaskedLoadOr(kPadding, m7, d, keys + 0x7 * kLanesPerRow);
+#endif // !HWY_MEM_OPS_MIGHT_FAULT
+
+ Sort8(d, st, v0, v1, v2, v3, v4, v5, v6, v7);
+
+ // Merge8x2 is a no-op if kKeysPerRow < 2 etc.
+ Merge8x2<kKeysPerRow>(d, st, v0, v1, v2, v3, v4, v5, v6, v7);
+ Merge8x4<kKeysPerRow>(d, st, v0, v1, v2, v3, v4, v5, v6, v7);
+
+ StoreU(v0, d, keys + 0x0 * kLanesPerRow);
+ StoreU(v1, d, keys + 0x1 * kLanesPerRow);
+ StoreU(v2, d, keys + 0x2 * kLanesPerRow);
+ StoreU(v3, d, keys + 0x3 * kLanesPerRow);
+
+#if HWY_MEM_OPS_MIGHT_FAULT
+ // Store remaining vectors into buf and safely copy them into keys.
+ StoreU(v4, d, buf + 0x4 * kLanesPerRow);
+ StoreU(v5, d, buf + 0x5 * kLanesPerRow);
+ StoreU(v6, d, buf + 0x6 * kLanesPerRow);
+ StoreU(v7, d, buf + 0x7 * kLanesPerRow);
+
+ const ScalableTag<T> dmax;
+ const size_t Nmax = Lanes(dmax);
+
+ // The first half of vectors have already been stored unconditionally into
+ // `keys`, so we do not copy them.
+ size_t i = kMinLanes;
+ HWY_UNROLL(1)
+ for (; i + Nmax <= num_lanes; i += Nmax) {
+ StoreU(LoadU(dmax, buf + i), dmax, keys + i);
+ }
+
+ // Last iteration: copy partial vector
+ const size_t remaining = num_lanes - i;
+ HWY_ASSUME(remaining < 256); // helps FirstN
+ SafeCopyN(remaining, dmax, buf + i, keys + i);
+#endif // HWY_MEM_OPS_MIGHT_FAULT
+#if !HWY_MEM_OPS_MIGHT_FAULT || HWY_IDE
+ BlendedStore(v4, m4, d, keys + 0x4 * kLanesPerRow);
+ BlendedStore(v5, m5, d, keys + 0x5 * kLanesPerRow);
+ BlendedStore(v6, m6, d, keys + 0x6 * kLanesPerRow);
+ BlendedStore(v7, m7, d, keys + 0x7 * kLanesPerRow);
+#endif // !HWY_MEM_OPS_MIGHT_FAULT
+}
+
+template <size_t kKeysPerRow, class Traits, typename T>
+HWY_NOINLINE void Sort16Rows(Traits st, T* HWY_RESTRICT keys, size_t num_lanes,
+ T* HWY_RESTRICT buf) {
+ static_assert(kKeysPerRow <= SortConstants::kMaxCols, "");
+
+ constexpr size_t kLPK = st.LanesPerKey();
+
+ // We reshape the 1D keys into kRows x kKeysPerRow.
+ constexpr size_t kRows = 16;
+ constexpr size_t kLanesPerRow = kKeysPerRow * kLPK;
+ constexpr size_t kMinLanes = kRows / 2 * kLanesPerRow;
+ HWY_DASSERT(kMinLanes < num_lanes && num_lanes <= kRows * kLanesPerRow);
+
+ const CappedTag<T, kLanesPerRow> d;
+ using V = Vec<decltype(d)>;
+ V v8, v9, va, vb, vc, vd, ve, vf;
+
+ // At least half the kRows are valid, otherwise a different function would
+ // have been called to handle this num_lanes.
+ V v0 = LoadU(d, keys + 0x0 * kLanesPerRow);
+ V v1 = LoadU(d, keys + 0x1 * kLanesPerRow);
+ V v2 = LoadU(d, keys + 0x2 * kLanesPerRow);
+ V v3 = LoadU(d, keys + 0x3 * kLanesPerRow);
+ V v4 = LoadU(d, keys + 0x4 * kLanesPerRow);
+ V v5 = LoadU(d, keys + 0x5 * kLanesPerRow);
+ V v6 = LoadU(d, keys + 0x6 * kLanesPerRow);
+ V v7 = LoadU(d, keys + 0x7 * kLanesPerRow);
+#if HWY_MEM_OPS_MIGHT_FAULT
+ CopyHalfToPaddedBuf<kRows, kLanesPerRow>(d, st, keys, num_lanes, buf);
+ v8 = LoadU(d, buf + 0x8 * kLanesPerRow);
+ v9 = LoadU(d, buf + 0x9 * kLanesPerRow);
+ va = LoadU(d, buf + 0xa * kLanesPerRow);
+ vb = LoadU(d, buf + 0xb * kLanesPerRow);
+ vc = LoadU(d, buf + 0xc * kLanesPerRow);
+ vd = LoadU(d, buf + 0xd * kLanesPerRow);
+ ve = LoadU(d, buf + 0xe * kLanesPerRow);
+ vf = LoadU(d, buf + 0xf * kLanesPerRow);
+#endif // HWY_MEM_OPS_MIGHT_FAULT
+#if !HWY_MEM_OPS_MIGHT_FAULT || HWY_IDE
+ (void)buf;
+ const V vnum_lanes = Set(d, static_cast<T>(num_lanes));
+ // First offset where not all vector are guaranteed valid.
+ const V kIota = Iota(d, static_cast<T>(kMinLanes));
+ const V k1 = Set(d, static_cast<T>(kLanesPerRow));
+ const V k2 = Add(k1, k1);
+ const V k4 = Add(k2, k2);
+ const V k8 = Add(k4, k4);
+
+ using M = Mask<decltype(d)>;
+ const M m8 = Gt(vnum_lanes, kIota);
+ const M m9 = Gt(vnum_lanes, Add(kIota, k1));
+ const M ma = Gt(vnum_lanes, Add(kIota, k2));
+ const M mb = Gt(vnum_lanes, Add(kIota, Sub(k4, k1)));
+ const M mc = Gt(vnum_lanes, Add(kIota, k4));
+ const M md = Gt(vnum_lanes, Add(kIota, Add(k4, k1)));
+ const M me = Gt(vnum_lanes, Add(kIota, Add(k4, k2)));
+ const M mf = Gt(vnum_lanes, Add(kIota, Sub(k8, k1)));
+
+ const V kPadding = st.LastValue(d); // Not copied to keys.
+ v8 = MaskedLoadOr(kPadding, m8, d, keys + 0x8 * kLanesPerRow);
+ v9 = MaskedLoadOr(kPadding, m9, d, keys + 0x9 * kLanesPerRow);
+ va = MaskedLoadOr(kPadding, ma, d, keys + 0xa * kLanesPerRow);
+ vb = MaskedLoadOr(kPadding, mb, d, keys + 0xb * kLanesPerRow);
+ vc = MaskedLoadOr(kPadding, mc, d, keys + 0xc * kLanesPerRow);
+ vd = MaskedLoadOr(kPadding, md, d, keys + 0xd * kLanesPerRow);
+ ve = MaskedLoadOr(kPadding, me, d, keys + 0xe * kLanesPerRow);
+ vf = MaskedLoadOr(kPadding, mf, d, keys + 0xf * kLanesPerRow);
+#endif // !HWY_MEM_OPS_MIGHT_FAULT
+
+ Sort16(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve, vf);
+
+ // Merge16x4 is a no-op if kKeysPerRow < 4 etc.
+ Merge16x2<kKeysPerRow>(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb,
+ vc, vd, ve, vf);
+ Merge16x4<kKeysPerRow>(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb,
+ vc, vd, ve, vf);
+ Merge16x8<kKeysPerRow>(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb,
+ vc, vd, ve, vf);
+#if !HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD
+ Merge16x16<kKeysPerRow>(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb,
+ vc, vd, ve, vf);
+#endif
+
+ StoreU(v0, d, keys + 0x0 * kLanesPerRow);
+ StoreU(v1, d, keys + 0x1 * kLanesPerRow);
+ StoreU(v2, d, keys + 0x2 * kLanesPerRow);
+ StoreU(v3, d, keys + 0x3 * kLanesPerRow);
+ StoreU(v4, d, keys + 0x4 * kLanesPerRow);
+ StoreU(v5, d, keys + 0x5 * kLanesPerRow);
+ StoreU(v6, d, keys + 0x6 * kLanesPerRow);
+ StoreU(v7, d, keys + 0x7 * kLanesPerRow);
+
+#if HWY_MEM_OPS_MIGHT_FAULT
+ // Store remaining vectors into buf and safely copy them into keys.
+ StoreU(v8, d, buf + 0x8 * kLanesPerRow);
+ StoreU(v9, d, buf + 0x9 * kLanesPerRow);
+ StoreU(va, d, buf + 0xa * kLanesPerRow);
+ StoreU(vb, d, buf + 0xb * kLanesPerRow);
+ StoreU(vc, d, buf + 0xc * kLanesPerRow);
+ StoreU(vd, d, buf + 0xd * kLanesPerRow);
+ StoreU(ve, d, buf + 0xe * kLanesPerRow);
+ StoreU(vf, d, buf + 0xf * kLanesPerRow);
+
+ const ScalableTag<T> dmax;
+ const size_t Nmax = Lanes(dmax);
+
+ // The first half of vectors have already been stored unconditionally into
+ // `keys`, so we do not copy them.
+ size_t i = kMinLanes;
+ HWY_UNROLL(1)
+ for (; i + Nmax <= num_lanes; i += Nmax) {
+ StoreU(LoadU(dmax, buf + i), dmax, keys + i);
+ }
+
+ // Last iteration: copy partial vector
+ const size_t remaining = num_lanes - i;
+ HWY_ASSUME(remaining < 256); // helps FirstN
+ SafeCopyN(remaining, dmax, buf + i, keys + i);
+#endif // HWY_MEM_OPS_MIGHT_FAULT
+#if !HWY_MEM_OPS_MIGHT_FAULT || HWY_IDE
+ BlendedStore(v8, m8, d, keys + 0x8 * kLanesPerRow);
+ BlendedStore(v9, m9, d, keys + 0x9 * kLanesPerRow);
+ BlendedStore(va, ma, d, keys + 0xa * kLanesPerRow);
+ BlendedStore(vb, mb, d, keys + 0xb * kLanesPerRow);
+ BlendedStore(vc, mc, d, keys + 0xc * kLanesPerRow);
+ BlendedStore(vd, md, d, keys + 0xd * kLanesPerRow);
+ BlendedStore(ve, me, d, keys + 0xe * kLanesPerRow);
+ BlendedStore(vf, mf, d, keys + 0xf * kLanesPerRow);
+#endif // !HWY_MEM_OPS_MIGHT_FAULT
+}
+
+// Sorts `keys` within the range [0, num_lanes) via sorting network.
+// Reshapes into a matrix, sorts columns independently, and then merges
+// into a sorted 1D array without transposing.
+//
+// `st` is SharedTraits<Traits*<Order*>>. This abstraction layer bridges
+// differences in sort order and single-lane vs 128-bit keys.
+//
+// See M. Blacher's thesis: https://github.com/mark-blacher/masterthesis
+template <class D, class Traits, typename T>
+HWY_NOINLINE void BaseCase(D d, Traits st, T* HWY_RESTRICT keys,
+ size_t num_lanes, T* buf) {
+ constexpr size_t kLPK = st.LanesPerKey();
+ HWY_DASSERT(num_lanes <= Constants::BaseCaseNumLanes<kLPK>(Lanes(d)));
+ const size_t num_keys = num_lanes / kLPK;
+
+ // Can be zero when called through HandleSpecialCases, but also 1 (in which
+ // case the array is already sorted). Also ensures num_lanes - 1 != 0.
+ if (HWY_UNLIKELY(num_keys <= 1)) return;
+
+ const size_t ceil_log2 =
+ 32 - Num0BitsAboveMS1Bit_Nonzero32(static_cast<uint32_t>(num_keys - 1));
+
+ // Checking kMaxKeysPerVector avoids generating unreachable codepaths.
+ constexpr size_t kMaxKeysPerVector = MaxLanes(d) / kLPK;
+
+ using FuncPtr = decltype(&Sort2To2<Traits, T>);
+ const FuncPtr funcs[9] = {
+ /* <= 1 */ nullptr, // We ensured num_keys > 1.
+ /* <= 2 */ &Sort2To2<Traits, T>,
+ /* <= 4 */ &Sort3To4<Traits, T>,
+ /* <= 8 */ &Sort8Rows<1, Traits, T>, // 1 key per row
+ /* <= 16 */ kMaxKeysPerVector >= 2 ? &Sort8Rows<2, Traits, T> : nullptr,
+ /* <= 32 */ kMaxKeysPerVector >= 4 ? &Sort8Rows<4, Traits, T> : nullptr,
+ /* <= 64 */ kMaxKeysPerVector >= 4 ? &Sort16Rows<4, Traits, T> : nullptr,
+ /* <= 128 */ kMaxKeysPerVector >= 8 ? &Sort16Rows<8, Traits, T> : nullptr,
+#if !HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD
+ /* <= 256 */ kMaxKeysPerVector >= 16 ? &Sort16Rows<16, Traits, T> : nullptr,
+#endif
+ };
+ funcs[ceil_log2](st, keys, num_lanes, buf);
+}
+
+// ------------------------------ Partition
+
+// Consumes from `keys` until a multiple of kUnroll*N remains.
+// Temporarily stores the right side into `buf`, then moves behind `num`.
+// Returns the number of keys consumed from the left side.
+template <class D, class Traits, class T>
+HWY_INLINE size_t PartitionToMultipleOfUnroll(D d, Traits st,
+ T* HWY_RESTRICT keys, size_t& num,
+ const Vec<D> pivot,
+ T* HWY_RESTRICT buf) {
+ constexpr size_t kUnroll = Constants::kPartitionUnroll;
+ const size_t N = Lanes(d);
+ size_t readL = 0;
+ T* HWY_RESTRICT posL = keys;
+ size_t bufR = 0;
+ // Partition requires both a multiple of kUnroll*N and at least
+ // 2*kUnroll*N for the initial loads. If less, consume all here.
+ const size_t num_rem =
+ (num < 2 * kUnroll * N) ? num : (num & (kUnroll * N - 1));
+ size_t i = 0;
+ for (; i + N <= num_rem; i += N) {
+ const Vec<D> vL = LoadU(d, keys + readL);
+ readL += N;
+
+ const auto comp = st.Compare(d, pivot, vL);
+ posL += CompressBlendedStore(vL, Not(comp), d, posL);
+ bufR += CompressStore(vL, comp, d, buf + bufR);
+ }
+ // Last iteration: only use valid lanes.
+ if (HWY_LIKELY(i != num_rem)) {
+ const auto mask = FirstN(d, num_rem - i);
+ const Vec<D> vL = LoadU(d, keys + readL);
+
+ const auto comp = st.Compare(d, pivot, vL);
+ posL += CompressBlendedStore(vL, AndNot(comp, mask), d, posL);
+ bufR += CompressStore(vL, And(comp, mask), d, buf + bufR);
+ }
+
+ // MSAN seems not to understand CompressStore. buf[0, bufR) are valid.
+ detail::MaybeUnpoison(buf, bufR);
+
+ // Everything we loaded was put into buf, or behind the current `posL`, after
+ // which there is space for bufR items. First move items from `keys + num` to
+ // `posL` to free up space, then copy `buf` into the vacated `keys + num`.
+ // A loop with masked loads from `buf` is insufficient - we would also need to
+ // mask from `keys + num`. Combining a loop with memcpy for the remainders is
+ // slower than just memcpy, so we use that for simplicity.
+ num -= bufR;
+ memcpy(posL, keys + num, bufR * sizeof(T));
+ memcpy(keys + num, buf, bufR * sizeof(T));
+ return static_cast<size_t>(posL - keys); // caller will shrink num by this.
+}
+
+template <class V>
+V OrXor(const V o, const V x1, const V x2) {
+ return Or(o, Xor(x1, x2)); // ternlog on AVX3
+}
+
+// Note: we could track the OrXor of v and pivot to see if the entire left
+// partition is equal, but that happens rarely and thus is a net loss.
+template <class D, class Traits, typename T>
+HWY_INLINE void StoreLeftRight(D d, Traits st, const Vec<D> v,
+ const Vec<D> pivot, T* HWY_RESTRICT keys,
+ size_t& writeL, size_t& remaining) {
+ const size_t N = Lanes(d);
+
+ const auto comp = st.Compare(d, pivot, v);
+
+ remaining -= N;
+ if (hwy::HWY_NAMESPACE::CompressIsPartition<T>::value ||
+ (HWY_MAX_BYTES == 16 && st.Is128())) {
+ // Non-native Compress (e.g. AVX2): we are able to partition a vector using
+ // a single Compress+two StoreU instead of two Compress[Blended]Store. The
+ // latter are more expensive. Because we store entire vectors, the contents
+ // between the updated writeL and writeR are ignored and will be overwritten
+ // by subsequent calls. This works because writeL and writeR are at least
+ // two vectors apart.
+ const auto lr = st.CompressKeys(v, comp);
+ const size_t num_left = N - CountTrue(d, comp);
+ StoreU(lr, d, keys + writeL);
+ // Now write the right-side elements (if any), such that the previous writeR
+ // is one past the end of the newly written right elements, then advance.
+ StoreU(lr, d, keys + remaining + writeL);
+ writeL += num_left;
+ } else {
+ // Native Compress[Store] (e.g. AVX3), which only keep the left or right
+ // side, not both, hence we require two calls.
+ const size_t num_left = CompressStore(v, Not(comp), d, keys + writeL);
+ writeL += num_left;
+
+ (void)CompressBlendedStore(v, comp, d, keys + remaining + writeL);
+ }
+}
+
+template <class D, class Traits, typename T>
+HWY_INLINE void StoreLeftRight4(D d, Traits st, const Vec<D> v0,
+ const Vec<D> v1, const Vec<D> v2,
+ const Vec<D> v3, const Vec<D> pivot,
+ T* HWY_RESTRICT keys, size_t& writeL,
+ size_t& remaining) {
+ StoreLeftRight(d, st, v0, pivot, keys, writeL, remaining);
+ StoreLeftRight(d, st, v1, pivot, keys, writeL, remaining);
+ StoreLeftRight(d, st, v2, pivot, keys, writeL, remaining);
+ StoreLeftRight(d, st, v3, pivot, keys, writeL, remaining);
+}
+
+// Moves "<= pivot" keys to the front, and others to the back. pivot is
+// broadcasted. Time-critical!
+//
+// Aligned loads do not seem to be worthwhile (not bottlenecked by load ports).
+template <class D, class Traits, typename T>
+HWY_INLINE size_t Partition(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
+ const Vec<D> pivot, T* HWY_RESTRICT buf) {
+ using V = decltype(Zero(d));
+ const size_t N = Lanes(d);
+
+ // StoreLeftRight will CompressBlendedStore ending at `writeR`. Unless all
+ // lanes happen to be in the right-side partition, this will overrun `keys`,
+ // which triggers asan errors. Avoid by special-casing the last vector.
+ HWY_DASSERT(num > 2 * N); // ensured by HandleSpecialCases
+ num -= N;
+ size_t last = num;
+ const V vlast = LoadU(d, keys + last);
+
+ const size_t consumedL =
+ PartitionToMultipleOfUnroll(d, st, keys, num, pivot, buf);
+ keys += consumedL;
+ last -= consumedL;
+ num -= consumedL;
+ constexpr size_t kUnroll = Constants::kPartitionUnroll;
+
+ // Partition splits the vector into 3 sections, left to right: Elements
+ // smaller or equal to the pivot, unpartitioned elements and elements larger
+ // than the pivot. To write elements unconditionally on the loop body without
+ // overwriting existing data, we maintain two regions of the loop where all
+ // elements have been copied elsewhere (e.g. vector registers.). I call these
+ // bufferL and bufferR, for left and right respectively.
+ //
+ // These regions are tracked by the indices (writeL, writeR, left, right) as
+ // presented in the diagram below.
+ //
+ // writeL writeR
+ // \/ \/
+ // | <= pivot | bufferL | unpartitioned | bufferR | > pivot |
+ // \/ \/
+ // left right
+ //
+ // In the main loop body below we choose a side, load some elements out of the
+ // vector and move either `left` or `right`. Next we call into StoreLeftRight
+ // to partition the data, and the partitioned elements will be written either
+ // to writeR or writeL and the corresponding index will be moved accordingly.
+ //
+ // Note that writeR is not explicitly tracked as an optimization for platforms
+ // with conditional operations. Instead we track writeL and the number of
+ // elements left to process (`remaining`). From the diagram above we can see
+ // that:
+ // writeR - writeL = remaining => writeR = remaining + writeL
+ //
+ // Tracking `remaining` is advantageous because each iteration reduces the
+ // number of unpartitioned elements by a fixed amount, so we can compute
+ // `remaining` without data dependencies.
+ //
+ size_t writeL = 0;
+ size_t remaining = num;
+
+ const T* HWY_RESTRICT readL = keys;
+ const T* HWY_RESTRICT readR = keys + num;
+ // Cannot load if there were fewer than 2 * kUnroll * N.
+ if (HWY_LIKELY(num != 0)) {
+ HWY_DASSERT(num >= 2 * kUnroll * N);
+ HWY_DASSERT((num & (kUnroll * N - 1)) == 0);
+
+ // Make space for writing in-place by reading from readL/readR.
+ const V vL0 = LoadU(d, readL + 0 * N);
+ const V vL1 = LoadU(d, readL + 1 * N);
+ const V vL2 = LoadU(d, readL + 2 * N);
+ const V vL3 = LoadU(d, readL + 3 * N);
+ readL += kUnroll * N;
+ readR -= kUnroll * N;
+ const V vR0 = LoadU(d, readR + 0 * N);
+ const V vR1 = LoadU(d, readR + 1 * N);
+ const V vR2 = LoadU(d, readR + 2 * N);
+ const V vR3 = LoadU(d, readR + 3 * N);
+
+ // readL/readR changed above, so check again before the loop.
+ while (readL != readR) {
+ V v0, v1, v2, v3;
+
+ // Data-dependent but branching is faster than forcing branch-free.
+ const size_t capacityL =
+ static_cast<size_t>((readL - keys) - static_cast<ptrdiff_t>(writeL));
+ HWY_DASSERT(capacityL <= num); // >= 0
+ // Load data from the end of the vector with less data (front or back).
+ // The next paragraphs explain how this works.
+ //
+ // let block_size = (kUnroll * N)
+ // On the loop prelude we load block_size elements from the front of the
+ // vector and an additional block_size elements from the back. On each
+ // iteration k elements are written to the front of the vector and
+ // (block_size - k) to the back.
+ //
+ // This creates a loop invariant where the capacity on the front
+ // (capacityL) and on the back (capacityR) always add to 2 * block_size.
+ // In other words:
+ // capacityL + capacityR = 2 * block_size
+ // capacityR = 2 * block_size - capacityL
+ //
+ // This means that:
+ // capacityL < capacityR <=>
+ // capacityL < 2 * block_size - capacityL <=>
+ // 2 * capacityL < 2 * block_size <=>
+ // capacityL < block_size
+ //
+ // Thus the check on the next line is equivalent to capacityL > capacityR.
+ //
+ if (kUnroll * N < capacityL) {
+ readR -= kUnroll * N;
+ v0 = LoadU(d, readR + 0 * N);
+ v1 = LoadU(d, readR + 1 * N);
+ v2 = LoadU(d, readR + 2 * N);
+ v3 = LoadU(d, readR + 3 * N);
+ hwy::Prefetch(readR - 3 * kUnroll * N);
+ } else {
+ v0 = LoadU(d, readL + 0 * N);
+ v1 = LoadU(d, readL + 1 * N);
+ v2 = LoadU(d, readL + 2 * N);
+ v3 = LoadU(d, readL + 3 * N);
+ readL += kUnroll * N;
+ hwy::Prefetch(readL + 3 * kUnroll * N);
+ }
+
+ StoreLeftRight4(d, st, v0, v1, v2, v3, pivot, keys, writeL, remaining);
+ }
+
+ // Now finish writing the saved vectors to the middle.
+ StoreLeftRight4(d, st, vL0, vL1, vL2, vL3, pivot, keys, writeL, remaining);
+ StoreLeftRight4(d, st, vR0, vR1, vR2, vR3, pivot, keys, writeL, remaining);
+ }
+
+ // We have partitioned [left, right) such that writeL is the boundary.
+ HWY_DASSERT(remaining == 0);
+ // Make space for inserting vlast: move up to N of the first right-side keys
+ // into the unused space starting at last. If we have fewer, ensure they are
+ // the last items in that vector by subtracting from the *load* address,
+ // which is safe because we have at least two vectors (checked above).
+ const size_t totalR = last - writeL;
+ const size_t startR = totalR < N ? writeL + totalR - N : writeL;
+ StoreU(LoadU(d, keys + startR), d, keys + last);
+
+ // Partition vlast: write L, then R, into the single-vector gap at writeL.
+ const auto comp = st.Compare(d, pivot, vlast);
+ writeL += CompressBlendedStore(vlast, Not(comp), d, keys + writeL);
+ (void)CompressBlendedStore(vlast, comp, d, keys + writeL);
+
+ return consumedL + writeL;
+}
+
+// Returns true and partitions if [keys, keys + num) contains only {valueL,
+// valueR}. Otherwise, sets third to the first differing value; keys may have
+// been reordered and a regular Partition is still necessary.
+// Called from two locations, hence NOINLINE.
+template <class D, class Traits, typename T>
+HWY_NOINLINE bool MaybePartitionTwoValue(D d, Traits st, T* HWY_RESTRICT keys,
+ size_t num, const Vec<D> valueL,
+ const Vec<D> valueR, Vec<D>& third,
+ T* HWY_RESTRICT buf) {
+ const size_t N = Lanes(d);
+
+ size_t i = 0;
+ size_t writeL = 0;
+
+ // As long as all lanes are equal to L or R, we can overwrite with valueL.
+ // This is faster than first counting, then backtracking to fill L and R.
+ for (; i + N <= num; i += N) {
+ const Vec<D> v = LoadU(d, keys + i);
+ // It is not clear how to apply OrXor here - that can check if *both*
+ // comparisons are true, but here we want *either*. Comparing the unsigned
+ // min of differences to zero works, but is expensive for u64 prior to AVX3.
+ const Mask<D> eqL = st.EqualKeys(d, v, valueL);
+ const Mask<D> eqR = st.EqualKeys(d, v, valueR);
+ // At least one other value present; will require a regular partition.
+ // On AVX-512, Or + AllTrue are folded into a single kortest if we are
+ // careful with the FindKnownFirstTrue argument, see below.
+ if (HWY_UNLIKELY(!AllTrue(d, Or(eqL, eqR)))) {
+ // If we repeat Or(eqL, eqR) here, the compiler will hoist it into the
+ // loop, which is a pessimization because this if-true branch is cold.
+ // We can defeat this via Not(Xor), which is equivalent because eqL and
+ // eqR cannot be true at the same time. Can we elide the additional Not?
+ // FindFirstFalse instructions are generally unavailable, but we can
+ // fuse Not and Xor/Or into one ExclusiveNeither.
+ const size_t lane = FindKnownFirstTrue(d, ExclusiveNeither(eqL, eqR));
+ third = st.SetKey(d, keys + i + lane);
+ if (VQSORT_PRINT >= 2) {
+ fprintf(stderr, "found 3rd value at vec %zu; writeL %zu\n", i, writeL);
+ }
+ // 'Undo' what we did by filling the remainder of what we read with R.
+ for (; writeL + N <= i; writeL += N) {
+ StoreU(valueR, d, keys + writeL);
+ }
+ BlendedStore(valueR, FirstN(d, i - writeL), d, keys + writeL);
+ return false;
+ }
+ StoreU(valueL, d, keys + writeL);
+ writeL += CountTrue(d, eqL);
+ }
+
+ // Final vector, masked comparison (no effect if i == num)
+ const size_t remaining = num - i;
+ SafeCopyN(remaining, d, keys + i, buf);
+ const Vec<D> v = Load(d, buf);
+ const Mask<D> valid = FirstN(d, remaining);
+ const Mask<D> eqL = And(st.EqualKeys(d, v, valueL), valid);
+ const Mask<D> eqR = st.EqualKeys(d, v, valueR);
+ // Invalid lanes are considered equal.
+ const Mask<D> eq = Or(Or(eqL, eqR), Not(valid));
+ // At least one other value present; will require a regular partition.
+ if (HWY_UNLIKELY(!AllTrue(d, eq))) {
+ const size_t lane = FindKnownFirstTrue(d, Not(eq));
+ third = st.SetKey(d, keys + i + lane);
+ if (VQSORT_PRINT >= 2) {
+ fprintf(stderr, "found 3rd value at partial vec %zu; writeL %zu\n", i,
+ writeL);
+ }
+ // 'Undo' what we did by filling the remainder of what we read with R.
+ for (; writeL + N <= i; writeL += N) {
+ StoreU(valueR, d, keys + writeL);
+ }
+ BlendedStore(valueR, FirstN(d, i - writeL), d, keys + writeL);
+ return false;
+ }
+ BlendedStore(valueL, valid, d, keys + writeL);
+ writeL += CountTrue(d, eqL);
+
+ // Fill right side
+ i = writeL;
+ for (; i + N <= num; i += N) {
+ StoreU(valueR, d, keys + i);
+ }
+ BlendedStore(valueR, FirstN(d, num - i), d, keys + i);
+
+ if (VQSORT_PRINT >= 2) {
+ fprintf(stderr, "Successful MaybePartitionTwoValue\n");
+ }
+ return true;
+}
+
+// Same as above, except that the pivot equals valueR, so scan right to left.
+template <class D, class Traits, typename T>
+HWY_INLINE bool MaybePartitionTwoValueR(D d, Traits st, T* HWY_RESTRICT keys,
+ size_t num, const Vec<D> valueL,
+ const Vec<D> valueR, Vec<D>& third,
+ T* HWY_RESTRICT buf) {
+ const size_t N = Lanes(d);
+
+ HWY_DASSERT(num >= N);
+ size_t pos = num - N; // current read/write position
+ size_t countR = 0; // number of valueR found
+
+ // For whole vectors, in descending address order: as long as all lanes are
+ // equal to L or R, overwrite with valueR. This is faster than counting, then
+ // filling both L and R. Loop terminates after unsigned wraparound.
+ for (; pos < num; pos -= N) {
+ const Vec<D> v = LoadU(d, keys + pos);
+ // It is not clear how to apply OrXor here - that can check if *both*
+ // comparisons are true, but here we want *either*. Comparing the unsigned
+ // min of differences to zero works, but is expensive for u64 prior to AVX3.
+ const Mask<D> eqL = st.EqualKeys(d, v, valueL);
+ const Mask<D> eqR = st.EqualKeys(d, v, valueR);
+ // If there is a third value, stop and undo what we've done. On AVX-512,
+ // Or + AllTrue are folded into a single kortest, but only if we are
+ // careful with the FindKnownFirstTrue argument - see prior comment on that.
+ if (HWY_UNLIKELY(!AllTrue(d, Or(eqL, eqR)))) {
+ const size_t lane = FindKnownFirstTrue(d, ExclusiveNeither(eqL, eqR));
+ third = st.SetKey(d, keys + pos + lane);
+ if (VQSORT_PRINT >= 2) {
+ fprintf(stderr, "found 3rd value at vec %zu; countR %zu\n", pos,
+ countR);
+ MaybePrintVector(d, "third", third, 0, st.LanesPerKey());
+ }
+ pos += N; // rewind: we haven't yet committed changes in this iteration.
+ // We have filled [pos, num) with R, but only countR of them should have
+ // been written. Rewrite [pos, num - countR) to L.
+ HWY_DASSERT(countR <= num - pos);
+ const size_t endL = num - countR;
+ for (; pos + N <= endL; pos += N) {
+ StoreU(valueL, d, keys + pos);
+ }
+ BlendedStore(valueL, FirstN(d, endL - pos), d, keys + pos);
+ return false;
+ }
+ StoreU(valueR, d, keys + pos);
+ countR += CountTrue(d, eqR);
+ }
+
+ // Final partial (or empty) vector, masked comparison.
+ const size_t remaining = pos + N;
+ HWY_DASSERT(remaining <= N);
+ const Vec<D> v = LoadU(d, keys); // Safe because num >= N.
+ const Mask<D> valid = FirstN(d, remaining);
+ const Mask<D> eqL = st.EqualKeys(d, v, valueL);
+ const Mask<D> eqR = And(st.EqualKeys(d, v, valueR), valid);
+ // Invalid lanes are considered equal.
+ const Mask<D> eq = Or(Or(eqL, eqR), Not(valid));
+ // At least one other value present; will require a regular partition.
+ if (HWY_UNLIKELY(!AllTrue(d, eq))) {
+ const size_t lane = FindKnownFirstTrue(d, Not(eq));
+ third = st.SetKey(d, keys + lane);
+ if (VQSORT_PRINT >= 2) {
+ fprintf(stderr, "found 3rd value at partial vec %zu; writeR %zu\n", pos,
+ countR);
+ MaybePrintVector(d, "third", third, 0, st.LanesPerKey());
+ }
+ pos += N; // rewind: we haven't yet committed changes in this iteration.
+ // We have filled [pos, num) with R, but only countR of them should have
+ // been written. Rewrite [pos, num - countR) to L.
+ HWY_DASSERT(countR <= num - pos);
+ const size_t endL = num - countR;
+ for (; pos + N <= endL; pos += N) {
+ StoreU(valueL, d, keys + pos);
+ }
+ BlendedStore(valueL, FirstN(d, endL - pos), d, keys + pos);
+ return false;
+ }
+ const size_t lastR = CountTrue(d, eqR);
+ countR += lastR;
+
+ // First finish writing valueR - [0, N) lanes were not yet written.
+ StoreU(valueR, d, keys); // Safe because num >= N.
+
+ // Fill left side (ascending order for clarity)
+ const size_t endL = num - countR;
+ size_t i = 0;
+ for (; i + N <= endL; i += N) {
+ StoreU(valueL, d, keys + i);
+ }
+ Store(valueL, d, buf);
+ SafeCopyN(endL - i, d, buf, keys + i); // avoids asan overrun
+
+ if (VQSORT_PRINT >= 2) {
+ fprintf(stderr,
+ "MaybePartitionTwoValueR countR %zu pos %zu i %zu endL %zu\n",
+ countR, pos, i, endL);
+ }
+
+ return true;
+}
+
+// `idx_second` is `first_mismatch` from `AllEqual` and thus the index of the
+// second key. This is the first path into `MaybePartitionTwoValue`, called
+// when all samples are equal. Returns false if there are at least a third
+// value and sets `third`. Otherwise, partitions the array and returns true.
+template <class D, class Traits, typename T>
+HWY_INLINE bool PartitionIfTwoKeys(D d, Traits st, const Vec<D> pivot,
+ T* HWY_RESTRICT keys, size_t num,
+ const size_t idx_second, const Vec<D> second,
+ Vec<D>& third, T* HWY_RESTRICT buf) {
+ // True if second comes before pivot.
+ const bool is_pivotR = AllFalse(d, st.Compare(d, pivot, second));
+ if (VQSORT_PRINT >= 1) {
+ fprintf(stderr, "Samples all equal, diff at %zu, isPivotR %d\n", idx_second,
+ is_pivotR);
+ }
+ HWY_DASSERT(AllFalse(d, st.EqualKeys(d, second, pivot)));
+
+ // If pivot is R, we scan backwards over the entire array. Otherwise,
+ // we already scanned up to idx_second and can leave those in place.
+ return is_pivotR ? MaybePartitionTwoValueR(d, st, keys, num, second, pivot,
+ third, buf)
+ : MaybePartitionTwoValue(d, st, keys + idx_second,
+ num - idx_second, pivot, second,
+ third, buf);
+}
+
+// Second path into `MaybePartitionTwoValue`, called when not all samples are
+// equal. `samples` is sorted.
+template <class D, class Traits, typename T>
+HWY_INLINE bool PartitionIfTwoSamples(D d, Traits st, T* HWY_RESTRICT keys,
+ size_t num, T* HWY_RESTRICT samples) {
+ constexpr size_t kSampleLanes = Constants::SampleLanes<T>();
+ constexpr size_t N1 = st.LanesPerKey();
+ const Vec<D> valueL = st.SetKey(d, samples);
+ const Vec<D> valueR = st.SetKey(d, samples + kSampleLanes - N1);
+ HWY_DASSERT(AllTrue(d, st.Compare(d, valueL, valueR)));
+ HWY_DASSERT(AllFalse(d, st.EqualKeys(d, valueL, valueR)));
+ const Vec<D> prev = st.PrevValue(d, valueR);
+ // If the sample has more than two values, then the keys have at least that
+ // many, and thus this special case is inapplicable.
+ if (HWY_UNLIKELY(!AllTrue(d, st.EqualKeys(d, valueL, prev)))) {
+ return false;
+ }
+
+ // Must not overwrite samples because if this returns false, caller wants to
+ // read the original samples again.
+ T* HWY_RESTRICT buf = samples + kSampleLanes;
+ Vec<D> third; // unused
+ return MaybePartitionTwoValue(d, st, keys, num, valueL, valueR, third, buf);
+}
+
+// ------------------------------ Pivot sampling
+
+template <class Traits, class V>
+HWY_INLINE V MedianOf3(Traits st, V v0, V v1, V v2) {
+ const DFromV<V> d;
+ // Slightly faster for 128-bit, apparently because not serially dependent.
+ if (st.Is128()) {
+ // Median = XOR-sum 'minus' the first and last. Calling First twice is
+ // slightly faster than Compare + 2 IfThenElse or even IfThenElse + XOR.
+ const auto sum = Xor(Xor(v0, v1), v2);
+ const auto first = st.First(d, st.First(d, v0, v1), v2);
+ const auto last = st.Last(d, st.Last(d, v0, v1), v2);
+ return Xor(Xor(sum, first), last);
+ }
+ st.Sort2(d, v0, v2);
+ v1 = st.Last(d, v0, v1);
+ v1 = st.First(d, v1, v2);
+ return v1;
+}
+
+// Based on https://github.com/numpy/numpy/issues/16313#issuecomment-641897028
+HWY_INLINE uint64_t RandomBits(uint64_t* HWY_RESTRICT state) {
+ const uint64_t a = state[0];
+ const uint64_t b = state[1];
+ const uint64_t w = state[2] + 1;
+ const uint64_t next = a ^ w;
+ state[0] = (b + (b << 3)) ^ (b >> 11);
+ const uint64_t rot = (b << 24) | (b >> 40);
+ state[1] = rot + next;
+ state[2] = w;
+ return next;
+}
+
+// Returns slightly biased random index of a chunk in [0, num_chunks).
+// See https://www.pcg-random.org/posts/bounded-rands.html.
+HWY_INLINE size_t RandomChunkIndex(const uint32_t num_chunks, uint32_t bits) {
+ const uint64_t chunk_index = (static_cast<uint64_t>(bits) * num_chunks) >> 32;
+ HWY_DASSERT(chunk_index < num_chunks);
+ return static_cast<size_t>(chunk_index);
+}
+
+// Writes samples from `keys[0, num)` into `buf`.
+template <class D, class Traits, typename T>
+HWY_INLINE void DrawSamples(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
+ T* HWY_RESTRICT buf, uint64_t* HWY_RESTRICT state) {
+ using V = decltype(Zero(d));
+ const size_t N = Lanes(d);
+
+ // Power of two
+ constexpr size_t kLanesPerChunk = Constants::LanesPerChunk(sizeof(T));
+
+ // Align start of keys to chunks. We have at least 2 chunks (x 64 bytes)
+ // because the base case handles anything up to 8 vectors (x 16 bytes).
+ HWY_DASSERT(num >= Constants::SampleLanes<T>());
+ const size_t misalign =
+ (reinterpret_cast<uintptr_t>(keys) / sizeof(T)) & (kLanesPerChunk - 1);
+ if (misalign != 0) {
+ const size_t consume = kLanesPerChunk - misalign;
+ keys += consume;
+ num -= consume;
+ }
+
+ // Generate enough random bits for 6 uint32
+ uint32_t bits[6];
+ for (size_t i = 0; i < 6; i += 2) {
+ const uint64_t bits64 = RandomBits(state);
+ CopyBytes<8>(&bits64, bits + i);
+ }
+
+ const size_t num_chunks64 = num / kLanesPerChunk;
+ // Clamp to uint32 for RandomChunkIndex
+ const uint32_t num_chunks =
+ static_cast<uint32_t>(HWY_MIN(num_chunks64, 0xFFFFFFFFull));
+
+ const size_t offset0 = RandomChunkIndex(num_chunks, bits[0]) * kLanesPerChunk;
+ const size_t offset1 = RandomChunkIndex(num_chunks, bits[1]) * kLanesPerChunk;
+ const size_t offset2 = RandomChunkIndex(num_chunks, bits[2]) * kLanesPerChunk;
+ const size_t offset3 = RandomChunkIndex(num_chunks, bits[3]) * kLanesPerChunk;
+ const size_t offset4 = RandomChunkIndex(num_chunks, bits[4]) * kLanesPerChunk;
+ const size_t offset5 = RandomChunkIndex(num_chunks, bits[5]) * kLanesPerChunk;
+ for (size_t i = 0; i < kLanesPerChunk; i += N) {
+ const V v0 = Load(d, keys + offset0 + i);
+ const V v1 = Load(d, keys + offset1 + i);
+ const V v2 = Load(d, keys + offset2 + i);
+ const V medians0 = MedianOf3(st, v0, v1, v2);
+ Store(medians0, d, buf + i);
+
+ const V v3 = Load(d, keys + offset3 + i);
+ const V v4 = Load(d, keys + offset4 + i);
+ const V v5 = Load(d, keys + offset5 + i);
+ const V medians1 = MedianOf3(st, v3, v4, v5);
+ Store(medians1, d, buf + i + kLanesPerChunk);
+ }
+}
+
+// For detecting inputs where (almost) all keys are equal.
+template <class D, class Traits>
+HWY_INLINE bool UnsortedSampleEqual(D d, Traits st,
+ const TFromD<D>* HWY_RESTRICT samples) {
+ constexpr size_t kSampleLanes = Constants::SampleLanes<TFromD<D>>();
+ const size_t N = Lanes(d);
+ // Both are powers of two, so there will be no remainders.
+ HWY_DASSERT(N < kSampleLanes);
+ using V = Vec<D>;
+
+ const V first = st.SetKey(d, samples);
+ // OR of XOR-difference may be faster than comparison.
+ V diff = Zero(d);
+ for (size_t i = 0; i < kSampleLanes; i += N) {
+ const V v = Load(d, samples + i);
+ diff = OrXor(diff, first, v);
+ }
+
+ return st.NoKeyDifference(d, diff);
+}
+
+template <class D, class Traits, typename T>
+HWY_INLINE void SortSamples(D d, Traits st, T* HWY_RESTRICT buf) {
+ const size_t N = Lanes(d);
+ constexpr size_t kSampleLanes = Constants::SampleLanes<T>();
+ // Network must be large enough to sort two chunks.
+ HWY_DASSERT(Constants::BaseCaseNumLanes<st.LanesPerKey()>(N) >= kSampleLanes);
+
+ BaseCase(d, st, buf, kSampleLanes, buf + kSampleLanes);
+
+ if (VQSORT_PRINT >= 2) {
+ fprintf(stderr, "Samples:\n");
+ for (size_t i = 0; i < kSampleLanes; i += N) {
+ MaybePrintVector(d, "", Load(d, buf + i), 0, N);
+ }
+ }
+}
+
+// ------------------------------ Pivot selection
+
+enum class PivotResult {
+ kDone, // stop without partitioning (all equal, or two-value partition)
+ kNormal, // partition and recurse left and right
+ kIsFirst, // partition but skip left recursion
+ kWasLast, // partition but skip right recursion
+};
+
+HWY_INLINE const char* PivotResultString(PivotResult result) {
+ switch (result) {
+ case PivotResult::kDone:
+ return "done";
+ case PivotResult::kNormal:
+ return "normal";
+ case PivotResult::kIsFirst:
+ return "first";
+ case PivotResult::kWasLast:
+ return "last";
+ }
+ return "unknown";
+}
+
+// (Could vectorize, but only 0.2% of total time)
+template <class Traits, typename T>
+HWY_INLINE size_t PivotRank(Traits st, const T* HWY_RESTRICT samples) {
+ constexpr size_t kSampleLanes = Constants::SampleLanes<T>();
+ constexpr size_t N1 = st.LanesPerKey();
+
+ constexpr size_t kRankMid = kSampleLanes / 2;
+ static_assert(kRankMid % N1 == 0, "Mid is not an aligned key");
+
+ // Find the previous value not equal to the median.
+ size_t rank_prev = kRankMid - N1;
+ for (; st.Equal1(samples + rank_prev, samples + kRankMid); rank_prev -= N1) {
+ // All previous samples are equal to the median.
+ if (rank_prev == 0) return 0;
+ }
+
+ size_t rank_next = rank_prev + N1;
+ for (; st.Equal1(samples + rank_next, samples + kRankMid); rank_next += N1) {
+ // The median is also the largest sample. If it is also the largest key,
+ // we'd end up with an empty right partition, so choose the previous key.
+ if (rank_next == kSampleLanes - N1) return rank_prev;
+ }
+
+ // If we choose the median as pivot, the ratio of keys ending in the left
+ // partition will likely be rank_next/kSampleLanes (if the sample is
+ // representative). This is because equal-to-pivot values also land in the
+ // left - it's infeasible to do an in-place vectorized 3-way partition.
+ // Check whether prev would lead to a more balanced partition.
+ const size_t excess_if_median = rank_next - kRankMid;
+ const size_t excess_if_prev = kRankMid - rank_prev;
+ return excess_if_median < excess_if_prev ? kRankMid : rank_prev;
+}
+
+// Returns pivot chosen from `samples`. It will never be the largest key
+// (thus the right partition will never be empty).
+template <class D, class Traits, typename T>
+HWY_INLINE Vec<D> ChoosePivotByRank(D d, Traits st,
+ const T* HWY_RESTRICT samples) {
+ const size_t pivot_rank = PivotRank(st, samples);
+ const Vec<D> pivot = st.SetKey(d, samples + pivot_rank);
+ if (VQSORT_PRINT >= 2) {
+ fprintf(stderr, " Pivot rank %zu = %f\n", pivot_rank,
+ static_cast<double>(GetLane(pivot)));
+ }
+ // Verify pivot is not equal to the last sample.
+ constexpr size_t kSampleLanes = Constants::SampleLanes<T>();
+ constexpr size_t N1 = st.LanesPerKey();
+ const Vec<D> last = st.SetKey(d, samples + kSampleLanes - N1);
+ const bool all_neq = AllTrue(d, st.NotEqualKeys(d, pivot, last));
+ (void)all_neq;
+ HWY_DASSERT(all_neq);
+ return pivot;
+}
+
+// Returns true if all keys equal `pivot`, otherwise returns false and sets
+// `*first_mismatch' to the index of the first differing key.
+template <class D, class Traits, typename T>
+HWY_INLINE bool AllEqual(D d, Traits st, const Vec<D> pivot,
+ const T* HWY_RESTRICT keys, size_t num,
+ size_t* HWY_RESTRICT first_mismatch) {
+ const size_t N = Lanes(d);
+ // Ensures we can use overlapping loads for the tail; see HandleSpecialCases.
+ HWY_DASSERT(num >= N);
+ const Vec<D> zero = Zero(d);
+
+ // Vector-align keys + i.
+ const size_t misalign =
+ (reinterpret_cast<uintptr_t>(keys) / sizeof(T)) & (N - 1);
+ HWY_DASSERT(misalign % st.LanesPerKey() == 0);
+ const size_t consume = N - misalign;
+ {
+ const Vec<D> v = LoadU(d, keys);
+ // Only check masked lanes; consider others to be equal.
+ const Mask<D> diff = And(FirstN(d, consume), st.NotEqualKeys(d, v, pivot));
+ if (HWY_UNLIKELY(!AllFalse(d, diff))) {
+ const size_t lane = FindKnownFirstTrue(d, diff);
+ *first_mismatch = lane;
+ return false;
+ }
+ }
+ size_t i = consume;
+ HWY_DASSERT(((reinterpret_cast<uintptr_t>(keys + i) / sizeof(T)) & (N - 1)) ==
+ 0);
+
+ // Sticky bits registering any difference between `keys` and the first key.
+ // We use vector XOR because it may be cheaper than comparisons, especially
+ // for 128-bit. 2x unrolled for more ILP.
+ Vec<D> diff0 = zero;
+ Vec<D> diff1 = zero;
+
+ // We want to stop once a difference has been found, but without slowing
+ // down the loop by comparing during each iteration. The compromise is to
+ // compare after a 'group', which consists of kLoops times two vectors.
+ constexpr size_t kLoops = 8;
+ const size_t lanes_per_group = kLoops * 2 * N;
+
+ for (; i + lanes_per_group <= num; i += lanes_per_group) {
+ HWY_DEFAULT_UNROLL
+ for (size_t loop = 0; loop < kLoops; ++loop) {
+ const Vec<D> v0 = Load(d, keys + i + loop * 2 * N);
+ const Vec<D> v1 = Load(d, keys + i + loop * 2 * N + N);
+ diff0 = OrXor(diff0, v0, pivot);
+ diff1 = OrXor(diff1, v1, pivot);
+ }
+
+ // If there was a difference in the entire group:
+ if (HWY_UNLIKELY(!st.NoKeyDifference(d, Or(diff0, diff1)))) {
+ // .. then loop until the first one, with termination guarantee.
+ for (;; i += N) {
+ const Vec<D> v = Load(d, keys + i);
+ const Mask<D> diff = st.NotEqualKeys(d, v, pivot);
+ if (HWY_UNLIKELY(!AllFalse(d, diff))) {
+ const size_t lane = FindKnownFirstTrue(d, diff);
+ *first_mismatch = i + lane;
+ return false;
+ }
+ }
+ }
+ }
+
+ // Whole vectors, no unrolling, compare directly
+ for (; i + N <= num; i += N) {
+ const Vec<D> v = Load(d, keys + i);
+ const Mask<D> diff = st.NotEqualKeys(d, v, pivot);
+ if (HWY_UNLIKELY(!AllFalse(d, diff))) {
+ const size_t lane = FindKnownFirstTrue(d, diff);
+ *first_mismatch = i + lane;
+ return false;
+ }
+ }
+ // Always re-check the last (unaligned) vector to reduce branching.
+ i = num - N;
+ const Vec<D> v = LoadU(d, keys + i);
+ const Mask<D> diff = st.NotEqualKeys(d, v, pivot);
+ if (HWY_UNLIKELY(!AllFalse(d, diff))) {
+ const size_t lane = FindKnownFirstTrue(d, diff);
+ *first_mismatch = i + lane;
+ return false;
+ }
+
+ if (VQSORT_PRINT >= 1) {
+ fprintf(stderr, "All keys equal\n");
+ }
+ return true; // all equal
+}
+
+// Called from 'two locations', but only one is active (IsKV is constexpr).
+template <class D, class Traits, typename T>
+HWY_INLINE bool ExistsAnyBefore(D d, Traits st, const T* HWY_RESTRICT keys,
+ size_t num, const Vec<D> pivot) {
+ const size_t N = Lanes(d);
+ HWY_DASSERT(num >= N); // See HandleSpecialCases
+
+ if (VQSORT_PRINT >= 2) {
+ fprintf(stderr, "Scanning for before\n");
+ }
+
+ size_t i = 0;
+
+ constexpr size_t kLoops = 16;
+ const size_t lanes_per_group = kLoops * N;
+
+ Vec<D> first = pivot;
+
+ // Whole group, unrolled
+ for (; i + lanes_per_group <= num; i += lanes_per_group) {
+ HWY_DEFAULT_UNROLL
+ for (size_t loop = 0; loop < kLoops; ++loop) {
+ const Vec<D> curr = LoadU(d, keys + i + loop * N);
+ first = st.First(d, first, curr);
+ }
+
+ if (HWY_UNLIKELY(!AllFalse(d, st.Compare(d, first, pivot)))) {
+ if (VQSORT_PRINT >= 2) {
+ fprintf(stderr, "Stopped scanning at end of group %zu\n",
+ i + lanes_per_group);
+ }
+ return true;
+ }
+ }
+ // Whole vectors, no unrolling
+ for (; i + N <= num; i += N) {
+ const Vec<D> curr = LoadU(d, keys + i);
+ if (HWY_UNLIKELY(!AllFalse(d, st.Compare(d, curr, pivot)))) {
+ if (VQSORT_PRINT >= 2) {
+ fprintf(stderr, "Stopped scanning at %zu\n", i);
+ }
+ return true;
+ }
+ }
+ // If there are remainders, re-check the last whole vector.
+ if (HWY_LIKELY(i != num)) {
+ const Vec<D> curr = LoadU(d, keys + num - N);
+ if (HWY_UNLIKELY(!AllFalse(d, st.Compare(d, curr, pivot)))) {
+ if (VQSORT_PRINT >= 2) {
+ fprintf(stderr, "Stopped scanning at last %zu\n", num - N);
+ }
+ return true;
+ }
+ }
+
+ return false; // pivot is the first
+}
+
+// Called from 'two locations', but only one is active (IsKV is constexpr).
+template <class D, class Traits, typename T>
+HWY_INLINE bool ExistsAnyAfter(D d, Traits st, const T* HWY_RESTRICT keys,
+ size_t num, const Vec<D> pivot) {
+ const size_t N = Lanes(d);
+ HWY_DASSERT(num >= N); // See HandleSpecialCases
+
+ if (VQSORT_PRINT >= 2) {
+ fprintf(stderr, "Scanning for after\n");
+ }
+
+ size_t i = 0;
+
+ constexpr size_t kLoops = 16;
+ const size_t lanes_per_group = kLoops * N;
+
+ Vec<D> last = pivot;
+
+ // Whole group, unrolled
+ for (; i + lanes_per_group <= num; i += lanes_per_group) {
+ HWY_DEFAULT_UNROLL
+ for (size_t loop = 0; loop < kLoops; ++loop) {
+ const Vec<D> curr = LoadU(d, keys + i + loop * N);
+ last = st.Last(d, last, curr);
+ }
+
+ if (HWY_UNLIKELY(!AllFalse(d, st.Compare(d, pivot, last)))) {
+ if (VQSORT_PRINT >= 2) {
+ fprintf(stderr, "Stopped scanning at end of group %zu\n",
+ i + lanes_per_group);
+ }
+ return true;
+ }
+ }
+ // Whole vectors, no unrolling
+ for (; i + N <= num; i += N) {
+ const Vec<D> curr = LoadU(d, keys + i);
+ if (HWY_UNLIKELY(!AllFalse(d, st.Compare(d, pivot, curr)))) {
+ if (VQSORT_PRINT >= 2) {
+ fprintf(stderr, "Stopped scanning at %zu\n", i);
+ }
+ return true;
+ }
+ }
+ // If there are remainders, re-check the last whole vector.
+ if (HWY_LIKELY(i != num)) {
+ const Vec<D> curr = LoadU(d, keys + num - N);
+ if (HWY_UNLIKELY(!AllFalse(d, st.Compare(d, pivot, curr)))) {
+ if (VQSORT_PRINT >= 2) {
+ fprintf(stderr, "Stopped scanning at last %zu\n", num - N);
+ }
+ return true;
+ }
+ }
+
+ return false; // pivot is the last
+}
+
+// Returns pivot chosen from `keys[0, num)`. It will never be the largest key
+// (thus the right partition will never be empty).
+template <class D, class Traits, typename T>
+HWY_INLINE Vec<D> ChoosePivotForEqualSamples(D d, Traits st,
+ T* HWY_RESTRICT keys, size_t num,
+ T* HWY_RESTRICT samples,
+ Vec<D> second, Vec<D> third,
+ PivotResult& result) {
+ const Vec<D> pivot = st.SetKey(d, samples); // the single unique sample
+
+ // Early out for mostly-0 arrays, where pivot is often FirstValue.
+ if (HWY_UNLIKELY(AllTrue(d, st.EqualKeys(d, pivot, st.FirstValue(d))))) {
+ result = PivotResult::kIsFirst;
+ return pivot;
+ }
+ if (HWY_UNLIKELY(AllTrue(d, st.EqualKeys(d, pivot, st.LastValue(d))))) {
+ result = PivotResult::kWasLast;
+ return st.PrevValue(d, pivot);
+ }
+
+ // If key-value, we didn't run PartitionIfTwo* and thus `third` is unknown and
+ // cannot be used.
+ if (st.IsKV()) {
+ // If true, pivot is either middle or last.
+ const bool before = !AllFalse(d, st.Compare(d, second, pivot));
+ if (HWY_UNLIKELY(before)) {
+ // Not last, so middle.
+ if (HWY_UNLIKELY(ExistsAnyAfter(d, st, keys, num, pivot))) {
+ result = PivotResult::kNormal;
+ return pivot;
+ }
+
+ // We didn't find anything after pivot, so it is the last. Because keys
+ // equal to the pivot go to the left partition, the right partition would
+ // be empty and Partition will not have changed anything. Instead use the
+ // previous value in sort order, which is not necessarily an actual key.
+ result = PivotResult::kWasLast;
+ return st.PrevValue(d, pivot);
+ }
+
+ // Otherwise, pivot is first or middle. Rule out it being first:
+ if (HWY_UNLIKELY(ExistsAnyBefore(d, st, keys, num, pivot))) {
+ result = PivotResult::kNormal;
+ return pivot;
+ }
+ // It is first: fall through to shared code below.
+ } else {
+ // Check if pivot is between two known values. If so, it is not the first
+ // nor the last and we can avoid scanning.
+ st.Sort2(d, second, third);
+ HWY_DASSERT(AllTrue(d, st.Compare(d, second, third)));
+ const bool before = !AllFalse(d, st.Compare(d, second, pivot));
+ const bool after = !AllFalse(d, st.Compare(d, pivot, third));
+ // Only reached if there are three keys, which means pivot is either first,
+ // last, or in between. Thus there is another key that comes before or
+ // after.
+ HWY_DASSERT(before || after);
+ if (HWY_UNLIKELY(before)) {
+ // Neither first nor last.
+ if (HWY_UNLIKELY(after || ExistsAnyAfter(d, st, keys, num, pivot))) {
+ result = PivotResult::kNormal;
+ return pivot;
+ }
+
+ // We didn't find anything after pivot, so it is the last. Because keys
+ // equal to the pivot go to the left partition, the right partition would
+ // be empty and Partition will not have changed anything. Instead use the
+ // previous value in sort order, which is not necessarily an actual key.
+ result = PivotResult::kWasLast;
+ return st.PrevValue(d, pivot);
+ }
+
+ // Has after, and we found one before: in the middle.
+ if (HWY_UNLIKELY(ExistsAnyBefore(d, st, keys, num, pivot))) {
+ result = PivotResult::kNormal;
+ return pivot;
+ }
+ }
+
+ // Pivot is first. We could consider a special partition mode that only
+ // reads from and writes to the right side, and later fills in the left
+ // side, which we know is equal to the pivot. However, that leads to more
+ // cache misses if the array is large, and doesn't save much, hence is a
+ // net loss.
+ result = PivotResult::kIsFirst;
+ return pivot;
+}
+
+// ------------------------------ Quicksort recursion
+
+template <class D, class Traits, typename T>
+HWY_NOINLINE void PrintMinMax(D d, Traits st, const T* HWY_RESTRICT keys,
+ size_t num, T* HWY_RESTRICT buf) {
+ if (VQSORT_PRINT >= 2) {
+ const size_t N = Lanes(d);
+ if (num < N) return;
+
+ Vec<D> first = st.LastValue(d);
+ Vec<D> last = st.FirstValue(d);
+
+ size_t i = 0;
+ for (; i + N <= num; i += N) {
+ const Vec<D> v = LoadU(d, keys + i);
+ first = st.First(d, v, first);
+ last = st.Last(d, v, last);
+ }
+ if (HWY_LIKELY(i != num)) {
+ HWY_DASSERT(num >= N); // See HandleSpecialCases
+ const Vec<D> v = LoadU(d, keys + num - N);
+ first = st.First(d, v, first);
+ last = st.Last(d, v, last);
+ }
+
+ first = st.FirstOfLanes(d, first, buf);
+ last = st.LastOfLanes(d, last, buf);
+ MaybePrintVector(d, "first", first, 0, st.LanesPerKey());
+ MaybePrintVector(d, "last", last, 0, st.LanesPerKey());
+ }
+}
+
+template <class D, class Traits, typename T>
+HWY_NOINLINE void Recurse(D d, Traits st, T* HWY_RESTRICT keys,
+ const size_t num, T* HWY_RESTRICT buf,
+ uint64_t* HWY_RESTRICT state,
+ const size_t remaining_levels) {
+ HWY_DASSERT(num != 0);
+
+ const size_t N = Lanes(d);
+ constexpr size_t kLPK = st.LanesPerKey();
+ if (HWY_UNLIKELY(num <= Constants::BaseCaseNumLanes<kLPK>(N))) {
+ BaseCase(d, st, keys, num, buf);
+ return;
+ }
+
+ // Move after BaseCase so we skip printing for small subarrays.
+ if (VQSORT_PRINT >= 1) {
+ fprintf(stderr, "\n\n=== Recurse depth=%zu len=%zu\n", remaining_levels,
+ num);
+ PrintMinMax(d, st, keys, num, buf);
+ }
+
+ DrawSamples(d, st, keys, num, buf, state);
+
+ Vec<D> pivot;
+ PivotResult result = PivotResult::kNormal;
+ if (HWY_UNLIKELY(UnsortedSampleEqual(d, st, buf))) {
+ pivot = st.SetKey(d, buf);
+ size_t idx_second = 0;
+ if (HWY_UNLIKELY(AllEqual(d, st, pivot, keys, num, &idx_second))) {
+ return;
+ }
+ HWY_DASSERT(idx_second % st.LanesPerKey() == 0);
+ // Must capture the value before PartitionIfTwoKeys may overwrite it.
+ const Vec<D> second = st.SetKey(d, keys + idx_second);
+ MaybePrintVector(d, "pivot", pivot, 0, st.LanesPerKey());
+ MaybePrintVector(d, "second", second, 0, st.LanesPerKey());
+
+ Vec<D> third;
+ // Not supported for key-value types because two 'keys' may be equivalent
+ // but not interchangeable (their values may differ).
+ if (HWY_UNLIKELY(!st.IsKV() &&
+ PartitionIfTwoKeys(d, st, pivot, keys, num, idx_second,
+ second, third, buf))) {
+ return; // Done, skip recursion because each side has all-equal keys.
+ }
+
+ // We can no longer start scanning from idx_second because
+ // PartitionIfTwoKeys may have reordered keys.
+ pivot = ChoosePivotForEqualSamples(d, st, keys, num, buf, second, third,
+ result);
+ // If kNormal, `pivot` is very common but not the first/last. It is
+ // tempting to do a 3-way partition (to avoid moving the =pivot keys a
+ // second time), but that is a net loss due to the extra comparisons.
+ } else {
+ SortSamples(d, st, buf);
+
+ // Not supported for key-value types because two 'keys' may be equivalent
+ // but not interchangeable (their values may differ).
+ if (HWY_UNLIKELY(!st.IsKV() &&
+ PartitionIfTwoSamples(d, st, keys, num, buf))) {
+ return;
+ }
+
+ pivot = ChoosePivotByRank(d, st, buf);
+ }
+
+ // Too many recursions. This is unlikely to happen because we select pivots
+ // from large (though still O(1)) samples.
+ if (HWY_UNLIKELY(remaining_levels == 0)) {
+ if (VQSORT_PRINT >= 1) {
+ fprintf(stderr, "HeapSort reached, size=%zu\n", num);
+ }
+ HeapSort(st, keys, num); // Slow but N*logN.
+ return;
+ }
+
+ const size_t bound = Partition(d, st, keys, num, pivot, buf);
+ if (VQSORT_PRINT >= 2) {
+ fprintf(stderr, "bound %zu num %zu result %s\n", bound, num,
+ PivotResultString(result));
+ }
+ // The left partition is not empty because the pivot is one of the keys
+ // (unless kWasLast, in which case the pivot is PrevValue, but we still
+ // have at least one value <= pivot because AllEqual ruled out the case of
+ // only one unique value, and there is exactly one value after pivot).
+ HWY_DASSERT(bound != 0);
+ // ChoosePivot* ensure pivot != last, so the right partition is never empty
+ // except in the rare case of the pivot matching the last-in-sort-order value,
+ // which implies we anyway skip the right partition due to kWasLast.
+ HWY_DASSERT(bound != num || result == PivotResult::kWasLast);
+
+ if (HWY_LIKELY(result != PivotResult::kIsFirst)) {
+ Recurse(d, st, keys, bound, buf, state, remaining_levels - 1);
+ }
+ if (HWY_LIKELY(result != PivotResult::kWasLast)) {
+ Recurse(d, st, keys + bound, num - bound, buf, state, remaining_levels - 1);
+ }
+}
+
+// Returns true if sorting is finished.
+template <class D, class Traits, typename T>
+HWY_INLINE bool HandleSpecialCases(D d, Traits st, T* HWY_RESTRICT keys,
+ size_t num, T* HWY_RESTRICT buf) {
+ const size_t N = Lanes(d);
+ constexpr size_t kLPK = st.LanesPerKey();
+ const size_t base_case_num = Constants::BaseCaseNumLanes<kLPK>(N);
+
+ // Recurse will also check this, but doing so here first avoids setting up
+ // the random generator state.
+ if (HWY_UNLIKELY(num <= base_case_num)) {
+ BaseCase(d, st, keys, num, buf);
+ return true;
+ }
+
+ // 128-bit keys require vectors with at least two u64 lanes, which is always
+ // the case unless `d` requests partial vectors (e.g. fraction = 1/2) AND the
+ // hardware vector width is less than 128bit / fraction.
+ const bool partial_128 = !IsFull(d) && N < 2 && st.Is128();
+ // Partition assumes its input is at least two vectors. If vectors are huge,
+ // base_case_num may actually be smaller. If so, which is only possible on
+ // RVV, pass a capped or partial d (LMUL < 1). Use HWY_MAX_BYTES instead of
+ // HWY_LANES to account for the largest possible LMUL.
+ constexpr bool kPotentiallyHuge =
+ HWY_MAX_BYTES / sizeof(T) > Constants::kMaxRows * Constants::kMaxCols;
+ const bool huge_vec = kPotentiallyHuge && (2 * N > base_case_num);
+ if (partial_128 || huge_vec) {
+ if (VQSORT_PRINT >= 1) {
+ fprintf(stderr, "WARNING: using slow HeapSort: partial %d huge %d\n",
+ partial_128, huge_vec);
+ }
+ HeapSort(st, keys, num);
+ return true;
+ }
+
+ // We could also check for already sorted/reverse/equal, but that's probably
+ // counterproductive if vqsort is used as a base case.
+
+ return false; // not finished sorting
+}
+
+#endif // VQSORT_ENABLED
+} // namespace detail
+
+// Old interface with user-specified buffer, retained for compatibility.
+// `buf` must be vector-aligned and hold at least
+// SortConstants::BufBytes(HWY_MAX_BYTES, st.LanesPerKey()).
+template <class D, class Traits, typename T>
+void Sort(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
+ T* HWY_RESTRICT buf) {
+ if (VQSORT_PRINT >= 1) {
+ fprintf(stderr, "=============== Sort num %zu\n", num);
+ }
+
+#if VQSORT_ENABLED || HWY_IDE
+ if (detail::HandleSpecialCases(d, st, keys, num, buf)) return;
+
+#if HWY_MAX_BYTES > 64
+ // sorting_networks-inl and traits assume no more than 512 bit vectors.
+ if (HWY_UNLIKELY(Lanes(d) > 64 / sizeof(T))) {
+ return Sort(CappedTag<T, 64 / sizeof(T)>(), st, keys, num, buf);
+ }
+#endif // HWY_MAX_BYTES > 64
+
+ uint64_t* HWY_RESTRICT state = GetGeneratorState();
+ // Introspection: switch to worst-case N*logN heapsort after this many.
+ // Should never be reached, so computing log2 exactly does not help.
+ const size_t max_levels = 50;
+ detail::Recurse(d, st, keys, num, buf, state, max_levels);
+#else // !VQSORT_ENABLED
+ (void)d;
+ (void)buf;
+ if (VQSORT_PRINT >= 1) {
+ fprintf(stderr, "WARNING: using slow HeapSort because vqsort disabled\n");
+ }
+ return detail::HeapSort(st, keys, num);
+#endif // VQSORT_ENABLED
+}
+
+// Sorts `keys[0..num-1]` according to the order defined by `st.Compare`.
+// In-place i.e. O(1) additional storage. Worst-case N*logN comparisons.
+// Non-stable (order of equal keys may change), except for the common case where
+// the upper bits of T are the key, and the lower bits are a sequential or at
+// least unique ID.
+// There is no upper limit on `num`, but note that pivots may be chosen by
+// sampling only from the first 256 GiB.
+//
+// `d` is typically SortTag<T> (chooses between full and partial vectors).
+// `st` is SharedTraits<Traits*<Order*>>. This abstraction layer bridges
+// differences in sort order and single-lane vs 128-bit keys.
+template <class D, class Traits, typename T>
+HWY_API void Sort(D d, Traits st, T* HWY_RESTRICT keys, size_t num) {
+ constexpr size_t kLPK = st.LanesPerKey();
+ HWY_ALIGN T buf[SortConstants::BufBytes<T, kLPK>(HWY_MAX_BYTES) / sizeof(T)];
+ return Sort(d, st, keys, num, buf);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#endif // HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort.cc b/third_party/highway/hwy/contrib/sort/vqsort.cc
new file mode 100644
index 0000000000..e4ec91d9ce
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/vqsort.cc
@@ -0,0 +1,124 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#include <time.h>
+
+#include <cstdint>
+
+#include "hwy/base.h"
+#include "hwy/contrib/sort/shared-inl.h"
+
+// Check if we have sys/random.h. First skip some systems on which the check
+// itself (features.h) might be problematic.
+#if defined(ANDROID) || defined(__ANDROID__) || HWY_ARCH_RVV
+#define VQSORT_GETRANDOM 0
+#endif
+
+#if !defined(VQSORT_GETRANDOM) && HWY_OS_LINUX
+#include <features.h>
+
+// ---- which libc
+#if defined(__UCLIBC__)
+#define VQSORT_GETRANDOM 1 // added Mar 2015, before uclibc-ng 1.0
+
+#elif defined(__GLIBC__) && defined(__GLIBC_PREREQ)
+#if __GLIBC_PREREQ(2, 25)
+#define VQSORT_GETRANDOM 1
+#else
+#define VQSORT_GETRANDOM 0
+#endif
+
+#else
+// Assume MUSL, which has getrandom since 2018. There is no macro to test, see
+// https://www.openwall.com/lists/musl/2013/03/29/13.
+#define VQSORT_GETRANDOM 1
+
+#endif // ---- which libc
+#endif // linux
+
+#if !defined(VQSORT_GETRANDOM)
+#define VQSORT_GETRANDOM 0
+#endif
+
+// Choose a seed source for SFC generator: 1=getrandom, 2=CryptGenRandom.
+// Allow user override - not all Android support the getrandom wrapper.
+#ifndef VQSORT_SECURE_SEED
+
+#if VQSORT_GETRANDOM
+#define VQSORT_SECURE_SEED 1
+#elif defined(_WIN32) || defined(_WIN64)
+#define VQSORT_SECURE_SEED 2
+#else
+#define VQSORT_SECURE_SEED 0
+#endif
+
+#endif // VQSORT_SECURE_SEED
+
+// Pull in dependencies of the chosen seed source.
+#if VQSORT_SECURE_SEED == 1
+#include <sys/random.h>
+#elif VQSORT_SECURE_SEED == 2
+#include <windows.h>
+#pragma comment(lib, "advapi32.lib")
+// Must come after windows.h.
+#include <wincrypt.h>
+#endif // VQSORT_SECURE_SEED
+
+namespace hwy {
+namespace {
+
+void Fill16Bytes(void* bytes) {
+#if VQSORT_SECURE_SEED == 1
+ // May block if urandom is not yet initialized.
+ const ssize_t ret = getrandom(bytes, 16, /*flags=*/0);
+ if (ret == 16) return;
+#elif VQSORT_SECURE_SEED == 2
+ HCRYPTPROV hProvider{};
+ if (CryptAcquireContextA(&hProvider, nullptr, nullptr, PROV_RSA_FULL,
+ CRYPT_VERIFYCONTEXT)) {
+ const BOOL ok =
+ CryptGenRandom(hProvider, 16, reinterpret_cast<BYTE*>(bytes));
+ CryptReleaseContext(hProvider, 0);
+ if (ok) return;
+ }
+#endif
+
+ // VQSORT_SECURE_SEED == 0, or one of the above failed. Get some entropy from
+ // the address and the clock() timer.
+ uint64_t* words = reinterpret_cast<uint64_t*>(bytes);
+ uint64_t** seed_stack = &words;
+ void (*seed_code)(void*) = &Fill16Bytes;
+ const uintptr_t bits_stack = reinterpret_cast<uintptr_t>(seed_stack);
+ const uintptr_t bits_code = reinterpret_cast<uintptr_t>(seed_code);
+ const uint64_t bits_time = static_cast<uint64_t>(clock());
+ words[0] = bits_stack ^ bits_time ^ 0xFEDCBA98; // "Nothing up my sleeve"
+ words[1] = bits_code ^ bits_time ^ 0x01234567; // constants.
+}
+
+} // namespace
+
+uint64_t* GetGeneratorState() {
+ thread_local uint64_t state[3] = {0};
+ // This is a counter; zero indicates not yet initialized.
+ if (HWY_UNLIKELY(state[2] == 0)) {
+ Fill16Bytes(state);
+ state[2] = 1;
+ }
+ return state;
+}
+
+} // namespace hwy
diff --git a/third_party/highway/hwy/contrib/sort/vqsort.h b/third_party/highway/hwy/contrib/sort/vqsort.h
new file mode 100644
index 0000000000..95c140d140
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/vqsort.h
@@ -0,0 +1,221 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Interface to vectorized quicksort with dynamic dispatch.
+// Blog post: https://tinyurl.com/vqsort-blog
+// Paper with measurements: https://arxiv.org/abs/2205.05982
+//
+// To ensure the overhead of using wide vectors (e.g. AVX2 or AVX-512) is
+// worthwhile, we recommend using this code for sorting arrays whose size is at
+// least 512 KiB.
+
+#ifndef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_
+#define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_
+
+#include "hwy/base.h"
+
+namespace hwy {
+
+// Tag arguments that determine the sort order.
+struct SortAscending {
+ constexpr bool IsAscending() const { return true; }
+};
+struct SortDescending {
+ constexpr bool IsAscending() const { return false; }
+};
+
+// Vectorized Quicksort: sorts keys[0, n). Dispatches to the best available
+// instruction set and does not allocate memory.
+// Uses about 1.2 KiB stack plus an internal 3-word TLS cache for random state.
+HWY_CONTRIB_DLLEXPORT void VQSort(uint16_t* HWY_RESTRICT keys, size_t n,
+ SortAscending);
+HWY_CONTRIB_DLLEXPORT void VQSort(uint16_t* HWY_RESTRICT keys, size_t n,
+ SortDescending);
+HWY_CONTRIB_DLLEXPORT void VQSort(uint32_t* HWY_RESTRICT keys, size_t n,
+ SortAscending);
+HWY_CONTRIB_DLLEXPORT void VQSort(uint32_t* HWY_RESTRICT keys, size_t n,
+ SortDescending);
+HWY_CONTRIB_DLLEXPORT void VQSort(uint64_t* HWY_RESTRICT keys, size_t n,
+ SortAscending);
+HWY_CONTRIB_DLLEXPORT void VQSort(uint64_t* HWY_RESTRICT keys, size_t n,
+ SortDescending);
+HWY_CONTRIB_DLLEXPORT void VQSort(int16_t* HWY_RESTRICT keys, size_t n,
+ SortAscending);
+HWY_CONTRIB_DLLEXPORT void VQSort(int16_t* HWY_RESTRICT keys, size_t n,
+ SortDescending);
+HWY_CONTRIB_DLLEXPORT void VQSort(int32_t* HWY_RESTRICT keys, size_t n,
+ SortAscending);
+HWY_CONTRIB_DLLEXPORT void VQSort(int32_t* HWY_RESTRICT keys, size_t n,
+ SortDescending);
+HWY_CONTRIB_DLLEXPORT void VQSort(int64_t* HWY_RESTRICT keys, size_t n,
+ SortAscending);
+HWY_CONTRIB_DLLEXPORT void VQSort(int64_t* HWY_RESTRICT keys, size_t n,
+ SortDescending);
+HWY_CONTRIB_DLLEXPORT void VQSort(float* HWY_RESTRICT keys, size_t n,
+ SortAscending);
+HWY_CONTRIB_DLLEXPORT void VQSort(float* HWY_RESTRICT keys, size_t n,
+ SortDescending);
+HWY_CONTRIB_DLLEXPORT void VQSort(double* HWY_RESTRICT keys, size_t n,
+ SortAscending);
+HWY_CONTRIB_DLLEXPORT void VQSort(double* HWY_RESTRICT keys, size_t n,
+ SortDescending);
+HWY_CONTRIB_DLLEXPORT void VQSort(uint128_t* HWY_RESTRICT keys, size_t n,
+ SortAscending);
+HWY_CONTRIB_DLLEXPORT void VQSort(uint128_t* HWY_RESTRICT keys, size_t n,
+ SortDescending);
+HWY_CONTRIB_DLLEXPORT void VQSort(K64V64* HWY_RESTRICT keys, size_t n,
+ SortAscending);
+HWY_CONTRIB_DLLEXPORT void VQSort(K64V64* HWY_RESTRICT keys, size_t n,
+ SortDescending);
+HWY_CONTRIB_DLLEXPORT void VQSort(K32V32* HWY_RESTRICT keys, size_t n,
+ SortAscending);
+HWY_CONTRIB_DLLEXPORT void VQSort(K32V32* HWY_RESTRICT keys, size_t n,
+ SortDescending);
+
+// User-level caching is no longer required, so this class is no longer
+// beneficial. We recommend using the simpler VQSort() interface instead, and
+// retain this class only for compatibility. It now just calls VQSort.
+class HWY_CONTRIB_DLLEXPORT Sorter {
+ public:
+ Sorter() {}
+ ~Sorter() {}
+
+ // Move-only
+ Sorter(const Sorter&) = delete;
+ Sorter& operator=(const Sorter&) = delete;
+ Sorter(Sorter&& /*other*/) {}
+ Sorter& operator=(Sorter&& /*other*/) { return *this; }
+
+ void operator()(uint16_t* HWY_RESTRICT keys, size_t n,
+ SortAscending tag) const {
+ VQSort(keys, n, tag);
+ }
+ void operator()(uint16_t* HWY_RESTRICT keys, size_t n,
+ SortDescending tag) const {
+ VQSort(keys, n, tag);
+ }
+ void operator()(uint32_t* HWY_RESTRICT keys, size_t n,
+ SortAscending tag) const {
+ VQSort(keys, n, tag);
+ }
+ void operator()(uint32_t* HWY_RESTRICT keys, size_t n,
+ SortDescending tag) const {
+ VQSort(keys, n, tag);
+ }
+ void operator()(uint64_t* HWY_RESTRICT keys, size_t n,
+ SortAscending tag) const {
+ VQSort(keys, n, tag);
+ }
+ void operator()(uint64_t* HWY_RESTRICT keys, size_t n,
+ SortDescending tag) const {
+ VQSort(keys, n, tag);
+ }
+
+ void operator()(int16_t* HWY_RESTRICT keys, size_t n,
+ SortAscending tag) const {
+ VQSort(keys, n, tag);
+ }
+ void operator()(int16_t* HWY_RESTRICT keys, size_t n,
+ SortDescending tag) const {
+ VQSort(keys, n, tag);
+ }
+ void operator()(int32_t* HWY_RESTRICT keys, size_t n,
+ SortAscending tag) const {
+ VQSort(keys, n, tag);
+ }
+ void operator()(int32_t* HWY_RESTRICT keys, size_t n,
+ SortDescending tag) const {
+ VQSort(keys, n, tag);
+ }
+ void operator()(int64_t* HWY_RESTRICT keys, size_t n,
+ SortAscending tag) const {
+ VQSort(keys, n, tag);
+ }
+ void operator()(int64_t* HWY_RESTRICT keys, size_t n,
+ SortDescending tag) const {
+ VQSort(keys, n, tag);
+ }
+
+ void operator()(float* HWY_RESTRICT keys, size_t n, SortAscending tag) const {
+ VQSort(keys, n, tag);
+ }
+ void operator()(float* HWY_RESTRICT keys, size_t n,
+ SortDescending tag) const {
+ VQSort(keys, n, tag);
+ }
+ void operator()(double* HWY_RESTRICT keys, size_t n,
+ SortAscending tag) const {
+ VQSort(keys, n, tag);
+ }
+ void operator()(double* HWY_RESTRICT keys, size_t n,
+ SortDescending tag) const {
+ VQSort(keys, n, tag);
+ }
+
+ void operator()(uint128_t* HWY_RESTRICT keys, size_t n,
+ SortAscending tag) const {
+ VQSort(keys, n, tag);
+ }
+ void operator()(uint128_t* HWY_RESTRICT keys, size_t n,
+ SortDescending tag) const {
+ VQSort(keys, n, tag);
+ }
+
+ void operator()(K64V64* HWY_RESTRICT keys, size_t n,
+ SortAscending tag) const {
+ VQSort(keys, n, tag);
+ }
+ void operator()(K64V64* HWY_RESTRICT keys, size_t n,
+ SortDescending tag) const {
+ VQSort(keys, n, tag);
+ }
+
+ void operator()(K32V32* HWY_RESTRICT keys, size_t n,
+ SortAscending tag) const {
+ VQSort(keys, n, tag);
+ }
+ void operator()(K32V32* HWY_RESTRICT keys, size_t n,
+ SortDescending tag) const {
+ VQSort(keys, n, tag);
+ }
+
+ // Unused
+ static void Fill24Bytes(const void*, size_t, void*) {}
+ static bool HaveFloat64() { return false; }
+
+ private:
+ void Delete() {}
+
+ template <typename T>
+ T* Get() const {
+ return nullptr;
+ }
+
+#if HWY_COMPILER_CLANG
+ HWY_DIAGNOSTICS(push)
+ HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wunused-private-field")
+#endif
+ void* unused_ = nullptr;
+#if HWY_COMPILER_CLANG
+ HWY_DIAGNOSTICS(pop)
+#endif
+};
+
+// Internal use only
+HWY_CONTRIB_DLLEXPORT uint64_t* GetGeneratorState();
+
+} // namespace hwy
+
+#endif // HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_128a.cc b/third_party/highway/hwy/contrib/sort/vqsort_128a.cc
new file mode 100644
index 0000000000..9acd33375d
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/vqsort_128a.cc
@@ -0,0 +1,59 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_128a.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits128-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void Sort128Asc(uint64_t* HWY_RESTRICT keys, size_t num) {
+#if VQSORT_ENABLED
+ SortTag<uint64_t> d;
+ detail::SharedTraits<detail::Traits128<detail::OrderAscending128>> st;
+ Sort(d, st, keys, num);
+#else
+ (void)keys;
+ (void)num;
+ HWY_ASSERT(0);
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(Sort128Asc);
+} // namespace
+
+void VQSort(uint128_t* HWY_RESTRICT keys, size_t n, SortAscending) {
+ HWY_DYNAMIC_DISPATCH(Sort128Asc)
+ (reinterpret_cast<uint64_t*>(keys), n * 2);
+}
+
+} // namespace hwy
+#endif // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_128d.cc b/third_party/highway/hwy/contrib/sort/vqsort_128d.cc
new file mode 100644
index 0000000000..633a1ef452
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/vqsort_128d.cc
@@ -0,0 +1,59 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_128d.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits128-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void Sort128Desc(uint64_t* HWY_RESTRICT keys, size_t num) {
+#if VQSORT_ENABLED
+ SortTag<uint64_t> d;
+ detail::SharedTraits<detail::Traits128<detail::OrderDescending128>> st;
+ Sort(d, st, keys, num);
+#else
+ (void)keys;
+ (void)num;
+ HWY_ASSERT(0);
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(Sort128Desc);
+} // namespace
+
+void VQSort(uint128_t* HWY_RESTRICT keys, size_t n, SortDescending) {
+ HWY_DYNAMIC_DISPATCH(Sort128Desc)
+ (reinterpret_cast<uint64_t*>(keys), n * 2);
+}
+
+} // namespace hwy
+#endif // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_f32a.cc b/third_party/highway/hwy/contrib/sort/vqsort_f32a.cc
new file mode 100644
index 0000000000..0018bcc580
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/vqsort_f32a.cc
@@ -0,0 +1,52 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f32a.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortF32Asc(float* HWY_RESTRICT keys, size_t num) {
+ SortTag<float> d;
+ detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<float>>> st;
+ Sort(d, st, keys, num);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortF32Asc);
+} // namespace
+
+void VQSort(float* HWY_RESTRICT keys, size_t n, SortAscending) {
+ HWY_DYNAMIC_DISPATCH(SortF32Asc)(keys, n);
+}
+
+} // namespace hwy
+#endif // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_f32d.cc b/third_party/highway/hwy/contrib/sort/vqsort_f32d.cc
new file mode 100644
index 0000000000..fb974c592d
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/vqsort_f32d.cc
@@ -0,0 +1,52 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f32d.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortF32Desc(float* HWY_RESTRICT keys, size_t num) {
+ SortTag<float> d;
+ detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<float>>> st;
+ Sort(d, st, keys, num);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortF32Desc);
+} // namespace
+
+void VQSort(float* HWY_RESTRICT keys, size_t n, SortDescending) {
+ HWY_DYNAMIC_DISPATCH(SortF32Desc)(keys, n);
+}
+
+} // namespace hwy
+#endif // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_f64a.cc b/third_party/highway/hwy/contrib/sort/vqsort_f64a.cc
new file mode 100644
index 0000000000..79c9712902
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/vqsort_f64a.cc
@@ -0,0 +1,58 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f64a.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortF64Asc(double* HWY_RESTRICT keys, size_t num) {
+#if HWY_HAVE_FLOAT64
+ SortTag<double> d;
+ detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<double>>> st;
+ Sort(d, st, keys, num);
+#else
+ (void)keys;
+ (void)num;
+ HWY_ASSERT(0);
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortF64Asc);
+} // namespace
+
+void VQSort(double* HWY_RESTRICT keys, size_t n, SortAscending) {
+ HWY_DYNAMIC_DISPATCH(SortF64Asc)(keys, n);
+}
+
+} // namespace hwy
+#endif // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_f64d.cc b/third_party/highway/hwy/contrib/sort/vqsort_f64d.cc
new file mode 100644
index 0000000000..922878c407
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/vqsort_f64d.cc
@@ -0,0 +1,58 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f64d.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortF64Desc(double* HWY_RESTRICT keys, size_t num) {
+#if HWY_HAVE_FLOAT64
+ SortTag<double> d;
+ detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<double>>> st;
+ Sort(d, st, keys, num);
+#else
+ (void)keys;
+ (void)num;
+ HWY_ASSERT(0);
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortF64Desc);
+} // namespace
+
+void VQSort(double* HWY_RESTRICT keys, size_t n, SortDescending) {
+ HWY_DYNAMIC_DISPATCH(SortF64Desc)(keys, n);
+}
+
+} // namespace hwy
+#endif // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_i16a.cc b/third_party/highway/hwy/contrib/sort/vqsort_i16a.cc
new file mode 100644
index 0000000000..809827fba9
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/vqsort_i16a.cc
@@ -0,0 +1,52 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i16a.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortI16Asc(int16_t* HWY_RESTRICT keys, size_t num) {
+ SortTag<int16_t> d;
+ detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int16_t>>> st;
+ Sort(d, st, keys, num);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortI16Asc);
+} // namespace
+
+void VQSort(int16_t* HWY_RESTRICT keys, size_t n, SortAscending) {
+ HWY_DYNAMIC_DISPATCH(SortI16Asc)(keys, n);
+}
+
+} // namespace hwy
+#endif // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_i16d.cc b/third_party/highway/hwy/contrib/sort/vqsort_i16d.cc
new file mode 100644
index 0000000000..e168e3349c
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/vqsort_i16d.cc
@@ -0,0 +1,52 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i16d.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortI16Desc(int16_t* HWY_RESTRICT keys, size_t num) {
+ SortTag<int16_t> d;
+ detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<int16_t>>> st;
+ Sort(d, st, keys, num);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortI16Desc);
+} // namespace
+
+void VQSort(int16_t* HWY_RESTRICT keys, size_t n, SortDescending) {
+ HWY_DYNAMIC_DISPATCH(SortI16Desc)(keys, n);
+}
+
+} // namespace hwy
+#endif // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_i32a.cc b/third_party/highway/hwy/contrib/sort/vqsort_i32a.cc
new file mode 100644
index 0000000000..df8d7e622d
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/vqsort_i32a.cc
@@ -0,0 +1,52 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i32a.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortI32Asc(int32_t* HWY_RESTRICT keys, size_t num) {
+ SortTag<int32_t> d;
+ detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int32_t>>> st;
+ Sort(d, st, keys, num);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortI32Asc);
+} // namespace
+
+void VQSort(int32_t* HWY_RESTRICT keys, size_t n, SortAscending) {
+ HWY_DYNAMIC_DISPATCH(SortI32Asc)(keys, n);
+}
+
+} // namespace hwy
+#endif // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_i32d.cc b/third_party/highway/hwy/contrib/sort/vqsort_i32d.cc
new file mode 100644
index 0000000000..5bf93e99c6
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/vqsort_i32d.cc
@@ -0,0 +1,52 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i32d.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortI32Desc(int32_t* HWY_RESTRICT keys, size_t num) {
+ SortTag<int32_t> d;
+ detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<int32_t>>> st;
+ Sort(d, st, keys, num);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortI32Desc);
+} // namespace
+
+void VQSort(int32_t* HWY_RESTRICT keys, size_t n, SortDescending) {
+ HWY_DYNAMIC_DISPATCH(SortI32Desc)(keys, n);
+}
+
+} // namespace hwy
+#endif // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_i64a.cc b/third_party/highway/hwy/contrib/sort/vqsort_i64a.cc
new file mode 100644
index 0000000000..fb8ae90a01
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/vqsort_i64a.cc
@@ -0,0 +1,52 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i64a.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortI64Asc(int64_t* HWY_RESTRICT keys, size_t num) {
+ SortTag<int64_t> d;
+ detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int64_t>>> st;
+ Sort(d, st, keys, num);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortI64Asc);
+} // namespace
+
+void VQSort(int64_t* HWY_RESTRICT keys, size_t n, SortAscending) {
+ HWY_DYNAMIC_DISPATCH(SortI64Asc)(keys, n);
+}
+
+} // namespace hwy
+#endif // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_i64d.cc b/third_party/highway/hwy/contrib/sort/vqsort_i64d.cc
new file mode 100644
index 0000000000..8605f0e483
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/vqsort_i64d.cc
@@ -0,0 +1,52 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i64d.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortI64Desc(int64_t* HWY_RESTRICT keys, size_t num) {
+ SortTag<int64_t> d;
+ detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<int64_t>>> st;
+ Sort(d, st, keys, num);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortI64Desc);
+} // namespace
+
+void VQSort(int64_t* HWY_RESTRICT keys, size_t n, SortDescending) {
+ HWY_DYNAMIC_DISPATCH(SortI64Desc)(keys, n);
+}
+
+} // namespace hwy
+#endif // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_kv128a.cc b/third_party/highway/hwy/contrib/sort/vqsort_kv128a.cc
new file mode 100644
index 0000000000..4c7f3f15e9
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/vqsort_kv128a.cc
@@ -0,0 +1,62 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+// clang-format off
+// (avoid line break, which would prevent Copybara rules from matching)
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv128a.cc" //NOLINT
+// clang-format on
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits128-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortKV128Asc(uint64_t* HWY_RESTRICT keys, size_t num) {
+#if VQSORT_ENABLED
+ SortTag<uint64_t> d;
+ detail::SharedTraits<detail::Traits128<detail::OrderAscendingKV128>> st;
+ Sort(d, st, keys, num);
+#else
+ (void)keys;
+ (void)num;
+ HWY_ASSERT(0);
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortKV128Asc);
+} // namespace
+
+void VQSort(K64V64* HWY_RESTRICT keys, size_t n, SortAscending) {
+ HWY_DYNAMIC_DISPATCH(SortKV128Asc)
+ (reinterpret_cast<uint64_t*>(keys), n * 2);
+}
+
+} // namespace hwy
+#endif // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_kv128d.cc b/third_party/highway/hwy/contrib/sort/vqsort_kv128d.cc
new file mode 100644
index 0000000000..7b91dd94d3
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/vqsort_kv128d.cc
@@ -0,0 +1,62 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+// clang-format off
+// (avoid line break, which would prevent Copybara rules from matching)
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv128d.cc" //NOLINT
+// clang-format on
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits128-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortKV128Desc(uint64_t* HWY_RESTRICT keys, size_t num) {
+#if VQSORT_ENABLED
+ SortTag<uint64_t> d;
+ detail::SharedTraits<detail::Traits128<detail::OrderDescendingKV128>> st;
+ Sort(d, st, keys, num);
+#else
+ (void)keys;
+ (void)num;
+ HWY_ASSERT(0);
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortKV128Desc);
+} // namespace
+
+void VQSort(K64V64* HWY_RESTRICT keys, size_t n, SortDescending) {
+ HWY_DYNAMIC_DISPATCH(SortKV128Desc)
+ (reinterpret_cast<uint64_t*>(keys), n * 2);
+}
+
+} // namespace hwy
+#endif // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_kv64a.cc b/third_party/highway/hwy/contrib/sort/vqsort_kv64a.cc
new file mode 100644
index 0000000000..dd6886aefa
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/vqsort_kv64a.cc
@@ -0,0 +1,62 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+// clang-format off
+// (avoid line break, which would prevent Copybara rules from matching)
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv64a.cc" //NOLINT
+// clang-format on
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortKV64Asc(uint64_t* HWY_RESTRICT keys, size_t num) {
+#if VQSORT_ENABLED
+ SortTag<uint64_t> d;
+ detail::SharedTraits<detail::TraitsLane<detail::OrderAscendingKV64>> st;
+ Sort(d, st, keys, num);
+#else
+ (void)keys;
+ (void)num;
+ HWY_ASSERT(0);
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortKV64Asc);
+} // namespace
+
+void VQSort(K32V32* HWY_RESTRICT keys, size_t n, SortAscending) {
+ HWY_DYNAMIC_DISPATCH(SortKV64Asc)
+ (reinterpret_cast<uint64_t*>(keys), n);
+}
+
+} // namespace hwy
+#endif // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_kv64d.cc b/third_party/highway/hwy/contrib/sort/vqsort_kv64d.cc
new file mode 100644
index 0000000000..091492f065
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/vqsort_kv64d.cc
@@ -0,0 +1,62 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+// clang-format off
+// (avoid line break, which would prevent Copybara rules from matching)
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv64d.cc" //NOLINT
+// clang-format on
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortKV64Desc(uint64_t* HWY_RESTRICT keys, size_t num) {
+#if VQSORT_ENABLED
+ SortTag<uint64_t> d;
+ detail::SharedTraits<detail::TraitsLane<detail::OrderDescendingKV64>> st;
+ Sort(d, st, keys, num);
+#else
+ (void)keys;
+ (void)num;
+ HWY_ASSERT(0);
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortKV64Desc);
+} // namespace
+
+void VQSort(K32V32* HWY_RESTRICT keys, size_t n, SortDescending) {
+ HWY_DYNAMIC_DISPATCH(SortKV64Desc)
+ (reinterpret_cast<uint64_t*>(keys), n);
+}
+
+} // namespace hwy
+#endif // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_u16a.cc b/third_party/highway/hwy/contrib/sort/vqsort_u16a.cc
new file mode 100644
index 0000000000..492cfd49b9
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/vqsort_u16a.cc
@@ -0,0 +1,52 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u16a.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortU16Asc(uint16_t* HWY_RESTRICT keys, size_t num) {
+ SortTag<uint16_t> d;
+ detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<uint16_t>>> st;
+ Sort(d, st, keys, num);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortU16Asc);
+} // namespace
+
+void VQSort(uint16_t* HWY_RESTRICT keys, size_t n, SortAscending) {
+ HWY_DYNAMIC_DISPATCH(SortU16Asc)(keys, n);
+}
+
+} // namespace hwy
+#endif // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_u16d.cc b/third_party/highway/hwy/contrib/sort/vqsort_u16d.cc
new file mode 100644
index 0000000000..1e33220d3c
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/vqsort_u16d.cc
@@ -0,0 +1,53 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u16d.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortU16Desc(uint16_t* HWY_RESTRICT keys, size_t num) {
+ SortTag<uint16_t> d;
+ detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<uint16_t>>>
+ st;
+ Sort(d, st, keys, num);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortU16Desc);
+} // namespace
+
+void VQSort(uint16_t* HWY_RESTRICT keys, size_t n, SortDescending) {
+ HWY_DYNAMIC_DISPATCH(SortU16Desc)(keys, n);
+}
+
+} // namespace hwy
+#endif // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_u32a.cc b/third_party/highway/hwy/contrib/sort/vqsort_u32a.cc
new file mode 100644
index 0000000000..f2be8753c6
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/vqsort_u32a.cc
@@ -0,0 +1,52 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u32a.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortU32Asc(uint32_t* HWY_RESTRICT keys, size_t num) {
+ SortTag<uint32_t> d;
+ detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<uint32_t>>> st;
+ Sort(d, st, keys, num);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortU32Asc);
+} // namespace
+
+void VQSort(uint32_t* HWY_RESTRICT keys, size_t n, SortAscending) {
+ HWY_DYNAMIC_DISPATCH(SortU32Asc)(keys, n);
+}
+
+} // namespace hwy
+#endif // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_u32d.cc b/third_party/highway/hwy/contrib/sort/vqsort_u32d.cc
new file mode 100644
index 0000000000..0caf695689
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/vqsort_u32d.cc
@@ -0,0 +1,53 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u32d.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortU32Desc(uint32_t* HWY_RESTRICT keys, size_t num) {
+ SortTag<uint32_t> d;
+ detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<uint32_t>>>
+ st;
+ Sort(d, st, keys, num);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortU32Desc);
+} // namespace
+
+void VQSort(uint32_t* HWY_RESTRICT keys, size_t n, SortDescending) {
+ HWY_DYNAMIC_DISPATCH(SortU32Desc)(keys, n);
+}
+
+} // namespace hwy
+#endif // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_u64a.cc b/third_party/highway/hwy/contrib/sort/vqsort_u64a.cc
new file mode 100644
index 0000000000..758f1f4c80
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/vqsort_u64a.cc
@@ -0,0 +1,52 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u64a.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortU64Asc(uint64_t* HWY_RESTRICT keys, size_t num) {
+ SortTag<uint64_t> d;
+ detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<uint64_t>>> st;
+ Sort(d, st, keys, num);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortU64Asc);
+} // namespace
+
+void VQSort(uint64_t* HWY_RESTRICT keys, size_t n, SortAscending) {
+ HWY_DYNAMIC_DISPATCH(SortU64Asc)(keys, n);
+}
+
+} // namespace hwy
+#endif // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_u64d.cc b/third_party/highway/hwy/contrib/sort/vqsort_u64d.cc
new file mode 100644
index 0000000000..6c34fbed9a
--- /dev/null
+++ b/third_party/highway/hwy/contrib/sort/vqsort_u64d.cc
@@ -0,0 +1,53 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u64d.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortU64Desc(uint64_t* HWY_RESTRICT keys, size_t num) {
+ SortTag<uint64_t> d;
+ detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<uint64_t>>>
+ st;
+ Sort(d, st, keys, num);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortU64Desc);
+} // namespace
+
+void VQSort(uint64_t* HWY_RESTRICT keys, size_t n, SortDescending) {
+ HWY_DYNAMIC_DISPATCH(SortU64Desc)(keys, n);
+}
+
+} // namespace hwy
+#endif // HWY_ONCE