diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 17:32:43 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 17:32:43 +0000 |
commit | 6bf0a5cb5034a7e684dcc3500e841785237ce2dd (patch) | |
tree | a68f146d7fa01f0134297619fbe7e33db084e0aa /third_party/highway/hwy/contrib/sort/shared-inl.h | |
parent | Initial commit. (diff) | |
download | thunderbird-upstream.tar.xz thunderbird-upstream.zip |
Adding upstream version 1:115.7.0.upstream/1%115.7.0upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/highway/hwy/contrib/sort/shared-inl.h')
-rw-r--r-- | third_party/highway/hwy/contrib/sort/shared-inl.h | 134 |
1 files changed, 134 insertions, 0 deletions
diff --git a/third_party/highway/hwy/contrib/sort/shared-inl.h b/third_party/highway/hwy/contrib/sort/shared-inl.h new file mode 100644 index 0000000000..735f95ee22 --- /dev/null +++ b/third_party/highway/hwy/contrib/sort/shared-inl.h @@ -0,0 +1,134 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Definitions shared between vqsort-inl and sorting_networks-inl. + +// Normal include guard for target-independent parts +#ifndef HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_ +#define HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_ + +#include "hwy/base.h" + +namespace hwy { + +// Internal constants - these are to avoid magic numbers/literals and cannot be +// changed without also changing the associated code. +struct SortConstants { +// SortingNetwork reshapes its input into a matrix. This is the maximum number +// of *keys* per vector. +#if HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD + static constexpr size_t kMaxCols = 8; // avoid build timeout/stack overflow +#else + static constexpr size_t kMaxCols = 16; // enough for u32 in 512-bit vector +#endif + + // 16 rows is a compromise between using the 32 AVX-512/SVE/RVV registers, + // fitting within 16 AVX2 registers with only a few spills, keeping BaseCase + // code size reasonable (7 KiB for AVX-512 and 16 cols), and minimizing the + // extra logN factor for larger networks (for which only loose upper bounds + // on size are known). + static constexpr size_t kMaxRowsLog2 = 4; + static constexpr size_t kMaxRows = size_t{1} << kMaxRowsLog2; + + static constexpr HWY_INLINE size_t BaseCaseNum(size_t N) { + return kMaxRows * HWY_MIN(N, kMaxCols); + } + + // Unrolling is important (pipelining and amortizing branch mispredictions); + // 2x is sufficient to reach full memory bandwidth on SKX in Partition, but + // somewhat slower for sorting than 4x. + // + // To change, must also update left + 3 * N etc. in the loop. + static constexpr size_t kPartitionUnroll = 4; + + static constexpr HWY_INLINE size_t PartitionBufNum(size_t N) { + // The main loop reads kPartitionUnroll vectors, and first loads from + // both left and right beforehand, so it requires min = 2 * + // kPartitionUnroll vectors. To handle smaller amounts (only guaranteed + // >= BaseCaseNum), we partition the right side into a buffer. We need + // another vector at the end so CompressStore does not overwrite anything. + return (2 * kPartitionUnroll + 1) * N; + } + + // Chunk := group of keys loaded for sampling a pivot. Matches the typical + // cache line size of 64 bytes to get maximum benefit per L2 miss. Sort() + // ensures vectors are no larger than that, so this can be independent of the + // vector size and thus constexpr. + static constexpr HWY_INLINE size_t LanesPerChunk(size_t sizeof_t) { + return 64 / sizeof_t; + } + + static constexpr HWY_INLINE size_t PivotBufNum(size_t sizeof_t, size_t N) { + // 3 chunks of medians, 1 chunk of median medians plus two padding vectors. + return (3 + 1) * LanesPerChunk(sizeof_t) + 2 * N; + } + + template <typename T> + static constexpr HWY_INLINE size_t BufNum(size_t N) { + // One extra for padding plus another for full-vector loads. + return HWY_MAX(BaseCaseNum(N) + 2 * N, + HWY_MAX(PartitionBufNum(N), PivotBufNum(sizeof(T), N))); + } + + template <typename T> + static constexpr HWY_INLINE size_t BufBytes(size_t vector_size) { + return sizeof(T) * BufNum<T>(vector_size / sizeof(T)); + } +}; + +} // namespace hwy + +#endif // HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_ + +// Per-target +#if defined(HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE +#undef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE +#else +#define HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE +#endif + +#include "hwy/highway.h" + +// vqsort isn't available on HWY_SCALAR, and builds time out on MSVC opt and +// Arm v7 debug. +#undef VQSORT_ENABLED +#if (HWY_TARGET == HWY_SCALAR) || \ + (HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD) || \ + (HWY_ARCH_ARM_V7 && HWY_IS_DEBUG_BUILD) +#define VQSORT_ENABLED 0 +#else +#define VQSORT_ENABLED 1 +#endif + +namespace hwy { +namespace HWY_NAMESPACE { + +// Default tag / vector width selector. +#if HWY_TARGET == HWY_RVV +// Use LMUL = 1/2; for SEW=64 this ends up emulated via vsetvl. +template <typename T> +using SortTag = ScalableTag<T, -1>; +#else +template <typename T> +using SortTag = ScalableTag<T>; +#endif + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy + +#endif // HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE |