From 9e3c08db40b8916968b9f30096c7be3f00ce9647 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 21 Apr 2024 13:44:51 +0200 Subject: Adding upstream version 1:115.7.0. Signed-off-by: Daniel Baumann --- third_party/highway/hwy/contrib/algo/copy-inl.h | 136 + third_party/highway/hwy/contrib/algo/copy_test.cc | 199 ++ third_party/highway/hwy/contrib/algo/find-inl.h | 109 + third_party/highway/hwy/contrib/algo/find_test.cc | 219 ++ .../highway/hwy/contrib/algo/transform-inl.h | 262 ++ .../highway/hwy/contrib/algo/transform_test.cc | 372 +++ .../highway/hwy/contrib/bit_pack/bit_pack-inl.h | 2599 ++++++++++++++++++++ .../highway/hwy/contrib/bit_pack/bit_pack_test.cc | 205 ++ third_party/highway/hwy/contrib/dot/dot-inl.h | 252 ++ third_party/highway/hwy/contrib/dot/dot_test.cc | 167 ++ third_party/highway/hwy/contrib/image/image.cc | 145 ++ third_party/highway/hwy/contrib/image/image.h | 470 ++++ .../highway/hwy/contrib/image/image_test.cc | 152 ++ third_party/highway/hwy/contrib/math/math-inl.h | 1242 ++++++++++ third_party/highway/hwy/contrib/math/math_test.cc | 228 ++ third_party/highway/hwy/contrib/sort/BUILD | 193 ++ third_party/highway/hwy/contrib/sort/README.md | 87 + third_party/highway/hwy/contrib/sort/algo-inl.h | 513 ++++ .../highway/hwy/contrib/sort/bench_parallel.cc | 238 ++ third_party/highway/hwy/contrib/sort/bench_sort.cc | 310 +++ .../highway/hwy/contrib/sort/print_network.cc | 191 ++ third_party/highway/hwy/contrib/sort/result-inl.h | 139 ++ third_party/highway/hwy/contrib/sort/shared-inl.h | 134 + third_party/highway/hwy/contrib/sort/sort_test.cc | 626 +++++ .../hwy/contrib/sort/sorting_networks-inl.h | 707 ++++++ third_party/highway/hwy/contrib/sort/traits-inl.h | 568 +++++ .../highway/hwy/contrib/sort/traits128-inl.h | 517 ++++ third_party/highway/hwy/contrib/sort/vqsort-inl.h | 1484 +++++++++++ third_party/highway/hwy/contrib/sort/vqsort.cc | 184 ++ third_party/highway/hwy/contrib/sort/vqsort.h | 108 + .../highway/hwy/contrib/sort/vqsort_128a.cc | 62 + .../highway/hwy/contrib/sort/vqsort_128d.cc | 62 + .../highway/hwy/contrib/sort/vqsort_f32a.cc | 53 + .../highway/hwy/contrib/sort/vqsort_f32d.cc | 54 + .../highway/hwy/contrib/sort/vqsort_f64a.cc | 61 + .../highway/hwy/contrib/sort/vqsort_f64d.cc | 61 + .../highway/hwy/contrib/sort/vqsort_i16a.cc | 54 + .../highway/hwy/contrib/sort/vqsort_i16d.cc | 54 + .../highway/hwy/contrib/sort/vqsort_i32a.cc | 54 + .../highway/hwy/contrib/sort/vqsort_i32d.cc | 54 + .../highway/hwy/contrib/sort/vqsort_i64a.cc | 54 + .../highway/hwy/contrib/sort/vqsort_i64d.cc | 54 + .../highway/hwy/contrib/sort/vqsort_kv128a.cc | 65 + .../highway/hwy/contrib/sort/vqsort_kv128d.cc | 65 + .../highway/hwy/contrib/sort/vqsort_kv64a.cc | 65 + .../highway/hwy/contrib/sort/vqsort_kv64d.cc | 65 + .../highway/hwy/contrib/sort/vqsort_u16a.cc | 54 + .../highway/hwy/contrib/sort/vqsort_u16d.cc | 55 + .../highway/hwy/contrib/sort/vqsort_u32a.cc | 54 + .../highway/hwy/contrib/sort/vqsort_u32d.cc | 55 + .../highway/hwy/contrib/sort/vqsort_u64a.cc | 54 + .../highway/hwy/contrib/sort/vqsort_u64d.cc | 55 + 52 files changed, 14020 insertions(+) create mode 100644 third_party/highway/hwy/contrib/algo/copy-inl.h create mode 100644 third_party/highway/hwy/contrib/algo/copy_test.cc create mode 100644 third_party/highway/hwy/contrib/algo/find-inl.h create mode 100644 third_party/highway/hwy/contrib/algo/find_test.cc create mode 100644 third_party/highway/hwy/contrib/algo/transform-inl.h create mode 100644 third_party/highway/hwy/contrib/algo/transform_test.cc create mode 100644 third_party/highway/hwy/contrib/bit_pack/bit_pack-inl.h create mode 100644 third_party/highway/hwy/contrib/bit_pack/bit_pack_test.cc create mode 100644 third_party/highway/hwy/contrib/dot/dot-inl.h create mode 100644 third_party/highway/hwy/contrib/dot/dot_test.cc create mode 100644 third_party/highway/hwy/contrib/image/image.cc create mode 100644 third_party/highway/hwy/contrib/image/image.h create mode 100644 third_party/highway/hwy/contrib/image/image_test.cc create mode 100644 third_party/highway/hwy/contrib/math/math-inl.h create mode 100644 third_party/highway/hwy/contrib/math/math_test.cc create mode 100644 third_party/highway/hwy/contrib/sort/BUILD create mode 100644 third_party/highway/hwy/contrib/sort/README.md create mode 100644 third_party/highway/hwy/contrib/sort/algo-inl.h create mode 100644 third_party/highway/hwy/contrib/sort/bench_parallel.cc create mode 100644 third_party/highway/hwy/contrib/sort/bench_sort.cc create mode 100644 third_party/highway/hwy/contrib/sort/print_network.cc create mode 100644 third_party/highway/hwy/contrib/sort/result-inl.h create mode 100644 third_party/highway/hwy/contrib/sort/shared-inl.h create mode 100644 third_party/highway/hwy/contrib/sort/sort_test.cc create mode 100644 third_party/highway/hwy/contrib/sort/sorting_networks-inl.h create mode 100644 third_party/highway/hwy/contrib/sort/traits-inl.h create mode 100644 third_party/highway/hwy/contrib/sort/traits128-inl.h create mode 100644 third_party/highway/hwy/contrib/sort/vqsort-inl.h create mode 100644 third_party/highway/hwy/contrib/sort/vqsort.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort.h create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_128a.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_128d.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_f32a.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_f32d.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_f64a.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_f64d.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_i16a.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_i16d.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_i32a.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_i32d.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_i64a.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_i64d.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_kv128a.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_kv128d.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_kv64a.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_kv64d.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_u16a.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_u16d.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_u32a.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_u32d.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_u64a.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_u64d.cc (limited to 'third_party/highway/hwy/contrib') diff --git a/third_party/highway/hwy/contrib/algo/copy-inl.h b/third_party/highway/hwy/contrib/algo/copy-inl.h new file mode 100644 index 0000000000..033cf8a626 --- /dev/null +++ b/third_party/highway/hwy/contrib/algo/copy-inl.h @@ -0,0 +1,136 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Per-target include guard +#if defined(HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_ +#undef HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_ +#else +#define HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_ +#endif + +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// These functions avoid having to write a loop plus remainder handling in the +// (unfortunately still common) case where arrays are not aligned/padded. If the +// inputs are known to be aligned/padded, it is more efficient to write a single +// loop using Load(). We do not provide a CopyAlignedPadded because it +// would be more verbose than such a loop. + +// Fills `to`[0, `count`) with `value`. +template > +void Fill(D d, T value, size_t count, T* HWY_RESTRICT to) { + const size_t N = Lanes(d); + const Vec v = Set(d, value); + + size_t idx = 0; + for (; idx + N <= count; idx += N) { + StoreU(v, d, to + idx); + } + + // `count` was a multiple of the vector length `N`: already done. + if (HWY_UNLIKELY(idx == count)) return; + + const size_t remaining = count - idx; + HWY_DASSERT(0 != remaining && remaining < N); + SafeFillN(remaining, value, d, to + idx); +} + +// Copies `from`[0, `count`) to `to`, which must not overlap `from`. +template > +void Copy(D d, const T* HWY_RESTRICT from, size_t count, T* HWY_RESTRICT to) { + const size_t N = Lanes(d); + + size_t idx = 0; + for (; idx + N <= count; idx += N) { + const Vec v = LoadU(d, from + idx); + StoreU(v, d, to + idx); + } + + // `count` was a multiple of the vector length `N`: already done. + if (HWY_UNLIKELY(idx == count)) return; + + const size_t remaining = count - idx; + HWY_DASSERT(0 != remaining && remaining < N); + SafeCopyN(remaining, d, from + idx, to + idx); +} + +// For idx in [0, count) in ascending order, appends `from[idx]` to `to` if the +// corresponding mask element of `func(d, v)` is true. Returns the STL-style end +// of the newly written elements in `to`. +// +// `func` is either a functor with a templated operator()(d, v) returning a +// mask, or a generic lambda if using C++14. Due to apparent limitations of +// Clang on Windows, it is currently necessary to add HWY_ATTR before the +// opening { of the lambda to avoid errors about "function .. requires target". +// +// NOTE: this is only supported for 16-, 32- or 64-bit types. +// NOTE: Func may be called a second time for elements it has already seen, but +// these elements will not be written to `to` again. +template > +T* CopyIf(D d, const T* HWY_RESTRICT from, size_t count, T* HWY_RESTRICT to, + const Func& func) { + const size_t N = Lanes(d); + + size_t idx = 0; + for (; idx + N <= count; idx += N) { + const Vec v = LoadU(d, from + idx); + to += CompressBlendedStore(v, func(d, v), d, to); + } + + // `count` was a multiple of the vector length `N`: already done. + if (HWY_UNLIKELY(idx == count)) return to; + +#if HWY_MEM_OPS_MIGHT_FAULT + // Proceed one by one. + const CappedTag d1; + for (; idx < count; ++idx) { + using V1 = Vec; + // Workaround for -Waggressive-loop-optimizations on GCC 8 + // (iteration 2305843009213693951 invokes undefined behavior for T=i64) + const uintptr_t addr = reinterpret_cast(from); + const T* HWY_RESTRICT from_idx = + reinterpret_cast(addr + (idx * sizeof(T))); + const V1 v = LoadU(d1, from_idx); + // Avoid storing to `to` unless we know it should be kept - otherwise, we + // might overrun the end if it was allocated for the exact count. + if (CountTrue(d1, func(d1, v)) == 0) continue; + StoreU(v, d1, to); + to += 1; + } +#else + // Start index of the last unaligned whole vector, ending at the array end. + const size_t last = count - N; + // Number of elements before `from` or already written. + const size_t invalid = idx - last; + HWY_DASSERT(0 != invalid && invalid < N); + const Mask mask = Not(FirstN(d, invalid)); + const Vec v = MaskedLoad(mask, d, from + last); + to += CompressBlendedStore(v, And(mask, func(d, v)), d, to); +#endif + return to; +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_ diff --git a/third_party/highway/hwy/contrib/algo/copy_test.cc b/third_party/highway/hwy/contrib/algo/copy_test.cc new file mode 100644 index 0000000000..e2675a39d7 --- /dev/null +++ b/third_party/highway/hwy/contrib/algo/copy_test.cc @@ -0,0 +1,199 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/aligned_allocator.h" + +// clang-format off +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/algo/copy_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +#include "hwy/contrib/algo/copy-inl.h" +#include "hwy/tests/test_util-inl.h" +// clang-format on + +// If your project requires C++14 or later, you can ignore this and pass lambdas +// directly to Transform, without requiring an lvalue as we do here for C++11. +#if __cplusplus < 201402L +#define HWY_GENERIC_LAMBDA 0 +#else +#define HWY_GENERIC_LAMBDA 1 +#endif + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// Returns random integer in [0, 128), which fits in any lane type. +template +T Random7Bit(RandomState& rng) { + return static_cast(Random32(&rng) & 127); +} + +// In C++14, we can instead define these as generic lambdas next to where they +// are invoked. +#if !HWY_GENERIC_LAMBDA + +struct IsOdd { + template + Mask operator()(D d, V v) const { + return TestBit(v, Set(d, TFromD{1})); + } +}; + +#endif // !HWY_GENERIC_LAMBDA + +// Invokes Test (e.g. TestCopyIf) with all arg combinations. T comes from +// ForFloatTypes. +template +struct ForeachCountAndMisalign { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) const { + RandomState rng; + const size_t N = Lanes(d); + const size_t misalignments[3] = {0, N / 4, 3 * N / 5}; + + for (size_t count = 0; count < 2 * N; ++count) { + for (size_t ma : misalignments) { + for (size_t mb : misalignments) { + Test()(d, count, ma, mb, rng); + } + } + } + } +}; + +struct TestFill { + template + void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, + RandomState& rng) { + using T = TFromD; + // HWY_MAX prevents error when misalign == count == 0. + AlignedFreeUniquePtr pa = + AllocateAligned(HWY_MAX(1, misalign_a + count)); + T* expected = pa.get() + misalign_a; + const T value = Random7Bit(rng); + for (size_t i = 0; i < count; ++i) { + expected[i] = value; + } + AlignedFreeUniquePtr pb = AllocateAligned(misalign_b + count + 1); + T* actual = pb.get() + misalign_b; + + actual[count] = T{0}; // sentinel + Fill(d, value, count, actual); + HWY_ASSERT_EQ(T{0}, actual[count]); // did not write past end + + const auto info = hwy::detail::MakeTypeInfo(); + const char* target_name = hwy::TargetName(HWY_TARGET); + hwy::detail::AssertArrayEqual(info, expected, actual, count, target_name, + __FILE__, __LINE__); + } +}; + +void TestAllFill() { + ForAllTypes(ForPartialVectors>()); +} + +struct TestCopy { + template + void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, + RandomState& rng) { + using T = TFromD; + // Prevents error if size to allocate is zero. + AlignedFreeUniquePtr pa = + AllocateAligned(HWY_MAX(1, misalign_a + count)); + T* a = pa.get() + misalign_a; + for (size_t i = 0; i < count; ++i) { + a[i] = Random7Bit(rng); + } + AlignedFreeUniquePtr pb = + AllocateAligned(HWY_MAX(1, misalign_b + count)); + T* b = pb.get() + misalign_b; + + Copy(d, a, count, b); + + const auto info = hwy::detail::MakeTypeInfo(); + const char* target_name = hwy::TargetName(HWY_TARGET); + hwy::detail::AssertArrayEqual(info, a, b, count, target_name, __FILE__, + __LINE__); + } +}; + +void TestAllCopy() { + ForAllTypes(ForPartialVectors>()); +} + +struct TestCopyIf { + template + void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, + RandomState& rng) { + using T = TFromD; + // Prevents error if size to allocate is zero. + AlignedFreeUniquePtr pa = + AllocateAligned(HWY_MAX(1, misalign_a + count)); + T* a = pa.get() + misalign_a; + for (size_t i = 0; i < count; ++i) { + a[i] = Random7Bit(rng); + } + const size_t padding = Lanes(ScalableTag()); + AlignedFreeUniquePtr pb = + AllocateAligned(HWY_MAX(1, misalign_b + count + padding)); + T* b = pb.get() + misalign_b; + + AlignedFreeUniquePtr expected = AllocateAligned(HWY_MAX(1, count)); + size_t num_odd = 0; + for (size_t i = 0; i < count; ++i) { + if (a[i] & 1) { + expected[num_odd++] = a[i]; + } + } + +#if HWY_GENERIC_LAMBDA + const auto is_odd = [](const auto d, const auto v) HWY_ATTR { + return TestBit(v, Set(d, TFromD{1})); + }; +#else + const IsOdd is_odd; +#endif + T* end = CopyIf(d, a, count, b, is_odd); + const size_t num_written = static_cast(end - b); + HWY_ASSERT_EQ(num_odd, num_written); + + const auto info = hwy::detail::MakeTypeInfo(); + const char* target_name = hwy::TargetName(HWY_TARGET); + hwy::detail::AssertArrayEqual(info, expected.get(), b, num_odd, target_name, + __FILE__, __LINE__); + } +}; + +void TestAllCopyIf() { + ForUI163264(ForPartialVectors>()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(CopyTest); +HWY_EXPORT_AND_TEST_P(CopyTest, TestAllFill); +HWY_EXPORT_AND_TEST_P(CopyTest, TestAllCopy); +HWY_EXPORT_AND_TEST_P(CopyTest, TestAllCopyIf); +} // namespace hwy + +#endif diff --git a/third_party/highway/hwy/contrib/algo/find-inl.h b/third_party/highway/hwy/contrib/algo/find-inl.h new file mode 100644 index 0000000000..388842e988 --- /dev/null +++ b/third_party/highway/hwy/contrib/algo/find-inl.h @@ -0,0 +1,109 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Per-target include guard +#if defined(HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_ +#undef HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_ +#else +#define HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_ +#endif + +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// Returns index of the first element equal to `value` in `in[0, count)`, or +// `count` if not found. +template > +size_t Find(D d, T value, const T* HWY_RESTRICT in, size_t count) { + const size_t N = Lanes(d); + const Vec broadcasted = Set(d, value); + + size_t i = 0; + for (; i + N <= count; i += N) { + const intptr_t pos = FindFirstTrue(d, Eq(broadcasted, LoadU(d, in + i))); + if (pos >= 0) return i + static_cast(pos); + } + + if (i != count) { +#if HWY_MEM_OPS_MIGHT_FAULT + // Scan single elements. + const CappedTag d1; + using V1 = Vec; + const V1 broadcasted1 = Set(d1, GetLane(broadcasted)); + for (; i < count; ++i) { + if (AllTrue(d1, Eq(broadcasted1, LoadU(d1, in + i)))) { + return i; + } + } +#else + const size_t remaining = count - i; + HWY_DASSERT(0 != remaining && remaining < N); + const Mask mask = FirstN(d, remaining); + const Vec v = MaskedLoad(mask, d, in + i); + // Apply mask so that we don't 'find' the zero-padding from MaskedLoad. + const intptr_t pos = FindFirstTrue(d, And(Eq(broadcasted, v), mask)); + if (pos >= 0) return i + static_cast(pos); +#endif // HWY_MEM_OPS_MIGHT_FAULT + } + + return count; // not found +} + +// Returns index of the first element in `in[0, count)` for which `func(d, vec)` +// returns true, otherwise `count`. +template > +size_t FindIf(D d, const T* HWY_RESTRICT in, size_t count, const Func& func) { + const size_t N = Lanes(d); + + size_t i = 0; + for (; i + N <= count; i += N) { + const intptr_t pos = FindFirstTrue(d, func(d, LoadU(d, in + i))); + if (pos >= 0) return i + static_cast(pos); + } + + if (i != count) { +#if HWY_MEM_OPS_MIGHT_FAULT + // Scan single elements. + const CappedTag d1; + for (; i < count; ++i) { + if (AllTrue(d1, func(d1, LoadU(d1, in + i)))) { + return i; + } + } +#else + const size_t remaining = count - i; + HWY_DASSERT(0 != remaining && remaining < N); + const Mask mask = FirstN(d, remaining); + const Vec v = MaskedLoad(mask, d, in + i); + // Apply mask so that we don't 'find' the zero-padding from MaskedLoad. + const intptr_t pos = FindFirstTrue(d, And(func(d, v), mask)); + if (pos >= 0) return i + static_cast(pos); +#endif // HWY_MEM_OPS_MIGHT_FAULT + } + + return count; // not found +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_ diff --git a/third_party/highway/hwy/contrib/algo/find_test.cc b/third_party/highway/hwy/contrib/algo/find_test.cc new file mode 100644 index 0000000000..f438a18ba0 --- /dev/null +++ b/third_party/highway/hwy/contrib/algo/find_test.cc @@ -0,0 +1,219 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include // std::find_if +#include + +#include "hwy/aligned_allocator.h" +#include "hwy/base.h" +#include "hwy/print.h" + +// clang-format off +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/algo/find_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep + +#include "hwy/contrib/algo/find-inl.h" +#include "hwy/tests/test_util-inl.h" +// clang-format on + +// If your project requires C++14 or later, you can ignore this and pass lambdas +// directly to FindIf, without requiring an lvalue as we do here for C++11. +#if __cplusplus < 201402L +#define HWY_GENERIC_LAMBDA 0 +#else +#define HWY_GENERIC_LAMBDA 1 +#endif + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// Returns random number in [-8, 8) - we use knowledge of the range to Find() +// values we know are not present. +template +T Random(RandomState& rng) { + const int32_t bits = static_cast(Random32(&rng)) & 1023; + const double val = (bits - 512) / 64.0; + // Clamp negative to zero for unsigned types. + return static_cast(HWY_MAX(hwy::LowestValue(), val)); +} + +// In C++14, we can instead define these as generic lambdas next to where they +// are invoked. +#if !HWY_GENERIC_LAMBDA + +class GreaterThan { + public: + GreaterThan(int val) : val_(val) {} + template + Mask operator()(D d, V v) const { + return Gt(v, Set(d, static_cast>(val_))); + } + + private: + int val_; +}; + +#endif // !HWY_GENERIC_LAMBDA + +// Invokes Test (e.g. TestFind) with all arg combinations. +template +struct ForeachCountAndMisalign { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) const { + RandomState rng; + const size_t N = Lanes(d); + const size_t misalignments[3] = {0, N / 4, 3 * N / 5}; + + // Find() checks 8 vectors at a time, so we want to cover a fairly large + // range without oversampling (checking every possible count). + std::vector counts(AdjustedReps(512)); + for (size_t& count : counts) { + count = static_cast(rng()) % (16 * N + 1); + } + counts[0] = 0; // ensure we test count=0. + + for (size_t count : counts) { + for (size_t m : misalignments) { + Test()(d, count, m, rng); + } + } + } +}; + +struct TestFind { + template + void operator()(D d, size_t count, size_t misalign, RandomState& rng) { + using T = TFromD; + // Must allocate at least one even if count is zero. + AlignedFreeUniquePtr storage = + AllocateAligned(HWY_MAX(1, misalign + count)); + T* in = storage.get() + misalign; + for (size_t i = 0; i < count; ++i) { + in[i] = Random(rng); + } + + // For each position, search for that element (which we know is there) + for (size_t pos = 0; pos < count; ++pos) { + const size_t actual = Find(d, in[pos], in, count); + + // We may have found an earlier occurrence of the same value; ensure the + // value is the same, and that it is the first. + if (!IsEqual(in[pos], in[actual])) { + fprintf(stderr, "%s count %d, found %.15f at %d but wanted %.15f\n", + hwy::TypeName(T(), Lanes(d)).c_str(), static_cast(count), + static_cast(in[actual]), static_cast(actual), + static_cast(in[pos])); + HWY_ASSERT(false); + } + for (size_t i = 0; i < actual; ++i) { + if (IsEqual(in[i], in[pos])) { + fprintf(stderr, "%s count %d, found %f at %d but Find returned %d\n", + hwy::TypeName(T(), Lanes(d)).c_str(), static_cast(count), + static_cast(in[i]), static_cast(i), + static_cast(actual)); + HWY_ASSERT(false); + } + } + } + + // Also search for values we know not to be present (out of range) + HWY_ASSERT_EQ(count, Find(d, T{9}, in, count)); + HWY_ASSERT_EQ(count, Find(d, static_cast(-9), in, count)); + } +}; + +void TestAllFind() { + ForAllTypes(ForPartialVectors>()); +} + +struct TestFindIf { + template + void operator()(D d, size_t count, size_t misalign, RandomState& rng) { + using T = TFromD; + using TI = MakeSigned; + // Must allocate at least one even if count is zero. + AlignedFreeUniquePtr storage = + AllocateAligned(HWY_MAX(1, misalign + count)); + T* in = storage.get() + misalign; + for (size_t i = 0; i < count; ++i) { + in[i] = Random(rng); + HWY_ASSERT(in[i] < 8); + HWY_ASSERT(!hwy::IsSigned() || static_cast(in[i]) >= -8); + } + + bool found_any = false; + bool not_found_any = false; + + // unsigned T would be promoted to signed and compare greater than any + // negative val, whereas Set() would just cast to an unsigned value and the + // comparison remains unsigned, so avoid negative numbers there. + const int min_val = IsSigned() ? -9 : 0; + // Includes out-of-range value 9 to test the not-found path. + for (int val = min_val; val <= 9; ++val) { +#if HWY_GENERIC_LAMBDA + const auto greater = [val](const auto d, const auto v) HWY_ATTR { + return Gt(v, Set(d, static_cast(val))); + }; +#else + const GreaterThan greater(val); +#endif + const size_t actual = FindIf(d, in, count, greater); + found_any |= actual < count; + not_found_any |= actual == count; + + const auto pos = std::find_if( + in, in + count, [val](T x) { return x > static_cast(val); }); + // Convert returned iterator to index. + const size_t expected = static_cast(pos - in); + if (expected != actual) { + fprintf(stderr, "%s count %d val %d, expected %d actual %d\n", + hwy::TypeName(T(), Lanes(d)).c_str(), static_cast(count), + val, static_cast(expected), static_cast(actual)); + hwy::detail::PrintArray(hwy::detail::MakeTypeInfo(), "in", in, count, + 0, count); + HWY_ASSERT(false); + } + } + + // We will always not-find something due to val=9. + HWY_ASSERT(not_found_any); + // We'll find something unless the input is empty or {0} - because 0 > i + // is false for all i=[0,9]. + if (count != 0 && in[0] != 0) { + HWY_ASSERT(found_any); + } + } +}; + +void TestAllFindIf() { + ForAllTypes(ForPartialVectors>()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(FindTest); +HWY_EXPORT_AND_TEST_P(FindTest, TestAllFind); +HWY_EXPORT_AND_TEST_P(FindTest, TestAllFindIf); +} // namespace hwy + +#endif diff --git a/third_party/highway/hwy/contrib/algo/transform-inl.h b/third_party/highway/hwy/contrib/algo/transform-inl.h new file mode 100644 index 0000000000..3e830acb47 --- /dev/null +++ b/third_party/highway/hwy/contrib/algo/transform-inl.h @@ -0,0 +1,262 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Per-target include guard +#if defined(HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_ +#undef HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_ +#else +#define HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_ +#endif + +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// These functions avoid having to write a loop plus remainder handling in the +// (unfortunately still common) case where arrays are not aligned/padded. If the +// inputs are known to be aligned/padded, it is more efficient to write a single +// loop using Load(). We do not provide a TransformAlignedPadded because it +// would be more verbose than such a loop. +// +// Func is either a functor with a templated operator()(d, v[, v1[, v2]]), or a +// generic lambda if using C++14. Due to apparent limitations of Clang on +// Windows, it is currently necessary to add HWY_ATTR before the opening { of +// the lambda to avoid errors about "always_inline function .. requires target". +// +// If HWY_MEM_OPS_MIGHT_FAULT, we use scalar code instead of masking. Otherwise, +// we used `MaskedLoad` and `BlendedStore` to read/write the final partial +// vector. + +// Fills `out[0, count)` with the vectors returned by `func(d, index_vec)`, +// where `index_vec` is `Vec>`. On the first call to `func`, +// the value of its lane i is i, and increases by `Lanes(d)` after every call. +// Note that some of these indices may be `>= count`, but the elements that +// `func` returns in those lanes will not be written to `out`. +template > +void Generate(D d, T* HWY_RESTRICT out, size_t count, const Func& func) { + const RebindToUnsigned du; + using TU = TFromD; + const size_t N = Lanes(d); + + size_t idx = 0; + Vec vidx = Iota(du, 0); + for (; idx + N <= count; idx += N) { + StoreU(func(d, vidx), d, out + idx); + vidx = Add(vidx, Set(du, static_cast(N))); + } + + // `count` was a multiple of the vector length `N`: already done. + if (HWY_UNLIKELY(idx == count)) return; + +#if HWY_MEM_OPS_MIGHT_FAULT + // Proceed one by one. + const CappedTag d1; + const RebindToUnsigned du1; + for (; idx < count; ++idx) { + StoreU(func(d1, Set(du1, static_cast(idx))), d1, out + idx); + } +#else + const size_t remaining = count - idx; + HWY_DASSERT(0 != remaining && remaining < N); + const Mask mask = FirstN(d, remaining); + BlendedStore(func(d, vidx), mask, d, out + idx); +#endif +} + +// Replaces `inout[idx]` with `func(d, inout[idx])`. Example usage: multiplying +// array elements by a constant. +template > +void Transform(D d, T* HWY_RESTRICT inout, size_t count, const Func& func) { + const size_t N = Lanes(d); + + size_t idx = 0; + for (; idx + N <= count; idx += N) { + const Vec v = LoadU(d, inout + idx); + StoreU(func(d, v), d, inout + idx); + } + + // `count` was a multiple of the vector length `N`: already done. + if (HWY_UNLIKELY(idx == count)) return; + +#if HWY_MEM_OPS_MIGHT_FAULT + // Proceed one by one. + const CappedTag d1; + for (; idx < count; ++idx) { + using V1 = Vec; + const V1 v = LoadU(d1, inout + idx); + StoreU(func(d1, v), d1, inout + idx); + } +#else + const size_t remaining = count - idx; + HWY_DASSERT(0 != remaining && remaining < N); + const Mask mask = FirstN(d, remaining); + const Vec v = MaskedLoad(mask, d, inout + idx); + BlendedStore(func(d, v), mask, d, inout + idx); +#endif +} + +// Replaces `inout[idx]` with `func(d, inout[idx], in1[idx])`. Example usage: +// multiplying array elements by those of another array. +template > +void Transform1(D d, T* HWY_RESTRICT inout, size_t count, + const T* HWY_RESTRICT in1, const Func& func) { + const size_t N = Lanes(d); + + size_t idx = 0; + for (; idx + N <= count; idx += N) { + const Vec v = LoadU(d, inout + idx); + const Vec v1 = LoadU(d, in1 + idx); + StoreU(func(d, v, v1), d, inout + idx); + } + + // `count` was a multiple of the vector length `N`: already done. + if (HWY_UNLIKELY(idx == count)) return; + +#if HWY_MEM_OPS_MIGHT_FAULT + // Proceed one by one. + const CappedTag d1; + for (; idx < count; ++idx) { + using V1 = Vec; + const V1 v = LoadU(d1, inout + idx); + const V1 v1 = LoadU(d1, in1 + idx); + StoreU(func(d1, v, v1), d1, inout + idx); + } +#else + const size_t remaining = count - idx; + HWY_DASSERT(0 != remaining && remaining < N); + const Mask mask = FirstN(d, remaining); + const Vec v = MaskedLoad(mask, d, inout + idx); + const Vec v1 = MaskedLoad(mask, d, in1 + idx); + BlendedStore(func(d, v, v1), mask, d, inout + idx); +#endif +} + +// Replaces `inout[idx]` with `func(d, inout[idx], in1[idx], in2[idx])`. Example +// usage: FMA of elements from three arrays, stored into the first array. +template > +void Transform2(D d, T* HWY_RESTRICT inout, size_t count, + const T* HWY_RESTRICT in1, const T* HWY_RESTRICT in2, + const Func& func) { + const size_t N = Lanes(d); + + size_t idx = 0; + for (; idx + N <= count; idx += N) { + const Vec v = LoadU(d, inout + idx); + const Vec v1 = LoadU(d, in1 + idx); + const Vec v2 = LoadU(d, in2 + idx); + StoreU(func(d, v, v1, v2), d, inout + idx); + } + + // `count` was a multiple of the vector length `N`: already done. + if (HWY_UNLIKELY(idx == count)) return; + +#if HWY_MEM_OPS_MIGHT_FAULT + // Proceed one by one. + const CappedTag d1; + for (; idx < count; ++idx) { + using V1 = Vec; + const V1 v = LoadU(d1, inout + idx); + const V1 v1 = LoadU(d1, in1 + idx); + const V1 v2 = LoadU(d1, in2 + idx); + StoreU(func(d1, v, v1, v2), d1, inout + idx); + } +#else + const size_t remaining = count - idx; + HWY_DASSERT(0 != remaining && remaining < N); + const Mask mask = FirstN(d, remaining); + const Vec v = MaskedLoad(mask, d, inout + idx); + const Vec v1 = MaskedLoad(mask, d, in1 + idx); + const Vec v2 = MaskedLoad(mask, d, in2 + idx); + BlendedStore(func(d, v, v1, v2), mask, d, inout + idx); +#endif +} + +template > +void Replace(D d, T* HWY_RESTRICT inout, size_t count, T new_t, T old_t) { + const size_t N = Lanes(d); + const Vec old_v = Set(d, old_t); + const Vec new_v = Set(d, new_t); + + size_t idx = 0; + for (; idx + N <= count; idx += N) { + Vec v = LoadU(d, inout + idx); + StoreU(IfThenElse(Eq(v, old_v), new_v, v), d, inout + idx); + } + + // `count` was a multiple of the vector length `N`: already done. + if (HWY_UNLIKELY(idx == count)) return; + +#if HWY_MEM_OPS_MIGHT_FAULT + // Proceed one by one. + const CappedTag d1; + const Vec old_v1 = Set(d1, old_t); + const Vec new_v1 = Set(d1, new_t); + for (; idx < count; ++idx) { + using V1 = Vec; + const V1 v1 = LoadU(d1, inout + idx); + StoreU(IfThenElse(Eq(v1, old_v1), new_v1, v1), d1, inout + idx); + } +#else + const size_t remaining = count - idx; + HWY_DASSERT(0 != remaining && remaining < N); + const Mask mask = FirstN(d, remaining); + const Vec v = MaskedLoad(mask, d, inout + idx); + BlendedStore(IfThenElse(Eq(v, old_v), new_v, v), mask, d, inout + idx); +#endif +} + +template > +void ReplaceIf(D d, T* HWY_RESTRICT inout, size_t count, T new_t, + const Func& func) { + const size_t N = Lanes(d); + const Vec new_v = Set(d, new_t); + + size_t idx = 0; + for (; idx + N <= count; idx += N) { + Vec v = LoadU(d, inout + idx); + StoreU(IfThenElse(func(d, v), new_v, v), d, inout + idx); + } + + // `count` was a multiple of the vector length `N`: already done. + if (HWY_UNLIKELY(idx == count)) return; + +#if HWY_MEM_OPS_MIGHT_FAULT + // Proceed one by one. + const CappedTag d1; + const Vec new_v1 = Set(d1, new_t); + for (; idx < count; ++idx) { + using V1 = Vec; + const V1 v = LoadU(d1, inout + idx); + StoreU(IfThenElse(func(d1, v), new_v1, v), d1, inout + idx); + } +#else + const size_t remaining = count - idx; + HWY_DASSERT(0 != remaining && remaining < N); + const Mask mask = FirstN(d, remaining); + const Vec v = MaskedLoad(mask, d, inout + idx); + BlendedStore(IfThenElse(func(d, v), new_v, v), mask, d, inout + idx); +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_ diff --git a/third_party/highway/hwy/contrib/algo/transform_test.cc b/third_party/highway/hwy/contrib/algo/transform_test.cc new file mode 100644 index 0000000000..335607ccfb --- /dev/null +++ b/third_party/highway/hwy/contrib/algo/transform_test.cc @@ -0,0 +1,372 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include // memcpy + +#include "hwy/aligned_allocator.h" + +// clang-format off +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/algo/transform_test.cc" //NOLINT +#include "hwy/foreach_target.h" // IWYU pragma: keep + +#include "hwy/contrib/algo/transform-inl.h" +#include "hwy/tests/test_util-inl.h" +// clang-format on + +// If your project requires C++14 or later, you can ignore this and pass lambdas +// directly to Transform, without requiring an lvalue as we do here for C++11. +#if __cplusplus < 201402L +#define HWY_GENERIC_LAMBDA 0 +#else +#define HWY_GENERIC_LAMBDA 1 +#endif + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +template +T Alpha() { + return static_cast(1.5); // arbitrary scalar +} + +// Returns random floating-point number in [-8, 8) to ensure computations do +// not exceed float32 precision. +template +T Random(RandomState& rng) { + const int32_t bits = static_cast(Random32(&rng)) & 1023; + const double val = (bits - 512) / 64.0; + // Clamp negative to zero for unsigned types. + return static_cast(HWY_MAX(hwy::LowestValue(), val)); +} + +// SCAL, AXPY names are from BLAS. +template +HWY_NOINLINE void SimpleSCAL(const T* x, T* out, size_t count) { + for (size_t i = 0; i < count; ++i) { + out[i] = Alpha() * x[i]; + } +} + +template +HWY_NOINLINE void SimpleAXPY(const T* x, const T* y, T* out, size_t count) { + for (size_t i = 0; i < count; ++i) { + out[i] = Alpha() * x[i] + y[i]; + } +} + +template +HWY_NOINLINE void SimpleFMA4(const T* x, const T* y, const T* z, T* out, + size_t count) { + for (size_t i = 0; i < count; ++i) { + out[i] = x[i] * y[i] + z[i]; + } +} + +// In C++14, we can instead define these as generic lambdas next to where they +// are invoked. +#if !HWY_GENERIC_LAMBDA + +// Generator that returns even numbers by doubling the output indices. +struct Gen2 { + template + Vec operator()(D d, VU vidx) const { + return BitCast(d, Add(vidx, vidx)); + } +}; + +struct SCAL { + template + Vec operator()(D d, V v) const { + using T = TFromD; + return Mul(Set(d, Alpha()), v); + } +}; + +struct AXPY { + template + Vec operator()(D d, V v, V v1) const { + using T = TFromD; + return MulAdd(Set(d, Alpha()), v, v1); + } +}; + +struct FMA4 { + template + Vec operator()(D /*d*/, V v, V v1, V v2) const { + return MulAdd(v, v1, v2); + } +}; + +#endif // !HWY_GENERIC_LAMBDA + +// Invokes Test (e.g. TestTransform1) with all arg combinations. T comes from +// ForFloatTypes. +template +struct ForeachCountAndMisalign { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) const { + RandomState rng; + const size_t N = Lanes(d); + const size_t misalignments[3] = {0, N / 4, 3 * N / 5}; + + for (size_t count = 0; count < 2 * N; ++count) { + for (size_t ma : misalignments) { + for (size_t mb : misalignments) { + Test()(d, count, ma, mb, rng); + } + } + } + } +}; + +// Output-only, no loads +struct TestGenerate { + template + void operator()(D d, size_t count, size_t misalign_a, size_t /*misalign_b*/, + RandomState& /*rng*/) { + using T = TFromD; + AlignedFreeUniquePtr pa = AllocateAligned(misalign_a + count + 1); + T* actual = pa.get() + misalign_a; + + AlignedFreeUniquePtr expected = AllocateAligned(HWY_MAX(1, count)); + for (size_t i = 0; i < count; ++i) { + expected[i] = static_cast(2 * i); + } + + // TODO(janwas): can we update the apply_to in HWY_PUSH_ATTRIBUTES so that + // the attribute also applies to lambdas? If so, remove HWY_ATTR. +#if HWY_GENERIC_LAMBDA + const auto gen2 = [](const auto d, const auto vidx) + HWY_ATTR { return BitCast(d, Add(vidx, vidx)); }; +#else + const Gen2 gen2; +#endif + actual[count] = T{0}; // sentinel + Generate(d, actual, count, gen2); + HWY_ASSERT_EQ(T{0}, actual[count]); // did not write past end + + const auto info = hwy::detail::MakeTypeInfo(); + const char* target_name = hwy::TargetName(HWY_TARGET); + hwy::detail::AssertArrayEqual(info, expected.get(), actual, count, + target_name, __FILE__, __LINE__); + } +}; + +// Zero extra input arrays +struct TestTransform { + template + void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, + RandomState& rng) { + if (misalign_b != 0) return; + using T = TFromD; + // Prevents error if size to allocate is zero. + AlignedFreeUniquePtr pa = + AllocateAligned(HWY_MAX(1, misalign_a + count)); + T* a = pa.get() + misalign_a; + for (size_t i = 0; i < count; ++i) { + a[i] = Random(rng); + } + + AlignedFreeUniquePtr expected = AllocateAligned(HWY_MAX(1, count)); + SimpleSCAL(a, expected.get(), count); + + // TODO(janwas): can we update the apply_to in HWY_PUSH_ATTRIBUTES so that + // the attribute also applies to lambdas? If so, remove HWY_ATTR. +#if HWY_GENERIC_LAMBDA + const auto scal = [](const auto d, const auto v) + HWY_ATTR { return Mul(Set(d, Alpha()), v); }; +#else + const SCAL scal; +#endif + Transform(d, a, count, scal); + + const auto info = hwy::detail::MakeTypeInfo(); + const char* target_name = hwy::TargetName(HWY_TARGET); + hwy::detail::AssertArrayEqual(info, expected.get(), a, count, target_name, + __FILE__, __LINE__); + } +}; + +// One extra input array +struct TestTransform1 { + template + void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, + RandomState& rng) { + using T = TFromD; + // Prevents error if size to allocate is zero. + AlignedFreeUniquePtr pa = + AllocateAligned(HWY_MAX(1, misalign_a + count)); + AlignedFreeUniquePtr pb = + AllocateAligned(HWY_MAX(1, misalign_b + count)); + T* a = pa.get() + misalign_a; + T* b = pb.get() + misalign_b; + for (size_t i = 0; i < count; ++i) { + a[i] = Random(rng); + b[i] = Random(rng); + } + + AlignedFreeUniquePtr expected = AllocateAligned(HWY_MAX(1, count)); + SimpleAXPY(a, b, expected.get(), count); + +#if HWY_GENERIC_LAMBDA + const auto axpy = [](const auto d, const auto v, const auto v1) HWY_ATTR { + return MulAdd(Set(d, Alpha()), v, v1); + }; +#else + const AXPY axpy; +#endif + Transform1(d, a, count, b, axpy); + + const auto info = hwy::detail::MakeTypeInfo(); + const char* target_name = hwy::TargetName(HWY_TARGET); + hwy::detail::AssertArrayEqual(info, expected.get(), a, count, target_name, + __FILE__, __LINE__); + } +}; + +// Two extra input arrays +struct TestTransform2 { + template + void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, + RandomState& rng) { + using T = TFromD; + // Prevents error if size to allocate is zero. + AlignedFreeUniquePtr pa = + AllocateAligned(HWY_MAX(1, misalign_a + count)); + AlignedFreeUniquePtr pb = + AllocateAligned(HWY_MAX(1, misalign_b + count)); + AlignedFreeUniquePtr pc = + AllocateAligned(HWY_MAX(1, misalign_a + count)); + T* a = pa.get() + misalign_a; + T* b = pb.get() + misalign_b; + T* c = pc.get() + misalign_a; + for (size_t i = 0; i < count; ++i) { + a[i] = Random(rng); + b[i] = Random(rng); + c[i] = Random(rng); + } + + AlignedFreeUniquePtr expected = AllocateAligned(HWY_MAX(1, count)); + SimpleFMA4(a, b, c, expected.get(), count); + +#if HWY_GENERIC_LAMBDA + const auto fma4 = [](auto /*d*/, auto v, auto v1, auto v2) + HWY_ATTR { return MulAdd(v, v1, v2); }; +#else + const FMA4 fma4; +#endif + Transform2(d, a, count, b, c, fma4); + + const auto info = hwy::detail::MakeTypeInfo(); + const char* target_name = hwy::TargetName(HWY_TARGET); + hwy::detail::AssertArrayEqual(info, expected.get(), a, count, target_name, + __FILE__, __LINE__); + } +}; + +template +class IfEq { + public: + IfEq(T val) : val_(val) {} + + template + Mask operator()(D d, V v) const { + return Eq(v, Set(d, val_)); + } + + private: + T val_; +}; + +struct TestReplace { + template + void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, + RandomState& rng) { + if (misalign_b != 0) return; + if (count == 0) return; + using T = TFromD; + AlignedFreeUniquePtr pa = AllocateAligned(misalign_a + count); + T* a = pa.get() + misalign_a; + for (size_t i = 0; i < count; ++i) { + a[i] = Random(rng); + } + AlignedFreeUniquePtr pb = AllocateAligned(count); + + AlignedFreeUniquePtr expected = AllocateAligned(count); + + std::vector positions(AdjustedReps(count)); + for (size_t& pos : positions) { + pos = static_cast(rng()) % count; + } + + for (size_t pos = 0; pos < count; ++pos) { + const T old_t = a[pos]; + const T new_t = Random(rng); + for (size_t i = 0; i < count; ++i) { + expected[i] = IsEqual(a[i], old_t) ? new_t : a[i]; + } + + // Copy so ReplaceIf gets the same input (and thus also outputs expected) + memcpy(pb.get(), a, count * sizeof(T)); + + Replace(d, a, count, new_t, old_t); + HWY_ASSERT_ARRAY_EQ(expected.get(), a, count); + + ReplaceIf(d, pb.get(), count, new_t, IfEq(old_t)); + HWY_ASSERT_ARRAY_EQ(expected.get(), pb.get(), count); + } + } +}; + +void TestAllGenerate() { + // The test BitCast-s the indices, which does not work for floats. + ForIntegerTypes(ForPartialVectors>()); +} + +void TestAllTransform() { + ForFloatTypes(ForPartialVectors>()); +} + +void TestAllTransform1() { + ForFloatTypes(ForPartialVectors>()); +} + +void TestAllTransform2() { + ForFloatTypes(ForPartialVectors>()); +} + +void TestAllReplace() { + ForFloatTypes(ForPartialVectors>()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(TransformTest); +HWY_EXPORT_AND_TEST_P(TransformTest, TestAllGenerate); +HWY_EXPORT_AND_TEST_P(TransformTest, TestAllTransform); +HWY_EXPORT_AND_TEST_P(TransformTest, TestAllTransform1); +HWY_EXPORT_AND_TEST_P(TransformTest, TestAllTransform2); +HWY_EXPORT_AND_TEST_P(TransformTest, TestAllReplace); +} // namespace hwy + +#endif diff --git a/third_party/highway/hwy/contrib/bit_pack/bit_pack-inl.h b/third_party/highway/hwy/contrib/bit_pack/bit_pack-inl.h new file mode 100644 index 0000000000..04d015453b --- /dev/null +++ b/third_party/highway/hwy/contrib/bit_pack/bit_pack-inl.h @@ -0,0 +1,2599 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Per-target include guard +#if defined(HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_ +#undef HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_ +#else +#define HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_ +#endif + +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// The entry points are class templates specialized below for each number of +// bits. Each provides Pack and Unpack member functions which load (Pack) or +// store (Unpack) B raw vectors, and store (Pack) or load (Unpack) a number of +// packed vectors equal to kBits. B denotes the bits per lane: 8 for Pack8, 16 +// for Pack16, which is also the upper bound for kBits. +template // <= 8 +struct Pack8 {}; +template // <= 16 +struct Pack16 {}; + +template <> +struct Pack8<1> { + template + HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw, + uint8_t* HWY_RESTRICT packed_out) const { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + // 16-bit shifts avoid masking (bits will not cross 8-bit lanes). + const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8)); + const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8)); + const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8)); + const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8)); + const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8)); + const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8)); + const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8)); + const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8)); + + const VU16 packed = + Xor3(Or(ShiftLeft<7>(raw7), ShiftLeft<6>(raw6)), + Xor3(ShiftLeft<5>(raw5), ShiftLeft<4>(raw4), ShiftLeft<3>(raw3)), + Xor3(ShiftLeft<2>(raw2), ShiftLeft<1>(raw1), raw0)); + StoreU(BitCast(d8, packed), d8, packed_out); + } + + template + HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in, + uint8_t* HWY_RESTRICT raw) const { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + const VU16 mask = Set(d16, 0x0101u); // LSB in each byte + + const VU16 packed = BitCast(d16, LoadU(d8, packed_in)); + + const VU16 raw0 = And(packed, mask); + StoreU(BitCast(d8, raw0), d8, raw + 0 * N8); + + const VU16 raw1 = And(ShiftRight<1>(packed), mask); + StoreU(BitCast(d8, raw1), d8, raw + 1 * N8); + + const VU16 raw2 = And(ShiftRight<2>(packed), mask); + StoreU(BitCast(d8, raw2), d8, raw + 2 * N8); + + const VU16 raw3 = And(ShiftRight<3>(packed), mask); + StoreU(BitCast(d8, raw3), d8, raw + 3 * N8); + + const VU16 raw4 = And(ShiftRight<4>(packed), mask); + StoreU(BitCast(d8, raw4), d8, raw + 4 * N8); + + const VU16 raw5 = And(ShiftRight<5>(packed), mask); + StoreU(BitCast(d8, raw5), d8, raw + 5 * N8); + + const VU16 raw6 = And(ShiftRight<6>(packed), mask); + StoreU(BitCast(d8, raw6), d8, raw + 6 * N8); + + const VU16 raw7 = And(ShiftRight<7>(packed), mask); + StoreU(BitCast(d8, raw7), d8, raw + 7 * N8); + } +}; // Pack8<1> + +template <> +struct Pack8<2> { + template + HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw, + uint8_t* HWY_RESTRICT packed_out) const { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + // 16-bit shifts avoid masking (bits will not cross 8-bit lanes). + const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8)); + const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8)); + const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8)); + const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8)); + const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8)); + const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8)); + const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8)); + const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8)); + + const VU16 packed0 = Xor3(ShiftLeft<6>(raw6), ShiftLeft<4>(raw4), + Or(ShiftLeft<2>(raw2), raw0)); + const VU16 packed1 = Xor3(ShiftLeft<6>(raw7), ShiftLeft<4>(raw5), + Or(ShiftLeft<2>(raw3), raw1)); + StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8); + StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8); + } + + template + HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in, + uint8_t* HWY_RESTRICT raw) const { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + const VU16 mask = Set(d16, 0x0303u); // Lowest 2 bits per byte + + const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8)); + const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8)); + + const VU16 raw0 = And(packed0, mask); + StoreU(BitCast(d8, raw0), d8, raw + 0 * N8); + + const VU16 raw1 = And(packed1, mask); + StoreU(BitCast(d8, raw1), d8, raw + 1 * N8); + + const VU16 raw2 = And(ShiftRight<2>(packed0), mask); + StoreU(BitCast(d8, raw2), d8, raw + 2 * N8); + + const VU16 raw3 = And(ShiftRight<2>(packed1), mask); + StoreU(BitCast(d8, raw3), d8, raw + 3 * N8); + + const VU16 raw4 = And(ShiftRight<4>(packed0), mask); + StoreU(BitCast(d8, raw4), d8, raw + 4 * N8); + + const VU16 raw5 = And(ShiftRight<4>(packed1), mask); + StoreU(BitCast(d8, raw5), d8, raw + 5 * N8); + + const VU16 raw6 = And(ShiftRight<6>(packed0), mask); + StoreU(BitCast(d8, raw6), d8, raw + 6 * N8); + + const VU16 raw7 = And(ShiftRight<6>(packed1), mask); + StoreU(BitCast(d8, raw7), d8, raw + 7 * N8); + } +}; // Pack8<2> + +template <> +struct Pack8<3> { + template + HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw, + uint8_t* HWY_RESTRICT packed_out) const { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8)); + const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8)); + const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8)); + const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8)); + const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8)); + const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8)); + const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8)); + const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8)); + + // The upper two bits of these three will be filled with packed3 (6 bits). + VU16 packed0 = Or(ShiftLeft<3>(raw4), raw0); + VU16 packed1 = Or(ShiftLeft<3>(raw5), raw1); + VU16 packed2 = Or(ShiftLeft<3>(raw6), raw2); + const VU16 packed3 = Or(ShiftLeft<3>(raw7), raw3); + + const VU16 hi2 = Set(d16, 0xC0C0u); + packed0 = OrAnd(packed0, ShiftLeft<2>(packed3), hi2); + packed1 = OrAnd(packed1, ShiftLeft<4>(packed3), hi2); + packed2 = OrAnd(packed2, ShiftLeft<6>(packed3), hi2); + StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8); + StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8); + StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8); + } + + template + HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in, + uint8_t* HWY_RESTRICT raw) const { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + const VU16 mask = Set(d16, 0x0707u); // Lowest 3 bits per byte + + const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8)); + const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8)); + const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8)); + + const VU16 raw0 = And(packed0, mask); + StoreU(BitCast(d8, raw0), d8, raw + 0 * N8); + + const VU16 raw1 = And(packed1, mask); + StoreU(BitCast(d8, raw1), d8, raw + 1 * N8); + + const VU16 raw2 = And(packed2, mask); + StoreU(BitCast(d8, raw2), d8, raw + 2 * N8); + + const VU16 raw4 = And(ShiftRight<3>(packed0), mask); + StoreU(BitCast(d8, raw4), d8, raw + 4 * N8); + + const VU16 raw5 = And(ShiftRight<3>(packed1), mask); + StoreU(BitCast(d8, raw5), d8, raw + 5 * N8); + + const VU16 raw6 = And(ShiftRight<3>(packed2), mask); + StoreU(BitCast(d8, raw6), d8, raw + 6 * N8); + + // raw73 is the concatenation of the upper two bits in packed0..2. + const VU16 hi2 = Set(d16, 0xC0C0u); + const VU16 raw73 = Xor3(ShiftRight<6>(And(packed2, hi2)), // + ShiftRight<4>(And(packed1, hi2)), + ShiftRight<2>(And(packed0, hi2))); + + const VU16 raw3 = And(mask, raw73); + StoreU(BitCast(d8, raw3), d8, raw + 3 * N8); + + const VU16 raw7 = And(mask, ShiftRight<3>(raw73)); + StoreU(BitCast(d8, raw7), d8, raw + 7 * N8); + } +}; // Pack8<3> + +template <> +struct Pack8<4> { + template + HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw, + uint8_t* HWY_RESTRICT packed_out) const { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + // 16-bit shifts avoid masking (bits will not cross 8-bit lanes). + const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8)); + const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8)); + const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8)); + const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8)); + const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8)); + const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8)); + const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8)); + const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8)); + + const VU16 packed0 = Or(ShiftLeft<4>(raw2), raw0); + const VU16 packed1 = Or(ShiftLeft<4>(raw3), raw1); + const VU16 packed2 = Or(ShiftLeft<4>(raw6), raw4); + const VU16 packed3 = Or(ShiftLeft<4>(raw7), raw5); + + StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8); + StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8); + StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8); + StoreU(BitCast(d8, packed3), d8, packed_out + 3 * N8); + } + + template + HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in, + uint8_t* HWY_RESTRICT raw) const { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + const VU16 mask = Set(d16, 0x0F0Fu); // Lowest 4 bits per byte + + const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8)); + const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8)); + const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8)); + const VU16 packed3 = BitCast(d16, LoadU(d8, packed_in + 3 * N8)); + + const VU16 raw0 = And(packed0, mask); + StoreU(BitCast(d8, raw0), d8, raw + 0 * N8); + + const VU16 raw1 = And(packed1, mask); + StoreU(BitCast(d8, raw1), d8, raw + 1 * N8); + + const VU16 raw2 = And(ShiftRight<4>(packed0), mask); + StoreU(BitCast(d8, raw2), d8, raw + 2 * N8); + + const VU16 raw3 = And(ShiftRight<4>(packed1), mask); + StoreU(BitCast(d8, raw3), d8, raw + 3 * N8); + + const VU16 raw4 = And(packed2, mask); + StoreU(BitCast(d8, raw4), d8, raw + 4 * N8); + + const VU16 raw5 = And(packed3, mask); + StoreU(BitCast(d8, raw5), d8, raw + 5 * N8); + + const VU16 raw6 = And(ShiftRight<4>(packed2), mask); + StoreU(BitCast(d8, raw6), d8, raw + 6 * N8); + + const VU16 raw7 = And(ShiftRight<4>(packed3), mask); + StoreU(BitCast(d8, raw7), d8, raw + 7 * N8); + } +}; // Pack8<4> + +template <> +struct Pack8<5> { + template + HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw, + uint8_t* HWY_RESTRICT packed_out) const { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8)); + const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8)); + const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8)); + const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8)); + const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8)); + const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8)); + const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8)); + const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8)); + + // Fill upper three bits with upper bits from raw4..7. + const VU16 hi3 = Set(d16, 0xE0E0u); + const VU16 packed0 = OrAnd(raw0, ShiftLeft<3>(raw4), hi3); + const VU16 packed1 = OrAnd(raw1, ShiftLeft<3>(raw5), hi3); + const VU16 packed2 = OrAnd(raw2, ShiftLeft<3>(raw6), hi3); + const VU16 packed3 = OrAnd(raw3, ShiftLeft<3>(raw7), hi3); + + StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8); + StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8); + StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8); + StoreU(BitCast(d8, packed3), d8, packed_out + 3 * N8); + + // Combine lower two bits of raw4..7 into packed4. + const VU16 lo2 = Set(d16, 0x0303u); + const VU16 packed4 = Or(And(raw4, lo2), Xor3(ShiftLeft<2>(And(raw5, lo2)), + ShiftLeft<4>(And(raw6, lo2)), + ShiftLeft<6>(And(raw7, lo2)))); + StoreU(BitCast(d8, packed4), d8, packed_out + 4 * N8); + } + + template + HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in, + uint8_t* HWY_RESTRICT raw) const { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + + const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8)); + const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8)); + const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8)); + const VU16 packed3 = BitCast(d16, LoadU(d8, packed_in + 3 * N8)); + const VU16 packed4 = BitCast(d16, LoadU(d8, packed_in + 4 * N8)); + + const VU16 mask = Set(d16, 0x1F1Fu); // Lowest 5 bits per byte + + const VU16 raw0 = And(packed0, mask); + StoreU(BitCast(d8, raw0), d8, raw + 0 * N8); + + const VU16 raw1 = And(packed1, mask); + StoreU(BitCast(d8, raw1), d8, raw + 1 * N8); + + const VU16 raw2 = And(packed2, mask); + StoreU(BitCast(d8, raw2), d8, raw + 2 * N8); + + const VU16 raw3 = And(packed3, mask); + StoreU(BitCast(d8, raw3), d8, raw + 3 * N8); + + // The upper bits are the top 3 bits shifted right by three. + const VU16 top4 = ShiftRight<3>(AndNot(mask, packed0)); + const VU16 top5 = ShiftRight<3>(AndNot(mask, packed1)); + const VU16 top6 = ShiftRight<3>(AndNot(mask, packed2)); + const VU16 top7 = ShiftRight<3>(AndNot(mask, packed3)); + + // Insert the lower 2 bits, which were concatenated into a byte. + const VU16 lo2 = Set(d16, 0x0303u); + const VU16 raw4 = OrAnd(top4, lo2, packed4); + const VU16 raw5 = OrAnd(top5, lo2, ShiftRight<2>(packed4)); + const VU16 raw6 = OrAnd(top6, lo2, ShiftRight<4>(packed4)); + const VU16 raw7 = OrAnd(top7, lo2, ShiftRight<6>(packed4)); + + StoreU(BitCast(d8, raw4), d8, raw + 4 * N8); + StoreU(BitCast(d8, raw5), d8, raw + 5 * N8); + StoreU(BitCast(d8, raw6), d8, raw + 6 * N8); + StoreU(BitCast(d8, raw7), d8, raw + 7 * N8); + } +}; // Pack8<5> + +template <> +struct Pack8<6> { + template + HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw, + uint8_t* HWY_RESTRICT packed_out) const { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8)); + const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8)); + const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8)); + const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8)); + const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8)); + const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8)); + const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8)); + const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8)); + + const VU16 hi2 = Set(d16, 0xC0C0u); + // Each triplet of these stores raw3/raw7 (6 bits) in the upper 2 bits. + const VU16 packed0 = OrAnd(raw0, ShiftLeft<2>(raw3), hi2); + const VU16 packed1 = OrAnd(raw1, ShiftLeft<4>(raw3), hi2); + const VU16 packed2 = OrAnd(raw2, ShiftLeft<6>(raw3), hi2); + const VU16 packed3 = OrAnd(raw4, ShiftLeft<2>(raw7), hi2); + const VU16 packed4 = OrAnd(raw5, ShiftLeft<4>(raw7), hi2); + const VU16 packed5 = OrAnd(raw6, ShiftLeft<6>(raw7), hi2); + + StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8); + StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8); + StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8); + StoreU(BitCast(d8, packed3), d8, packed_out + 3 * N8); + StoreU(BitCast(d8, packed4), d8, packed_out + 4 * N8); + StoreU(BitCast(d8, packed5), d8, packed_out + 5 * N8); + } + + template + HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in, + uint8_t* HWY_RESTRICT raw) const { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + const VU16 mask = Set(d16, 0x3F3Fu); // Lowest 6 bits per byte + + const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8)); + const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8)); + const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8)); + const VU16 packed3 = BitCast(d16, LoadU(d8, packed_in + 3 * N8)); + const VU16 packed4 = BitCast(d16, LoadU(d8, packed_in + 4 * N8)); + const VU16 packed5 = BitCast(d16, LoadU(d8, packed_in + 5 * N8)); + + const VU16 raw0 = And(packed0, mask); + StoreU(BitCast(d8, raw0), d8, raw + 0 * N8); + + const VU16 raw1 = And(packed1, mask); + StoreU(BitCast(d8, raw1), d8, raw + 1 * N8); + + const VU16 raw2 = And(packed2, mask); + StoreU(BitCast(d8, raw2), d8, raw + 2 * N8); + + const VU16 raw4 = And(packed3, mask); + StoreU(BitCast(d8, raw4), d8, raw + 4 * N8); + + const VU16 raw5 = And(packed4, mask); + StoreU(BitCast(d8, raw5), d8, raw + 5 * N8); + + const VU16 raw6 = And(packed5, mask); + StoreU(BitCast(d8, raw6), d8, raw + 6 * N8); + + // raw3/7 are the concatenation of the upper two bits in packed0..2. + const VU16 raw3 = Xor3(ShiftRight<6>(AndNot(mask, packed2)), + ShiftRight<4>(AndNot(mask, packed1)), + ShiftRight<2>(AndNot(mask, packed0))); + const VU16 raw7 = Xor3(ShiftRight<6>(AndNot(mask, packed5)), + ShiftRight<4>(AndNot(mask, packed4)), + ShiftRight<2>(AndNot(mask, packed3))); + StoreU(BitCast(d8, raw3), d8, raw + 3 * N8); + StoreU(BitCast(d8, raw7), d8, raw + 7 * N8); + } +}; // Pack8<6> + +template <> +struct Pack8<7> { + template + HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw, + uint8_t* HWY_RESTRICT packed_out) const { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8)); + const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8)); + const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8)); + const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8)); + const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8)); + const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8)); + const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8)); + // Inserted into top bit of packed0..6. + const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8)); + + const VU16 hi1 = Set(d16, 0x8080u); + const VU16 packed0 = OrAnd(raw0, Add(raw7, raw7), hi1); + const VU16 packed1 = OrAnd(raw1, ShiftLeft<2>(raw7), hi1); + const VU16 packed2 = OrAnd(raw2, ShiftLeft<3>(raw7), hi1); + const VU16 packed3 = OrAnd(raw3, ShiftLeft<4>(raw7), hi1); + const VU16 packed4 = OrAnd(raw4, ShiftLeft<5>(raw7), hi1); + const VU16 packed5 = OrAnd(raw5, ShiftLeft<6>(raw7), hi1); + const VU16 packed6 = OrAnd(raw6, ShiftLeft<7>(raw7), hi1); + + StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8); + StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8); + StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8); + StoreU(BitCast(d8, packed3), d8, packed_out + 3 * N8); + StoreU(BitCast(d8, packed4), d8, packed_out + 4 * N8); + StoreU(BitCast(d8, packed5), d8, packed_out + 5 * N8); + StoreU(BitCast(d8, packed6), d8, packed_out + 6 * N8); + } + + template + HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in, + uint8_t* HWY_RESTRICT raw) const { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + + const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8)); + const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8)); + const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8)); + const VU16 packed3 = BitCast(d16, LoadU(d8, packed_in + 3 * N8)); + const VU16 packed4 = BitCast(d16, LoadU(d8, packed_in + 4 * N8)); + const VU16 packed5 = BitCast(d16, LoadU(d8, packed_in + 5 * N8)); + const VU16 packed6 = BitCast(d16, LoadU(d8, packed_in + 6 * N8)); + + const VU16 mask = Set(d16, 0x7F7Fu); // Lowest 7 bits per byte + + const VU16 raw0 = And(packed0, mask); + StoreU(BitCast(d8, raw0), d8, raw + 0 * N8); + + const VU16 raw1 = And(packed1, mask); + StoreU(BitCast(d8, raw1), d8, raw + 1 * N8); + + const VU16 raw2 = And(packed2, mask); + StoreU(BitCast(d8, raw2), d8, raw + 2 * N8); + + const VU16 raw3 = And(packed3, mask); + StoreU(BitCast(d8, raw3), d8, raw + 3 * N8); + + const VU16 raw4 = And(packed4, mask); + StoreU(BitCast(d8, raw4), d8, raw + 4 * N8); + + const VU16 raw5 = And(packed5, mask); + StoreU(BitCast(d8, raw5), d8, raw + 5 * N8); + + const VU16 raw6 = And(packed6, mask); + StoreU(BitCast(d8, raw6), d8, raw + 6 * N8); + + const VU16 p0 = Xor3(ShiftRight<7>(AndNot(mask, packed6)), + ShiftRight<6>(AndNot(mask, packed5)), + ShiftRight<5>(AndNot(mask, packed4))); + const VU16 p1 = Xor3(ShiftRight<4>(AndNot(mask, packed3)), + ShiftRight<3>(AndNot(mask, packed2)), + ShiftRight<2>(AndNot(mask, packed1))); + const VU16 raw7 = Xor3(ShiftRight<1>(AndNot(mask, packed0)), p0, p1); + StoreU(BitCast(d8, raw7), d8, raw + 7 * N8); + } +}; // Pack8<7> + +template <> +struct Pack8<8> { + template + HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw, + uint8_t* HWY_RESTRICT packed_out) const { + using VU8 = Vec; + const size_t N8 = Lanes(d8); + const VU8 raw0 = LoadU(d8, raw + 0 * N8); + const VU8 raw1 = LoadU(d8, raw + 1 * N8); + const VU8 raw2 = LoadU(d8, raw + 2 * N8); + const VU8 raw3 = LoadU(d8, raw + 3 * N8); + const VU8 raw4 = LoadU(d8, raw + 4 * N8); + const VU8 raw5 = LoadU(d8, raw + 5 * N8); + const VU8 raw6 = LoadU(d8, raw + 6 * N8); + const VU8 raw7 = LoadU(d8, raw + 7 * N8); + + StoreU(raw0, d8, packed_out + 0 * N8); + StoreU(raw1, d8, packed_out + 1 * N8); + StoreU(raw2, d8, packed_out + 2 * N8); + StoreU(raw3, d8, packed_out + 3 * N8); + StoreU(raw4, d8, packed_out + 4 * N8); + StoreU(raw5, d8, packed_out + 5 * N8); + StoreU(raw6, d8, packed_out + 6 * N8); + StoreU(raw7, d8, packed_out + 7 * N8); + } + + template + HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in, + uint8_t* HWY_RESTRICT raw) const { + using VU8 = Vec; + const size_t N8 = Lanes(d8); + const VU8 raw0 = LoadU(d8, packed_in + 0 * N8); + const VU8 raw1 = LoadU(d8, packed_in + 1 * N8); + const VU8 raw2 = LoadU(d8, packed_in + 2 * N8); + const VU8 raw3 = LoadU(d8, packed_in + 3 * N8); + const VU8 raw4 = LoadU(d8, packed_in + 4 * N8); + const VU8 raw5 = LoadU(d8, packed_in + 5 * N8); + const VU8 raw6 = LoadU(d8, packed_in + 6 * N8); + const VU8 raw7 = LoadU(d8, packed_in + 7 * N8); + + StoreU(raw0, d8, raw + 0 * N8); + StoreU(raw1, d8, raw + 1 * N8); + StoreU(raw2, d8, raw + 2 * N8); + StoreU(raw3, d8, raw + 3 * N8); + StoreU(raw4, d8, raw + 4 * N8); + StoreU(raw5, d8, raw + 5 * N8); + StoreU(raw6, d8, raw + 6 * N8); + StoreU(raw7, d8, raw + 7 * N8); + } +}; // Pack8<8> + +template <> +struct Pack16<1> { + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + const VU16 raw8 = LoadU(d, raw + 8 * N); + const VU16 raw9 = LoadU(d, raw + 9 * N); + const VU16 rawA = LoadU(d, raw + 0xA * N); + const VU16 rawB = LoadU(d, raw + 0xB * N); + const VU16 rawC = LoadU(d, raw + 0xC * N); + const VU16 rawD = LoadU(d, raw + 0xD * N); + const VU16 rawE = LoadU(d, raw + 0xE * N); + const VU16 rawF = LoadU(d, raw + 0xF * N); + + const VU16 p0 = Xor3(ShiftLeft<2>(raw2), Add(raw1, raw1), raw0); + const VU16 p1 = + Xor3(ShiftLeft<5>(raw5), ShiftLeft<4>(raw4), ShiftLeft<3>(raw3)); + const VU16 p2 = + Xor3(ShiftLeft<8>(raw8), ShiftLeft<7>(raw7), ShiftLeft<6>(raw6)); + const VU16 p3 = + Xor3(ShiftLeft<0xB>(rawB), ShiftLeft<0xA>(rawA), ShiftLeft<9>(raw9)); + const VU16 p4 = + Xor3(ShiftLeft<0xE>(rawE), ShiftLeft<0xD>(rawD), ShiftLeft<0xC>(rawC)); + const VU16 packed = + Or(Xor3(ShiftLeft<0xF>(rawF), p0, p1), Xor3(p2, p3, p4)); + StoreU(packed, d, packed_out); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 mask = Set(d, 1u); // Lowest bit + + const VU16 packed = LoadU(d, packed_in); + + const VU16 raw0 = And(packed, mask); + StoreU(raw0, d, raw + 0 * N); + + const VU16 raw1 = And(ShiftRight<1>(packed), mask); + StoreU(raw1, d, raw + 1 * N); + + const VU16 raw2 = And(ShiftRight<2>(packed), mask); + StoreU(raw2, d, raw + 2 * N); + + const VU16 raw3 = And(ShiftRight<3>(packed), mask); + StoreU(raw3, d, raw + 3 * N); + + const VU16 raw4 = And(ShiftRight<4>(packed), mask); + StoreU(raw4, d, raw + 4 * N); + + const VU16 raw5 = And(ShiftRight<5>(packed), mask); + StoreU(raw5, d, raw + 5 * N); + + const VU16 raw6 = And(ShiftRight<6>(packed), mask); + StoreU(raw6, d, raw + 6 * N); + + const VU16 raw7 = And(ShiftRight<7>(packed), mask); + StoreU(raw7, d, raw + 7 * N); + + const VU16 raw8 = And(ShiftRight<8>(packed), mask); + StoreU(raw8, d, raw + 8 * N); + + const VU16 raw9 = And(ShiftRight<9>(packed), mask); + StoreU(raw9, d, raw + 9 * N); + + const VU16 rawA = And(ShiftRight<0xA>(packed), mask); + StoreU(rawA, d, raw + 0xA * N); + + const VU16 rawB = And(ShiftRight<0xB>(packed), mask); + StoreU(rawB, d, raw + 0xB * N); + + const VU16 rawC = And(ShiftRight<0xC>(packed), mask); + StoreU(rawC, d, raw + 0xC * N); + + const VU16 rawD = And(ShiftRight<0xD>(packed), mask); + StoreU(rawD, d, raw + 0xD * N); + + const VU16 rawE = And(ShiftRight<0xE>(packed), mask); + StoreU(rawE, d, raw + 0xE * N); + + const VU16 rawF = ShiftRight<0xF>(packed); + StoreU(rawF, d, raw + 0xF * N); + } +}; // Pack16<1> + +template <> +struct Pack16<2> { + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + const VU16 raw8 = LoadU(d, raw + 8 * N); + const VU16 raw9 = LoadU(d, raw + 9 * N); + const VU16 rawA = LoadU(d, raw + 0xA * N); + const VU16 rawB = LoadU(d, raw + 0xB * N); + const VU16 rawC = LoadU(d, raw + 0xC * N); + const VU16 rawD = LoadU(d, raw + 0xD * N); + const VU16 rawE = LoadU(d, raw + 0xE * N); + const VU16 rawF = LoadU(d, raw + 0xF * N); + + VU16 packed0 = Xor3(ShiftLeft<4>(raw4), ShiftLeft<2>(raw2), raw0); + VU16 packed1 = Xor3(ShiftLeft<4>(raw5), ShiftLeft<2>(raw3), raw1); + packed0 = Xor3(packed0, ShiftLeft<8>(raw8), ShiftLeft<6>(raw6)); + packed1 = Xor3(packed1, ShiftLeft<8>(raw9), ShiftLeft<6>(raw7)); + + packed0 = Xor3(packed0, ShiftLeft<12>(rawC), ShiftLeft<10>(rawA)); + packed1 = Xor3(packed1, ShiftLeft<12>(rawD), ShiftLeft<10>(rawB)); + + packed0 = Or(packed0, ShiftLeft<14>(rawE)); + packed1 = Or(packed1, ShiftLeft<14>(rawF)); + StoreU(packed0, d, packed_out + 0 * N); + StoreU(packed1, d, packed_out + 1 * N); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 mask = Set(d, 0x3u); // Lowest 2 bits + + const VU16 packed0 = LoadU(d, packed_in + 0 * N); + const VU16 packed1 = LoadU(d, packed_in + 1 * N); + + const VU16 raw0 = And(packed0, mask); + StoreU(raw0, d, raw + 0 * N); + + const VU16 raw1 = And(packed1, mask); + StoreU(raw1, d, raw + 1 * N); + + const VU16 raw2 = And(ShiftRight<2>(packed0), mask); + StoreU(raw2, d, raw + 2 * N); + + const VU16 raw3 = And(ShiftRight<2>(packed1), mask); + StoreU(raw3, d, raw + 3 * N); + + const VU16 raw4 = And(ShiftRight<4>(packed0), mask); + StoreU(raw4, d, raw + 4 * N); + + const VU16 raw5 = And(ShiftRight<4>(packed1), mask); + StoreU(raw5, d, raw + 5 * N); + + const VU16 raw6 = And(ShiftRight<6>(packed0), mask); + StoreU(raw6, d, raw + 6 * N); + + const VU16 raw7 = And(ShiftRight<6>(packed1), mask); + StoreU(raw7, d, raw + 7 * N); + + const VU16 raw8 = And(ShiftRight<8>(packed0), mask); + StoreU(raw8, d, raw + 8 * N); + + const VU16 raw9 = And(ShiftRight<8>(packed1), mask); + StoreU(raw9, d, raw + 9 * N); + + const VU16 rawA = And(ShiftRight<0xA>(packed0), mask); + StoreU(rawA, d, raw + 0xA * N); + + const VU16 rawB = And(ShiftRight<0xA>(packed1), mask); + StoreU(rawB, d, raw + 0xB * N); + + const VU16 rawC = And(ShiftRight<0xC>(packed0), mask); + StoreU(rawC, d, raw + 0xC * N); + + const VU16 rawD = And(ShiftRight<0xC>(packed1), mask); + StoreU(rawD, d, raw + 0xD * N); + + const VU16 rawE = ShiftRight<0xE>(packed0); + StoreU(rawE, d, raw + 0xE * N); + + const VU16 rawF = ShiftRight<0xE>(packed1); + StoreU(rawF, d, raw + 0xF * N); + } +}; // Pack16<2> + +template <> +struct Pack16<3> { + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + const VU16 raw8 = LoadU(d, raw + 8 * N); + const VU16 raw9 = LoadU(d, raw + 9 * N); + const VU16 rawA = LoadU(d, raw + 0xA * N); + const VU16 rawB = LoadU(d, raw + 0xB * N); + const VU16 rawC = LoadU(d, raw + 0xC * N); + const VU16 rawD = LoadU(d, raw + 0xD * N); + const VU16 rawE = LoadU(d, raw + 0xE * N); + const VU16 rawF = LoadU(d, raw + 0xF * N); + + // We can fit 15 raw vectors in three packed vectors (five each). + VU16 packed0 = Xor3(ShiftLeft<6>(raw6), ShiftLeft<3>(raw3), raw0); + VU16 packed1 = Xor3(ShiftLeft<6>(raw7), ShiftLeft<3>(raw4), raw1); + VU16 packed2 = Xor3(ShiftLeft<6>(raw8), ShiftLeft<3>(raw5), raw2); + + // rawF will be scattered into the upper bit of these three. + packed0 = Xor3(packed0, ShiftLeft<12>(rawC), ShiftLeft<9>(raw9)); + packed1 = Xor3(packed1, ShiftLeft<12>(rawD), ShiftLeft<9>(rawA)); + packed2 = Xor3(packed2, ShiftLeft<12>(rawE), ShiftLeft<9>(rawB)); + + const VU16 hi1 = Set(d, 0x8000u); + packed0 = Or(packed0, ShiftLeft<15>(rawF)); // MSB only, no mask + packed1 = OrAnd(packed1, ShiftLeft<14>(rawF), hi1); + packed2 = OrAnd(packed2, ShiftLeft<13>(rawF), hi1); + StoreU(packed0, d, packed_out + 0 * N); + StoreU(packed1, d, packed_out + 1 * N); + StoreU(packed2, d, packed_out + 2 * N); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 mask = Set(d, 0x7u); // Lowest 3 bits + + const VU16 packed0 = LoadU(d, packed_in + 0 * N); + const VU16 packed1 = LoadU(d, packed_in + 1 * N); + const VU16 packed2 = LoadU(d, packed_in + 2 * N); + + const VU16 raw0 = And(mask, packed0); + StoreU(raw0, d, raw + 0 * N); + + const VU16 raw1 = And(mask, packed1); + StoreU(raw1, d, raw + 1 * N); + + const VU16 raw2 = And(mask, packed2); + StoreU(raw2, d, raw + 2 * N); + + const VU16 raw3 = And(mask, ShiftRight<3>(packed0)); + StoreU(raw3, d, raw + 3 * N); + + const VU16 raw4 = And(mask, ShiftRight<3>(packed1)); + StoreU(raw4, d, raw + 4 * N); + + const VU16 raw5 = And(mask, ShiftRight<3>(packed2)); + StoreU(raw5, d, raw + 5 * N); + + const VU16 raw6 = And(mask, ShiftRight<6>(packed0)); + StoreU(raw6, d, raw + 6 * N); + + const VU16 raw7 = And(mask, ShiftRight<6>(packed1)); + StoreU(raw7, d, raw + 7 * N); + + const VU16 raw8 = And(mask, ShiftRight<6>(packed2)); + StoreU(raw8, d, raw + 8 * N); + + const VU16 raw9 = And(mask, ShiftRight<9>(packed0)); + StoreU(raw9, d, raw + 9 * N); + + const VU16 rawA = And(mask, ShiftRight<9>(packed1)); + StoreU(rawA, d, raw + 0xA * N); + + const VU16 rawB = And(mask, ShiftRight<9>(packed2)); + StoreU(rawB, d, raw + 0xB * N); + + const VU16 rawC = And(mask, ShiftRight<12>(packed0)); + StoreU(rawC, d, raw + 0xC * N); + + const VU16 rawD = And(mask, ShiftRight<12>(packed1)); + StoreU(rawD, d, raw + 0xD * N); + + const VU16 rawE = And(mask, ShiftRight<12>(packed2)); + StoreU(rawE, d, raw + 0xE * N); + + // rawF is the concatenation of the upper bit of packed0..2. + const VU16 down0 = ShiftRight<15>(packed0); + const VU16 down1 = ShiftRight<15>(packed1); + const VU16 down2 = ShiftRight<15>(packed2); + const VU16 rawF = Xor3(ShiftLeft<2>(down2), Add(down1, down1), down0); + StoreU(rawF, d, raw + 0xF * N); + } +}; // Pack16<3> + +template <> +struct Pack16<4> { + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + const VU16 raw8 = LoadU(d, raw + 8 * N); + const VU16 raw9 = LoadU(d, raw + 9 * N); + const VU16 rawA = LoadU(d, raw + 0xA * N); + const VU16 rawB = LoadU(d, raw + 0xB * N); + const VU16 rawC = LoadU(d, raw + 0xC * N); + const VU16 rawD = LoadU(d, raw + 0xD * N); + const VU16 rawE = LoadU(d, raw + 0xE * N); + const VU16 rawF = LoadU(d, raw + 0xF * N); + + VU16 packed0 = Xor3(ShiftLeft<8>(raw4), ShiftLeft<4>(raw2), raw0); + VU16 packed1 = Xor3(ShiftLeft<8>(raw5), ShiftLeft<4>(raw3), raw1); + packed0 = Or(packed0, ShiftLeft<12>(raw6)); + packed1 = Or(packed1, ShiftLeft<12>(raw7)); + VU16 packed2 = Xor3(ShiftLeft<8>(rawC), ShiftLeft<4>(rawA), raw8); + VU16 packed3 = Xor3(ShiftLeft<8>(rawD), ShiftLeft<4>(rawB), raw9); + packed2 = Or(packed2, ShiftLeft<12>(rawE)); + packed3 = Or(packed3, ShiftLeft<12>(rawF)); + + StoreU(packed0, d, packed_out + 0 * N); + StoreU(packed1, d, packed_out + 1 * N); + StoreU(packed2, d, packed_out + 2 * N); + StoreU(packed3, d, packed_out + 3 * N); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 mask = Set(d, 0xFu); // Lowest 4 bits + + const VU16 packed0 = LoadU(d, packed_in + 0 * N); + const VU16 packed1 = LoadU(d, packed_in + 1 * N); + const VU16 packed2 = LoadU(d, packed_in + 2 * N); + const VU16 packed3 = LoadU(d, packed_in + 3 * N); + + const VU16 raw0 = And(packed0, mask); + StoreU(raw0, d, raw + 0 * N); + + const VU16 raw1 = And(packed1, mask); + StoreU(raw1, d, raw + 1 * N); + + const VU16 raw2 = And(ShiftRight<4>(packed0), mask); + StoreU(raw2, d, raw + 2 * N); + + const VU16 raw3 = And(ShiftRight<4>(packed1), mask); + StoreU(raw3, d, raw + 3 * N); + + const VU16 raw4 = And(ShiftRight<8>(packed0), mask); + StoreU(raw4, d, raw + 4 * N); + + const VU16 raw5 = And(ShiftRight<8>(packed1), mask); + StoreU(raw5, d, raw + 5 * N); + + const VU16 raw6 = ShiftRight<12>(packed0); // no mask required + StoreU(raw6, d, raw + 6 * N); + + const VU16 raw7 = ShiftRight<12>(packed1); // no mask required + StoreU(raw7, d, raw + 7 * N); + + const VU16 raw8 = And(packed2, mask); + StoreU(raw8, d, raw + 8 * N); + + const VU16 raw9 = And(packed3, mask); + StoreU(raw9, d, raw + 9 * N); + + const VU16 rawA = And(ShiftRight<4>(packed2), mask); + StoreU(rawA, d, raw + 0xA * N); + + const VU16 rawB = And(ShiftRight<4>(packed3), mask); + StoreU(rawB, d, raw + 0xB * N); + + const VU16 rawC = And(ShiftRight<8>(packed2), mask); + StoreU(rawC, d, raw + 0xC * N); + + const VU16 rawD = And(ShiftRight<8>(packed3), mask); + StoreU(rawD, d, raw + 0xD * N); + + const VU16 rawE = ShiftRight<12>(packed2); // no mask required + StoreU(rawE, d, raw + 0xE * N); + + const VU16 rawF = ShiftRight<12>(packed3); // no mask required + StoreU(rawF, d, raw + 0xF * N); + } +}; // Pack16<4> + +template <> +struct Pack16<5> { + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + const VU16 raw8 = LoadU(d, raw + 8 * N); + const VU16 raw9 = LoadU(d, raw + 9 * N); + const VU16 rawA = LoadU(d, raw + 0xA * N); + const VU16 rawB = LoadU(d, raw + 0xB * N); + const VU16 rawC = LoadU(d, raw + 0xC * N); + const VU16 rawD = LoadU(d, raw + 0xD * N); + const VU16 rawE = LoadU(d, raw + 0xE * N); + const VU16 rawF = LoadU(d, raw + 0xF * N); + + // We can fit 15 raw vectors in five packed vectors (three each). + VU16 packed0 = Xor3(ShiftLeft<10>(rawA), ShiftLeft<5>(raw5), raw0); + VU16 packed1 = Xor3(ShiftLeft<10>(rawB), ShiftLeft<5>(raw6), raw1); + VU16 packed2 = Xor3(ShiftLeft<10>(rawC), ShiftLeft<5>(raw7), raw2); + VU16 packed3 = Xor3(ShiftLeft<10>(rawD), ShiftLeft<5>(raw8), raw3); + VU16 packed4 = Xor3(ShiftLeft<10>(rawE), ShiftLeft<5>(raw9), raw4); + + // rawF will be scattered into the upper bits of these five. + const VU16 hi1 = Set(d, 0x8000u); + packed0 = Or(packed0, ShiftLeft<15>(rawF)); // MSB only, no mask + packed1 = OrAnd(packed1, ShiftLeft<14>(rawF), hi1); + packed2 = OrAnd(packed2, ShiftLeft<13>(rawF), hi1); + packed3 = OrAnd(packed3, ShiftLeft<12>(rawF), hi1); + packed4 = OrAnd(packed4, ShiftLeft<11>(rawF), hi1); + + StoreU(packed0, d, packed_out + 0 * N); + StoreU(packed1, d, packed_out + 1 * N); + StoreU(packed2, d, packed_out + 2 * N); + StoreU(packed3, d, packed_out + 3 * N); + StoreU(packed4, d, packed_out + 4 * N); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) const { + using VU16 = Vec; + const size_t N = Lanes(d); + + const VU16 packed0 = LoadU(d, packed_in + 0 * N); + const VU16 packed1 = LoadU(d, packed_in + 1 * N); + const VU16 packed2 = LoadU(d, packed_in + 2 * N); + const VU16 packed3 = LoadU(d, packed_in + 3 * N); + const VU16 packed4 = LoadU(d, packed_in + 4 * N); + + const VU16 mask = Set(d, 0x1Fu); // Lowest 5 bits + + const VU16 raw0 = And(packed0, mask); + StoreU(raw0, d, raw + 0 * N); + + const VU16 raw1 = And(packed1, mask); + StoreU(raw1, d, raw + 1 * N); + + const VU16 raw2 = And(packed2, mask); + StoreU(raw2, d, raw + 2 * N); + + const VU16 raw3 = And(packed3, mask); + StoreU(raw3, d, raw + 3 * N); + + const VU16 raw4 = And(packed4, mask); + StoreU(raw4, d, raw + 4 * N); + + const VU16 raw5 = And(ShiftRight<5>(packed0), mask); + StoreU(raw5, d, raw + 5 * N); + + const VU16 raw6 = And(ShiftRight<5>(packed1), mask); + StoreU(raw6, d, raw + 6 * N); + + const VU16 raw7 = And(ShiftRight<5>(packed2), mask); + StoreU(raw7, d, raw + 7 * N); + + const VU16 raw8 = And(ShiftRight<5>(packed3), mask); + StoreU(raw8, d, raw + 8 * N); + + const VU16 raw9 = And(ShiftRight<5>(packed4), mask); + StoreU(raw9, d, raw + 9 * N); + + const VU16 rawA = And(ShiftRight<10>(packed0), mask); + StoreU(rawA, d, raw + 0xA * N); + + const VU16 rawB = And(ShiftRight<10>(packed1), mask); + StoreU(rawB, d, raw + 0xB * N); + + const VU16 rawC = And(ShiftRight<10>(packed2), mask); + StoreU(rawC, d, raw + 0xC * N); + + const VU16 rawD = And(ShiftRight<10>(packed3), mask); + StoreU(rawD, d, raw + 0xD * N); + + const VU16 rawE = And(ShiftRight<10>(packed4), mask); + StoreU(rawE, d, raw + 0xE * N); + + // rawF is the concatenation of the lower bit of packed0..4. + const VU16 down0 = ShiftRight<15>(packed0); + const VU16 down1 = ShiftRight<15>(packed1); + const VU16 hi1 = Set(d, 0x8000u); + const VU16 p0 = + Xor3(ShiftRight<13>(And(packed2, hi1)), Add(down1, down1), down0); + const VU16 rawF = Xor3(ShiftRight<11>(And(packed4, hi1)), + ShiftRight<12>(And(packed3, hi1)), p0); + StoreU(rawF, d, raw + 0xF * N); + } +}; // Pack16<5> + +template <> +struct Pack16<6> { + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + const VU16 raw8 = LoadU(d, raw + 8 * N); + const VU16 raw9 = LoadU(d, raw + 9 * N); + const VU16 rawA = LoadU(d, raw + 0xA * N); + const VU16 rawB = LoadU(d, raw + 0xB * N); + const VU16 rawC = LoadU(d, raw + 0xC * N); + const VU16 rawD = LoadU(d, raw + 0xD * N); + const VU16 rawE = LoadU(d, raw + 0xE * N); + const VU16 rawF = LoadU(d, raw + 0xF * N); + + const VU16 packed3 = Or(ShiftLeft<6>(raw7), raw3); + const VU16 packed7 = Or(ShiftLeft<6>(rawF), rawB); + // Three vectors, two 6-bit raw each; packed3 (12 bits) is spread over the + // four remainder bits at the top of each vector. + const VU16 packed0 = Xor3(ShiftLeft<12>(packed3), ShiftLeft<6>(raw4), raw0); + VU16 packed1 = Or(ShiftLeft<6>(raw5), raw1); + VU16 packed2 = Or(ShiftLeft<6>(raw6), raw2); + const VU16 packed4 = Xor3(ShiftLeft<12>(packed7), ShiftLeft<6>(rawC), raw8); + VU16 packed5 = Or(ShiftLeft<6>(rawD), raw9); + VU16 packed6 = Or(ShiftLeft<6>(rawE), rawA); + + const VU16 hi4 = Set(d, 0xF000u); + packed1 = OrAnd(packed1, ShiftLeft<8>(packed3), hi4); + packed2 = OrAnd(packed2, ShiftLeft<4>(packed3), hi4); + packed5 = OrAnd(packed5, ShiftLeft<8>(packed7), hi4); + packed6 = OrAnd(packed6, ShiftLeft<4>(packed7), hi4); + + StoreU(packed0, d, packed_out + 0 * N); + StoreU(packed1, d, packed_out + 1 * N); + StoreU(packed2, d, packed_out + 2 * N); + StoreU(packed4, d, packed_out + 3 * N); + StoreU(packed5, d, packed_out + 4 * N); + StoreU(packed6, d, packed_out + 5 * N); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 mask = Set(d, 0x3Fu); // Lowest 6 bits + + const VU16 packed0 = LoadU(d, packed_in + 0 * N); + const VU16 packed1 = LoadU(d, packed_in + 1 * N); + const VU16 packed2 = LoadU(d, packed_in + 2 * N); + const VU16 packed4 = LoadU(d, packed_in + 3 * N); + const VU16 packed5 = LoadU(d, packed_in + 4 * N); + const VU16 packed6 = LoadU(d, packed_in + 5 * N); + + const VU16 raw0 = And(packed0, mask); + StoreU(raw0, d, raw + 0 * N); + + const VU16 raw1 = And(packed1, mask); + StoreU(raw1, d, raw + 1 * N); + + const VU16 raw2 = And(packed2, mask); + StoreU(raw2, d, raw + 2 * N); + + const VU16 raw4 = And(ShiftRight<6>(packed0), mask); + StoreU(raw4, d, raw + 4 * N); + + const VU16 raw5 = And(ShiftRight<6>(packed1), mask); + StoreU(raw5, d, raw + 5 * N); + + const VU16 raw6 = And(ShiftRight<6>(packed2), mask); + StoreU(raw6, d, raw + 6 * N); + + const VU16 raw8 = And(packed4, mask); + StoreU(raw8, d, raw + 8 * N); + + const VU16 raw9 = And(packed5, mask); + StoreU(raw9, d, raw + 9 * N); + + const VU16 rawA = And(packed6, mask); + StoreU(rawA, d, raw + 0xA * N); + + const VU16 rawC = And(ShiftRight<6>(packed4), mask); + StoreU(rawC, d, raw + 0xC * N); + + const VU16 rawD = And(ShiftRight<6>(packed5), mask); + StoreU(rawD, d, raw + 0xD * N); + + const VU16 rawE = And(ShiftRight<6>(packed6), mask); + StoreU(rawE, d, raw + 0xE * N); + + // packed3 is the concatenation of the four upper bits in packed0..2. + const VU16 down0 = ShiftRight<12>(packed0); + const VU16 down4 = ShiftRight<12>(packed4); + const VU16 hi4 = Set(d, 0xF000u); + const VU16 packed3 = Xor3(ShiftRight<4>(And(packed2, hi4)), + ShiftRight<8>(And(packed1, hi4)), down0); + const VU16 packed7 = Xor3(ShiftRight<4>(And(packed6, hi4)), + ShiftRight<8>(And(packed5, hi4)), down4); + const VU16 raw3 = And(packed3, mask); + StoreU(raw3, d, raw + 3 * N); + + const VU16 rawB = And(packed7, mask); + StoreU(rawB, d, raw + 0xB * N); + + const VU16 raw7 = ShiftRight<6>(packed3); // upper bits already zero + StoreU(raw7, d, raw + 7 * N); + + const VU16 rawF = ShiftRight<6>(packed7); // upper bits already zero + StoreU(rawF, d, raw + 0xF * N); + } +}; // Pack16<6> + +template <> +struct Pack16<7> { + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + const VU16 raw8 = LoadU(d, raw + 8 * N); + const VU16 raw9 = LoadU(d, raw + 9 * N); + const VU16 rawA = LoadU(d, raw + 0xA * N); + const VU16 rawB = LoadU(d, raw + 0xB * N); + const VU16 rawC = LoadU(d, raw + 0xC * N); + const VU16 rawD = LoadU(d, raw + 0xD * N); + const VU16 rawE = LoadU(d, raw + 0xE * N); + const VU16 rawF = LoadU(d, raw + 0xF * N); + + const VU16 packed7 = Or(ShiftLeft<7>(rawF), raw7); + // Seven vectors, two 7-bit raw each; packed7 (14 bits) is spread over the + // two remainder bits at the top of each vector. + const VU16 packed0 = Xor3(ShiftLeft<14>(packed7), ShiftLeft<7>(raw8), raw0); + VU16 packed1 = Or(ShiftLeft<7>(raw9), raw1); + VU16 packed2 = Or(ShiftLeft<7>(rawA), raw2); + VU16 packed3 = Or(ShiftLeft<7>(rawB), raw3); + VU16 packed4 = Or(ShiftLeft<7>(rawC), raw4); + VU16 packed5 = Or(ShiftLeft<7>(rawD), raw5); + VU16 packed6 = Or(ShiftLeft<7>(rawE), raw6); + + const VU16 hi2 = Set(d, 0xC000u); + packed1 = OrAnd(packed1, ShiftLeft<12>(packed7), hi2); + packed2 = OrAnd(packed2, ShiftLeft<10>(packed7), hi2); + packed3 = OrAnd(packed3, ShiftLeft<8>(packed7), hi2); + packed4 = OrAnd(packed4, ShiftLeft<6>(packed7), hi2); + packed5 = OrAnd(packed5, ShiftLeft<4>(packed7), hi2); + packed6 = OrAnd(packed6, ShiftLeft<2>(packed7), hi2); + + StoreU(packed0, d, packed_out + 0 * N); + StoreU(packed1, d, packed_out + 1 * N); + StoreU(packed2, d, packed_out + 2 * N); + StoreU(packed3, d, packed_out + 3 * N); + StoreU(packed4, d, packed_out + 4 * N); + StoreU(packed5, d, packed_out + 5 * N); + StoreU(packed6, d, packed_out + 6 * N); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) const { + using VU16 = Vec; + const size_t N = Lanes(d); + + const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N)); + const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N)); + const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N)); + const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N)); + const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N)); + const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N)); + const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N)); + + const VU16 mask = Set(d, 0x7Fu); // Lowest 7 bits + + const VU16 raw0 = And(packed0, mask); + StoreU(raw0, d, raw + 0 * N); + + const VU16 raw1 = And(packed1, mask); + StoreU(raw1, d, raw + 1 * N); + + const VU16 raw2 = And(packed2, mask); + StoreU(raw2, d, raw + 2 * N); + + const VU16 raw3 = And(packed3, mask); + StoreU(raw3, d, raw + 3 * N); + + const VU16 raw4 = And(packed4, mask); + StoreU(raw4, d, raw + 4 * N); + + const VU16 raw5 = And(packed5, mask); + StoreU(raw5, d, raw + 5 * N); + + const VU16 raw6 = And(packed6, mask); + StoreU(raw6, d, raw + 6 * N); + + const VU16 raw8 = And(ShiftRight<7>(packed0), mask); + StoreU(raw8, d, raw + 8 * N); + + const VU16 raw9 = And(ShiftRight<7>(packed1), mask); + StoreU(raw9, d, raw + 9 * N); + + const VU16 rawA = And(ShiftRight<7>(packed2), mask); + StoreU(rawA, d, raw + 0xA * N); + + const VU16 rawB = And(ShiftRight<7>(packed3), mask); + StoreU(rawB, d, raw + 0xB * N); + + const VU16 rawC = And(ShiftRight<7>(packed4), mask); + StoreU(rawC, d, raw + 0xC * N); + + const VU16 rawD = And(ShiftRight<7>(packed5), mask); + StoreU(rawD, d, raw + 0xD * N); + + const VU16 rawE = And(ShiftRight<7>(packed6), mask); + StoreU(rawE, d, raw + 0xE * N); + + // packed7 is the concatenation of the two upper bits in packed0..6. + const VU16 down0 = ShiftRight<14>(packed0); + const VU16 hi2 = Set(d, 0xC000u); + const VU16 p0 = Xor3(ShiftRight<12>(And(packed1, hi2)), + ShiftRight<10>(And(packed2, hi2)), down0); + const VU16 p1 = Xor3(ShiftRight<8>(And(packed3, hi2)), // + ShiftRight<6>(And(packed4, hi2)), + ShiftRight<4>(And(packed5, hi2))); + const VU16 packed7 = Xor3(ShiftRight<2>(And(packed6, hi2)), p1, p0); + + const VU16 raw7 = And(packed7, mask); + StoreU(raw7, d, raw + 7 * N); + + const VU16 rawF = ShiftRight<7>(packed7); // upper bits already zero + StoreU(rawF, d, raw + 0xF * N); + } +}; // Pack16<7> + +template <> +struct Pack16<8> { + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + const VU16 raw8 = LoadU(d, raw + 8 * N); + const VU16 raw9 = LoadU(d, raw + 9 * N); + const VU16 rawA = LoadU(d, raw + 0xA * N); + const VU16 rawB = LoadU(d, raw + 0xB * N); + const VU16 rawC = LoadU(d, raw + 0xC * N); + const VU16 rawD = LoadU(d, raw + 0xD * N); + const VU16 rawE = LoadU(d, raw + 0xE * N); + const VU16 rawF = LoadU(d, raw + 0xF * N); + + // This is equivalent to ConcatEven with 8-bit lanes, but much more + // efficient on RVV and slightly less efficient on SVE2. + const VU16 packed0 = Or(ShiftLeft<8>(raw2), raw0); + const VU16 packed1 = Or(ShiftLeft<8>(raw3), raw1); + const VU16 packed2 = Or(ShiftLeft<8>(raw6), raw4); + const VU16 packed3 = Or(ShiftLeft<8>(raw7), raw5); + const VU16 packed4 = Or(ShiftLeft<8>(rawA), raw8); + const VU16 packed5 = Or(ShiftLeft<8>(rawB), raw9); + const VU16 packed6 = Or(ShiftLeft<8>(rawE), rawC); + const VU16 packed7 = Or(ShiftLeft<8>(rawF), rawD); + + StoreU(packed0, d, packed_out + 0 * N); + StoreU(packed1, d, packed_out + 1 * N); + StoreU(packed2, d, packed_out + 2 * N); + StoreU(packed3, d, packed_out + 3 * N); + StoreU(packed4, d, packed_out + 4 * N); + StoreU(packed5, d, packed_out + 5 * N); + StoreU(packed6, d, packed_out + 6 * N); + StoreU(packed7, d, packed_out + 7 * N); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) const { + using VU16 = Vec; + const size_t N = Lanes(d); + + const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N)); + const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N)); + const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N)); + const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N)); + const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N)); + const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N)); + const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N)); + const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N)); + const VU16 mask = Set(d, 0xFFu); // Lowest 8 bits + + const VU16 raw0 = And(packed0, mask); + StoreU(raw0, d, raw + 0 * N); + + const VU16 raw1 = And(packed1, mask); + StoreU(raw1, d, raw + 1 * N); + + const VU16 raw2 = ShiftRight<8>(packed0); // upper bits already zero + StoreU(raw2, d, raw + 2 * N); + + const VU16 raw3 = ShiftRight<8>(packed1); // upper bits already zero + StoreU(raw3, d, raw + 3 * N); + + const VU16 raw4 = And(packed2, mask); + StoreU(raw4, d, raw + 4 * N); + + const VU16 raw5 = And(packed3, mask); + StoreU(raw5, d, raw + 5 * N); + + const VU16 raw6 = ShiftRight<8>(packed2); // upper bits already zero + StoreU(raw6, d, raw + 6 * N); + + const VU16 raw7 = ShiftRight<8>(packed3); // upper bits already zero + StoreU(raw7, d, raw + 7 * N); + + const VU16 raw8 = And(packed4, mask); + StoreU(raw8, d, raw + 8 * N); + + const VU16 raw9 = And(packed5, mask); + StoreU(raw9, d, raw + 9 * N); + + const VU16 rawA = ShiftRight<8>(packed4); // upper bits already zero + StoreU(rawA, d, raw + 0xA * N); + + const VU16 rawB = ShiftRight<8>(packed5); // upper bits already zero + StoreU(rawB, d, raw + 0xB * N); + + const VU16 rawC = And(packed6, mask); + StoreU(rawC, d, raw + 0xC * N); + + const VU16 rawD = And(packed7, mask); + StoreU(rawD, d, raw + 0xD * N); + + const VU16 rawE = ShiftRight<8>(packed6); // upper bits already zero + StoreU(rawE, d, raw + 0xE * N); + + const VU16 rawF = ShiftRight<8>(packed7); // upper bits already zero + StoreU(rawF, d, raw + 0xF * N); + } +}; // Pack16<8> + +template <> +struct Pack16<9> { + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + const VU16 raw8 = LoadU(d, raw + 8 * N); + const VU16 raw9 = LoadU(d, raw + 9 * N); + const VU16 rawA = LoadU(d, raw + 0xA * N); + const VU16 rawB = LoadU(d, raw + 0xB * N); + const VU16 rawC = LoadU(d, raw + 0xC * N); + const VU16 rawD = LoadU(d, raw + 0xD * N); + const VU16 rawE = LoadU(d, raw + 0xE * N); + const VU16 rawF = LoadU(d, raw + 0xF * N); + // 8 vectors, each with 9+7 bits; top 2 bits are concatenated into packed8. + const VU16 packed0 = Or(ShiftLeft<9>(raw8), raw0); + const VU16 packed1 = Or(ShiftLeft<9>(raw9), raw1); + const VU16 packed2 = Or(ShiftLeft<9>(rawA), raw2); + const VU16 packed3 = Or(ShiftLeft<9>(rawB), raw3); + const VU16 packed4 = Or(ShiftLeft<9>(rawC), raw4); + const VU16 packed5 = Or(ShiftLeft<9>(rawD), raw5); + const VU16 packed6 = Or(ShiftLeft<9>(rawE), raw6); + const VU16 packed7 = Or(ShiftLeft<9>(rawF), raw7); + + // We could shift down, OR and shift up, but two shifts are typically more + // expensive than AND, shift into position, and OR (which can be further + // reduced via Xor3). + const VU16 mid2 = Set(d, 0x180u); // top 2 in lower 9 + const VU16 part8 = ShiftRight<7>(And(raw8, mid2)); + const VU16 part9 = ShiftRight<5>(And(raw9, mid2)); + const VU16 partA = ShiftRight<3>(And(rawA, mid2)); + const VU16 partB = ShiftRight<1>(And(rawB, mid2)); + const VU16 partC = ShiftLeft<1>(And(rawC, mid2)); + const VU16 partD = ShiftLeft<3>(And(rawD, mid2)); + const VU16 partE = ShiftLeft<5>(And(rawE, mid2)); + const VU16 partF = ShiftLeft<7>(And(rawF, mid2)); + const VU16 packed8 = Xor3(Xor3(part8, part9, partA), + Xor3(partB, partC, partD), Or(partE, partF)); + + StoreU(packed0, d, packed_out + 0 * N); + StoreU(packed1, d, packed_out + 1 * N); + StoreU(packed2, d, packed_out + 2 * N); + StoreU(packed3, d, packed_out + 3 * N); + StoreU(packed4, d, packed_out + 4 * N); + StoreU(packed5, d, packed_out + 5 * N); + StoreU(packed6, d, packed_out + 6 * N); + StoreU(packed7, d, packed_out + 7 * N); + StoreU(packed8, d, packed_out + 8 * N); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) const { + using VU16 = Vec; + const size_t N = Lanes(d); + + const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N)); + const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N)); + const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N)); + const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N)); + const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N)); + const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N)); + const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N)); + const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N)); + const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N)); + + const VU16 mask = Set(d, 0x1FFu); // Lowest 9 bits + + const VU16 raw0 = And(packed0, mask); + StoreU(raw0, d, raw + 0 * N); + + const VU16 raw1 = And(packed1, mask); + StoreU(raw1, d, raw + 1 * N); + + const VU16 raw2 = And(packed2, mask); + StoreU(raw2, d, raw + 2 * N); + + const VU16 raw3 = And(packed3, mask); + StoreU(raw3, d, raw + 3 * N); + + const VU16 raw4 = And(packed4, mask); + StoreU(raw4, d, raw + 4 * N); + + const VU16 raw5 = And(packed5, mask); + StoreU(raw5, d, raw + 5 * N); + + const VU16 raw6 = And(packed6, mask); + StoreU(raw6, d, raw + 6 * N); + + const VU16 raw7 = And(packed7, mask); + StoreU(raw7, d, raw + 7 * N); + + const VU16 mid2 = Set(d, 0x180u); // top 2 in lower 9 + const VU16 raw8 = + OrAnd(ShiftRight<9>(packed0), ShiftLeft<7>(packed8), mid2); + const VU16 raw9 = + OrAnd(ShiftRight<9>(packed1), ShiftLeft<5>(packed8), mid2); + const VU16 rawA = + OrAnd(ShiftRight<9>(packed2), ShiftLeft<3>(packed8), mid2); + const VU16 rawB = + OrAnd(ShiftRight<9>(packed3), ShiftLeft<1>(packed8), mid2); + const VU16 rawC = + OrAnd(ShiftRight<9>(packed4), ShiftRight<1>(packed8), mid2); + const VU16 rawD = + OrAnd(ShiftRight<9>(packed5), ShiftRight<3>(packed8), mid2); + const VU16 rawE = + OrAnd(ShiftRight<9>(packed6), ShiftRight<5>(packed8), mid2); + const VU16 rawF = + OrAnd(ShiftRight<9>(packed7), ShiftRight<7>(packed8), mid2); + + StoreU(raw8, d, raw + 8 * N); + StoreU(raw9, d, raw + 9 * N); + StoreU(rawA, d, raw + 0xA * N); + StoreU(rawB, d, raw + 0xB * N); + StoreU(rawC, d, raw + 0xC * N); + StoreU(rawD, d, raw + 0xD * N); + StoreU(rawE, d, raw + 0xE * N); + StoreU(rawF, d, raw + 0xF * N); + } +}; // Pack16<9> + +template <> +struct Pack16<10> { + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + const VU16 raw8 = LoadU(d, raw + 8 * N); + const VU16 raw9 = LoadU(d, raw + 9 * N); + const VU16 rawA = LoadU(d, raw + 0xA * N); + const VU16 rawB = LoadU(d, raw + 0xB * N); + const VU16 rawC = LoadU(d, raw + 0xC * N); + const VU16 rawD = LoadU(d, raw + 0xD * N); + const VU16 rawE = LoadU(d, raw + 0xE * N); + const VU16 rawF = LoadU(d, raw + 0xF * N); + + // 8 vectors, each with 10+6 bits; top 4 bits are concatenated into + // packed8 and packed9. + const VU16 packed0 = Or(ShiftLeft<10>(raw8), raw0); + const VU16 packed1 = Or(ShiftLeft<10>(raw9), raw1); + const VU16 packed2 = Or(ShiftLeft<10>(rawA), raw2); + const VU16 packed3 = Or(ShiftLeft<10>(rawB), raw3); + const VU16 packed4 = Or(ShiftLeft<10>(rawC), raw4); + const VU16 packed5 = Or(ShiftLeft<10>(rawD), raw5); + const VU16 packed6 = Or(ShiftLeft<10>(rawE), raw6); + const VU16 packed7 = Or(ShiftLeft<10>(rawF), raw7); + + // We could shift down, OR and shift up, but two shifts are typically more + // expensive than AND, shift into position, and OR (which can be further + // reduced via Xor3). + const VU16 mid4 = Set(d, 0x3C0u); // top 4 in lower 10 + const VU16 part8 = ShiftRight<6>(And(raw8, mid4)); + const VU16 part9 = ShiftRight<2>(And(raw9, mid4)); + const VU16 partA = ShiftLeft<2>(And(rawA, mid4)); + const VU16 partB = ShiftLeft<6>(And(rawB, mid4)); + const VU16 partC = ShiftRight<6>(And(rawC, mid4)); + const VU16 partD = ShiftRight<2>(And(rawD, mid4)); + const VU16 partE = ShiftLeft<2>(And(rawE, mid4)); + const VU16 partF = ShiftLeft<6>(And(rawF, mid4)); + const VU16 packed8 = Or(Xor3(part8, part9, partA), partB); + const VU16 packed9 = Or(Xor3(partC, partD, partE), partF); + + StoreU(packed0, d, packed_out + 0 * N); + StoreU(packed1, d, packed_out + 1 * N); + StoreU(packed2, d, packed_out + 2 * N); + StoreU(packed3, d, packed_out + 3 * N); + StoreU(packed4, d, packed_out + 4 * N); + StoreU(packed5, d, packed_out + 5 * N); + StoreU(packed6, d, packed_out + 6 * N); + StoreU(packed7, d, packed_out + 7 * N); + StoreU(packed8, d, packed_out + 8 * N); + StoreU(packed9, d, packed_out + 9 * N); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) const { + using VU16 = Vec; + const size_t N = Lanes(d); + + const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N)); + const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N)); + const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N)); + const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N)); + const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N)); + const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N)); + const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N)); + const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N)); + const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N)); + const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N)); + + const VU16 mask = Set(d, 0x3FFu); // Lowest 10 bits + + const VU16 raw0 = And(packed0, mask); + StoreU(raw0, d, raw + 0 * N); + + const VU16 raw1 = And(packed1, mask); + StoreU(raw1, d, raw + 1 * N); + + const VU16 raw2 = And(packed2, mask); + StoreU(raw2, d, raw + 2 * N); + + const VU16 raw3 = And(packed3, mask); + StoreU(raw3, d, raw + 3 * N); + + const VU16 raw4 = And(packed4, mask); + StoreU(raw4, d, raw + 4 * N); + + const VU16 raw5 = And(packed5, mask); + StoreU(raw5, d, raw + 5 * N); + + const VU16 raw6 = And(packed6, mask); + StoreU(raw6, d, raw + 6 * N); + + const VU16 raw7 = And(packed7, mask); + StoreU(raw7, d, raw + 7 * N); + + const VU16 mid4 = Set(d, 0x3C0u); // top 4 in lower 10 + const VU16 raw8 = + OrAnd(ShiftRight<10>(packed0), ShiftLeft<6>(packed8), mid4); + const VU16 raw9 = + OrAnd(ShiftRight<10>(packed1), ShiftLeft<2>(packed8), mid4); + const VU16 rawA = + OrAnd(ShiftRight<10>(packed2), ShiftRight<2>(packed8), mid4); + const VU16 rawB = + OrAnd(ShiftRight<10>(packed3), ShiftRight<6>(packed8), mid4); + const VU16 rawC = + OrAnd(ShiftRight<10>(packed4), ShiftLeft<6>(packed9), mid4); + const VU16 rawD = + OrAnd(ShiftRight<10>(packed5), ShiftLeft<2>(packed9), mid4); + const VU16 rawE = + OrAnd(ShiftRight<10>(packed6), ShiftRight<2>(packed9), mid4); + const VU16 rawF = + OrAnd(ShiftRight<10>(packed7), ShiftRight<6>(packed9), mid4); + + StoreU(raw8, d, raw + 8 * N); + StoreU(raw9, d, raw + 9 * N); + StoreU(rawA, d, raw + 0xA * N); + StoreU(rawB, d, raw + 0xB * N); + StoreU(rawC, d, raw + 0xC * N); + StoreU(rawD, d, raw + 0xD * N); + StoreU(rawE, d, raw + 0xE * N); + StoreU(rawF, d, raw + 0xF * N); + } +}; // Pack16<10> + +template <> +struct Pack16<11> { + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + const VU16 raw8 = LoadU(d, raw + 8 * N); + const VU16 raw9 = LoadU(d, raw + 9 * N); + const VU16 rawA = LoadU(d, raw + 0xA * N); + const VU16 rawB = LoadU(d, raw + 0xB * N); + const VU16 rawC = LoadU(d, raw + 0xC * N); + const VU16 rawD = LoadU(d, raw + 0xD * N); + const VU16 rawE = LoadU(d, raw + 0xE * N); + const VU16 rawF = LoadU(d, raw + 0xF * N); + + // It is not obvious what the optimal partitioning looks like. To reduce the + // number of constants, we want to minimize the number of distinct bit + // lengths. 11+5 also requires 6-bit remnants with 4-bit leftovers. + // 8+3 seems better: it is easier to scatter 3 bits into the MSBs. + const VU16 lo8 = Set(d, 0xFFu); + + // Lower 8 bits of all raw + const VU16 packed0 = OrAnd(ShiftLeft<8>(raw1), raw0, lo8); + const VU16 packed1 = OrAnd(ShiftLeft<8>(raw3), raw2, lo8); + const VU16 packed2 = OrAnd(ShiftLeft<8>(raw5), raw4, lo8); + const VU16 packed3 = OrAnd(ShiftLeft<8>(raw7), raw6, lo8); + const VU16 packed4 = OrAnd(ShiftLeft<8>(raw9), raw8, lo8); + const VU16 packed5 = OrAnd(ShiftLeft<8>(rawB), rawA, lo8); + const VU16 packed6 = OrAnd(ShiftLeft<8>(rawD), rawC, lo8); + const VU16 packed7 = OrAnd(ShiftLeft<8>(rawF), rawE, lo8); + + StoreU(packed0, d, packed_out + 0 * N); + StoreU(packed1, d, packed_out + 1 * N); + StoreU(packed2, d, packed_out + 2 * N); + StoreU(packed3, d, packed_out + 3 * N); + StoreU(packed4, d, packed_out + 4 * N); + StoreU(packed5, d, packed_out + 5 * N); + StoreU(packed6, d, packed_out + 6 * N); + StoreU(packed7, d, packed_out + 7 * N); + + // Three vectors, five 3bit remnants each, plus one 3bit in their MSB. + const VU16 top0 = ShiftRight<8>(raw0); + const VU16 top1 = ShiftRight<8>(raw1); + const VU16 top2 = ShiftRight<8>(raw2); + // Insert top raw bits into 3-bit groups within packed8..A. Moving the + // mask along avoids masking each of raw0..E and enables OrAnd. + VU16 next = Set(d, 0x38u); // 0x7 << 3 + VU16 packed8 = OrAnd(top0, ShiftRight<5>(raw3), next); + VU16 packed9 = OrAnd(top1, ShiftRight<5>(raw4), next); + VU16 packedA = OrAnd(top2, ShiftRight<5>(raw5), next); + next = ShiftLeft<3>(next); + packed8 = OrAnd(packed8, ShiftRight<2>(raw6), next); + packed9 = OrAnd(packed9, ShiftRight<2>(raw7), next); + packedA = OrAnd(packedA, ShiftRight<2>(raw8), next); + next = ShiftLeft<3>(next); + packed8 = OrAnd(packed8, Add(raw9, raw9), next); + packed9 = OrAnd(packed9, Add(rawA, rawA), next); + packedA = OrAnd(packedA, Add(rawB, rawB), next); + next = ShiftLeft<3>(next); + packed8 = OrAnd(packed8, ShiftLeft<4>(rawC), next); + packed9 = OrAnd(packed9, ShiftLeft<4>(rawD), next); + packedA = OrAnd(packedA, ShiftLeft<4>(rawE), next); + + // Scatter upper 3 bits of rawF into the upper bits. + next = ShiftLeft<3>(next); // = 0x8000u + packed8 = OrAnd(packed8, ShiftLeft<7>(rawF), next); + packed9 = OrAnd(packed9, ShiftLeft<6>(rawF), next); + packedA = OrAnd(packedA, ShiftLeft<5>(rawF), next); + + StoreU(packed8, d, packed_out + 8 * N); + StoreU(packed9, d, packed_out + 9 * N); + StoreU(packedA, d, packed_out + 0xA * N); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) const { + using VU16 = Vec; + const size_t N = Lanes(d); + + const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N)); + const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N)); + const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N)); + const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N)); + const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N)); + const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N)); + const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N)); + const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N)); + const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N)); + const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N)); + const VU16 packedA = BitCast(d, LoadU(d, packed_in + 0xA * N)); + + const VU16 mask = Set(d, 0xFFu); // Lowest 8 bits + + const VU16 down0 = And(packed0, mask); + const VU16 down1 = ShiftRight<8>(packed0); + const VU16 down2 = And(packed1, mask); + const VU16 down3 = ShiftRight<8>(packed1); + const VU16 down4 = And(packed2, mask); + const VU16 down5 = ShiftRight<8>(packed2); + const VU16 down6 = And(packed3, mask); + const VU16 down7 = ShiftRight<8>(packed3); + const VU16 down8 = And(packed4, mask); + const VU16 down9 = ShiftRight<8>(packed4); + const VU16 downA = And(packed5, mask); + const VU16 downB = ShiftRight<8>(packed5); + const VU16 downC = And(packed6, mask); + const VU16 downD = ShiftRight<8>(packed6); + const VU16 downE = And(packed7, mask); + const VU16 downF = ShiftRight<8>(packed7); + + // Three bits from packed8..A, eight bits from down0..F. + const VU16 hi3 = Set(d, 0x700u); + const VU16 raw0 = OrAnd(down0, ShiftLeft<8>(packed8), hi3); + const VU16 raw1 = OrAnd(down1, ShiftLeft<8>(packed9), hi3); + const VU16 raw2 = OrAnd(down2, ShiftLeft<8>(packedA), hi3); + + const VU16 raw3 = OrAnd(down3, ShiftLeft<5>(packed8), hi3); + const VU16 raw4 = OrAnd(down4, ShiftLeft<5>(packed9), hi3); + const VU16 raw5 = OrAnd(down5, ShiftLeft<5>(packedA), hi3); + + const VU16 raw6 = OrAnd(down6, ShiftLeft<2>(packed8), hi3); + const VU16 raw7 = OrAnd(down7, ShiftLeft<2>(packed9), hi3); + const VU16 raw8 = OrAnd(down8, ShiftLeft<2>(packedA), hi3); + + const VU16 raw9 = OrAnd(down9, ShiftRight<1>(packed8), hi3); + const VU16 rawA = OrAnd(downA, ShiftRight<1>(packed9), hi3); + const VU16 rawB = OrAnd(downB, ShiftRight<1>(packedA), hi3); + + const VU16 rawC = OrAnd(downC, ShiftRight<4>(packed8), hi3); + const VU16 rawD = OrAnd(downD, ShiftRight<4>(packed9), hi3); + const VU16 rawE = OrAnd(downE, ShiftRight<4>(packedA), hi3); + + // Shift MSB into the top 3-of-11 and mask. + const VU16 rawF = Or(downF, Xor3(And(ShiftRight<7>(packed8), hi3), + And(ShiftRight<6>(packed9), hi3), + And(ShiftRight<5>(packedA), hi3))); + + StoreU(raw0, d, raw + 0 * N); + StoreU(raw1, d, raw + 1 * N); + StoreU(raw2, d, raw + 2 * N); + StoreU(raw3, d, raw + 3 * N); + StoreU(raw4, d, raw + 4 * N); + StoreU(raw5, d, raw + 5 * N); + StoreU(raw6, d, raw + 6 * N); + StoreU(raw7, d, raw + 7 * N); + StoreU(raw8, d, raw + 8 * N); + StoreU(raw9, d, raw + 9 * N); + StoreU(rawA, d, raw + 0xA * N); + StoreU(rawB, d, raw + 0xB * N); + StoreU(rawC, d, raw + 0xC * N); + StoreU(rawD, d, raw + 0xD * N); + StoreU(rawE, d, raw + 0xE * N); + StoreU(rawF, d, raw + 0xF * N); + } +}; // Pack16<11> + +template <> +struct Pack16<12> { + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + const VU16 raw8 = LoadU(d, raw + 8 * N); + const VU16 raw9 = LoadU(d, raw + 9 * N); + const VU16 rawA = LoadU(d, raw + 0xA * N); + const VU16 rawB = LoadU(d, raw + 0xB * N); + const VU16 rawC = LoadU(d, raw + 0xC * N); + const VU16 rawD = LoadU(d, raw + 0xD * N); + const VU16 rawE = LoadU(d, raw + 0xE * N); + const VU16 rawF = LoadU(d, raw + 0xF * N); + + // 8 vectors, each with 12+4 bits; top 8 bits are concatenated into + // packed8 to packedB. + const VU16 packed0 = Or(ShiftLeft<12>(raw8), raw0); + const VU16 packed1 = Or(ShiftLeft<12>(raw9), raw1); + const VU16 packed2 = Or(ShiftLeft<12>(rawA), raw2); + const VU16 packed3 = Or(ShiftLeft<12>(rawB), raw3); + const VU16 packed4 = Or(ShiftLeft<12>(rawC), raw4); + const VU16 packed5 = Or(ShiftLeft<12>(rawD), raw5); + const VU16 packed6 = Or(ShiftLeft<12>(rawE), raw6); + const VU16 packed7 = Or(ShiftLeft<12>(rawF), raw7); + + // Masking after shifting left enables OrAnd. + const VU16 hi8 = Set(d, 0xFF00u); + const VU16 packed8 = OrAnd(ShiftRight<4>(raw8), ShiftLeft<4>(raw9), hi8); + const VU16 packed9 = OrAnd(ShiftRight<4>(rawA), ShiftLeft<4>(rawB), hi8); + const VU16 packedA = OrAnd(ShiftRight<4>(rawC), ShiftLeft<4>(rawD), hi8); + const VU16 packedB = OrAnd(ShiftRight<4>(rawE), ShiftLeft<4>(rawF), hi8); + StoreU(packed0, d, packed_out + 0 * N); + StoreU(packed1, d, packed_out + 1 * N); + StoreU(packed2, d, packed_out + 2 * N); + StoreU(packed3, d, packed_out + 3 * N); + StoreU(packed4, d, packed_out + 4 * N); + StoreU(packed5, d, packed_out + 5 * N); + StoreU(packed6, d, packed_out + 6 * N); + StoreU(packed7, d, packed_out + 7 * N); + StoreU(packed8, d, packed_out + 8 * N); + StoreU(packed9, d, packed_out + 9 * N); + StoreU(packedA, d, packed_out + 0xA * N); + StoreU(packedB, d, packed_out + 0xB * N); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) const { + using VU16 = Vec; + const size_t N = Lanes(d); + + const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N)); + const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N)); + const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N)); + const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N)); + const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N)); + const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N)); + const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N)); + const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N)); + const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N)); + const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N)); + const VU16 packedA = BitCast(d, LoadU(d, packed_in + 0xA * N)); + const VU16 packedB = BitCast(d, LoadU(d, packed_in + 0xB * N)); + + const VU16 mask = Set(d, 0xFFFu); // Lowest 12 bits + + const VU16 raw0 = And(packed0, mask); + StoreU(raw0, d, raw + 0 * N); + + const VU16 raw1 = And(packed1, mask); + StoreU(raw1, d, raw + 1 * N); + + const VU16 raw2 = And(packed2, mask); + StoreU(raw2, d, raw + 2 * N); + + const VU16 raw3 = And(packed3, mask); + StoreU(raw3, d, raw + 3 * N); + + const VU16 raw4 = And(packed4, mask); + StoreU(raw4, d, raw + 4 * N); + + const VU16 raw5 = And(packed5, mask); + StoreU(raw5, d, raw + 5 * N); + + const VU16 raw6 = And(packed6, mask); + StoreU(raw6, d, raw + 6 * N); + + const VU16 raw7 = And(packed7, mask); + StoreU(raw7, d, raw + 7 * N); + + const VU16 mid8 = Set(d, 0xFF0u); // upper 8 in lower 12 + const VU16 raw8 = + OrAnd(ShiftRight<12>(packed0), ShiftLeft<4>(packed8), mid8); + const VU16 raw9 = + OrAnd(ShiftRight<12>(packed1), ShiftRight<4>(packed8), mid8); + const VU16 rawA = + OrAnd(ShiftRight<12>(packed2), ShiftLeft<4>(packed9), mid8); + const VU16 rawB = + OrAnd(ShiftRight<12>(packed3), ShiftRight<4>(packed9), mid8); + const VU16 rawC = + OrAnd(ShiftRight<12>(packed4), ShiftLeft<4>(packedA), mid8); + const VU16 rawD = + OrAnd(ShiftRight<12>(packed5), ShiftRight<4>(packedA), mid8); + const VU16 rawE = + OrAnd(ShiftRight<12>(packed6), ShiftLeft<4>(packedB), mid8); + const VU16 rawF = + OrAnd(ShiftRight<12>(packed7), ShiftRight<4>(packedB), mid8); + StoreU(raw8, d, raw + 8 * N); + StoreU(raw9, d, raw + 9 * N); + StoreU(rawA, d, raw + 0xA * N); + StoreU(rawB, d, raw + 0xB * N); + StoreU(rawC, d, raw + 0xC * N); + StoreU(rawD, d, raw + 0xD * N); + StoreU(rawE, d, raw + 0xE * N); + StoreU(rawF, d, raw + 0xF * N); + } +}; // Pack16<12> + +template <> +struct Pack16<13> { + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + const VU16 raw8 = LoadU(d, raw + 8 * N); + const VU16 raw9 = LoadU(d, raw + 9 * N); + const VU16 rawA = LoadU(d, raw + 0xA * N); + const VU16 rawB = LoadU(d, raw + 0xB * N); + const VU16 rawC = LoadU(d, raw + 0xC * N); + const VU16 rawD = LoadU(d, raw + 0xD * N); + const VU16 rawE = LoadU(d, raw + 0xE * N); + const VU16 rawF = LoadU(d, raw + 0xF * N); + + // As with 11 bits, it is not obvious what the optimal partitioning looks + // like. We similarly go with an 8+5 split. + const VU16 lo8 = Set(d, 0xFFu); + + // Lower 8 bits of all raw + const VU16 packed0 = OrAnd(ShiftLeft<8>(raw1), raw0, lo8); + const VU16 packed1 = OrAnd(ShiftLeft<8>(raw3), raw2, lo8); + const VU16 packed2 = OrAnd(ShiftLeft<8>(raw5), raw4, lo8); + const VU16 packed3 = OrAnd(ShiftLeft<8>(raw7), raw6, lo8); + const VU16 packed4 = OrAnd(ShiftLeft<8>(raw9), raw8, lo8); + const VU16 packed5 = OrAnd(ShiftLeft<8>(rawB), rawA, lo8); + const VU16 packed6 = OrAnd(ShiftLeft<8>(rawD), rawC, lo8); + const VU16 packed7 = OrAnd(ShiftLeft<8>(rawF), rawE, lo8); + + StoreU(packed0, d, packed_out + 0 * N); + StoreU(packed1, d, packed_out + 1 * N); + StoreU(packed2, d, packed_out + 2 * N); + StoreU(packed3, d, packed_out + 3 * N); + StoreU(packed4, d, packed_out + 4 * N); + StoreU(packed5, d, packed_out + 5 * N); + StoreU(packed6, d, packed_out + 6 * N); + StoreU(packed7, d, packed_out + 7 * N); + + // Five vectors, three 5bit remnants each, plus one 5bit in their MSB. + const VU16 top0 = ShiftRight<8>(raw0); + const VU16 top1 = ShiftRight<8>(raw1); + const VU16 top2 = ShiftRight<8>(raw2); + const VU16 top3 = ShiftRight<8>(raw3); + const VU16 top4 = ShiftRight<8>(raw4); + + // Insert top raw bits into 5-bit groups within packed8..C. Moving the + // mask along avoids masking each of raw0..E and enables OrAnd. + VU16 next = Set(d, 0x3E0u); // 0x1F << 5 + VU16 packed8 = OrAnd(top0, ShiftRight<3>(raw5), next); + VU16 packed9 = OrAnd(top1, ShiftRight<3>(raw6), next); + VU16 packedA = OrAnd(top2, ShiftRight<3>(raw7), next); + VU16 packedB = OrAnd(top3, ShiftRight<3>(raw8), next); + VU16 packedC = OrAnd(top4, ShiftRight<3>(raw9), next); + next = ShiftLeft<5>(next); + packed8 = OrAnd(packed8, ShiftLeft<2>(rawA), next); + packed9 = OrAnd(packed9, ShiftLeft<2>(rawB), next); + packedA = OrAnd(packedA, ShiftLeft<2>(rawC), next); + packedB = OrAnd(packedB, ShiftLeft<2>(rawD), next); + packedC = OrAnd(packedC, ShiftLeft<2>(rawE), next); + + // Scatter upper 5 bits of rawF into the upper bits. + next = ShiftLeft<3>(next); // = 0x8000u + packed8 = OrAnd(packed8, ShiftLeft<7>(rawF), next); + packed9 = OrAnd(packed9, ShiftLeft<6>(rawF), next); + packedA = OrAnd(packedA, ShiftLeft<5>(rawF), next); + packedB = OrAnd(packedB, ShiftLeft<4>(rawF), next); + packedC = OrAnd(packedC, ShiftLeft<3>(rawF), next); + + StoreU(packed8, d, packed_out + 8 * N); + StoreU(packed9, d, packed_out + 9 * N); + StoreU(packedA, d, packed_out + 0xA * N); + StoreU(packedB, d, packed_out + 0xB * N); + StoreU(packedC, d, packed_out + 0xC * N); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) const { + using VU16 = Vec; + const size_t N = Lanes(d); + + const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N)); + const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N)); + const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N)); + const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N)); + const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N)); + const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N)); + const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N)); + const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N)); + const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N)); + const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N)); + const VU16 packedA = BitCast(d, LoadU(d, packed_in + 0xA * N)); + const VU16 packedB = BitCast(d, LoadU(d, packed_in + 0xB * N)); + const VU16 packedC = BitCast(d, LoadU(d, packed_in + 0xC * N)); + + const VU16 mask = Set(d, 0xFFu); // Lowest 8 bits + + const VU16 down0 = And(packed0, mask); + const VU16 down1 = ShiftRight<8>(packed0); + const VU16 down2 = And(packed1, mask); + const VU16 down3 = ShiftRight<8>(packed1); + const VU16 down4 = And(packed2, mask); + const VU16 down5 = ShiftRight<8>(packed2); + const VU16 down6 = And(packed3, mask); + const VU16 down7 = ShiftRight<8>(packed3); + const VU16 down8 = And(packed4, mask); + const VU16 down9 = ShiftRight<8>(packed4); + const VU16 downA = And(packed5, mask); + const VU16 downB = ShiftRight<8>(packed5); + const VU16 downC = And(packed6, mask); + const VU16 downD = ShiftRight<8>(packed6); + const VU16 downE = And(packed7, mask); + const VU16 downF = ShiftRight<8>(packed7); + + // Upper five bits from packed8..C, eight bits from down0..F. + const VU16 hi5 = Set(d, 0x1F00u); + const VU16 raw0 = OrAnd(down0, ShiftLeft<8>(packed8), hi5); + const VU16 raw1 = OrAnd(down1, ShiftLeft<8>(packed9), hi5); + const VU16 raw2 = OrAnd(down2, ShiftLeft<8>(packedA), hi5); + const VU16 raw3 = OrAnd(down3, ShiftLeft<8>(packedB), hi5); + const VU16 raw4 = OrAnd(down4, ShiftLeft<8>(packedC), hi5); + + const VU16 raw5 = OrAnd(down5, ShiftLeft<3>(packed8), hi5); + const VU16 raw6 = OrAnd(down6, ShiftLeft<3>(packed9), hi5); + const VU16 raw7 = OrAnd(down7, ShiftLeft<3>(packedA), hi5); + const VU16 raw8 = OrAnd(down8, ShiftLeft<3>(packed9), hi5); + const VU16 raw9 = OrAnd(down9, ShiftLeft<3>(packedA), hi5); + + const VU16 rawA = OrAnd(downA, ShiftRight<2>(packed8), hi5); + const VU16 rawB = OrAnd(downB, ShiftRight<2>(packed9), hi5); + const VU16 rawC = OrAnd(downC, ShiftRight<2>(packedA), hi5); + const VU16 rawD = OrAnd(downD, ShiftRight<2>(packed9), hi5); + const VU16 rawE = OrAnd(downE, ShiftRight<2>(packedA), hi5); + + // Shift MSB into the top 5-of-11 and mask. + const VU16 p0 = Xor3(And(ShiftRight<7>(packed8), hi5), // + And(ShiftRight<6>(packed9), hi5), + And(ShiftRight<5>(packedA), hi5)); + const VU16 p1 = Xor3(And(ShiftRight<4>(packedB), hi5), + And(ShiftRight<3>(packedC), hi5), downF); + const VU16 rawF = Or(p0, p1); + + StoreU(raw0, d, raw + 0 * N); + StoreU(raw1, d, raw + 1 * N); + StoreU(raw2, d, raw + 2 * N); + StoreU(raw3, d, raw + 3 * N); + StoreU(raw4, d, raw + 4 * N); + StoreU(raw5, d, raw + 5 * N); + StoreU(raw6, d, raw + 6 * N); + StoreU(raw7, d, raw + 7 * N); + StoreU(raw8, d, raw + 8 * N); + StoreU(raw9, d, raw + 9 * N); + StoreU(rawA, d, raw + 0xA * N); + StoreU(rawB, d, raw + 0xB * N); + StoreU(rawC, d, raw + 0xC * N); + StoreU(rawD, d, raw + 0xD * N); + StoreU(rawE, d, raw + 0xE * N); + StoreU(rawF, d, raw + 0xF * N); + } +}; // Pack16<13> + +template <> +struct Pack16<14> { + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + const VU16 raw8 = LoadU(d, raw + 8 * N); + const VU16 raw9 = LoadU(d, raw + 9 * N); + const VU16 rawA = LoadU(d, raw + 0xA * N); + const VU16 rawB = LoadU(d, raw + 0xB * N); + const VU16 rawC = LoadU(d, raw + 0xC * N); + const VU16 rawD = LoadU(d, raw + 0xD * N); + const VU16 rawE = LoadU(d, raw + 0xE * N); + const VU16 rawF = LoadU(d, raw + 0xF * N); + + // 14 vectors, each with 14+2 bits; two raw vectors are scattered + // across the upper 2 bits. + const VU16 hi2 = Set(d, 0xC000u); + const VU16 packed0 = Or(raw0, ShiftLeft<14>(rawE)); + const VU16 packed1 = OrAnd(raw1, ShiftLeft<12>(rawE), hi2); + const VU16 packed2 = OrAnd(raw2, ShiftLeft<10>(rawE), hi2); + const VU16 packed3 = OrAnd(raw3, ShiftLeft<8>(rawE), hi2); + const VU16 packed4 = OrAnd(raw4, ShiftLeft<6>(rawE), hi2); + const VU16 packed5 = OrAnd(raw5, ShiftLeft<4>(rawE), hi2); + const VU16 packed6 = OrAnd(raw6, ShiftLeft<2>(rawE), hi2); + const VU16 packed7 = Or(raw7, ShiftLeft<14>(rawF)); + const VU16 packed8 = OrAnd(raw8, ShiftLeft<12>(rawF), hi2); + const VU16 packed9 = OrAnd(raw9, ShiftLeft<10>(rawF), hi2); + const VU16 packedA = OrAnd(rawA, ShiftLeft<8>(rawF), hi2); + const VU16 packedB = OrAnd(rawB, ShiftLeft<6>(rawF), hi2); + const VU16 packedC = OrAnd(rawC, ShiftLeft<4>(rawF), hi2); + const VU16 packedD = OrAnd(rawD, ShiftLeft<2>(rawF), hi2); + + StoreU(packed0, d, packed_out + 0 * N); + StoreU(packed1, d, packed_out + 1 * N); + StoreU(packed2, d, packed_out + 2 * N); + StoreU(packed3, d, packed_out + 3 * N); + StoreU(packed4, d, packed_out + 4 * N); + StoreU(packed5, d, packed_out + 5 * N); + StoreU(packed6, d, packed_out + 6 * N); + StoreU(packed7, d, packed_out + 7 * N); + StoreU(packed8, d, packed_out + 8 * N); + StoreU(packed9, d, packed_out + 9 * N); + StoreU(packedA, d, packed_out + 0xA * N); + StoreU(packedB, d, packed_out + 0xB * N); + StoreU(packedC, d, packed_out + 0xC * N); + StoreU(packedD, d, packed_out + 0xD * N); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) const { + using VU16 = Vec; + const size_t N = Lanes(d); + + const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N)); + const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N)); + const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N)); + const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N)); + const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N)); + const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N)); + const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N)); + const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N)); + const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N)); + const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N)); + const VU16 packedA = BitCast(d, LoadU(d, packed_in + 0xA * N)); + const VU16 packedB = BitCast(d, LoadU(d, packed_in + 0xB * N)); + const VU16 packedC = BitCast(d, LoadU(d, packed_in + 0xC * N)); + const VU16 packedD = BitCast(d, LoadU(d, packed_in + 0xD * N)); + + const VU16 mask = Set(d, 0x3FFFu); // Lowest 14 bits + + const VU16 raw0 = And(packed0, mask); + StoreU(raw0, d, raw + 0 * N); + + const VU16 raw1 = And(packed1, mask); + StoreU(raw1, d, raw + 1 * N); + + const VU16 raw2 = And(packed2, mask); + StoreU(raw2, d, raw + 2 * N); + + const VU16 raw3 = And(packed3, mask); + StoreU(raw3, d, raw + 3 * N); + + const VU16 raw4 = And(packed4, mask); + StoreU(raw4, d, raw + 4 * N); + + const VU16 raw5 = And(packed5, mask); + StoreU(raw5, d, raw + 5 * N); + + const VU16 raw6 = And(packed6, mask); + StoreU(raw6, d, raw + 6 * N); + + const VU16 raw7 = And(packed7, mask); + StoreU(raw7, d, raw + 7 * N); + + const VU16 raw8 = And(packed8, mask); + StoreU(raw8, d, raw + 8 * N); + + const VU16 raw9 = And(packed9, mask); + StoreU(raw9, d, raw + 9 * N); + + const VU16 rawA = And(packedA, mask); + StoreU(rawA, d, raw + 0xA * N); + + const VU16 rawB = And(packedB, mask); + StoreU(rawB, d, raw + 0xB * N); + + const VU16 rawC = And(packedC, mask); + StoreU(rawC, d, raw + 0xC * N); + + const VU16 rawD = And(packedD, mask); + StoreU(rawD, d, raw + 0xD * N); + + // rawE is the concatenation of the top two bits in packed0..6. + const VU16 E0 = Xor3(ShiftRight<14>(packed0), // + ShiftRight<12>(AndNot(mask, packed1)), + ShiftRight<10>(AndNot(mask, packed2))); + const VU16 E1 = Xor3(ShiftRight<8>(AndNot(mask, packed3)), + ShiftRight<6>(AndNot(mask, packed4)), + ShiftRight<4>(AndNot(mask, packed5))); + const VU16 rawE = Xor3(ShiftRight<2>(AndNot(mask, packed6)), E0, E1); + const VU16 F0 = Xor3(ShiftRight<14>(AndNot(mask, packed7)), + ShiftRight<12>(AndNot(mask, packed8)), + ShiftRight<10>(AndNot(mask, packed9))); + const VU16 F1 = Xor3(ShiftRight<8>(AndNot(mask, packedA)), + ShiftRight<6>(AndNot(mask, packedB)), + ShiftRight<4>(AndNot(mask, packedC))); + const VU16 rawF = Xor3(ShiftRight<2>(AndNot(mask, packedD)), F0, F1); + StoreU(rawE, d, raw + 0xE * N); + StoreU(rawF, d, raw + 0xF * N); + } +}; // Pack16<14> + +template <> +struct Pack16<15> { + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + const VU16 raw8 = LoadU(d, raw + 8 * N); + const VU16 raw9 = LoadU(d, raw + 9 * N); + const VU16 rawA = LoadU(d, raw + 0xA * N); + const VU16 rawB = LoadU(d, raw + 0xB * N); + const VU16 rawC = LoadU(d, raw + 0xC * N); + const VU16 rawD = LoadU(d, raw + 0xD * N); + const VU16 rawE = LoadU(d, raw + 0xE * N); + const VU16 rawF = LoadU(d, raw + 0xF * N); + + // 15 vectors, each with 15+1 bits; one packed vector is scattered + // across the upper bit. + const VU16 hi1 = Set(d, 0x8000u); + const VU16 packed0 = Or(raw0, ShiftLeft<15>(rawF)); + const VU16 packed1 = OrAnd(raw1, ShiftLeft<14>(rawF), hi1); + const VU16 packed2 = OrAnd(raw2, ShiftLeft<13>(rawF), hi1); + const VU16 packed3 = OrAnd(raw3, ShiftLeft<12>(rawF), hi1); + const VU16 packed4 = OrAnd(raw4, ShiftLeft<11>(rawF), hi1); + const VU16 packed5 = OrAnd(raw5, ShiftLeft<10>(rawF), hi1); + const VU16 packed6 = OrAnd(raw6, ShiftLeft<9>(rawF), hi1); + const VU16 packed7 = OrAnd(raw7, ShiftLeft<8>(rawF), hi1); + const VU16 packed8 = OrAnd(raw8, ShiftLeft<7>(rawF), hi1); + const VU16 packed9 = OrAnd(raw9, ShiftLeft<6>(rawF), hi1); + const VU16 packedA = OrAnd(rawA, ShiftLeft<5>(rawF), hi1); + const VU16 packedB = OrAnd(rawB, ShiftLeft<4>(rawF), hi1); + const VU16 packedC = OrAnd(rawC, ShiftLeft<3>(rawF), hi1); + const VU16 packedD = OrAnd(rawD, ShiftLeft<2>(rawF), hi1); + const VU16 packedE = OrAnd(rawE, ShiftLeft<1>(rawF), hi1); + + StoreU(packed0, d, packed_out + 0 * N); + StoreU(packed1, d, packed_out + 1 * N); + StoreU(packed2, d, packed_out + 2 * N); + StoreU(packed3, d, packed_out + 3 * N); + StoreU(packed4, d, packed_out + 4 * N); + StoreU(packed5, d, packed_out + 5 * N); + StoreU(packed6, d, packed_out + 6 * N); + StoreU(packed7, d, packed_out + 7 * N); + StoreU(packed8, d, packed_out + 8 * N); + StoreU(packed9, d, packed_out + 9 * N); + StoreU(packedA, d, packed_out + 0xA * N); + StoreU(packedB, d, packed_out + 0xB * N); + StoreU(packedC, d, packed_out + 0xC * N); + StoreU(packedD, d, packed_out + 0xD * N); + StoreU(packedE, d, packed_out + 0xE * N); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) const { + using VU16 = Vec; + const size_t N = Lanes(d); + + const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N)); + const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N)); + const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N)); + const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N)); + const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N)); + const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N)); + const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N)); + const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N)); + const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N)); + const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N)); + const VU16 packedA = BitCast(d, LoadU(d, packed_in + 0xA * N)); + const VU16 packedB = BitCast(d, LoadU(d, packed_in + 0xB * N)); + const VU16 packedC = BitCast(d, LoadU(d, packed_in + 0xC * N)); + const VU16 packedD = BitCast(d, LoadU(d, packed_in + 0xD * N)); + const VU16 packedE = BitCast(d, LoadU(d, packed_in + 0xE * N)); + + const VU16 mask = Set(d, 0x7FFFu); // Lowest 15 bits + + const VU16 raw0 = And(packed0, mask); + StoreU(raw0, d, raw + 0 * N); + + const VU16 raw1 = And(packed1, mask); + StoreU(raw1, d, raw + 1 * N); + + const VU16 raw2 = And(packed2, mask); + StoreU(raw2, d, raw + 2 * N); + + const VU16 raw3 = And(packed3, mask); + StoreU(raw3, d, raw + 3 * N); + + const VU16 raw4 = And(packed4, mask); + StoreU(raw4, d, raw + 4 * N); + + const VU16 raw5 = And(packed5, mask); + StoreU(raw5, d, raw + 5 * N); + + const VU16 raw6 = And(packed6, mask); + StoreU(raw6, d, raw + 6 * N); + + const VU16 raw7 = And(packed7, mask); + StoreU(raw7, d, raw + 7 * N); + + const VU16 raw8 = And(packed8, mask); + StoreU(raw8, d, raw + 8 * N); + + const VU16 raw9 = And(packed9, mask); + StoreU(raw9, d, raw + 9 * N); + + const VU16 rawA = And(packedA, mask); + StoreU(rawA, d, raw + 0xA * N); + + const VU16 rawB = And(packedB, mask); + StoreU(rawB, d, raw + 0xB * N); + + const VU16 rawC = And(packedC, mask); + StoreU(rawC, d, raw + 0xC * N); + + const VU16 rawD = And(packedD, mask); + StoreU(rawD, d, raw + 0xD * N); + + const VU16 rawE = And(packedE, mask); + StoreU(rawE, d, raw + 0xE * N); + + // rawF is the concatenation of the top bit in packed0..E. + const VU16 F0 = Xor3(ShiftRight<15>(packed0), // + ShiftRight<14>(AndNot(mask, packed1)), + ShiftRight<13>(AndNot(mask, packed2))); + const VU16 F1 = Xor3(ShiftRight<12>(AndNot(mask, packed3)), + ShiftRight<11>(AndNot(mask, packed4)), + ShiftRight<10>(AndNot(mask, packed5))); + const VU16 F2 = Xor3(ShiftRight<9>(AndNot(mask, packed6)), + ShiftRight<8>(AndNot(mask, packed7)), + ShiftRight<7>(AndNot(mask, packed8))); + const VU16 F3 = Xor3(ShiftRight<6>(AndNot(mask, packed9)), + ShiftRight<5>(AndNot(mask, packedA)), + ShiftRight<4>(AndNot(mask, packedB))); + const VU16 F4 = Xor3(ShiftRight<3>(AndNot(mask, packedC)), + ShiftRight<2>(AndNot(mask, packedD)), + ShiftRight<1>(AndNot(mask, packedE))); + const VU16 rawF = Xor3(F0, F1, Xor3(F2, F3, F4)); + StoreU(rawF, d, raw + 0xF * N); + } +}; // Pack16<15> + +template <> +struct Pack16<16> { + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + const VU16 raw8 = LoadU(d, raw + 8 * N); + const VU16 raw9 = LoadU(d, raw + 9 * N); + const VU16 rawA = LoadU(d, raw + 0xA * N); + const VU16 rawB = LoadU(d, raw + 0xB * N); + const VU16 rawC = LoadU(d, raw + 0xC * N); + const VU16 rawD = LoadU(d, raw + 0xD * N); + const VU16 rawE = LoadU(d, raw + 0xE * N); + const VU16 rawF = LoadU(d, raw + 0xF * N); + + StoreU(raw0, d, packed_out + 0 * N); + StoreU(raw1, d, packed_out + 1 * N); + StoreU(raw2, d, packed_out + 2 * N); + StoreU(raw3, d, packed_out + 3 * N); + StoreU(raw4, d, packed_out + 4 * N); + StoreU(raw5, d, packed_out + 5 * N); + StoreU(raw6, d, packed_out + 6 * N); + StoreU(raw7, d, packed_out + 7 * N); + StoreU(raw8, d, packed_out + 8 * N); + StoreU(raw9, d, packed_out + 9 * N); + StoreU(rawA, d, packed_out + 0xA * N); + StoreU(rawB, d, packed_out + 0xB * N); + StoreU(rawC, d, packed_out + 0xC * N); + StoreU(rawD, d, packed_out + 0xD * N); + StoreU(rawE, d, packed_out + 0xE * N); + StoreU(rawF, d, packed_out + 0xF * N); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) const { + using VU16 = Vec; + const size_t N = Lanes(d); + + const VU16 raw0 = BitCast(d, LoadU(d, packed_in + 0 * N)); + const VU16 raw1 = BitCast(d, LoadU(d, packed_in + 1 * N)); + const VU16 raw2 = BitCast(d, LoadU(d, packed_in + 2 * N)); + const VU16 raw3 = BitCast(d, LoadU(d, packed_in + 3 * N)); + const VU16 raw4 = BitCast(d, LoadU(d, packed_in + 4 * N)); + const VU16 raw5 = BitCast(d, LoadU(d, packed_in + 5 * N)); + const VU16 raw6 = BitCast(d, LoadU(d, packed_in + 6 * N)); + const VU16 raw7 = BitCast(d, LoadU(d, packed_in + 7 * N)); + const VU16 raw8 = BitCast(d, LoadU(d, packed_in + 8 * N)); + const VU16 raw9 = BitCast(d, LoadU(d, packed_in + 9 * N)); + const VU16 rawA = BitCast(d, LoadU(d, packed_in + 0xA * N)); + const VU16 rawB = BitCast(d, LoadU(d, packed_in + 0xB * N)); + const VU16 rawC = BitCast(d, LoadU(d, packed_in + 0xC * N)); + const VU16 rawD = BitCast(d, LoadU(d, packed_in + 0xD * N)); + const VU16 rawE = BitCast(d, LoadU(d, packed_in + 0xE * N)); + const VU16 rawF = BitCast(d, LoadU(d, packed_in + 0xF * N)); + + StoreU(raw0, d, raw + 0 * N); + StoreU(raw1, d, raw + 1 * N); + StoreU(raw2, d, raw + 2 * N); + StoreU(raw3, d, raw + 3 * N); + StoreU(raw4, d, raw + 4 * N); + StoreU(raw5, d, raw + 5 * N); + StoreU(raw6, d, raw + 6 * N); + StoreU(raw7, d, raw + 7 * N); + StoreU(raw8, d, raw + 8 * N); + StoreU(raw9, d, raw + 9 * N); + StoreU(rawA, d, raw + 0xA * N); + StoreU(rawB, d, raw + 0xB * N); + StoreU(rawC, d, raw + 0xC * N); + StoreU(rawD, d, raw + 0xD * N); + StoreU(rawE, d, raw + 0xE * N); + StoreU(rawF, d, raw + 0xF * N); + } +}; // Pack16<16> + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_ diff --git a/third_party/highway/hwy/contrib/bit_pack/bit_pack_test.cc b/third_party/highway/hwy/contrib/bit_pack/bit_pack_test.cc new file mode 100644 index 0000000000..a239da9cf6 --- /dev/null +++ b/third_party/highway/hwy/contrib/bit_pack/bit_pack_test.cc @@ -0,0 +1,205 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include "hwy/aligned_allocator.h" +#include "hwy/base.h" +#include "hwy/nanobenchmark.h" + +// clang-format off +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/bit_pack/bit_pack_test.cc" // NOLINT +#include "hwy/foreach_target.h" // IWYU pragma: keep + +#include "hwy/contrib/bit_pack/bit_pack-inl.h" +#include "hwy/tests/test_util-inl.h" +// clang-format on + +#ifndef HWY_BIT_PACK_BENCHMARK +#define HWY_BIT_PACK_BENCHMARK 0 +#endif + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +// Used to prevent running benchmark (slow) for partial vectors and targets +// except the best available. Global, not per-target, hence must be outside +// HWY_NAMESPACE. Declare first because HWY_ONCE is only true after some code +// has been re-included. +extern size_t last_bits; +extern uint64_t best_target; +#if HWY_ONCE +size_t last_bits = 0; +uint64_t best_target = ~0ull; +#endif +namespace HWY_NAMESPACE { + +template +T Random(RandomState& rng) { + return static_cast(Random32(&rng) & kBits); +} + +template +class Checker { + public: + explicit Checker(size_t num) { raw_.reserve(num); } + void NotifyRaw(T raw) { raw_.push_back(raw); } + + void NotifyRawOutput(size_t bits, T raw) { + if (raw_[num_verified_] != raw) { + HWY_ABORT("%zu bits: pos %zu of %zu, expected %.0f actual %.0f\n", bits, + num_verified_, raw_.size(), + static_cast(raw_[num_verified_]), + static_cast(raw)); + } + ++num_verified_; + } + + private: + std::vector raw_; + size_t num_verified_ = 0; +}; + +template