From 43a97878ce14b72f0981164f87f2e35e14151312 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 11:22:09 +0200 Subject: Adding upstream version 110.0.1. Signed-off-by: Daniel Baumann --- .../highway/hwy/contrib/bit_pack/bit_pack-inl.h | 1438 ++++++++++++++++++++ .../highway/hwy/contrib/bit_pack/bit_pack_test.cc | 177 +++ 2 files changed, 1615 insertions(+) create mode 100644 third_party/highway/hwy/contrib/bit_pack/bit_pack-inl.h create mode 100644 third_party/highway/hwy/contrib/bit_pack/bit_pack_test.cc (limited to 'third_party/highway/hwy/contrib/bit_pack') diff --git a/third_party/highway/hwy/contrib/bit_pack/bit_pack-inl.h b/third_party/highway/hwy/contrib/bit_pack/bit_pack-inl.h new file mode 100644 index 0000000000..3d0e62eb26 --- /dev/null +++ b/third_party/highway/hwy/contrib/bit_pack/bit_pack-inl.h @@ -0,0 +1,1438 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Per-target include guard +#if defined(HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_ +#undef HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_ +#else +#define HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_ +#endif + +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +namespace detail { + +// Primary template, specialized below for each number of bits +template // <= 8 +struct Pack8 {}; + +template <> +struct Pack8<1> { + static constexpr size_t kBits = 1; + static constexpr size_t kRawVectors = 8; + static constexpr size_t kPackedVectors = 1; + + template + HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw, + uint8_t* HWY_RESTRICT packed_out) { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + // 16-bit shifts avoid masking (bits will not cross 8-bit lanes). + const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8)); + const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8)); + const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8)); + const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8)); + const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8)); + const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8)); + const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8)); + const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8)); + + const VU16 packed76 = Or(ShiftLeft<7>(raw7), ShiftLeft<6>(raw6)); + const VU16 packed54 = Or(ShiftLeft<5>(raw5), ShiftLeft<4>(raw4)); + const VU16 packed32 = Or(ShiftLeft<3>(raw3), ShiftLeft<2>(raw2)); + const VU16 packed10 = Or(ShiftLeft<1>(raw1), raw0); + const VU16 packed = Or3(Or(packed76, packed54), packed32, packed10); + StoreU(BitCast(d8, packed), d8, packed_out); + } + + template + HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in, + uint8_t* HWY_RESTRICT raw) { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + // We extract the lowest bit from each byte, then shift right. + const VU16 mask = Set(d16, 0x0101u); + + VU16 packed = BitCast(d16, LoadU(d8, packed_in)); + + const VU16 raw0 = And(packed, mask); + packed = ShiftRight<1>(packed); + StoreU(BitCast(d8, raw0), d8, raw + 0 * N8); + + const VU16 raw1 = And(packed, mask); + packed = ShiftRight<1>(packed); + StoreU(BitCast(d8, raw1), d8, raw + 1 * N8); + + const VU16 raw2 = And(packed, mask); + packed = ShiftRight<1>(packed); + StoreU(BitCast(d8, raw2), d8, raw + 2 * N8); + + const VU16 raw3 = And(packed, mask); + packed = ShiftRight<1>(packed); + StoreU(BitCast(d8, raw3), d8, raw + 3 * N8); + + const VU16 raw4 = And(packed, mask); + packed = ShiftRight<1>(packed); + StoreU(BitCast(d8, raw4), d8, raw + 4 * N8); + + const VU16 raw5 = And(packed, mask); + packed = ShiftRight<1>(packed); + StoreU(BitCast(d8, raw5), d8, raw + 5 * N8); + + const VU16 raw6 = And(packed, mask); + packed = ShiftRight<1>(packed); + StoreU(BitCast(d8, raw6), d8, raw + 6 * N8); + + const VU16 raw7 = And(packed, mask); + StoreU(BitCast(d8, raw7), d8, raw + 7 * N8); + } +}; // Pack8<1> + +template <> +struct Pack8<2> { + static constexpr size_t kBits = 2; + static constexpr size_t kRawVectors = 4; + static constexpr size_t kPackedVectors = 1; + + template + HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw, + uint8_t* HWY_RESTRICT packed_out) { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + // 16-bit shifts avoid masking (bits will not cross 8-bit lanes). + const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8)); + const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8)); + const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8)); + const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8)); + + const VU16 packed32 = Or(ShiftLeft<6>(raw3), ShiftLeft<4>(raw2)); + const VU16 packed10 = Or(ShiftLeft<2>(raw1), raw0); + const VU16 packed = Or(packed32, packed10); + StoreU(BitCast(d8, packed), d8, packed_out); + } + + template + HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in, + uint8_t* HWY_RESTRICT raw) { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + // We extract the lowest two bits from each byte, then shift right. + const VU16 mask = Set(d16, 0x0303u); + + VU16 packed = BitCast(d16, LoadU(d8, packed_in)); + + const VU16 raw0 = And(packed, mask); + packed = ShiftRight<2>(packed); + StoreU(BitCast(d8, raw0), d8, raw + 0 * N8); + + const VU16 raw1 = And(packed, mask); + packed = ShiftRight<2>(packed); + StoreU(BitCast(d8, raw1), d8, raw + 1 * N8); + + const VU16 raw2 = And(packed, mask); + packed = ShiftRight<2>(packed); + StoreU(BitCast(d8, raw2), d8, raw + 2 * N8); + + const VU16 raw3 = And(packed, mask); + StoreU(BitCast(d8, raw3), d8, raw + 3 * N8); + } +}; // Pack8<2> + +template <> +struct Pack8<3> { + static constexpr size_t kBits = 3; + static constexpr size_t kRawVectors = 8; + static constexpr size_t kPackedVectors = 3; + + template + HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw, + uint8_t* HWY_RESTRICT packed_out) { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8)); + const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8)); + const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8)); + const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8)); + const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8)); + const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8)); + const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8)); + const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8)); + + // The upper two bits of these three will be filled with packed3 (6 bits). + VU16 packed0 = Or(ShiftLeft<3>(raw4), raw0); + VU16 packed1 = Or(ShiftLeft<3>(raw5), raw1); + VU16 packed2 = Or(ShiftLeft<3>(raw6), raw2); + const VU16 packed3 = Or(ShiftLeft<3>(raw7), raw3); + + const VU16 hi2 = Set(d16, 0xC0C0u); + packed0 = OrAnd(packed0, ShiftLeft<2>(packed3), hi2); + packed1 = OrAnd(packed1, ShiftLeft<4>(packed3), hi2); + packed2 = OrAnd(packed2, ShiftLeft<6>(packed3), hi2); + StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8); + StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8); + StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8); + } + + template + HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in, + uint8_t* HWY_RESTRICT raw) { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + // We extract the lowest three bits from each byte. + const VU16 mask = Set(d16, 0x0707u); + + VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8)); + VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8)); + VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8)); + + const VU16 raw0 = And(mask, packed0); + packed0 = ShiftRight<3>(packed0); + StoreU(BitCast(d8, raw0), d8, raw + 0 * N8); + + const VU16 raw1 = And(mask, packed1); + packed1 = ShiftRight<3>(packed1); + StoreU(BitCast(d8, raw1), d8, raw + 1 * N8); + + const VU16 raw2 = And(mask, packed2); + packed2 = ShiftRight<3>(packed2); + StoreU(BitCast(d8, raw2), d8, raw + 2 * N8); + + const VU16 raw4 = And(mask, packed0); + packed0 = ShiftRight<3>(packed0); + StoreU(BitCast(d8, raw4), d8, raw + 4 * N8); + + const VU16 raw5 = And(mask, packed1); + packed1 = ShiftRight<3>(packed1); + StoreU(BitCast(d8, raw5), d8, raw + 5 * N8); + + const VU16 raw6 = And(mask, packed2); + packed2 = ShiftRight<3>(packed2); + StoreU(BitCast(d8, raw6), d8, raw + 6 * N8); + + // packed3 is the concatenation of the low two bits in packed0..2. + const VU16 mask2 = Set(d16, 0x0303u); + VU16 packed3 = And(mask2, packed0); // high bits in lower two bits + packed3 = ShiftLeft<2>(packed3); + packed3 = OrAnd(packed3, mask2, packed1); // insert mid 2 bits + packed3 = ShiftLeft<2>(packed3); + packed3 = OrAnd(packed3, mask2, packed2); // insert low 2 bits + + const VU16 raw3 = And(mask, packed3); + packed3 = ShiftRight<3>(packed3); + StoreU(BitCast(d8, raw3), d8, raw + 3 * N8); + + const VU16 raw7 = And(mask, packed3); + StoreU(BitCast(d8, raw7), d8, raw + 7 * N8); + } +}; // Pack8<3> + +template <> +struct Pack8<4> { + static constexpr size_t kBits = 4; + // 2x unrolled (matches size of 2/6 bit cases) for increased efficiency. + static constexpr size_t kRawVectors = 4; + static constexpr size_t kPackedVectors = 2; + + template + HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw, + uint8_t* HWY_RESTRICT packed_out) { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + // 16-bit shifts avoid masking (bits will not cross 8-bit lanes). + const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8)); + const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8)); + const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8)); + const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8)); + + const VU16 packed0 = Or(ShiftLeft<4>(raw1), raw0); + const VU16 packed1 = Or(ShiftLeft<4>(raw3), raw2); + StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8); + StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8); + } + + template + HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in, + uint8_t* HWY_RESTRICT raw) { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + // We extract the lowest four bits from each byte, then shift right. + const VU16 mask = Set(d16, 0x0F0Fu); + + VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8)); + VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8)); + + const VU16 raw0 = And(packed0, mask); + packed0 = ShiftRight<4>(packed0); + StoreU(BitCast(d8, raw0), d8, raw + 0 * N8); + + const VU16 raw1 = And(packed0, mask); + StoreU(BitCast(d8, raw1), d8, raw + 1 * N8); + + const VU16 raw2 = And(packed1, mask); + packed1 = ShiftRight<4>(packed1); + StoreU(BitCast(d8, raw2), d8, raw + 2 * N8); + + const VU16 raw3 = And(packed1, mask); + StoreU(BitCast(d8, raw3), d8, raw + 3 * N8); + } +}; // Pack8<4> + +template <> +struct Pack8<5> { + static constexpr size_t kBits = 5; + static constexpr size_t kRawVectors = 8; + static constexpr size_t kPackedVectors = 5; + + template + HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw, + uint8_t* HWY_RESTRICT packed_out) { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8)); + const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8)); + const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8)); + const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8)); + const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8)); + const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8)); + const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8)); + const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8)); + + // Fill upper three bits with upper bits from raw4..7. + const VU16 hi3 = Set(d16, 0xE0E0u); + const VU16 packed0 = OrAnd(raw0, ShiftLeft<3>(raw4), hi3); + const VU16 packed1 = OrAnd(raw1, ShiftLeft<3>(raw5), hi3); + const VU16 packed2 = OrAnd(raw2, ShiftLeft<3>(raw6), hi3); + const VU16 packed3 = OrAnd(raw3, ShiftLeft<3>(raw7), hi3); + StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8); + StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8); + StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8); + StoreU(BitCast(d8, packed3), d8, packed_out + 3 * N8); + + // Combine lower two bits of raw4..7 into packed4. + const VU16 lo2 = Set(d16, 0x0303u); + VU16 packed4 = And(lo2, raw7); + packed4 = ShiftLeft<2>(packed4); + packed4 = OrAnd(packed4, lo2, raw6); + packed4 = ShiftLeft<2>(packed4); + packed4 = OrAnd(packed4, lo2, raw5); + packed4 = ShiftLeft<2>(packed4); + packed4 = OrAnd(packed4, lo2, raw4); + StoreU(BitCast(d8, packed4), d8, packed_out + 4 * N8); + } + + template + HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in, + uint8_t* HWY_RESTRICT raw) { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + + const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8)); + const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8)); + const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8)); + const VU16 packed3 = BitCast(d16, LoadU(d8, packed_in + 3 * N8)); + VU16 packed4 = BitCast(d16, LoadU(d8, packed_in + 4 * N8)); + + // We extract the lowest five bits from each byte. + const VU16 hi3 = Set(d16, 0xE0E0u); + + const VU16 raw0 = AndNot(hi3, packed0); + StoreU(BitCast(d8, raw0), d8, raw + 0 * N8); + + const VU16 raw1 = AndNot(hi3, packed1); + StoreU(BitCast(d8, raw1), d8, raw + 1 * N8); + + const VU16 raw2 = AndNot(hi3, packed2); + StoreU(BitCast(d8, raw2), d8, raw + 2 * N8); + + const VU16 raw3 = AndNot(hi3, packed3); + StoreU(BitCast(d8, raw3), d8, raw + 3 * N8); + + // The upper bits are the top 3 bits shifted right by three. + const VU16 top4 = ShiftRight<3>(And(hi3, packed0)); + const VU16 top5 = ShiftRight<3>(And(hi3, packed1)); + const VU16 top6 = ShiftRight<3>(And(hi3, packed2)); + const VU16 top7 = ShiftRight<3>(And(hi3, packed3)); + + // Insert the lower 2 bits, which were concatenated into a byte. + const VU16 lo2 = Set(d16, 0x0303u); + const VU16 raw4 = OrAnd(top4, lo2, packed4); + packed4 = ShiftRight<2>(packed4); + const VU16 raw5 = OrAnd(top5, lo2, packed4); + packed4 = ShiftRight<2>(packed4); + const VU16 raw6 = OrAnd(top6, lo2, packed4); + packed4 = ShiftRight<2>(packed4); + const VU16 raw7 = OrAnd(top7, lo2, packed4); + + StoreU(BitCast(d8, raw4), d8, raw + 4 * N8); + StoreU(BitCast(d8, raw5), d8, raw + 5 * N8); + StoreU(BitCast(d8, raw6), d8, raw + 6 * N8); + StoreU(BitCast(d8, raw7), d8, raw + 7 * N8); + } +}; // Pack8<5> + +template <> +struct Pack8<6> { + static constexpr size_t kBits = 6; + static constexpr size_t kRawVectors = 4; + static constexpr size_t kPackedVectors = 3; + + template + HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw, + uint8_t* HWY_RESTRICT packed_out) { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8)); + const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8)); + const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8)); + const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8)); + + // The upper two bits of these three will be filled with raw3 (6 bits). + VU16 packed0 = raw0; + VU16 packed1 = raw1; + VU16 packed2 = raw2; + + const VU16 hi2 = Set(d16, 0xC0C0u); + packed0 = OrAnd(packed0, ShiftLeft<2>(raw3), hi2); + packed1 = OrAnd(packed1, ShiftLeft<4>(raw3), hi2); + packed2 = OrAnd(packed2, ShiftLeft<6>(raw3), hi2); + StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8); + StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8); + StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8); + } + + template + HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in, + uint8_t* HWY_RESTRICT raw) { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + // We extract the lowest six bits from each byte. Negated mask so we can + // use OrAnd below. + const VU16 mask = Set(d16, 0xC0C0u); + + const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8)); + const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8)); + const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8)); + + const VU16 raw0 = AndNot(mask, packed0); + StoreU(BitCast(d8, raw0), d8, raw + 0 * N8); + + const VU16 raw1 = AndNot(mask, packed1); + StoreU(BitCast(d8, raw1), d8, raw + 1 * N8); + + const VU16 raw2 = AndNot(mask, packed2); + StoreU(BitCast(d8, raw2), d8, raw + 2 * N8); + + // raw3 is the concatenation of the upper two bits in packed0..2. + VU16 raw3 = And(mask, packed2); // low 2 bits in top of byte + raw3 = ShiftRight<2>(raw3); + raw3 = OrAnd(raw3, mask, packed1); // insert mid 2 bits + raw3 = ShiftRight<2>(raw3); + raw3 = OrAnd(raw3, mask, packed0); // insert high 2 bits + raw3 = ShiftRight<2>(raw3); + StoreU(BitCast(d8, raw3), d8, raw + 3 * N8); + } +}; // Pack8<6> + +template <> +struct Pack8<7> { + static constexpr size_t kBits = 7; + static constexpr size_t kRawVectors = 8; + static constexpr size_t kPackedVectors = 7; + + template + HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw, + uint8_t* HWY_RESTRICT packed_out) { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8)); + const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8)); + const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8)); + const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8)); + const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8)); + const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8)); + const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8)); + // Top bit is inserted into packed0..6 and then shifted left. + VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8)); + + const VU16 hi1 = Set(d16, 0x8080u); + raw7 = Add(raw7, raw7); + const VU16 packed0 = OrAnd(raw0, hi1, raw7); + raw7 = Add(raw7, raw7); + const VU16 packed1 = OrAnd(raw1, hi1, raw7); + raw7 = Add(raw7, raw7); + const VU16 packed2 = OrAnd(raw2, hi1, raw7); + raw7 = Add(raw7, raw7); + const VU16 packed3 = OrAnd(raw3, hi1, raw7); + raw7 = Add(raw7, raw7); + const VU16 packed4 = OrAnd(raw4, hi1, raw7); + raw7 = Add(raw7, raw7); + const VU16 packed5 = OrAnd(raw5, hi1, raw7); + raw7 = Add(raw7, raw7); + const VU16 packed6 = OrAnd(raw6, hi1, raw7); + StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8); + StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8); + StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8); + StoreU(BitCast(d8, packed3), d8, packed_out + 3 * N8); + StoreU(BitCast(d8, packed4), d8, packed_out + 4 * N8); + StoreU(BitCast(d8, packed5), d8, packed_out + 5 * N8); + StoreU(BitCast(d8, packed6), d8, packed_out + 6 * N8); + } + + template + HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in, + uint8_t* HWY_RESTRICT raw) { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + + const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8)); + const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8)); + const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8)); + const VU16 packed3 = BitCast(d16, LoadU(d8, packed_in + 3 * N8)); + const VU16 packed4 = BitCast(d16, LoadU(d8, packed_in + 4 * N8)); + const VU16 packed5 = BitCast(d16, LoadU(d8, packed_in + 5 * N8)); + const VU16 packed6 = BitCast(d16, LoadU(d8, packed_in + 6 * N8)); + + // We extract the lowest seven bits from each byte. + const VU16 hi1 = Set(d16, 0x8080u); + + const VU16 raw0 = AndNot(hi1, packed0); + StoreU(BitCast(d8, raw0), d8, raw + 0 * N8); + + const VU16 raw1 = AndNot(hi1, packed1); + StoreU(BitCast(d8, raw1), d8, raw + 1 * N8); + + const VU16 raw2 = AndNot(hi1, packed2); + StoreU(BitCast(d8, raw2), d8, raw + 2 * N8); + + const VU16 raw3 = AndNot(hi1, packed3); + StoreU(BitCast(d8, raw3), d8, raw + 3 * N8); + + const VU16 raw4 = AndNot(hi1, packed4); + StoreU(BitCast(d8, raw4), d8, raw + 4 * N8); + + const VU16 raw5 = AndNot(hi1, packed5); + StoreU(BitCast(d8, raw5), d8, raw + 5 * N8); + + const VU16 raw6 = AndNot(hi1, packed6); + StoreU(BitCast(d8, raw6), d8, raw + 6 * N8); + + VU16 raw7 = And(hi1, packed6); // will shift this down into LSB + raw7 = ShiftRight<1>(raw7); + raw7 = OrAnd(raw7, hi1, packed5); + raw7 = ShiftRight<1>(raw7); + raw7 = OrAnd(raw7, hi1, packed4); + raw7 = ShiftRight<1>(raw7); + raw7 = OrAnd(raw7, hi1, packed3); + raw7 = ShiftRight<1>(raw7); + raw7 = OrAnd(raw7, hi1, packed2); + raw7 = ShiftRight<1>(raw7); + raw7 = OrAnd(raw7, hi1, packed1); + raw7 = ShiftRight<1>(raw7); + raw7 = OrAnd(raw7, hi1, packed0); + raw7 = ShiftRight<1>(raw7); + StoreU(BitCast(d8, raw7), d8, raw + 7 * N8); + } +}; // Pack8<7> + +template <> +struct Pack8<8> { + static constexpr size_t kBits = 8; + // 4x unrolled (matches size of 2/6 bit cases) for increased efficiency. + static constexpr size_t kRawVectors = 4; + static constexpr size_t kPackedVectors = 4; + + template + HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw, + uint8_t* HWY_RESTRICT packed_out) { + using VU8 = Vec; + const size_t N8 = Lanes(d8); + const VU8 raw0 = LoadU(d8, raw + 0 * N8); + const VU8 raw1 = LoadU(d8, raw + 1 * N8); + const VU8 raw2 = LoadU(d8, raw + 2 * N8); + const VU8 raw3 = LoadU(d8, raw + 3 * N8); + + StoreU(raw0, d8, packed_out + 0 * N8); + StoreU(raw1, d8, packed_out + 1 * N8); + StoreU(raw2, d8, packed_out + 2 * N8); + StoreU(raw3, d8, packed_out + 3 * N8); + } + + template + HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in, + uint8_t* HWY_RESTRICT raw) { + using VU8 = Vec; + const size_t N8 = Lanes(d8); + const VU8 raw0 = LoadU(d8, packed_in + 0 * N8); + const VU8 raw1 = LoadU(d8, packed_in + 1 * N8); + const VU8 raw2 = LoadU(d8, packed_in + 2 * N8); + const VU8 raw3 = LoadU(d8, packed_in + 3 * N8); + + StoreU(raw0, d8, raw + 0 * N8); + StoreU(raw1, d8, raw + 1 * N8); + StoreU(raw2, d8, raw + 2 * N8); + StoreU(raw3, d8, raw + 3 * N8); + } +}; // Pack8<8> + +// Primary template, specialized below for each number of bits +template // <= 16 +struct Pack16 {}; + +template <> +struct Pack16<1> { + static constexpr size_t kBits = 1; + static constexpr size_t kRawVectors = 16; + static constexpr size_t kPackedVectors = 1; + + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + const VU16 raw8 = LoadU(d, raw + 8 * N); + const VU16 raw9 = LoadU(d, raw + 9 * N); + const VU16 rawA = LoadU(d, raw + 0xA * N); + const VU16 rawB = LoadU(d, raw + 0xB * N); + const VU16 rawC = LoadU(d, raw + 0xC * N); + const VU16 rawD = LoadU(d, raw + 0xD * N); + const VU16 rawE = LoadU(d, raw + 0xE * N); + const VU16 rawF = LoadU(d, raw + 0xF * N); + + const VU16 packed10 = Or(ShiftLeft<1>(raw1), raw0); + const VU16 packed32 = Or(ShiftLeft<3>(raw3), ShiftLeft<2>(raw2)); + const VU16 packed54 = Or(ShiftLeft<5>(raw5), ShiftLeft<4>(raw4)); + const VU16 packed76 = Or(ShiftLeft<7>(raw7), ShiftLeft<6>(raw6)); + const VU16 packed98 = Or(ShiftLeft<9>(raw9), ShiftLeft<8>(raw8)); + const VU16 packedBA = Or(ShiftLeft<0xB>(rawB), ShiftLeft<0xA>(rawA)); + const VU16 packedDC = Or(ShiftLeft<0xD>(rawD), ShiftLeft<0xC>(rawC)); + const VU16 packedFE = Or(ShiftLeft<0xF>(rawF), ShiftLeft<0xE>(rawE)); + const VU16 packed50 = Or3(packed10, packed32, packed54); + const VU16 packedB7 = Or3(packed76, packed98, packedBA); + const VU16 packedFC = Or(packedFE, packedDC); + const VU16 packed = Or3(packed50, packedB7, packedFC); + StoreU(packed, d, packed_out); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) { + using VU16 = Vec; + const size_t N = Lanes(d); + // We extract the lowest bit from each u16, then shift right. + const VU16 mask = Set(d, 1u); + + VU16 packed = LoadU(d, packed_in); + + const VU16 raw0 = And(packed, mask); + packed = ShiftRight<1>(packed); + StoreU(raw0, d, raw + 0 * N); + + const VU16 raw1 = And(packed, mask); + packed = ShiftRight<1>(packed); + StoreU(raw1, d, raw + 1 * N); + + const VU16 raw2 = And(packed, mask); + packed = ShiftRight<1>(packed); + StoreU(raw2, d, raw + 2 * N); + + const VU16 raw3 = And(packed, mask); + packed = ShiftRight<1>(packed); + StoreU(raw3, d, raw + 3 * N); + + const VU16 raw4 = And(packed, mask); + packed = ShiftRight<1>(packed); + StoreU(raw4, d, raw + 4 * N); + + const VU16 raw5 = And(packed, mask); + packed = ShiftRight<1>(packed); + StoreU(raw5, d, raw + 5 * N); + + const VU16 raw6 = And(packed, mask); + packed = ShiftRight<1>(packed); + StoreU(raw6, d, raw + 6 * N); + + const VU16 raw7 = And(packed, mask); + packed = ShiftRight<1>(packed); + StoreU(raw7, d, raw + 7 * N); + + const VU16 raw8 = And(packed, mask); + packed = ShiftRight<1>(packed); + StoreU(raw8, d, raw + 8 * N); + + const VU16 raw9 = And(packed, mask); + packed = ShiftRight<1>(packed); + StoreU(raw9, d, raw + 9 * N); + + const VU16 rawA = And(packed, mask); + packed = ShiftRight<1>(packed); + StoreU(rawA, d, raw + 0xA * N); + + const VU16 rawB = And(packed, mask); + packed = ShiftRight<1>(packed); + StoreU(rawB, d, raw + 0xB * N); + + const VU16 rawC = And(packed, mask); + packed = ShiftRight<1>(packed); + StoreU(rawC, d, raw + 0xC * N); + + const VU16 rawD = And(packed, mask); + packed = ShiftRight<1>(packed); + StoreU(rawD, d, raw + 0xD * N); + + const VU16 rawE = And(packed, mask); + packed = ShiftRight<1>(packed); + StoreU(rawE, d, raw + 0xE * N); + + const VU16 rawF = And(packed, mask); + StoreU(rawF, d, raw + 0xF * N); + } +}; // Pack16<1> + +template <> +struct Pack16<2> { + static constexpr size_t kBits = 2; + static constexpr size_t kRawVectors = 8; + static constexpr size_t kPackedVectors = 1; + + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + + const VU16 packed10 = Or(ShiftLeft<2>(raw1), raw0); + const VU16 packed32 = Or(ShiftLeft<6>(raw3), ShiftLeft<4>(raw2)); + const VU16 packed54 = Or(ShiftLeft<10>(raw5), ShiftLeft<8>(raw4)); + const VU16 packed76 = Or(ShiftLeft<14>(raw7), ShiftLeft<12>(raw6)); + const VU16 packed50 = Or3(packed10, packed32, packed54); + const VU16 packed = Or(packed50, packed76); + StoreU(packed, d, packed_out); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) { + using VU16 = Vec; + const size_t N = Lanes(d); + // We extract the lowest two bits, then shift right. + const VU16 mask = Set(d, 0x3u); + + VU16 packed = LoadU(d, packed_in); + + const VU16 raw0 = And(packed, mask); + packed = ShiftRight<2>(packed); + StoreU(raw0, d, raw + 0 * N); + + const VU16 raw1 = And(packed, mask); + packed = ShiftRight<2>(packed); + StoreU(raw1, d, raw + 1 * N); + + const VU16 raw2 = And(packed, mask); + packed = ShiftRight<2>(packed); + StoreU(raw2, d, raw + 2 * N); + + const VU16 raw3 = And(packed, mask); + packed = ShiftRight<2>(packed); + StoreU(raw3, d, raw + 3 * N); + + const VU16 raw4 = And(packed, mask); + packed = ShiftRight<2>(packed); + StoreU(raw4, d, raw + 4 * N); + + const VU16 raw5 = And(packed, mask); + packed = ShiftRight<2>(packed); + StoreU(raw5, d, raw + 5 * N); + + const VU16 raw6 = And(packed, mask); + packed = ShiftRight<2>(packed); + StoreU(raw6, d, raw + 6 * N); + + const VU16 raw7 = And(packed, mask); + packed = ShiftRight<2>(packed); + StoreU(raw7, d, raw + 7 * N); + } +}; // Pack16<2> + +template <> +struct Pack16<3> { + static constexpr size_t kBits = 3; + static constexpr size_t kRawVectors = 16; + static constexpr size_t kPackedVectors = 3; + + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + const VU16 raw8 = LoadU(d, raw + 8 * N); + const VU16 raw9 = LoadU(d, raw + 9 * N); + const VU16 rawA = LoadU(d, raw + 0xA * N); + const VU16 rawB = LoadU(d, raw + 0xB * N); + const VU16 rawC = LoadU(d, raw + 0xC * N); + const VU16 rawD = LoadU(d, raw + 0xD * N); + const VU16 rawE = LoadU(d, raw + 0xE * N); + const VU16 rawF = LoadU(d, raw + 0xF * N); + + // We can fit 15 raw vectors in three packed vectors (five each). + const VU16 raw630 = Or3(ShiftLeft<6>(raw6), ShiftLeft<3>(raw3), raw0); + const VU16 raw741 = Or3(ShiftLeft<6>(raw7), ShiftLeft<3>(raw4), raw1); + const VU16 raw852 = Or3(ShiftLeft<6>(raw8), ShiftLeft<3>(raw5), raw2); + + // rawF will be scattered into the upper bits of these three. + VU16 packed0 = Or3(raw630, ShiftLeft<12>(rawC), ShiftLeft<9>(raw9)); + VU16 packed1 = Or3(raw741, ShiftLeft<12>(rawD), ShiftLeft<9>(rawA)); + VU16 packed2 = Or3(raw852, ShiftLeft<12>(rawE), ShiftLeft<9>(rawB)); + + const VU16 hi1 = Set(d, 0x8000u); + packed0 = Or(packed0, ShiftLeft<15>(rawF)); // MSB only, no mask + packed1 = OrAnd(packed1, ShiftLeft<14>(rawF), hi1); + packed2 = OrAnd(packed2, ShiftLeft<13>(rawF), hi1); + StoreU(packed0, d, packed_out + 0 * N); + StoreU(packed1, d, packed_out + 1 * N); + StoreU(packed2, d, packed_out + 2 * N); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) { + using VU16 = Vec; + const size_t N = Lanes(d); + // We extract the lowest three bits. + const VU16 mask = Set(d, 0x7u); + + VU16 packed0 = LoadU(d, packed_in + 0 * N); + VU16 packed1 = LoadU(d, packed_in + 1 * N); + VU16 packed2 = LoadU(d, packed_in + 2 * N); + + const VU16 raw0 = And(mask, packed0); + packed0 = ShiftRight<3>(packed0); + StoreU(raw0, d, raw + 0 * N); + + const VU16 raw1 = And(mask, packed1); + packed1 = ShiftRight<3>(packed1); + StoreU(raw1, d, raw + 1 * N); + + const VU16 raw2 = And(mask, packed2); + packed2 = ShiftRight<3>(packed2); + StoreU(raw2, d, raw + 2 * N); + + const VU16 raw3 = And(mask, packed0); + packed0 = ShiftRight<3>(packed0); + StoreU(raw3, d, raw + 3 * N); + + const VU16 raw4 = And(mask, packed1); + packed1 = ShiftRight<3>(packed1); + StoreU(raw4, d, raw + 4 * N); + + const VU16 raw5 = And(mask, packed2); + packed2 = ShiftRight<3>(packed2); + StoreU(raw5, d, raw + 5 * N); + + const VU16 raw6 = And(mask, packed0); + packed0 = ShiftRight<3>(packed0); + StoreU(raw6, d, raw + 6 * N); + + const VU16 raw7 = And(mask, packed1); + packed1 = ShiftRight<3>(packed1); + StoreU(raw7, d, raw + 7 * N); + + const VU16 raw8 = And(mask, packed2); + packed2 = ShiftRight<3>(packed2); + StoreU(raw8, d, raw + 8 * N); + + const VU16 raw9 = And(mask, packed0); + packed0 = ShiftRight<3>(packed0); + StoreU(raw9, d, raw + 9 * N); + + const VU16 rawA = And(mask, packed1); + packed1 = ShiftRight<3>(packed1); + StoreU(rawA, d, raw + 0xA * N); + + const VU16 rawB = And(mask, packed2); + packed2 = ShiftRight<3>(packed2); + StoreU(rawB, d, raw + 0xB * N); + + const VU16 rawC = And(mask, packed0); + packed0 = ShiftRight<3>(packed0); + StoreU(rawC, d, raw + 0xC * N); + + const VU16 rawD = And(mask, packed1); + packed1 = ShiftRight<3>(packed1); + StoreU(rawD, d, raw + 0xD * N); + + const VU16 rawE = And(mask, packed2); + packed2 = ShiftRight<3>(packed2); + StoreU(rawE, d, raw + 0xE * N); + + // rawF is the concatenation of the lower bit of packed0..2. No masking is + // required because we have shifted that bit downward from the MSB. + const VU16 rawF = + Or3(ShiftLeft<2>(packed2), Add(packed1, packed1), packed0); + StoreU(rawF, d, raw + 0xF * N); + } +}; // Pack16<3> + +template <> +struct Pack16<4> { + static constexpr size_t kBits = 4; + // 2x unrolled (matches size of 2/6 bit cases) for increased efficiency. + static constexpr size_t kRawVectors = 8; + static constexpr size_t kPackedVectors = 2; + + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = BitCast(d, LoadU(d, raw + 0 * N)); + const VU16 raw1 = BitCast(d, LoadU(d, raw + 1 * N)); + const VU16 raw2 = BitCast(d, LoadU(d, raw + 2 * N)); + const VU16 raw3 = BitCast(d, LoadU(d, raw + 3 * N)); + const VU16 raw4 = BitCast(d, LoadU(d, raw + 4 * N)); + const VU16 raw5 = BitCast(d, LoadU(d, raw + 5 * N)); + const VU16 raw6 = BitCast(d, LoadU(d, raw + 6 * N)); + const VU16 raw7 = BitCast(d, LoadU(d, raw + 7 * N)); + + const VU16 raw20 = Or3(ShiftLeft<8>(raw2), ShiftLeft<4>(raw1), raw0); + const VU16 packed0 = Or(raw20, ShiftLeft<12>(raw3)); + const VU16 raw64 = Or3(ShiftLeft<8>(raw6), ShiftLeft<4>(raw5), raw4); + const VU16 packed1 = Or(raw64, ShiftLeft<12>(raw7)); + StoreU(packed0, d, packed_out + 0 * N); + StoreU(packed1, d, packed_out + 1 * N); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) { + using VU16 = Vec; + const size_t N = Lanes(d); + // We extract the lowest four bits, then shift right. + const VU16 mask = Set(d, 0xFu); + + VU16 packed0 = LoadU(d, packed_in + 0 * N); + VU16 packed1 = LoadU(d, packed_in + 1 * N); + + const VU16 raw0 = And(packed0, mask); + packed0 = ShiftRight<4>(packed0); + StoreU(raw0, d, raw + 0 * N); + + const VU16 raw1 = And(packed0, mask); + packed0 = ShiftRight<4>(packed0); + StoreU(raw1, d, raw + 1 * N); + + const VU16 raw2 = And(packed0, mask); + packed0 = ShiftRight<4>(packed0); + StoreU(raw2, d, raw + 2 * N); + + const VU16 raw3 = packed0; // shifted down, no mask required + StoreU(raw3, d, raw + 3 * N); + + const VU16 raw4 = And(packed1, mask); + packed1 = ShiftRight<4>(packed1); + StoreU(raw4, d, raw + 4 * N); + + const VU16 raw5 = And(packed1, mask); + packed1 = ShiftRight<4>(packed1); + StoreU(raw5, d, raw + 5 * N); + + const VU16 raw6 = And(packed1, mask); + packed1 = ShiftRight<4>(packed1); + StoreU(raw6, d, raw + 6 * N); + + const VU16 raw7 = packed1; // shifted down, no mask required + StoreU(raw7, d, raw + 7 * N); + } +}; // Pack16<4> + +template <> +struct Pack16<5> { + static constexpr size_t kBits = 5; + static constexpr size_t kRawVectors = 16; + static constexpr size_t kPackedVectors = 5; + + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + const VU16 raw8 = LoadU(d, raw + 8 * N); + const VU16 raw9 = LoadU(d, raw + 9 * N); + const VU16 rawA = LoadU(d, raw + 0xA * N); + const VU16 rawB = LoadU(d, raw + 0xB * N); + const VU16 rawC = LoadU(d, raw + 0xC * N); + const VU16 rawD = LoadU(d, raw + 0xD * N); + const VU16 rawE = LoadU(d, raw + 0xE * N); + const VU16 rawF = LoadU(d, raw + 0xF * N); + + // We can fit 15 raw vectors in five packed vectors (three each). + const VU16 rawA50 = Or3(ShiftLeft<10>(rawA), ShiftLeft<5>(raw5), raw0); + const VU16 rawB61 = Or3(ShiftLeft<10>(rawB), ShiftLeft<5>(raw6), raw1); + const VU16 rawC72 = Or3(ShiftLeft<10>(rawC), ShiftLeft<5>(raw7), raw2); + const VU16 rawD83 = Or3(ShiftLeft<10>(rawD), ShiftLeft<5>(raw8), raw3); + const VU16 rawE94 = Or3(ShiftLeft<10>(rawE), ShiftLeft<5>(raw9), raw4); + + // rawF will be scattered into the upper bits of these five. + const VU16 hi1 = Set(d, 0x8000u); + const VU16 packed0 = Or(rawA50, ShiftLeft<15>(rawF)); // MSB only, no mask + const VU16 packed1 = OrAnd(rawB61, ShiftLeft<14>(rawF), hi1); + const VU16 packed2 = OrAnd(rawC72, ShiftLeft<13>(rawF), hi1); + const VU16 packed3 = OrAnd(rawD83, ShiftLeft<12>(rawF), hi1); + const VU16 packed4 = OrAnd(rawE94, ShiftLeft<11>(rawF), hi1); + StoreU(packed0, d, packed_out + 0 * N); + StoreU(packed1, d, packed_out + 1 * N); + StoreU(packed2, d, packed_out + 2 * N); + StoreU(packed3, d, packed_out + 3 * N); + StoreU(packed4, d, packed_out + 4 * N); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) { + using VU16 = Vec; + const size_t N = Lanes(d); + + VU16 packed0 = LoadU(d, packed_in + 0 * N); + VU16 packed1 = LoadU(d, packed_in + 1 * N); + VU16 packed2 = LoadU(d, packed_in + 2 * N); + VU16 packed3 = LoadU(d, packed_in + 3 * N); + VU16 packed4 = LoadU(d, packed_in + 4 * N); + + // We extract the lowest five bits and shift right. + const VU16 mask = Set(d, 0x1Fu); + + const VU16 raw0 = And(packed0, mask); + packed0 = ShiftRight<5>(packed0); + StoreU(raw0, d, raw + 0 * N); + + const VU16 raw1 = And(packed1, mask); + packed1 = ShiftRight<5>(packed1); + StoreU(raw1, d, raw + 1 * N); + + const VU16 raw2 = And(packed2, mask); + packed2 = ShiftRight<5>(packed2); + StoreU(raw2, d, raw + 2 * N); + + const VU16 raw3 = And(packed3, mask); + packed3 = ShiftRight<5>(packed3); + StoreU(raw3, d, raw + 3 * N); + + const VU16 raw4 = And(packed4, mask); + packed4 = ShiftRight<5>(packed4); + StoreU(raw4, d, raw + 4 * N); + + const VU16 raw5 = And(packed0, mask); + packed0 = ShiftRight<5>(packed0); + StoreU(raw5, d, raw + 5 * N); + + const VU16 raw6 = And(packed1, mask); + packed1 = ShiftRight<5>(packed1); + StoreU(raw6, d, raw + 6 * N); + + const VU16 raw7 = And(packed2, mask); + packed2 = ShiftRight<5>(packed2); + StoreU(raw7, d, raw + 7 * N); + + const VU16 raw8 = And(packed3, mask); + packed3 = ShiftRight<5>(packed3); + StoreU(raw8, d, raw + 8 * N); + + const VU16 raw9 = And(packed4, mask); + packed4 = ShiftRight<5>(packed4); + StoreU(raw9, d, raw + 9 * N); + + const VU16 rawA = And(packed0, mask); + packed0 = ShiftRight<5>(packed0); + StoreU(rawA, d, raw + 0xA * N); + + const VU16 rawB = And(packed1, mask); + packed1 = ShiftRight<5>(packed1); + StoreU(rawB, d, raw + 0xB * N); + + const VU16 rawC = And(packed2, mask); + packed2 = ShiftRight<5>(packed2); + StoreU(rawC, d, raw + 0xC * N); + + const VU16 rawD = And(packed3, mask); + packed3 = ShiftRight<5>(packed3); + StoreU(rawD, d, raw + 0xD * N); + + const VU16 rawE = And(packed4, mask); + packed4 = ShiftRight<5>(packed4); + StoreU(rawE, d, raw + 0xE * N); + + // rawF is the concatenation of the lower bit of packed0..4. No masking is + // required because we have shifted that bit downward from the MSB. + const VU16 bits20 = + Or3(ShiftLeft<2>(packed2), Add(packed1, packed1), packed0); + const VU16 rawF = Or3(ShiftLeft<4>(packed4), ShiftLeft<3>(packed3), bits20); + StoreU(rawF, d, raw + 0xF * N); + } +}; // Pack16<5> + +template <> +struct Pack16<6> { + static constexpr size_t kBits = 6; + static constexpr size_t kRawVectors = 8; + static constexpr size_t kPackedVectors = 3; + + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + + // Three vectors, two 6-bit raw each; packed3 (12 bits) is spread over the + // four remainder bits at the top of each vector. + VU16 packed0 = Or(ShiftLeft<6>(raw4), raw0); + VU16 packed1 = Or(ShiftLeft<6>(raw5), raw1); + VU16 packed2 = Or(ShiftLeft<6>(raw6), raw2); + const VU16 packed3 = Or(ShiftLeft<6>(raw7), raw3); + + const VU16 hi4 = Set(d, 0xF000u); + packed0 = Or(packed0, ShiftLeft<12>(packed3)); // shift zeroed LSBs + packed1 = OrAnd(packed1, ShiftLeft<8>(packed3), hi4); + packed2 = OrAnd(packed2, ShiftLeft<4>(packed3), hi4); + StoreU(packed0, d, packed_out + 0 * N); + StoreU(packed1, d, packed_out + 1 * N); + StoreU(packed2, d, packed_out + 2 * N); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) { + using VU16 = Vec; + const size_t N = Lanes(d); + // We extract the lowest six bits and shift right. + const VU16 mask = Set(d, 0x3Fu); + + VU16 packed0 = LoadU(d, packed_in + 0 * N); + VU16 packed1 = LoadU(d, packed_in + 1 * N); + VU16 packed2 = LoadU(d, packed_in + 2 * N); + + const VU16 raw0 = And(packed0, mask); + packed0 = ShiftRight<6>(packed0); + StoreU(raw0, d, raw + 0 * N); + + const VU16 raw1 = And(packed1, mask); + packed1 = ShiftRight<6>(packed1); + StoreU(raw1, d, raw + 1 * N); + + const VU16 raw2 = And(packed2, mask); + packed2 = ShiftRight<6>(packed2); + StoreU(raw2, d, raw + 2 * N); + + const VU16 raw4 = And(packed0, mask); + packed0 = ShiftRight<6>(packed0); + StoreU(raw4, d, raw + 4 * N); + + const VU16 raw5 = And(packed1, mask); + packed1 = ShiftRight<6>(packed1); + StoreU(raw5, d, raw + 5 * N); + + const VU16 raw6 = And(packed2, mask); + packed2 = ShiftRight<6>(packed2); + StoreU(raw6, d, raw + 6 * N); + + // packed3 is the concatenation of the four bits in packed0..2. + VU16 packed3 = Or3(ShiftLeft<8>(packed2), ShiftLeft<4>(packed1), packed0); + const VU16 raw3 = And(packed3, mask); + packed3 = ShiftRight<6>(packed3); + StoreU(raw3, d, raw + 3 * N); + const VU16 raw7 = packed3; // upper bits already zero + StoreU(raw7, d, raw + 7 * N); + } +}; // Pack16<6> + +template <> +struct Pack16<7> { + static constexpr size_t kBits = 7; + static constexpr size_t kRawVectors = 16; + static constexpr size_t kPackedVectors = 7; + + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + const VU16 raw8 = LoadU(d, raw + 8 * N); + const VU16 raw9 = LoadU(d, raw + 9 * N); + const VU16 rawA = LoadU(d, raw + 0xA * N); + const VU16 rawB = LoadU(d, raw + 0xB * N); + const VU16 rawC = LoadU(d, raw + 0xC * N); + const VU16 rawD = LoadU(d, raw + 0xD * N); + const VU16 rawE = LoadU(d, raw + 0xE * N); + const VU16 rawF = LoadU(d, raw + 0xF * N); + // Seven vectors, two 7-bit raw each; packed7 (14 bits) is spread over the + // two remainder bits at the top of each vector. + VU16 packed0 = Or(ShiftLeft<7>(raw8), raw0); + VU16 packed1 = Or(ShiftLeft<7>(raw9), raw1); + VU16 packed2 = Or(ShiftLeft<7>(rawA), raw2); + VU16 packed3 = Or(ShiftLeft<7>(rawB), raw3); + VU16 packed4 = Or(ShiftLeft<7>(rawC), raw4); + VU16 packed5 = Or(ShiftLeft<7>(rawD), raw5); + VU16 packed6 = Or(ShiftLeft<7>(rawE), raw6); + const VU16 packed7 = Or(ShiftLeft<7>(rawF), raw7); + + const VU16 hi2 = Set(d, 0xC000u); + packed0 = Or(packed0, ShiftLeft<14>(packed7)); // shift zeroed LSBs + packed1 = OrAnd(packed1, ShiftLeft<12>(packed7), hi2); + packed2 = OrAnd(packed2, ShiftLeft<10>(packed7), hi2); + packed3 = OrAnd(packed3, ShiftLeft<8>(packed7), hi2); + packed4 = OrAnd(packed4, ShiftLeft<6>(packed7), hi2); + packed5 = OrAnd(packed5, ShiftLeft<4>(packed7), hi2); + packed6 = OrAnd(packed6, ShiftLeft<2>(packed7), hi2); + StoreU(packed0, d, packed_out + 0 * N); + StoreU(packed1, d, packed_out + 1 * N); + StoreU(packed2, d, packed_out + 2 * N); + StoreU(packed3, d, packed_out + 3 * N); + StoreU(packed4, d, packed_out + 4 * N); + StoreU(packed5, d, packed_out + 5 * N); + StoreU(packed6, d, packed_out + 6 * N); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) { + using VU16 = Vec; + const size_t N = Lanes(d); + + VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N)); + VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N)); + VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N)); + VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N)); + VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N)); + VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N)); + VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N)); + + // We extract the lowest seven bits and shift right. + const VU16 mask = Set(d, 0x7Fu); + + const VU16 raw0 = And(packed0, mask); + packed0 = ShiftRight<7>(packed0); + StoreU(raw0, d, raw + 0 * N); + + const VU16 raw1 = And(packed1, mask); + packed1 = ShiftRight<7>(packed1); + StoreU(raw1, d, raw + 1 * N); + + const VU16 raw2 = And(packed2, mask); + packed2 = ShiftRight<7>(packed2); + StoreU(raw2, d, raw + 2 * N); + + const VU16 raw3 = And(packed3, mask); + packed3 = ShiftRight<7>(packed3); + StoreU(raw3, d, raw + 3 * N); + + const VU16 raw4 = And(packed4, mask); + packed4 = ShiftRight<7>(packed4); + StoreU(raw4, d, raw + 4 * N); + + const VU16 raw5 = And(packed5, mask); + packed5 = ShiftRight<7>(packed5); + StoreU(raw5, d, raw + 5 * N); + + const VU16 raw6 = And(packed6, mask); + packed6 = ShiftRight<7>(packed6); + StoreU(raw6, d, raw + 6 * N); + + const VU16 raw8 = And(packed0, mask); + packed0 = ShiftRight<7>(packed0); + StoreU(raw8, d, raw + 8 * N); + + const VU16 raw9 = And(packed1, mask); + packed1 = ShiftRight<7>(packed1); + StoreU(raw9, d, raw + 9 * N); + + const VU16 rawA = And(packed2, mask); + packed2 = ShiftRight<7>(packed2); + StoreU(rawA, d, raw + 0xA * N); + + const VU16 rawB = And(packed3, mask); + packed3 = ShiftRight<7>(packed3); + StoreU(rawB, d, raw + 0xB * N); + + const VU16 rawC = And(packed4, mask); + packed4 = ShiftRight<7>(packed4); + StoreU(rawC, d, raw + 0xC * N); + + const VU16 rawD = And(packed5, mask); + packed5 = ShiftRight<7>(packed5); + StoreU(rawD, d, raw + 0xD * N); + + const VU16 rawE = And(packed6, mask); + packed6 = ShiftRight<7>(packed6); + StoreU(rawE, d, raw + 0xE * N); + + // packed7 is the concatenation of the two bits in packed0..6. + const VU16 bits50 = + Or3(ShiftLeft<4>(packed2), ShiftLeft<2>(packed1), packed0); + const VU16 bitsB6 = Or3(ShiftLeft<10>(packed5), ShiftLeft<8>(packed4), + ShiftLeft<6>(packed3)); + VU16 packed7 = Or3(ShiftLeft<12>(packed6), bitsB6, bits50); + const VU16 raw7 = And(packed7, mask); + packed7 = ShiftRight<7>(packed7); + StoreU(raw7, d, raw + 7 * N); + const VU16 rawF = packed7; // upper bits already zero + StoreU(rawF, d, raw + 0xF * N); + } +}; // Pack16<7> + +template <> +struct Pack16<8> { + static constexpr size_t kBits = 8; + // 4x unrolled (matches size of 2/6 bit cases) for increased efficiency. + static constexpr size_t kRawVectors = 8; + static constexpr size_t kPackedVectors = 4; + + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + // This is equivalent to ConcatEven with 8-bit lanes, but much more + // efficient on RVV and slightly less efficient on SVE2. + const VU16 packed0 = Or(ShiftLeft<8>(raw1), raw0); + const VU16 packed1 = Or(ShiftLeft<8>(raw3), raw2); + const VU16 packed2 = Or(ShiftLeft<8>(raw5), raw4); + const VU16 packed3 = Or(ShiftLeft<8>(raw7), raw6); + StoreU(packed0, d, packed_out + 0 * N); + StoreU(packed1, d, packed_out + 1 * N); + StoreU(packed2, d, packed_out + 2 * N); + StoreU(packed3, d, packed_out + 3 * N); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) { + using VU16 = Vec; + const size_t N = Lanes(d); + + VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N)); + VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N)); + VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N)); + VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N)); + // We extract the lowest eight bits and shift right. + const VU16 mask = Set(d, 0xFFu); + + const VU16 raw0 = And(packed0, mask); + packed0 = ShiftRight<8>(packed0); + StoreU(raw0, d, raw + 0 * N); + const VU16 raw1 = packed0; // upper bits already zero + StoreU(raw1, d, raw + 1 * N); + + const VU16 raw2 = And(packed1, mask); + packed1 = ShiftRight<8>(packed1); + StoreU(raw2, d, raw + 2 * N); + const VU16 raw3 = packed1; // upper bits already zero + StoreU(raw3, d, raw + 3 * N); + + const VU16 raw4 = And(packed2, mask); + packed2 = ShiftRight<8>(packed2); + StoreU(raw4, d, raw + 4 * N); + const VU16 raw5 = packed2; // upper bits already zero + StoreU(raw5, d, raw + 5 * N); + + const VU16 raw6 = And(packed3, mask); + packed3 = ShiftRight<8>(packed3); + StoreU(raw6, d, raw + 6 * N); + const VU16 raw7 = packed3; // upper bits already zero + StoreU(raw7, d, raw + 7 * N); + } +}; // Pack16<8> + +} // namespace detail + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_ diff --git a/third_party/highway/hwy/contrib/bit_pack/bit_pack_test.cc b/third_party/highway/hwy/contrib/bit_pack/bit_pack_test.cc new file mode 100644 index 0000000000..eb4e75800f --- /dev/null +++ b/third_party/highway/hwy/contrib/bit_pack/bit_pack_test.cc @@ -0,0 +1,177 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include "hwy/aligned_allocator.h" +#include "hwy/base.h" +#include "hwy/nanobenchmark.h" + +// clang-format off +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/bit_pack/bit_pack_test.cc" // NOLINT +#include "hwy/foreach_target.h" // IWYU pragma: keep + +#include "hwy/contrib/bit_pack/bit_pack-inl.h" +#include "hwy/tests/test_util-inl.h" +// clang-format on + +#ifndef HWY_BIT_PACK_BENCHMARK +#define HWY_BIT_PACK_BENCHMARK 0 +#endif + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +// Used to prevent running benchmark (slow) for partial vectors and targets +// except the best available. Global, not per-target, hence must be outside +// HWY_NAMESPACE. Declare first because HWY_ONCE is only true after some code +// has been re-included. +extern size_t last_bits; +extern uint64_t best_target; +#if HWY_ONCE +size_t last_bits = 0; +uint64_t best_target = ~0ull; +#endif +namespace HWY_NAMESPACE { + +template +T Random(RandomState& rng) { + return static_cast(Random32(&rng) & kBits); +} + +template +class Checker { + public: + explicit Checker(size_t num) { raw_.reserve(num); } + void NotifyRaw(T raw) { raw_.push_back(raw); } + + void NotifyRawOutput(size_t bits, T raw) { + if (raw_[num_verified_] != raw) { + HWY_ABORT("%zu bits: pos %zu of %zu, expected %.0f actual %.0f\n", bits, + num_verified_, raw_.size(), + static_cast(raw_[num_verified_]), + static_cast(raw)); + } + ++num_verified_; + } + + private: + std::vector raw_; + size_t num_verified_ = 0; +}; + +template +struct TestPack { + template + void operator()(T /* t */, D d) { + const size_t N = Lanes(d); + RandomState rng(N * 129); + const size_t num = N * PackT::kRawVectors; + const size_t packed_size = N * PackT::kPackedVectors; + Checker checker(num); + AlignedFreeUniquePtr raw = hwy::AllocateAligned(num); + AlignedFreeUniquePtr raw2 = hwy::AllocateAligned(num); + AlignedFreeUniquePtr packed = hwy::AllocateAligned(packed_size); + + for (size_t i = 0; i < num; ++i) { + raw[i] = Random(rng); + checker.NotifyRaw(raw[i]); + } + + best_target = HWY_MIN(best_target, HWY_TARGET); + const bool run_bench = HWY_BIT_PACK_BENCHMARK && + (PackT::kBits != last_bits) && + (HWY_TARGET == best_target); + last_bits = PackT::kBits; + + if (run_bench) { + const size_t kNumInputs = 1; + const size_t num_items = num * size_t(Unpredictable1()); + const FuncInput inputs[kNumInputs] = {num_items}; + Result results[kNumInputs]; + + Params p; + p.verbose = false; + p.max_evals = 7; + p.target_rel_mad = 0.002; + const size_t num_results = MeasureClosure( + [&](FuncInput) HWY_ATTR { + PackT().Pack(d, raw.get(), packed.get()); + PackT().Unpack(d, packed.get(), raw2.get()); + return raw2[Random32(&rng) % num]; + }, + inputs, kNumInputs, results, p); + if (num_results != kNumInputs) { + fprintf(stderr, "MeasureClosure failed.\n"); + return; + } + // Print cycles per element + for (size_t i = 0; i < num_results; ++i) { + const double cycles_per_item = + results[i].ticks / static_cast(results[i].input); + const double mad = results[i].variability * cycles_per_item; + printf("Bits:%2d elements:%3d cyc/elt:%6.3f (+/- %5.3f)\n", + static_cast(PackT::kBits), + static_cast(results[i].input), cycles_per_item, mad); + } + } else { + PackT().Pack(d, raw.get(), packed.get()); + PackT().Unpack(d, packed.get(), raw2.get()); + } + + for (size_t i = 0; i < num; ++i) { + checker.NotifyRawOutput(PackT::kBits, raw2[i]); + } + } +}; + +void TestAllPack8() { + ForShrinkableVectors>>()(uint8_t()); + ForShrinkableVectors>>()(uint8_t()); + ForShrinkableVectors>>()(uint8_t()); + ForShrinkableVectors>>()(uint8_t()); + ForShrinkableVectors>>()(uint8_t()); + ForShrinkableVectors>>()(uint8_t()); + ForShrinkableVectors>>()(uint8_t()); + ForShrinkableVectors>>()(uint8_t()); +} + +void TestAllPack16() { + ForShrinkableVectors>>()(uint16_t()); + ForShrinkableVectors>>()(uint16_t()); + ForShrinkableVectors>>()(uint16_t()); + ForShrinkableVectors>>()(uint16_t()); + ForShrinkableVectors>>()(uint16_t()); + ForShrinkableVectors>>()(uint16_t()); + ForShrinkableVectors>>()(uint16_t()); + ForShrinkableVectors>>()(uint16_t()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(BitPackTest); +HWY_EXPORT_AND_TEST_P(BitPackTest, TestAllPack8); +HWY_EXPORT_AND_TEST_P(BitPackTest, TestAllPack16); +} // namespace hwy + +#endif -- cgit v1.2.3