2 files changed, 2804 insertions, 0 deletions
diff --git a/third_party/highway/hwy/contrib/bit_pack/bit_pack-inl.h b/third_party/highway/hwy/contrib/bit_pack/bit_pack-inl.h
new file mode 100644
index 0000000000..04d015453b
--- /dev/null
+++ b/third_party/highway/hwy/contrib/bit_pack/bit_pack-inl.h
@@ -0,0 +1,2599 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Per-target include guard
+#if defined(HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_) == \
+    defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_
+#undef HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_
+#else
+#define HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_
+#endif
+
+#include "hwy/highway.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// The entry points are class templates specialized below for each number of
+// bits. Each provides Pack and Unpack member functions which load (Pack) or
+// store (Unpack) B raw vectors, and store (Pack) or load (Unpack) a number of
+// packed vectors equal to kBits. B denotes the bits per lane: 8 for Pack8, 16
+// for Pack16, which is also the upper bound for kBits.
+template <size_t kBits>  // <= 8
+struct Pack8 {};
+template <size_t kBits>  // <= 16
+struct Pack16 {};
+
+template <>
+struct Pack8<1> {
+  template <class D8>
+  HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw,
+                       uint8_t* HWY_RESTRICT packed_out) const {
+    const RepartitionToWide<decltype(d8)> d16;
+    using VU16 = Vec<decltype(d16)>;
+    const size_t N8 = Lanes(d8);
+    // 16-bit shifts avoid masking (bits will not cross 8-bit lanes).
+    const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8));
+    const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8));
+    const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8));
+    const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8));
+    const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8));
+    const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8));
+    const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8));
+    const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8));
+
+    const VU16 packed =
+        Xor3(Or(ShiftLeft<7>(raw7), ShiftLeft<6>(raw6)),
+             Xor3(ShiftLeft<5>(raw5), ShiftLeft<4>(raw4), ShiftLeft<3>(raw3)),
+             Xor3(ShiftLeft<2>(raw2), ShiftLeft<1>(raw1), raw0));
+    StoreU(BitCast(d8, packed), d8, packed_out);
+  }
+
+  template <class D8>
+  HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in,
+                         uint8_t* HWY_RESTRICT raw) const {
+    const RepartitionToWide<decltype(d8)> d16;
+    using VU16 = Vec<decltype(d16)>;
+    const size_t N8 = Lanes(d8);
+    const VU16 mask = Set(d16, 0x0101u);  // LSB in each byte
+
+    const VU16 packed = BitCast(d16, LoadU(d8, packed_in));
+
+    const VU16 raw0 = And(packed, mask);
+    StoreU(BitCast(d8, raw0), d8, raw + 0 * N8);
+
+    const VU16 raw1 = And(ShiftRight<1>(packed), mask);
+    StoreU(BitCast(d8, raw1), d8, raw + 1 * N8);
+
+    const VU16 raw2 = And(ShiftRight<2>(packed), mask);
+    StoreU(BitCast(d8, raw2), d8, raw + 2 * N8);
+
+    const VU16 raw3 = And(ShiftRight<3>(packed), mask);
+    StoreU(BitCast(d8, raw3), d8, raw + 3 * N8);
+
+    const VU16 raw4 = And(ShiftRight<4>(packed), mask);
+    StoreU(BitCast(d8, raw4), d8, raw + 4 * N8);
+
+    const VU16 raw5 = And(ShiftRight<5>(packed), mask);
+    StoreU(BitCast(d8, raw5), d8, raw + 5 * N8);
+
+    const VU16 raw6 = And(ShiftRight<6>(packed), mask);
+    StoreU(BitCast(d8, raw6), d8, raw + 6 * N8);
+
+    const VU16 raw7 = And(ShiftRight<7>(packed), mask);
+    StoreU(BitCast(d8, raw7), d8, raw + 7 * N8);
+  }
+};  // Pack8<1>
+
+template <>
+struct Pack8<2> {
+  template <class D8>
+  HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw,
+                       uint8_t* HWY_RESTRICT packed_out) const {
+    const RepartitionToWide<decltype(d8)> d16;
+    using VU16 = Vec<decltype(d16)>;
+    const size_t N8 = Lanes(d8);
+    // 16-bit shifts avoid masking (bits will not cross 8-bit lanes).
+    const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8));
+    const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8));
+    const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8));
+    const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8));
+    const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8));
+    const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8));
+    const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8));
+    const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8));
+
+    const VU16 packed0 = Xor3(ShiftLeft<6>(raw6), ShiftLeft<4>(raw4),
+                              Or(ShiftLeft<2>(raw2), raw0));
+    const VU16 packed1 = Xor3(ShiftLeft<6>(raw7), ShiftLeft<4>(raw5),
+                              Or(ShiftLeft<2>(raw3), raw1));
+    StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8);
+    StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8);
+  }
+
+  template <class D8>
+  HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in,
+                         uint8_t* HWY_RESTRICT raw) const {
+    const RepartitionToWide<decltype(d8)> d16;
+    using VU16 = Vec<decltype(d16)>;
+    const size_t N8 = Lanes(d8);
+    const VU16 mask = Set(d16, 0x0303u);  // Lowest 2 bits per byte
+
+    const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8));
+    const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8));
+
+    const VU16 raw0 = And(packed0, mask);
+    StoreU(BitCast(d8, raw0), d8, raw + 0 * N8);
+
+    const VU16 raw1 = And(packed1, mask);
+    StoreU(BitCast(d8, raw1), d8, raw + 1 * N8);
+
+    const VU16 raw2 = And(ShiftRight<2>(packed0), mask);
+    StoreU(BitCast(d8, raw2), d8, raw + 2 * N8);
+
+    const VU16 raw3 = And(ShiftRight<2>(packed1), mask);
+    StoreU(BitCast(d8, raw3), d8, raw + 3 * N8);
+
+    const VU16 raw4 = And(ShiftRight<4>(packed0), mask);
+    StoreU(BitCast(d8, raw4), d8, raw + 4 * N8);
+
+    const VU16 raw5 = And(ShiftRight<4>(packed1), mask);
+    StoreU(BitCast(d8, raw5), d8, raw + 5 * N8);
+
+    const VU16 raw6 = And(ShiftRight<6>(packed0), mask);
+    StoreU(BitCast(d8, raw6), d8, raw + 6 * N8);
+
+    const VU16 raw7 = And(ShiftRight<6>(packed1), mask);
+    StoreU(BitCast(d8, raw7), d8, raw + 7 * N8);
+  }
+};  // Pack8<2>
+
+template <>
+struct Pack8<3> {
+  template <class D8>
+  HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw,
+                       uint8_t* HWY_RESTRICT packed_out) const {
+    const RepartitionToWide<decltype(d8)> d16;
+    using VU16 = Vec<decltype(d16)>;
+    const size_t N8 = Lanes(d8);
+    const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8));
+    const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8));
+    const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8));
+    const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8));
+    const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8));
+    const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8));
+    const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8));
+    const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8));
+
+    // The upper two bits of these three will be filled with packed3 (6 bits).
+    VU16 packed0 = Or(ShiftLeft<3>(raw4), raw0);
+    VU16 packed1 = Or(ShiftLeft<3>(raw5), raw1);
+    VU16 packed2 = Or(ShiftLeft<3>(raw6), raw2);
+    const VU16 packed3 = Or(ShiftLeft<3>(raw7), raw3);
+
+    const VU16 hi2 = Set(d16, 0xC0C0u);
+    packed0 = OrAnd(packed0, ShiftLeft<2>(packed3), hi2);
+    packed1 = OrAnd(packed1, ShiftLeft<4>(packed3), hi2);
+    packed2 = OrAnd(packed2, ShiftLeft<6>(packed3), hi2);
+    StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8);
+    StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8);
+    StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8);
+  }
+
+  template <class D8>
+  HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in,
+                         uint8_t* HWY_RESTRICT raw) const {
+    const RepartitionToWide<decltype(d8)> d16;
+    using VU16 = Vec<decltype(d16)>;
+    const size_t N8 = Lanes(d8);
+    const VU16 mask = Set(d16, 0x0707u);  // Lowest 3 bits per byte
+
+    const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8));
+    const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8));
+    const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8));
+
+    const VU16 raw0 = And(packed0, mask);
+    StoreU(BitCast(d8, raw0), d8, raw + 0 * N8);
+
+    const VU16 raw1 = And(packed1, mask);
+    StoreU(BitCast(d8, raw1), d8, raw + 1 * N8);
+
+    const VU16 raw2 = And(packed2, mask);
+    StoreU(BitCast(d8, raw2), d8, raw + 2 * N8);
+
+    const VU16 raw4 = And(ShiftRight<3>(packed0), mask);
+    StoreU(BitCast(d8, raw4), d8, raw + 4 * N8);
+
+    const VU16 raw5 = And(ShiftRight<3>(packed1), mask);
+    StoreU(BitCast(d8, raw5), d8, raw + 5 * N8);
+
+    const VU16 raw6 = And(ShiftRight<3>(packed2), mask);
+    StoreU(BitCast(d8, raw6), d8, raw + 6 * N8);
+
+    // raw73 is the concatenation of the upper two bits in packed0..2.
+    const VU16 hi2 = Set(d16, 0xC0C0u);
+    const VU16 raw73 = Xor3(ShiftRight<6>(And(packed2, hi2)),  //
+                            ShiftRight<4>(And(packed1, hi2)),
+                            ShiftRight<2>(And(packed0, hi2)));
+
+    const VU16 raw3 = And(mask, raw73);
+    StoreU(BitCast(d8, raw3), d8, raw + 3 * N8);
+
+    const VU16 raw7 = And(mask, ShiftRight<3>(raw73));
+    StoreU(BitCast(d8, raw7), d8, raw + 7 * N8);
+  }
+};  // Pack8<3>
+
+template <>
+struct Pack8<4> {
+  template <class D8>
+  HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw,
+                       uint8_t* HWY_RESTRICT packed_out) const {
+    const RepartitionToWide<decltype(d8)> d16;
+    using VU16 = Vec<decltype(d16)>;
+    const size_t N8 = Lanes(d8);
+    // 16-bit shifts avoid masking (bits will not cross 8-bit lanes).
+    const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8));
+    const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8));
+    const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8));
+    const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8));
+    const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8));
+    const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8));
+    const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8));
+    const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8));
+
+    const VU16 packed0 = Or(ShiftLeft<4>(raw2), raw0);
+    const VU16 packed1 = Or(ShiftLeft<4>(raw3), raw1);
+    const VU16 packed2 = Or(ShiftLeft<4>(raw6), raw4);
+    const VU16 packed3 = Or(ShiftLeft<4>(raw7), raw5);
+
+    StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8);
+    StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8);
+    StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8);
+    StoreU(BitCast(d8, packed3), d8, packed_out + 3 * N8);
+  }
+
+  template <class D8>
+  HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in,
+                         uint8_t* HWY_RESTRICT raw) const {
+    const RepartitionToWide<decltype(d8)> d16;
+    using VU16 = Vec<decltype(d16)>;
+    const size_t N8 = Lanes(d8);
+    const VU16 mask = Set(d16, 0x0F0Fu);  // Lowest 4 bits per byte
+
+    const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8));
+    const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8));
+    const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8));
+    const VU16 packed3 = BitCast(d16, LoadU(d8, packed_in + 3 * N8));
+
+    const VU16 raw0 = And(packed0, mask);
+    StoreU(BitCast(d8, raw0), d8, raw + 0 * N8);
+
+    const VU16 raw1 = And(packed1, mask);
+    StoreU(BitCast(d8, raw1), d8, raw + 1 * N8);
+
+    const VU16 raw2 = And(ShiftRight<4>(packed0), mask);
+    StoreU(BitCast(d8, raw2), d8, raw + 2 * N8);
+
+    const VU16 raw3 = And(ShiftRight<4>(packed1), mask);
+    StoreU(BitCast(d8, raw3), d8, raw + 3 * N8);
+
+    const VU16 raw4 = And(packed2, mask);
+    StoreU(BitCast(d8, raw4), d8, raw + 4 * N8);
+
+    const VU16 raw5 = And(packed3, mask);
+    StoreU(BitCast(d8, raw5), d8, raw + 5 * N8);
+
+    const VU16 raw6 = And(ShiftRight<4>(packed2), mask);
+    StoreU(BitCast(d8, raw6), d8, raw + 6 * N8);
+
+    const VU16 raw7 = And(ShiftRight<4>(packed3), mask);
+    StoreU(BitCast(d8, raw7), d8, raw + 7 * N8);
+  }
+};  // Pack8<4>
+
+template <>
+struct Pack8<5> {
+  template <class D8>
+  HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw,
+                       uint8_t* HWY_RESTRICT packed_out) const {
+    const RepartitionToWide<decltype(d8)> d16;
+    using VU16 = Vec<decltype(d16)>;
+    const size_t N8 = Lanes(d8);
+    const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8));
+    const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8));
+    const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8));
+    const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8));
+    const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8));
+    const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8));
+    const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8));
+    const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8));
+
+    // Fill upper three bits with upper bits from raw4..7.
+    const VU16 hi3 = Set(d16, 0xE0E0u);
+    const VU16 packed0 = OrAnd(raw0, ShiftLeft<3>(raw4), hi3);
+    const VU16 packed1 = OrAnd(raw1, ShiftLeft<3>(raw5), hi3);
+    const VU16 packed2 = OrAnd(raw2, ShiftLeft<3>(raw6), hi3);
+    const VU16 packed3 = OrAnd(raw3, ShiftLeft<3>(raw7), hi3);
+
+    StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8);
+    StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8);
+    StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8);
+    StoreU(BitCast(d8, packed3), d8, packed_out + 3 * N8);
+
+    // Combine lower two bits of raw4..7 into packed4.
+    const VU16 lo2 = Set(d16, 0x0303u);
+    const VU16 packed4 = Or(And(raw4, lo2), Xor3(ShiftLeft<2>(And(raw5, lo2)),
+                                                 ShiftLeft<4>(And(raw6, lo2)),
+                                                 ShiftLeft<6>(And(raw7, lo2))));
+    StoreU(BitCast(d8, packed4), d8, packed_out + 4 * N8);
+  }
+
+  template <class D8>
+  HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in,
+                         uint8_t* HWY_RESTRICT raw) const {
+    const RepartitionToWide<decltype(d8)> d16;
+    using VU16 = Vec<decltype(d16)>;
+    const size_t N8 = Lanes(d8);
+
+    const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8));
+    const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8));
+    const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8));
+    const VU16 packed3 = BitCast(d16, LoadU(d8, packed_in + 3 * N8));
+    const VU16 packed4 = BitCast(d16, LoadU(d8, packed_in + 4 * N8));
+
+    const VU16 mask = Set(d16, 0x1F1Fu);  // Lowest 5 bits per byte
+
+    const VU16 raw0 = And(packed0, mask);
+    StoreU(BitCast(d8, raw0), d8, raw + 0 * N8);
+
+    const VU16 raw1 = And(packed1, mask);
+    StoreU(BitCast(d8, raw1), d8, raw + 1 * N8);
+
+    const VU16 raw2 = And(packed2, mask);
+    StoreU(BitCast(d8, raw2), d8, raw + 2 * N8);
+
+    const VU16 raw3 = And(packed3, mask);
+    StoreU(BitCast(d8, raw3), d8, raw + 3 * N8);
+
+    // The upper bits are the top 3 bits shifted right by three.
+    const VU16 top4 = ShiftRight<3>(AndNot(mask, packed0));
+    const VU16 top5 = ShiftRight<3>(AndNot(mask, packed1));
+    const VU16 top6 = ShiftRight<3>(AndNot(mask, packed2));
+    const VU16 top7 = ShiftRight<3>(AndNot(mask, packed3));
+
+    // Insert the lower 2 bits, which were concatenated into a byte.
+    const VU16 lo2 = Set(d16, 0x0303u);
+    const VU16 raw4 = OrAnd(top4, lo2, packed4);
+    const VU16 raw5 = OrAnd(top5, lo2, ShiftRight<2>(packed4));
+    const VU16 raw6 = OrAnd(top6, lo2, ShiftRight<4>(packed4));
+    const VU16 raw7 = OrAnd(top7, lo2, ShiftRight<6>(packed4));
+
+    StoreU(BitCast(d8, raw4), d8, raw + 4 * N8);
+    StoreU(BitCast(d8, raw5), d8, raw + 5 * N8);
+    StoreU(BitCast(d8, raw6), d8, raw + 6 * N8);
+    StoreU(BitCast(d8, raw7), d8, raw + 7 * N8);
+  }
+};  // Pack8<5>
+
+template <>
+struct Pack8<6> {
+  template <class D8>
+  HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw,
+                       uint8_t* HWY_RESTRICT packed_out) const {
+    const RepartitionToWide<decltype(d8)> d16;
+    using VU16 = Vec<decltype(d16)>;
+    const size_t N8 = Lanes(d8);
+    const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8));
+    const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8));
+    const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8));
+    const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8));
+    const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8));
+    const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8));
+    const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8));
+    const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8));
+
+    const VU16 hi2 = Set(d16, 0xC0C0u);
+    // Each triplet of these stores raw3/raw7 (6 bits) in the upper 2 bits.
+    const VU16 packed0 = OrAnd(raw0, ShiftLeft<2>(raw3), hi2);
+    const VU16 packed1 = OrAnd(raw1, ShiftLeft<4>(raw3), hi2);
+    const VU16 packed2 = OrAnd(raw2, ShiftLeft<6>(raw3), hi2);
+    const VU16 packed3 = OrAnd(raw4, ShiftLeft<2>(raw7), hi2);
+    const VU16 packed4 = OrAnd(raw5, ShiftLeft<4>(raw7), hi2);
+    const VU16 packed5 = OrAnd(raw6, ShiftLeft<6>(raw7), hi2);
+
+    StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8);
+    StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8);
+    StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8);
+    StoreU(BitCast(d8, packed3), d8, packed_out + 3 * N8);
+    StoreU(BitCast(d8, packed4), d8, packed_out + 4 * N8);
+    StoreU(BitCast(d8, packed5), d8, packed_out + 5 * N8);
+  }
+
+  template <class D8>
+  HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in,
+                         uint8_t* HWY_RESTRICT raw) const {
+    const RepartitionToWide<decltype(d8)> d16;
+    using VU16 = Vec<decltype(d16)>;
+    const size_t N8 = Lanes(d8);
+    const VU16 mask = Set(d16, 0x3F3Fu);  // Lowest 6 bits per byte
+
+    const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8));
+    const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8));
+    const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8));
+    const VU16 packed3 = BitCast(d16, LoadU(d8, packed_in + 3 * N8));
+    const VU16 packed4 = BitCast(d16, LoadU(d8, packed_in + 4 * N8));
+    const VU16 packed5 = BitCast(d16, LoadU(d8, packed_in + 5 * N8));
+
+    const VU16 raw0 = And(packed0, mask);
+    StoreU(BitCast(d8, raw0), d8, raw + 0 * N8);
+
+    const VU16 raw1 = And(packed1, mask);
+    StoreU(BitCast(d8, raw1), d8, raw + 1 * N8);
+
+    const VU16 raw2 = And(packed2, mask);
+    StoreU(BitCast(d8, raw2), d8, raw + 2 * N8);
+
+    const VU16 raw4 = And(packed3, mask);
+    StoreU(BitCast(d8, raw4), d8, raw + 4 * N8);
+
+    const VU16 raw5 = And(packed4, mask);
+    StoreU(BitCast(d8, raw5), d8, raw + 5 * N8);
+
+    const VU16 raw6 = And(packed5, mask);
+    StoreU(BitCast(d8, raw6), d8, raw + 6 * N8);
+
+    // raw3/7 are the concatenation of the upper two bits in packed0..2.
+    const VU16 raw3 = Xor3(ShiftRight<6>(AndNot(mask, packed2)),
+                           ShiftRight<4>(AndNot(mask, packed1)),
+                           ShiftRight<2>(AndNot(mask, packed0)));
+    const VU16 raw7 = Xor3(ShiftRight<6>(AndNot(mask, packed5)),
+                           ShiftRight<4>(AndNot(mask, packed4)),
+                           ShiftRight<2>(AndNot(mask, packed3)));
+    StoreU(BitCast(d8, raw3), d8, raw + 3 * N8);
+    StoreU(BitCast(d8, raw7), d8, raw + 7 * N8);
+  }
+};  // Pack8<6>
+
+template <>
+struct Pack8<7> {
+  template <class D8>
+  HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw,
+                       uint8_t* HWY_RESTRICT packed_out) const {
+    const RepartitionToWide<decltype(d8)> d16;
+    using VU16 = Vec<decltype(d16)>;
+    const size_t N8 = Lanes(d8);
+    const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8));
+    const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8));
+    const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8));
+    const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8));
+    const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8));
+    const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8));
+    const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8));
+    // Inserted into top bit of packed0..6.
+    const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8));
+
+    const VU16 hi1 = Set(d16, 0x8080u);
+    const VU16 packed0 = OrAnd(raw0, Add(raw7, raw7), hi1);
+    const VU16 packed1 = OrAnd(raw1, ShiftLeft<2>(raw7), hi1);
+    const VU16 packed2 = OrAnd(raw2, ShiftLeft<3>(raw7), hi1);
+    const VU16 packed3 = OrAnd(raw3, ShiftLeft<4>(raw7), hi1);
+    const VU16 packed4 = OrAnd(raw4, ShiftLeft<5>(raw7), hi1);
+    const VU16 packed5 = OrAnd(raw5, ShiftLeft<6>(raw7), hi1);
+    const VU16 packed6 = OrAnd(raw6, ShiftLeft<7>(raw7), hi1);
+
+    StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8);
+    StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8);
+    StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8);
+    StoreU(BitCast(d8, packed3), d8, packed_out + 3 * N8);
+    StoreU(BitCast(d8, packed4), d8, packed_out + 4 * N8);
+    StoreU(BitCast(d8, packed5), d8, packed_out + 5 * N8);
+    StoreU(BitCast(d8, packed6), d8, packed_out + 6 * N8);
+  }
+
+  template <class D8>
+  HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in,
+                         uint8_t* HWY_RESTRICT raw) const {
+    const RepartitionToWide<decltype(d8)> d16;
+    using VU16 = Vec<decltype(d16)>;
+    const size_t N8 = Lanes(d8);
+
+    const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8));
+    const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8));
+    const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8));
+    const VU16 packed3 = BitCast(d16, LoadU(d8, packed_in + 3 * N8));
+    const VU16 packed4 = BitCast(d16, LoadU(d8, packed_in + 4 * N8));
+    const VU16 packed5 = BitCast(d16, LoadU(d8, packed_in + 5 * N8));
+    const VU16 packed6 = BitCast(d16, LoadU(d8, packed_in + 6 * N8));
+
+    const VU16 mask = Set(d16, 0x7F7Fu);  // Lowest 7 bits per byte
+
+    const VU16 raw0 = And(packed0, mask);
+    StoreU(BitCast(d8, raw0), d8, raw + 0 * N8);
+
+    const VU16 raw1 = And(packed1, mask);
+    StoreU(BitCast(d8, raw1), d8, raw + 1 * N8);
+
+    const VU16 raw2 = And(packed2, mask);
+    StoreU(BitCast(d8, raw2), d8, raw + 2 * N8);
+
+    const VU16 raw3 = And(packed3, mask);
+    StoreU(BitCast(d8, raw3), d8, raw + 3 * N8);
+
+    const VU16 raw4 = And(packed4, mask);
+    StoreU(BitCast(d8, raw4), d8, raw + 4 * N8);
+
+    const VU16 raw5 = And(packed5, mask);
+    StoreU(BitCast(d8, raw5), d8, raw + 5 * N8);
+
+    const VU16 raw6 = And(packed6, mask);
+    StoreU(BitCast(d8, raw6), d8, raw + 6 * N8);
+
+    const VU16 p0 = Xor3(ShiftRight<7>(AndNot(mask, packed6)),
+                         ShiftRight<6>(AndNot(mask, packed5)),
+                         ShiftRight<5>(AndNot(mask, packed4)));
+    const VU16 p1 = Xor3(ShiftRight<4>(AndNot(mask, packed3)),
+                         ShiftRight<3>(AndNot(mask, packed2)),
+                         ShiftRight<2>(AndNot(mask, packed1)));
+    const VU16 raw7 = Xor3(ShiftRight<1>(AndNot(mask, packed0)), p0, p1);
+    StoreU(BitCast(d8, raw7), d8, raw + 7 * N8);
+  }
+};  // Pack8<7>
+
+template <>
+struct Pack8<8> {
+  template <class D8>
+  HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw,
+                       uint8_t* HWY_RESTRICT packed_out) const {
+    using VU8 = Vec<decltype(d8)>;
+    const size_t N8 = Lanes(d8);
+    const VU8 raw0 = LoadU(d8, raw + 0 * N8);
+    const VU8 raw1 = LoadU(d8, raw + 1 * N8);
+    const VU8 raw2 = LoadU(d8, raw + 2 * N8);
+    const VU8 raw3 = LoadU(d8, raw + 3 * N8);
+    const VU8 raw4 = LoadU(d8, raw + 4 * N8);
+    const VU8 raw5 = LoadU(d8, raw + 5 * N8);
+    const VU8 raw6 = LoadU(d8, raw + 6 * N8);
+    const VU8 raw7 = LoadU(d8, raw + 7 * N8);
+
+    StoreU(raw0, d8, packed_out + 0 * N8);
+    StoreU(raw1, d8, packed_out + 1 * N8);
+    StoreU(raw2, d8, packed_out + 2 * N8);
+    StoreU(raw3, d8, packed_out + 3 * N8);
+    StoreU(raw4, d8, packed_out + 4 * N8);
+    StoreU(raw5, d8, packed_out + 5 * N8);
+    StoreU(raw6, d8, packed_out + 6 * N8);
+    StoreU(raw7, d8, packed_out + 7 * N8);
+  }
+
+  template <class D8>
+  HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in,
+                         uint8_t* HWY_RESTRICT raw) const {
+    using VU8 = Vec<decltype(d8)>;
+    const size_t N8 = Lanes(d8);
+    const VU8 raw0 = LoadU(d8, packed_in + 0 * N8);
+    const VU8 raw1 = LoadU(d8, packed_in + 1 * N8);
+    const VU8 raw2 = LoadU(d8, packed_in + 2 * N8);
+    const VU8 raw3 = LoadU(d8, packed_in + 3 * N8);
+    const VU8 raw4 = LoadU(d8, packed_in + 4 * N8);
+    const VU8 raw5 = LoadU(d8, packed_in + 5 * N8);
+    const VU8 raw6 = LoadU(d8, packed_in + 6 * N8);
+    const VU8 raw7 = LoadU(d8, packed_in + 7 * N8);
+
+    StoreU(raw0, d8, raw + 0 * N8);
+    StoreU(raw1, d8, raw + 1 * N8);
+    StoreU(raw2, d8, raw + 2 * N8);
+    StoreU(raw3, d8, raw + 3 * N8);
+    StoreU(raw4, d8, raw + 4 * N8);
+    StoreU(raw5, d8, raw + 5 * N8);
+    StoreU(raw6, d8, raw + 6 * N8);
+    StoreU(raw7, d8, raw + 7 * N8);
+  }
+};  // Pack8<8>
+
+template <>
+struct Pack16<1> {
+  template <class D>
+  HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
+                       uint16_t* HWY_RESTRICT packed_out) const {
+    using VU16 = Vec<decltype(d)>;
+    const size_t N = Lanes(d);
+    const VU16 raw0 = LoadU(d, raw + 0 * N);
+    const VU16 raw1 = LoadU(d, raw + 1 * N);
+    const VU16 raw2 = LoadU(d, raw + 2 * N);
+    const VU16 raw3 = LoadU(d, raw + 3 * N);
+    const VU16 raw4 = LoadU(d, raw + 4 * N);
+    const VU16 raw5 = LoadU(d, raw + 5 * N);
+    const VU16 raw6 = LoadU(d, raw + 6 * N);
+    const VU16 raw7 = LoadU(d, raw + 7 * N);
+    const VU16 raw8 = LoadU(d, raw + 8 * N);
+    const VU16 raw9 = LoadU(d, raw + 9 * N);
+    const VU16 rawA = LoadU(d, raw + 0xA * N);
+    const VU16 rawB = LoadU(d, raw + 0xB * N);
+    const VU16 rawC = LoadU(d, raw + 0xC * N);
+    const VU16 rawD = LoadU(d, raw + 0xD * N);
+    const VU16 rawE = LoadU(d, raw + 0xE * N);
+    const VU16 rawF = LoadU(d, raw + 0xF * N);
+
+    const VU16 p0 = Xor3(ShiftLeft<2>(raw2), Add(raw1, raw1), raw0);
+    const VU16 p1 =
+        Xor3(ShiftLeft<5>(raw5), ShiftLeft<4>(raw4), ShiftLeft<3>(raw3));
+    const VU16 p2 =
+        Xor3(ShiftLeft<8>(raw8), ShiftLeft<7>(raw7), ShiftLeft<6>(raw6));
+    const VU16 p3 =
+        Xor3(ShiftLeft<0xB>(rawB), ShiftLeft<0xA>(rawA), ShiftLeft<9>(raw9));
+    const VU16 p4 =
+        Xor3(ShiftLeft<0xE>(rawE), ShiftLeft<0xD>(rawD), ShiftLeft<0xC>(rawC));
+    const VU16 packed =
+        Or(Xor3(ShiftLeft<0xF>(rawF), p0, p1), Xor3(p2, p3, p4));
+    StoreU(packed, d, packed_out);
+  }
+
+  template <class D>
+  HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
+                         uint16_t* HWY_RESTRICT raw) const {
+    using VU16 = Vec<decltype(d)>;
+    const size_t N = Lanes(d);
+    const VU16 mask = Set(d, 1u);  // Lowest bit
+
+    const VU16 packed = LoadU(d, packed_in);
+
+    const VU16 raw0 = And(packed, mask);
+    StoreU(raw0, d, raw + 0 * N);
+
+    const VU16 raw1 = And(ShiftRight<1>(packed), mask);
+    StoreU(raw1, d, raw + 1 * N);
+
+    const VU16 raw2 = And(ShiftRight<2>(packed), mask);
+    StoreU(raw2, d, raw + 2 * N);
+
+    const VU16 raw3 = And(ShiftRight<3>(packed), mask);
+    StoreU(raw3, d, raw + 3 * N);
+
+    const VU16 raw4 = And(ShiftRight<4>(packed), mask);
+    StoreU(raw4, d, raw + 4 * N);
+
+    const VU16 raw5 = And(ShiftRight<5>(packed), mask);
+    StoreU(raw5, d, raw + 5 * N);
+
+    const VU16 raw6 = And(ShiftRight<6>(packed), mask);
+    StoreU(raw6, d, raw + 6 * N);
+
+    const VU16 raw7 = And(ShiftRight<7>(packed), mask);
+    StoreU(raw7, d, raw + 7 * N);
+
+    const VU16 raw8 = And(ShiftRight<8>(packed), mask);
+    StoreU(raw8, d, raw + 8 * N);
+
+    const VU16 raw9 = And(ShiftRight<9>(packed), mask);
+    StoreU(raw9, d, raw + 9 * N);
+
+    const VU16 rawA = And(ShiftRight<0xA>(packed), mask);
+    StoreU(rawA, d, raw + 0xA * N);
+
+    const VU16 rawB = And(ShiftRight<0xB>(packed), mask);
+    StoreU(rawB, d, raw + 0xB * N);
+
+    const VU16 rawC = And(ShiftRight<0xC>(packed), mask);
+    StoreU(rawC, d, raw + 0xC * N);
+
+    const VU16 rawD = And(ShiftRight<0xD>(packed), mask);
+    StoreU(rawD, d, raw + 0xD * N);
+
+    const VU16 rawE = And(ShiftRight<0xE>(packed), mask);
+    StoreU(rawE, d, raw + 0xE * N);
+
+    const VU16 rawF = ShiftRight<0xF>(packed);
+    StoreU(rawF, d, raw + 0xF * N);
+  }
+};  // Pack16<1>
+
+template <>
+struct Pack16<2> {
+  template <class D>
+  HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
+                       uint16_t* HWY_RESTRICT packed_out) const {
+    using VU16 = Vec<decltype(d)>;
+    const size_t N = Lanes(d);
+    const VU16 raw0 = LoadU(d, raw + 0 * N);
+    const VU16 raw1 = LoadU(d, raw + 1 * N);
+    const VU16 raw2 = LoadU(d, raw + 2 * N);
+    const VU16 raw3 = LoadU(d, raw + 3 * N);
+    const VU16 raw4 = LoadU(d, raw + 4 * N);
+    const VU16 raw5 = LoadU(d, raw + 5 * N);
+    const VU16 raw6 = LoadU(d, raw + 6 * N);
+    const VU16 raw7 = LoadU(d, raw + 7 * N);
+    const VU16 raw8 = LoadU(d, raw + 8 * N);
+    const VU16 raw9 = LoadU(d, raw + 9 * N);
+    const VU16 rawA = LoadU(d, raw + 0xA * N);
+    const VU16 rawB = LoadU(d, raw + 0xB * N);
+    const VU16 rawC = LoadU(d, raw + 0xC * N);
+    const VU16 rawD = LoadU(d, raw + 0xD * N);
+    const VU16 rawE = LoadU(d, raw + 0xE * N);
+    const VU16 rawF = LoadU(d, raw + 0xF * N);
+
+    VU16 packed0 = Xor3(ShiftLeft<4>(raw4), ShiftLeft<2>(raw2), raw0);
+    VU16 packed1 = Xor3(ShiftLeft<4>(raw5), ShiftLeft<2>(raw3), raw1);
+    packed0 = Xor3(packed0, ShiftLeft<8>(raw8), ShiftLeft<6>(raw6));
+    packed1 = Xor3(packed1, ShiftLeft<8>(raw9), ShiftLeft<6>(raw7));
+
+    packed0 = Xor3(packed0, ShiftLeft<12>(rawC), ShiftLeft<10>(rawA));
+    packed1 = Xor3(packed1, ShiftLeft<12>(rawD), ShiftLeft<10>(rawB));
+
+    packed0 = Or(packed0, ShiftLeft<14>(rawE));
+    packed1 = Or(packed1, ShiftLeft<14>(rawF));
+    StoreU(packed0, d, packed_out + 0 * N);
+    StoreU(packed1, d, packed_out + 1 * N);
+  }
+
+  template <class D>
+  HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
+                         uint16_t* HWY_RESTRICT raw) const {
+    using VU16 = Vec<decltype(d)>;
+    const size_t N = Lanes(d);
+    const VU16 mask = Set(d, 0x3u);  // Lowest 2 bits
+
+    const VU16 packed0 = LoadU(d, packed_in + 0 * N);
+    const VU16 packed1 = LoadU(d, packed_in + 1 * N);
+
+    const VU16 raw0 = And(packed0, mask);
+    StoreU(raw0, d, raw + 0 * N);
+
+    const VU16 raw1 = And(packed1, mask);
+    StoreU(raw1, d, raw + 1 * N);
+
+    const VU16 raw2 = And(ShiftRight<2>(packed0), mask);
+    StoreU(raw2, d, raw + 2 * N);
+
+    const VU16 raw3 = And(ShiftRight<2>(packed1), mask);
+    StoreU(raw3, d, raw + 3 * N);
+
+    const VU16 raw4 = And(ShiftRight<4>(packed0), mask);
+    StoreU(raw4, d, raw + 4 * N);
+
+    const VU16 raw5 = And(ShiftRight<4>(packed1), mask);
+    StoreU(raw5, d, raw + 5 * N);
+
+    const VU16 raw6 = And(ShiftRight<6>(packed0), mask);
+    StoreU(raw6, d, raw + 6 * N);
+
+    const VU16 raw7 = And(ShiftRight<6>(packed1), mask);
+    StoreU(raw7, d, raw + 7 * N);
+
+    const VU16 raw8 = And(ShiftRight<8>(packed0), mask);
+    StoreU(raw8, d, raw + 8 * N);
+
+    const VU16 raw9 = And(ShiftRight<8>(packed1), mask);
+    StoreU(raw9, d, raw + 9 * N);
+
+    const VU16 rawA = And(ShiftRight<0xA>(packed0), mask);
+    StoreU(rawA, d, raw + 0xA * N);
+
+    const VU16 rawB = And(ShiftRight<0xA>(packed1), mask);
+    StoreU(rawB, d, raw + 0xB * N);
+
+    const VU16 rawC = And(ShiftRight<0xC>(packed0), mask);
+    StoreU(rawC, d, raw + 0xC * N);
+
+    const VU16 rawD = And(ShiftRight<0xC>(packed1), mask);
+    StoreU(rawD, d, raw + 0xD * N);
+
+    const VU16 rawE = ShiftRight<0xE>(packed0);
+    StoreU(rawE, d, raw + 0xE * N);
+
+    const VU16 rawF = ShiftRight<0xE>(packed1);
+    StoreU(rawF, d, raw + 0xF * N);
+  }
+};  // Pack16<2>
+
+template <>
+struct Pack16<3> {
+  template <class D>
+  HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
+                       uint16_t* HWY_RESTRICT packed_out) const {
+    using VU16 = Vec<decltype(d)>;
+    const size_t N = Lanes(d);
+    const VU16 raw0 = LoadU(d, raw + 0 * N);
+    const VU16 raw1 = LoadU(d, raw + 1 * N);
+    const VU16 raw2 = LoadU(d, raw + 2 * N);
+    const VU16 raw3 = LoadU(d, raw + 3 * N);
+    const VU16 raw4 = LoadU(d, raw + 4 * N);
+    const VU16 raw5 = LoadU(d, raw + 5 * N);
+    const VU16 raw6 = LoadU(d, raw + 6 * N);
+    const VU16 raw7 = LoadU(d, raw + 7 * N);
+    const VU16 raw8 = LoadU(d, raw + 8 * N);
+    const VU16 raw9 = LoadU(d, raw + 9 * N);
+    const VU16 rawA = LoadU(d, raw + 0xA * N);
+    const VU16 rawB = LoadU(d, raw + 0xB * N);
+    const VU16 rawC = LoadU(d, raw + 0xC * N);
+    const VU16 rawD = LoadU(d, raw + 0xD * N);
+    const VU16 rawE = LoadU(d, raw + 0xE * N);
+    const VU16 rawF = LoadU(d, raw + 0xF * N);
+
+    // We can fit 15 raw vectors in three packed vectors (five each).
+    VU16 packed0 = Xor3(ShiftLeft<6>(raw6), ShiftLeft<3>(raw3), raw0);
+    VU16 packed1 = Xor3(ShiftLeft<6>(raw7), ShiftLeft<3>(raw4), raw1);
+    VU16 packed2 = Xor3(ShiftLeft<6>(raw8), ShiftLeft<3>(raw5), raw2);
+
+    // rawF will be scattered into the upper bit of these three.
+    packed0 = Xor3(packed0, ShiftLeft<12>(rawC), ShiftLeft<9>(raw9));
+    packed1 = Xor3(packed1, ShiftLeft<12>(rawD), ShiftLeft<9>(rawA));
+    packed2 = Xor3(packed2, ShiftLeft<12>(rawE), ShiftLeft<9>(rawB));
+
+    const VU16 hi1 = Set(d, 0x8000u);
+    packed0 = Or(packed0, ShiftLeft<15>(rawF));  // MSB only, no mask
+    packed1 = OrAnd(packed1, ShiftLeft<14>(rawF), hi1);
+    packed2 = OrAnd(packed2, ShiftLeft<13>(rawF), hi1);
+    StoreU(packed0, d, packed_out + 0 * N);
+    StoreU(packed1, d, packed_out + 1 * N);
+    StoreU(packed2, d, packed_out + 2 * N);
+  }
+
+  template <class D>
+  HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
+                         uint16_t* HWY_RESTRICT raw) const {
+    using VU16 = Vec<decltype(d)>;
+    const size_t N = Lanes(d);
+    const VU16 mask = Set(d, 0x7u);  // Lowest 3 bits
+
+    const VU16 packed0 = LoadU(d, packed_in + 0 * N);
+    const VU16 packed1 = LoadU(d, packed_in + 1 * N);
+    const VU16 packed2 = LoadU(d, packed_in + 2 * N);
+
+    const VU16 raw0 = And(mask, packed0);
+    StoreU(raw0, d, raw + 0 * N);
+
+    const VU16 raw1 = And(mask, packed1);
+    StoreU(raw1, d, raw + 1 * N);
+
+    const VU16 raw2 = And(mask, packed2);
+    StoreU(raw2, d, raw + 2 * N);
+
+    const VU16 raw3 = And(mask, ShiftRight<3>(packed0));
+    StoreU(raw3, d, raw + 3 * N);
+
+    const VU16 raw4 = And(mask, ShiftRight<3>(packed1));
+    StoreU(raw4, d, raw + 4 * N);
+
+    const VU16 raw5 = And(mask, ShiftRight<3>(packed2));
+    StoreU(raw5, d, raw + 5 * N);
+
+    const VU16 raw6 = And(mask, ShiftRight<6>(packed0));
+    StoreU(raw6, d, raw + 6 * N);
+
+    const VU16 raw7 = And(mask, ShiftRight<6>(packed1));
+    StoreU(raw7, d, raw + 7 * N);
+
+    const VU16 raw8 = And(mask, ShiftRight<6>(packed2));
+    StoreU(raw8, d, raw + 8 * N);
+
+    const VU16 raw9 = And(mask, ShiftRight<9>(packed0));
+    StoreU(raw9, d, raw + 9 * N);
+
+    const VU16 rawA = And(mask, ShiftRight<9>(packed1));
+    StoreU(rawA, d, raw + 0xA * N);
+
+    const VU16 rawB = And(mask, ShiftRight<9>(packed2));
+    StoreU(rawB, d, raw + 0xB * N);
+
+    const VU16 rawC = And(mask, ShiftRight<12>(packed0));
+    StoreU(rawC, d, raw + 0xC * N);
+
+    const VU16 rawD = And(mask, ShiftRight<12>(packed1));
+    StoreU(rawD, d, raw + 0xD * N);
+
+    const VU16 rawE = And(mask, ShiftRight<12>(packed2));
+    StoreU(rawE, d, raw + 0xE * N);
+
+    // rawF is the concatenation of the upper bit of packed0..2.
+    const VU16 down0 = ShiftRight<15>(packed0);
+    const VU16 down1 = ShiftRight<15>(packed1);
+    const VU16 down2 = ShiftRight<15>(packed2);
+    const VU16 rawF = Xor3(ShiftLeft<2>(down2), Add(down1, down1), down0);
+    StoreU(rawF, d, raw + 0xF * N);
+  }
+};  // Pack16<3>
+
+template <>
+struct Pack16<4> {
+  template <class D>
+  HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
+                       uint16_t* HWY_RESTRICT packed_out) const {
+    using VU16 = Vec<decltype(d)>;
+    const size_t N = Lanes(d);
+    const VU16 raw0 = LoadU(d, raw + 0 * N);
+    const VU16 raw1 = LoadU(d, raw + 1 * N);
+    const VU16 raw2 = LoadU(d, raw + 2 * N);
+    const VU16 raw3 = LoadU(d, raw + 3 * N);
+    const VU16 raw4 = LoadU(d, raw + 4 * N);
+    const VU16 raw5 = LoadU(d, raw + 5 * N);
+    const VU16 raw6 = LoadU(d, raw + 6 * N);
+    const VU16 raw7 = LoadU(d, raw + 7 * N);
+    const VU16 raw8 = LoadU(d, raw + 8 * N);
+    const VU16 raw9 = LoadU(d, raw + 9 * N);
+    const VU16 rawA = LoadU(d, raw + 0xA * N);
+    const VU16 rawB = LoadU(d, raw + 0xB * N);
+    const VU16 rawC = LoadU(d, raw + 0xC * N);
+    const VU16 rawD = LoadU(d, raw + 0xD * N);
+    const VU16 rawE = LoadU(d, raw + 0xE * N);
+    const VU16 rawF = LoadU(d, raw + 0xF * N);
+
+    VU16 packed0 = Xor3(ShiftLeft<8>(raw4), ShiftLeft<4>(raw2), raw0);
+    VU16 packed1 = Xor3(ShiftLeft<8>(raw5), ShiftLeft<4>(raw3), raw1);
+    packed0 = Or(packed0, ShiftLeft<12>(raw6));
+    packed1 = Or(packed1, ShiftLeft<12>(raw7));
+    VU16 packed2 = Xor3(ShiftLeft<8>(rawC), ShiftLeft<4>(rawA), raw8);
+    VU16 packed3 = Xor3(ShiftLeft<8>(rawD), ShiftLeft<4>(rawB), raw9);
+    packed2 = Or(packed2, ShiftLeft<12>(rawE));
+    packed3 = Or(packed3, ShiftLeft<12>(rawF));
+
+    StoreU(packed0, d, packed_out + 0 * N);
+    StoreU(packed1, d, packed_out + 1 * N);
+    StoreU(packed2, d, packed_out + 2 * N);
+    StoreU(packed3, d, packed_out + 3 * N);
+  }
+
+  template <class D>
+  HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
+                         uint16_t* HWY_RESTRICT raw) const {
+    using VU16 = Vec<decltype(d)>;
+    const size_t N = Lanes(d);
+    const VU16 mask = Set(d, 0xFu);  // Lowest 4 bits
+
+    const VU16 packed0 = LoadU(d, packed_in + 0 * N);
+    const VU16 packed1 = LoadU(d, packed_in + 1 * N);
+    const VU16 packed2 = LoadU(d, packed_in + 2 * N);
+    const VU16 packed3 = LoadU(d, packed_in + 3 * N);
+
+    const VU16 raw0 = And(packed0, mask);
+    StoreU(raw0, d, raw + 0 * N);
+
+    const VU16 raw1 = And(packed1, mask);
+    StoreU(raw1, d, raw + 1 * N);
+
+    const VU16 raw2 = And(ShiftRight<4>(packed0), mask);
+    StoreU(raw2, d, raw + 2 * N);
+
+    const VU16 raw3 = And(ShiftRight<4>(packed1), mask);
+    StoreU(raw3, d, raw + 3 * N);
+
+    const VU16 raw4 = And(ShiftRight<8>(packed0), mask);
+    StoreU(raw4, d, raw + 4 * N);
+
+    const VU16 raw5 = And(ShiftRight<8>(packed1), mask);
+    StoreU(raw5, d, raw + 5 * N);
+
+    const VU16 raw6 = ShiftRight<12>(packed0);  // no mask required
+    StoreU(raw6, d, raw + 6 * N);
+
+    const VU16 raw7 = ShiftRight<12>(packed1);  // no mask required
+    StoreU(raw7, d, raw + 7 * N);
+
+    const VU16 raw8 = And(packed2, mask);
+    StoreU(raw8, d, raw + 8 * N);
+
+    const VU16 raw9 = And(packed3, mask);
+    StoreU(raw9, d, raw + 9 * N);
+
+    const VU16 rawA = And(ShiftRight<4>(packed2), mask);
+    StoreU(rawA, d, raw + 0xA * N);
+
+    const VU16 rawB = And(ShiftRight<4>(packed3), mask);
+    StoreU(rawB, d, raw + 0xB * N);
+
+    const VU16 rawC = And(ShiftRight<8>(packed2), mask);
+    StoreU(rawC, d, raw + 0xC * N);
+
+    const VU16 rawD = And(ShiftRight<8>(packed3), mask);
+    StoreU(rawD, d, raw + 0xD * N);
+
+    const VU16 rawE = ShiftRight<12>(packed2);  // no mask required
+    StoreU(rawE, d, raw + 0xE * N);
+
+    const VU16 rawF = ShiftRight<12>(packed3);  // no mask required
+    StoreU(rawF, d, raw + 0xF * N);
+  }
+};  // Pack16<4>
+
+template <>
+struct Pack16<5> {
+  template <class D>
+  HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
+                       uint16_t* HWY_RESTRICT packed_out) const {
+    using VU16 = Vec<decltype(d)>;
+    const size_t N = Lanes(d);
+    const VU16 raw0 = LoadU(d, raw + 0 * N);
+    const VU16 raw1 = LoadU(d, raw + 1 * N);
+    const VU16 raw2 = LoadU(d, raw + 2 * N);
+    const VU16 raw3 = LoadU(d, raw + 3 * N);
+    const VU16 raw4 = LoadU(d, raw + 4 * N);
+    const VU16 raw5 = LoadU(d, raw + 5 * N);
+    const VU16 raw6 = LoadU(d, raw + 6 * N);
+    const VU16 raw7 = LoadU(d, raw + 7 * N);
+    const VU16 raw8 = LoadU(d, raw + 8 * N);
+    const VU16 raw9 = LoadU(d, raw + 9 * N);
+    const VU16 rawA = LoadU(d, raw + 0xA * N);
+    const VU16 rawB = LoadU(d, raw + 0xB * N);
+    const VU16 rawC = LoadU(d, raw + 0xC * N);
+    const VU16 rawD = LoadU(d, raw + 0xD * N);
+    const VU16 rawE = LoadU(d, raw + 0xE * N);
+    const VU16 rawF = LoadU(d, raw + 0xF * N);
+
+    // We can fit 15 raw vectors in five packed vectors (three each).
+    VU16 packed0 = Xor3(ShiftLeft<10>(rawA), ShiftLeft<5>(raw5), raw0);
+    VU16 packed1 = Xor3(ShiftLeft<10>(rawB), ShiftLeft<5>(raw6), raw1);
+    VU16 packed2 = Xor3(ShiftLeft<10>(rawC), ShiftLeft<5>(raw7), raw2);
+    VU16 packed3 = Xor3(ShiftLeft<10>(rawD), ShiftLeft<5>(raw8), raw3);
+    VU16 packed4 = Xor3(ShiftLeft<10>(rawE), ShiftLeft<5>(raw9), raw4);
+
+    // rawF will be scattered into the upper bits of these five.
+    const VU16 hi1 = Set(d, 0x8000u);
+    packed0 = Or(packed0, ShiftLeft<15>(rawF));  // MSB only, no mask
+    packed1 = OrAnd(packed1, ShiftLeft<14>(rawF), hi1);
+    packed2 = OrAnd(packed2, ShiftLeft<13>(rawF), hi1);
+    packed3 = OrAnd(packed3, ShiftLeft<12>(rawF), hi1);
+    packed4 = OrAnd(packed4, ShiftLeft<11>(rawF), hi1);
+
+    StoreU(packed0, d, packed_out + 0 * N);
+    StoreU(packed1, d, packed_out + 1 * N);
+    StoreU(packed2, d, packed_out + 2 * N);
+    StoreU(packed3, d, packed_out + 3 * N);
+    StoreU(packed4, d, packed_out + 4 * N);
+  }
+
+  template <class D>
+  HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
+                         uint16_t* HWY_RESTRICT raw) const {
+    using VU16 = Vec<decltype(d)>;
+    const size_t N = Lanes(d);
+
+    const VU16 packed0 = LoadU(d, packed_in + 0 * N);
+    const VU16 packed1 = LoadU(d, packed_in + 1 * N);
+    const VU16 packed2 = LoadU(d, packed_in + 2 * N);
+    const VU16 packed3 = LoadU(d, packed_in + 3 * N);
+    const VU16 packed4 = LoadU(d, packed_in + 4 * N);
+
+    const VU16 mask = Set(d, 0x1Fu);  // Lowest 5 bits
+
+    const VU16 raw0 = And(packed0, mask);
+    StoreU(raw0, d, raw + 0 * N);
+
+    const VU16 raw1 = And(packed1, mask);
+    StoreU(raw1, d, raw + 1 * N);
+
+    const VU16 raw2 = And(packed2, mask);
+    StoreU(raw2, d, raw + 2 * N);
+
+    const VU16 raw3 = And(packed3, mask);
+    StoreU(raw3, d, raw + 3 * N);
+
+    const VU16 raw4 = And(packed4, mask);
+    StoreU(raw4, d, raw + 4 * N);
+
+    const VU16 raw5 = And(ShiftRight<5>(packed0), mask);
+    StoreU(raw5, d, raw + 5 * N);
+
+    const VU16 raw6 = And(ShiftRight<5>(packed1), mask);
+    StoreU(raw6, d, raw + 6 * N);
+
+    const VU16 raw7 = And(ShiftRight<5>(packed2), mask);
+    StoreU(raw7, d, raw + 7 * N);
+
+    const VU16 raw8 = And(ShiftRight<5>(packed3), mask);
+    StoreU(raw8, d, raw + 8 * N);
+
+    const VU16 raw9 = And(ShiftRight<5>(packed4), mask);
+    StoreU(raw9, d, raw + 9 * N);
+
+    const VU16 rawA = And(ShiftRight<10>(packed0), mask);
+    StoreU(rawA, d, raw + 0xA * N);
+
+    const VU16 rawB = And(ShiftRight<10>(packed1), mask);
+    StoreU(rawB, d, raw + 0xB * N);
+
+    const VU16 rawC = And(ShiftRight<10>(packed2), mask);
+    StoreU(rawC, d, raw + 0xC * N);
+
+    const VU16 rawD = And(ShiftRight<10>(packed3), mask);
+    StoreU(rawD, d, raw + 0xD * N);
+
+    const VU16 rawE = And(ShiftRight<10>(packed4), mask);
+    StoreU(rawE, d, raw + 0xE * N);
+
+    // rawF is the concatenation of the lower bit of packed0..4.
+    const VU16 down0 = ShiftRight<15>(packed0);
+    const VU16 down1 = ShiftRight<15>(packed1);
+    const VU16 hi1 = Set(d, 0x8000u);
+    const VU16 p0 =
+        Xor3(ShiftRight<13>(And(packed2, hi1)), Add(down1, down1), down0);
+    const VU16 rawF = Xor3(ShiftRight<11>(And(packed4, hi1)),
+                           ShiftRight<12>(And(packed3, hi1)), p0);
+    StoreU(rawF, d, raw + 0xF * N);
+  }
+};  // Pack16<5>
+
+template <>
+struct Pack16<6> {
+  template <class D>
+  HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
+                       uint16_t* HWY_RESTRICT packed_out) const {
+    using VU16 = Vec<decltype(d)>;
+    const size_t N = Lanes(d);
+    const VU16 raw0 = LoadU(d, raw + 0 * N);
+    const VU16 raw1 = LoadU(d, raw + 1 * N);
+    const VU16 raw2 = LoadU(d, raw + 2 * N);
+    const VU16 raw3 = LoadU(d, raw + 3 * N);
+    const VU16 raw4 = LoadU(d, raw + 4 * N);
+    const VU16 raw5 = LoadU(d, raw + 5 * N);
+    const VU16 raw6 = LoadU(d, raw + 6 * N);
+    const VU16 raw7 = LoadU(d, raw + 7 * N);
+    const VU16 raw8 = LoadU(d, raw + 8 * N);
+    const VU16 raw9 = LoadU(d, raw + 9 * N);
+    const VU16 rawA = LoadU(d, raw + 0xA * N);
+    const VU16 rawB = LoadU(d, raw + 0xB * N);
+    const VU16 rawC = LoadU(d, raw + 0xC * N);
+    const VU16 rawD = LoadU(d, raw + 0xD * N);
+    const VU16 rawE = LoadU(d, raw + 0xE * N);
+    const VU16 rawF = LoadU(d, raw + 0xF * N);
+
+    const VU16 packed3 = Or(ShiftLeft<6>(raw7), raw3);
+    const VU16 packed7 = Or(ShiftLeft<6>(rawF), rawB);
+    // Three vectors, two 6-bit raw each; packed3 (12 bits) is spread over the
+    // four remainder bits at the top of each vector.
+    const VU16 packed0 = Xor3(ShiftLeft<12>(packed3), ShiftLeft<6>(raw4), raw0);
+    VU16 packed1 = Or(ShiftLeft<6>(raw5), raw1);
+    VU16 packed2 = Or(ShiftLeft<6>(raw6), raw2);
+    const VU16 packed4 = Xor3(ShiftLeft<12>(packed7), ShiftLeft<6>(rawC), raw8);
+    VU16 packed5 = Or(ShiftLeft<6>(rawD), raw9);
+    VU16 packed6 = Or(ShiftLeft<6>(rawE), rawA);
+
+    const VU16 hi4 = Set(d, 0xF000u);
+    packed1 = OrAnd(packed1, ShiftLeft<8>(packed3), hi4);
+    packed2 = OrAnd(packed2, ShiftLeft<4>(packed3), hi4);
+    packed5 = OrAnd(packed5, ShiftLeft<8>(packed7), hi4);
+    packed6 = OrAnd(packed6, ShiftLeft<4>(packed7), hi4);
+
+    StoreU(packed0, d, packed_out + 0 * N);
+    StoreU(packed1, d, packed_out + 1 * N);
+    StoreU(packed2, d, packed_out + 2 * N);
+    StoreU(packed4, d, packed_out + 3 * N);
+    StoreU(packed5, d, packed_out + 4 * N);
+    StoreU(packed6, d, packed_out + 5 * N);
+  }
+
+  template <class D>
+  HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
+                         uint16_t* HWY_RESTRICT raw) const {
+    using VU16 = Vec<decltype(d)>;
+    const size_t N = Lanes(d);
+    const VU16 mask = Set(d, 0x3Fu);  // Lowest 6 bits
+
+    const VU16 packed0 = LoadU(d, packed_in + 0 * N);
+    const VU16 packed1 = LoadU(d, packed_in + 1 * N);
+    const VU16 packed2 = LoadU(d, packed_in + 2 * N);
+    const VU16 packed4 = LoadU(d, packed_in + 3 * N);
+    const VU16 packed5 = LoadU(d, packed_in + 4 * N);
+    const VU16 packed6 = LoadU(d, packed_in + 5 * N);
+
+    const VU16 raw0 = And(packed0, mask);
+    StoreU(raw0, d, raw + 0 * N);
+
+    const VU16 raw1 = And(packed1, mask);
+    StoreU(raw1, d, raw + 1 * N);
+
+    const VU16 raw2 = And(packed2, mask);
+    StoreU(raw2, d, raw + 2 * N);
+
+    const VU16 raw4 = And(ShiftRight<6>(packed0), mask);
+    StoreU(raw4, d, raw + 4 * N);
+
+    const VU16 raw5 = And(ShiftRight<6>(packed1), mask);
+    StoreU(raw5, d, raw + 5 * N);
+
+    const VU16 raw6 = And(ShiftRight<6>(packed2), mask);
+    StoreU(raw6, d, raw + 6 * N);
+
+    const VU16 raw8 = And(packed4, mask);
+    StoreU(raw8, d, raw + 8 * N);
+
+    const VU16 raw9 = And(packed5, mask);
+    StoreU(raw9, d, raw + 9 * N);
+
+    const VU16 rawA = And(packed6, mask);
+    StoreU(rawA, d, raw + 0xA * N);
+
+    const VU16 rawC = And(ShiftRight<6>(packed4), mask);
+    StoreU(rawC, d, raw + 0xC * N);
+
+    const VU16 rawD = And(ShiftRight<6>(packed5), mask);
+    StoreU(rawD, d, raw + 0xD * N);
+
+    const VU16 rawE = And(ShiftRight<6>(packed6), mask);
+    StoreU(rawE, d, raw + 0xE * N);
+
+    // packed3 is the concatenation of the four upper bits in packed0..2.
+    const VU16 down0 = ShiftRight<12>(packed0);
+    const VU16 down4 = ShiftRight<12>(packed4);
+    const VU16 hi4 = Set(d, 0xF000u);
+    const VU16 packed3 = Xor3(ShiftRight<4>(And(packed2, hi4)),
+                              ShiftRight<8>(And(packed1, hi4)), down0);
+    const VU16 packed7 = Xor3(ShiftRight<4>(And(packed6, hi4)),
+                              ShiftRight<8>(And(packed5, hi4)), down4);
+    const VU16 raw3 = And(packed3, mask);
+    StoreU(raw3, d, raw + 3 * N);
+
+    const VU16 rawB = And(packed7, mask);
+    StoreU(rawB, d, raw + 0xB * N);
+
+    const VU16 raw7 = ShiftRight<6>(packed3);  // upper bits already zero
+    StoreU(raw7, d, raw + 7 * N);
+
+    const VU16 rawF = ShiftRight<6>(packed7);  // upper bits already zero
+    StoreU(rawF, d, raw + 0xF * N);
+  }
+};  // Pack16<6>
+
+template <>
+struct Pack16<7> {
+  template <class D>
+  HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
+                       uint16_t* HWY_RESTRICT packed_out) const {
+    using VU16 = Vec<decltype(d)>;
+    const size_t N = Lanes(d);
+    const VU16 raw0 = LoadU(d, raw + 0 * N);
+    const VU16 raw1 = LoadU(d, raw + 1 * N);
+    const VU16 raw2 = LoadU(d, raw + 2 * N);
+    const VU16 raw3 = LoadU(d, raw + 3 * N);
+    const VU16 raw4 = LoadU(d, raw + 4 * N);
+    const VU16 raw5 = LoadU(d, raw + 5 * N);
+    const VU16 raw6 = LoadU(d, raw + 6 * N);
+    const VU16 raw7 = LoadU(d, raw + 7 * N);
+    const VU16 raw8 = LoadU(d, raw + 8 * N);
+    const VU16 raw9 = LoadU(d, raw + 9 * N);
+    const VU16 rawA = LoadU(d, raw + 0xA * N);
+    const VU16 rawB = LoadU(d, raw + 0xB * N);
+    const VU16 rawC = LoadU(d, raw + 0xC * N);
+    const VU16 rawD = LoadU(d, raw + 0xD * N);
+    const VU16 rawE = LoadU(d, raw + 0xE * N);
+    const VU16 rawF = LoadU(d, raw + 0xF * N);
+
+    const VU16 packed7 = Or(ShiftLeft<7>(rawF), raw7);
+    // Seven vectors, two 7-bit raw each; packed7 (14 bits) is spread over the
+    // two remainder bits at the top of each vector.
+    const VU16 packed0 = Xor3(ShiftLeft<14>(packed7), ShiftLeft<7>(raw8), raw0);
+    VU16 packed1 = Or(ShiftLeft<7>(raw9), raw1);
+    VU16 packed2 = Or(ShiftLeft<7>(rawA), raw2);
+    VU16 packed3 = Or(ShiftLeft<7>(rawB), raw3);
+    VU16 packed4 = Or(ShiftLeft<7>(rawC), raw4);
+    VU16 packed5 = Or(ShiftLeft<7>(rawD), raw5);
+    VU16 packed6 = Or(ShiftLeft<7>(rawE), raw6);
+
+    const VU16 hi2 = Set(d, 0xC000u);
+    packed1 = OrAnd(packed1, ShiftLeft<12>(packed7), hi2);
+    packed2 = OrAnd(packed2, ShiftLeft<10>(packed7), hi2);
+    packed3 = OrAnd(packed3, ShiftLeft<8>(packed7), hi2);
+    packed4 = OrAnd(packed4, ShiftLeft<6>(packed7), hi2);
+    packed5 = OrAnd(packed5, ShiftLeft<4>(packed7), hi2);
+    packed6 = OrAnd(packed6, ShiftLeft<2>(packed7), hi2);
+
+    StoreU(packed0, d, packed_out + 0 * N);
+    StoreU(packed1, d, packed_out + 1 * N);
+    StoreU(packed2, d, packed_out + 2 * N);
+    StoreU(packed3, d, packed_out + 3 * N);
+    StoreU(packed4, d, packed_out + 4 * N);
+    StoreU(packed5, d, packed_out + 5 * N);
+    StoreU(packed6, d, packed_out + 6 * N);
+  }
+
+  template <class D>
+  HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
+                         uint16_t* HWY_RESTRICT raw) const {
+    using VU16 = Vec<decltype(d)>;
+    const size_t N = Lanes(d);
+
+    const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N));
+    const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N));
+    const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N));
+    const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N));
+    const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N));
+    const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N));
+    const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N));
+
+    const VU16 mask = Set(d, 0x7Fu);  // Lowest 7 bits
+
+    const VU16 raw0 = And(packed0, mask);
+    StoreU(raw0, d, raw + 0 * N);
+
+    const VU16 raw1 = And(packed1, mask);
+    StoreU(raw1, d, raw + 1 * N);
+
+    const VU16 raw2 = And(packed2, mask);
+    StoreU(raw2, d, raw + 2 * N);
+
+    const VU16 raw3 = And(packed3, mask);
+    StoreU(raw3, d, raw + 3 * N);
+
+    const VU16 raw4 = And(packed4, mask);
+    StoreU(raw4, d, raw + 4 * N);
+
+    const VU16 raw5 = And(packed5, mask);
+    StoreU(raw5, d, raw + 5 * N);
+
+    const VU16 raw6 = And(packed6, mask);
+    StoreU(raw6, d, raw + 6 * N);
+
+    const VU16 raw8 = And(ShiftRight<7>(packed0), mask);
+    StoreU(raw8, d, raw + 8 * N);
+
+    const VU16 raw9 = And(ShiftRight<7>(packed1), mask);
+    StoreU(raw9, d, raw + 9 * N);
+
+    const VU16 rawA = And(ShiftRight<7>(packed2), mask);
+    StoreU(rawA, d, raw + 0xA * N);
+
+    const VU16 rawB = And(ShiftRight<7>(packed3), mask);
+    StoreU(rawB, d, raw + 0xB * N);
+
+    const VU16 rawC = And(ShiftRight<7>(packed4), mask);
+    StoreU(rawC, d, raw + 0xC * N);
+
+    const VU16 rawD = And(ShiftRight<7>(packed5), mask);
+    StoreU(rawD, d, raw + 0xD * N);
+
+    const VU16 rawE = And(ShiftRight<7>(packed6), mask);
+    StoreU(rawE, d, raw + 0xE * N);
+
+    // packed7 is the concatenation of the two upper bits in packed0..6.
+    const VU16 down0 = ShiftRight<14>(packed0);
+    const VU16 hi2 = Set(d, 0xC000u);
+    const VU16 p0 = Xor3(ShiftRight<12>(And(packed1, hi2)),
+                         ShiftRight<10>(And(packed2, hi2)), down0);
+    const VU16 p1 = Xor3(ShiftRight<8>(And(packed3, hi2)),  //
+                         ShiftRight<6>(And(packed4, hi2)),
+                         ShiftRight<4>(And(packed5, hi2)));
+    const VU16 packed7 = Xor3(ShiftRight<2>(And(packed6, hi2)), p1, p0);
+
+    const VU16 raw7 = And(packed7, mask);
+    StoreU(raw7, d, raw + 7 * N);
+
+    const VU16 rawF = ShiftRight<7>(packed7);  // upper bits already zero
+    StoreU(rawF, d, raw + 0xF * N);
+  }
+};  // Pack16<7>
+
+template <>
+struct Pack16<8> {
+  template <class D>
+  HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
+                       uint16_t* HWY_RESTRICT packed_out) const {
+    using VU16 = Vec<decltype(d)>;
+    const size_t N = Lanes(d);
+    const VU16 raw0 = LoadU(d, raw + 0 * N);
+    const VU16 raw1 = LoadU(d, raw + 1 * N);
+    const VU16 raw2 = LoadU(d, raw + 2 * N);
+    const VU16 raw3 = LoadU(d, raw + 3 * N);
+    const VU16 raw4 = LoadU(d, raw + 4 * N);
+    const VU16 raw5 = LoadU(d, raw + 5 * N);
+    const VU16 raw6 = LoadU(d, raw + 6 * N);
+    const VU16 raw7 = LoadU(d, raw + 7 * N);
+    const VU16 raw8 = LoadU(d, raw + 8 * N);
+    const VU16 raw9 = LoadU(d, raw + 9 * N);
+    const VU16 rawA = LoadU(d, raw + 0xA * N);
+    const VU16 rawB = LoadU(d, raw + 0xB * N);
+    const VU16 rawC = LoadU(d, raw + 0xC * N);
+    const VU16 rawD = LoadU(d, raw + 0xD * N);
+    const VU16 rawE = LoadU(d, raw + 0xE * N);
+    const VU16 rawF = LoadU(d, raw + 0xF * N);
+
+    // This is equivalent to ConcatEven with 8-bit lanes, but much more
+    // efficient on RVV and slightly less efficient on SVE2.
+    const VU16 packed0 = Or(ShiftLeft<8>(raw2), raw0);
+    const VU16 packed1 = Or(ShiftLeft<8>(raw3), raw1);
+    const VU16 packed2 = Or(ShiftLeft<8>(raw6), raw4);
+    const VU16 packed3 = Or(ShiftLeft<8>(raw7), raw5);
+    const VU16 packed4 = Or(ShiftLeft<8>(rawA), raw8);
+    const VU16 packed5 = Or(ShiftLeft<8>(rawB), raw9);
+    const VU16 packed6 = Or(ShiftLeft<8>(rawE), rawC);
+    const VU16 packed7 = Or(ShiftLeft<8>(rawF), rawD);
+
+    StoreU(packed0, d, packed_out + 0 * N);
+    StoreU(packed1, d, packed_out + 1 * N);
+    StoreU(packed2, d, packed_out + 2 * N);
+    StoreU(packed3, d, packed_out + 3 * N);
+    StoreU(packed4, d, packed_out + 4 * N);
+    StoreU(packed5, d, packed_out + 5 * N);
+    StoreU(packed6, d, packed_out + 6 * N);
+    StoreU(packed7, d, packed_out + 7 * N);
+  }
+
+  template <class D>
+  HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
+                         uint16_t* HWY_RESTRICT raw) const {
+    using VU16 = Vec<decltype(d)>;
+    const size_t N = Lanes(d);
+
+    const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N));
+    const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N));
+    const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N));
+    const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N));
+    const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N));
+    const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N));
+    const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N));
+    const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N));
+    const VU16 mask = Set(d, 0xFFu);  // Lowest 8 bits
+
+    const VU16 raw0 = And(packed0, mask);
+    StoreU(raw0, d, raw + 0 * N);
+
+    const VU16 raw1 = And(packed1, mask);
+    StoreU(raw1, d, raw + 1 * N);
+
+    const VU16 raw2 = ShiftRight<8>(packed0);  // upper bits already zero
+    StoreU(raw2, d, raw + 2 * N);
+
+    const VU16 raw3 = ShiftRight<8>(packed1);  // upper bits already zero
+    StoreU(raw3, d, raw + 3 * N);
+
+    const VU16 raw4 = And(packed2, mask);
+    StoreU(raw4, d, raw + 4 * N);
+
+    const VU16 raw5 = And(packed3, mask);
+    StoreU(raw5, d, raw + 5 * N);
+
+    const VU16 raw6 = ShiftRight<8>(packed2);  // upper bits already zero
+    StoreU(raw6, d, raw + 6 * N);
+
+    const VU16 raw7 = ShiftRight<8>(packed3);  // upper bits already zero
+    StoreU(raw7, d, raw + 7 * N);
+
+    const VU16 raw8 = And(packed4, mask);
+    StoreU(raw8, d, raw + 8 * N);
+
+    const VU16 raw9 = And(packed5, mask);
+    StoreU(raw9, d, raw + 9 * N);
+
+    const VU16 rawA = ShiftRight<8>(packed4);  // upper bits already zero
+    StoreU(rawA, d, raw + 0xA * N);
+
+    const VU16 rawB = ShiftRight<8>(packed5);  // upper bits already zero
+    StoreU(rawB, d, raw + 0xB * N);
+
+    const VU16 rawC = And(packed6, mask);
+    StoreU(rawC, d, raw + 0xC * N);
+
+    const VU16 rawD = And(packed7, mask);
+    StoreU(rawD, d, raw + 0xD * N);
+
+    const VU16 rawE = ShiftRight<8>(packed6);  // upper bits already zero
+    StoreU(rawE, d, raw + 0xE * N);
+
+    const VU16 rawF = ShiftRight<8>(packed7);  // upper bits already zero
+    StoreU(rawF, d, raw + 0xF * N);
+  }
+};  // Pack16<8>
+
+template <>
+struct Pack16<9> {
+  template <class D>
+  HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
+                       uint16_t* HWY_RESTRICT packed_out) const {
+    using VU16 = Vec<decltype(d)>;
+    const size_t N = Lanes(d);
+    const VU16 raw0 = LoadU(d, raw + 0 * N);
+    const VU16 raw1 = LoadU(d, raw + 1 * N);
+    const VU16 raw2 = LoadU(d, raw + 2 * N);
+    const VU16 raw3 = LoadU(d, raw + 3 * N);
+    const VU16 raw4 = LoadU(d, raw + 4 * N);
+    const VU16 raw5 = LoadU(d, raw + 5 * N);
+    const VU16 raw6 = LoadU(d, raw + 6 * N);
+    const VU16 raw7 = LoadU(d, raw + 7 * N);
+    const VU16 raw8 = LoadU(d, raw + 8 * N);
+    const VU16 raw9 = LoadU(d, raw + 9 * N);
+    const VU16 rawA = LoadU(d, raw + 0xA * N);
+    const VU16 rawB = LoadU(d, raw + 0xB * N);
+    const VU16 rawC = LoadU(d, raw + 0xC * N);
+    const VU16 rawD = LoadU(d, raw + 0xD * N);
+    const VU16 rawE = LoadU(d, raw + 0xE * N);
+    const VU16 rawF = LoadU(d, raw + 0xF * N);
+    // 8 vectors, each with 9+7 bits; top 2 bits are concatenated into packed8.
+    const VU16 packed0 = Or(ShiftLeft<9>(raw8), raw0);
+    const VU16 packed1 = Or(ShiftLeft<9>(raw9), raw1);
+    const VU16 packed2 = Or(ShiftLeft<9>(rawA), raw2);
+    const VU16 packed3 = Or(ShiftLeft<9>(rawB), raw3);
+    const VU16 packed4 = Or(ShiftLeft<9>(rawC), raw4);
+    const VU16 packed5 = Or(ShiftLeft<9>(rawD), raw5);
+    const VU16 packed6 = Or(ShiftLeft<9>(rawE), raw6);
+    const VU16 packed7 = Or(ShiftLeft<9>(rawF), raw7);
+
+    // We could shift down, OR and shift up, but two shifts are typically more
+    // expensive than AND, shift into position, and OR (which can be further
+    // reduced via Xor3).
+    const VU16 mid2 = Set(d, 0x180u);  // top 2 in lower 9
+    const VU16 part8 = ShiftRight<7>(And(raw8, mid2));
+    const VU16 part9 = ShiftRight<5>(And(raw9, mid2));
+    const VU16 partA = ShiftRight<3>(And(rawA, mid2));
+    const VU16 partB = ShiftRight<1>(And(rawB, mid2));
+    const VU16 partC = ShiftLeft<1>(And(rawC, mid2));
+    const VU16 partD = ShiftLeft<3>(And(rawD, mid2));
+    const VU16 partE = ShiftLeft<5>(And(rawE, mid2));
+    const VU16 partF = ShiftLeft<7>(And(rawF, mid2));
+    const VU16 packed8 = Xor3(Xor3(part8, part9, partA),
+                              Xor3(partB, partC, partD), Or(partE, partF));
+
+    StoreU(packed0, d, packed_out + 0 * N);
+    StoreU(packed1, d, packed_out + 1 * N);
+    StoreU(packed2, d, packed_out + 2 * N);
+    StoreU(packed3, d, packed_out + 3 * N);
+    StoreU(packed4, d, packed_out + 4 * N);
+    StoreU(packed5, d, packed_out + 5 * N);
+    StoreU(packed6, d, packed_out + 6 * N);
+    StoreU(packed7, d, packed_out + 7 * N);
+    StoreU(packed8, d, packed_out + 8 * N);
+  }
+
+  template <class D>
+  HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
+                         uint16_t* HWY_RESTRICT raw) const {
+    using VU16 = Vec<decltype(d)>;
+    const size_t N = Lanes(d);
+
+    const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N));
+    const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N));
+    const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N));
+    const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N));
+    const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N));
+    const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N));
+    const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N));
+    const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N));
+    const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N));
+
+    const VU16 mask = Set(d, 0x1FFu);  // Lowest 9 bits
+
+    const VU16 raw0 = And(packed0, mask);
+    StoreU(raw0, d, raw + 0 * N);
+
+    const VU16 raw1 = And(packed1, mask);
+    StoreU(raw1, d, raw + 1 * N);
+
+    const VU16 raw2 = And(packed2, mask);
+    StoreU(raw2, d, raw + 2 * N);
+
+    const VU16 raw3 = And(packed3, mask);
+    StoreU(raw3, d, raw + 3 * N);
+
+    const VU16 raw4 = And(packed4, mask);
+    StoreU(raw4, d, raw + 4 * N);
+
+    const VU16 raw5 = And(packed5, mask);
+    StoreU(raw5, d, raw + 5 * N);
+
+    const VU16 raw6 = And(packed6, mask);
+    StoreU(raw6, d, raw + 6 * N);
+
+    const VU16 raw7 = And(packed7, mask);
+    StoreU(raw7, d, raw + 7 * N);
+
+    const VU16 mid2 = Set(d, 0x180u);  // top 2 in lower 9
+    const VU16 raw8 =
+        OrAnd(ShiftRight<9>(packed0), ShiftLeft<7>(packed8), mid2);
+    const VU16 raw9 =
+        OrAnd(ShiftRight<9>(packed1), ShiftLeft<5>(packed8), mid2);
+    const VU16 rawA =
+        OrAnd(ShiftRight<9>(packed2), ShiftLeft<3>(packed8), mid2);
+    const VU16 rawB =
+        OrAnd(ShiftRight<9>(packed3), ShiftLeft<1>(packed8), mid2);
+    const VU16 rawC =
+        OrAnd(ShiftRight<9>(packed4), ShiftRight<1>(packed8), mid2);
+    const VU16 rawD =
+        OrAnd(ShiftRight<9>(packed5), ShiftRight<3>(packed8), mid2);
+    const VU16 rawE =
+        OrAnd(ShiftRight<9>(packed6), ShiftRight<5>(packed8), mid2);
+    const VU16 rawF =
+        OrAnd(ShiftRight<9>(packed7), ShiftRight<7>(packed8), mid2);
+
+    StoreU(raw8, d, raw + 8 * N);
+    StoreU(raw9, d, raw + 9 * N);
+    StoreU(rawA, d, raw + 0xA * N);
+    StoreU(rawB, d, raw + 0xB * N);
+    StoreU(rawC, d, raw + 0xC * N);
+    StoreU(rawD, d, raw + 0xD * N);
+    StoreU(rawE, d, raw + 0xE * N);
+    StoreU(rawF, d, raw + 0xF * N);
+  }
+};  // Pack16<9>
+
+template <>
+struct Pack16<10> {
+  template <class D>
+  HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
+                       uint16_t* HWY_RESTRICT packed_out) const {
+    using VU16 = Vec<decltype(d)>;
+    const size_t N = Lanes(d);
+    const VU16 raw0 = LoadU(d, raw + 0 * N);
+    const VU16 raw1 = LoadU(d, raw + 1 * N);
+    const VU16 raw2 = LoadU(d, raw + 2 * N);
+    const VU16 raw3 = LoadU(d, raw + 3 * N);
+    const VU16 raw4 = LoadU(d, raw + 4 * N);
+    const VU16 raw5 = LoadU(d, raw + 5 * N);
+    const VU16 raw6 = LoadU(d, raw + 6 * N);
+    const VU16 raw7 = LoadU(d, raw + 7 * N);
+    const VU16 raw8 = LoadU(d, raw + 8 * N);
+    const VU16 raw9 = LoadU(d, raw + 9 * N);
+    const VU16 rawA = LoadU(d, raw + 0xA * N);
+    const VU16 rawB = LoadU(d, raw + 0xB * N);
+    const VU16 rawC = LoadU(d, raw + 0xC * N);
+    const VU16 rawD = LoadU(d, raw + 0xD * N);
+    const VU16 rawE = LoadU(d, raw + 0xE * N);
+    const VU16 rawF = LoadU(d, raw + 0xF * N);
+
+    // 8 vectors, each with 10+6 bits; top 4 bits are concatenated into
+    // packed8 and packed9.
+    const VU16 packed0 = Or(ShiftLeft<10>(raw8), raw0);
+    const VU16 packed1 = Or(ShiftLeft<10>(raw9), raw1);
+    const VU16 packed2 = Or(ShiftLeft<10>(rawA), raw2);
+    const VU16 packed3 = Or(ShiftLeft<10>(rawB), raw3);
+    const VU16 packed4 = Or(ShiftLeft<10>(rawC), raw4);
+    const VU16 packed5 = Or(ShiftLeft<10>(rawD), raw5);
+    const VU16 packed6 = Or(ShiftLeft<10>(rawE), raw6);
+    const VU16 packed7 = Or(ShiftLeft<10>(rawF), raw7);
+
+    // We could shift down, OR and shift up, but two shifts are typically more
+    // expensive than AND, shift into position, and OR (which can be further
+    // reduced via Xor3).
+    const VU16 mid4 = Set(d, 0x3C0u);  // top 4 in lower 10
+    const VU16 part8 = ShiftRight<6>(And(raw8, mid4));
+    const VU16 part9 = ShiftRight<2>(And(raw9, mid4));
+    const VU16 partA = ShiftLeft<2>(And(rawA, mid4));
+    const VU16 partB = ShiftLeft<6>(And(rawB, mid4));
+    const VU16 partC = ShiftRight<6>(And(rawC, mid4));
+    const VU16 partD = ShiftRight<2>(And(rawD, mid4));
+    const VU16 partE = ShiftLeft<2>(And(rawE, mid4));
+    const VU16 partF = ShiftLeft<6>(And(rawF, mid4));
+    const VU16 packed8 = Or(Xor3(part8, part9, partA), partB);
+    const VU16 packed9 = Or(Xor3(partC, partD, partE), partF);
+
+    StoreU(packed0, d, packed_out + 0 * N);
+    StoreU(packed1, d, packed_out + 1 * N);
+    StoreU(packed2, d, packed_out + 2 * N);
+    StoreU(packed3, d, packed_out + 3 * N);
+    StoreU(packed4, d, packed_out + 4 * N);
+    StoreU(packed5, d, packed_out + 5 * N);
+    StoreU(packed6, d, packed_out + 6 * N);
+    StoreU(packed7, d, packed_out + 7 * N);
+    StoreU(packed8, d, packed_out + 8 * N);
+    StoreU(packed9, d, packed_out + 9 * N);
+  }
+
+  template <class D>
+  HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
+                         uint16_t* HWY_RESTRICT raw) const {
+    using VU16 = Vec<decltype(d)>;
+    const size_t N = Lanes(d);
+
+    const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N));
+    const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N));
+    const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N));
+    const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N));
+    const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N));
+    const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N));
+    const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N));
+    const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N));
+    const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N));
+    const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N));
+
+    const VU16 mask = Set(d, 0x3FFu);  // Lowest 10 bits
+
+    const VU16 raw0 = And(packed0, mask);
+    StoreU(raw0, d, raw + 0 * N);
+
+    const VU16 raw1 = And(packed1, mask);
+    StoreU(raw1, d, raw + 1 * N);
+
+    const VU16 raw2 = And(packed2, mask);
+    StoreU(raw2, d, raw + 2 * N);
+
+    const VU16 raw3 = And(packed3, mask);
+    StoreU(raw3, d, raw + 3 * N);
+
+    const VU16 raw4 = And(packed4, mask);
+    StoreU(raw4, d, raw + 4 * N);
+
+    const VU16 raw5 = And(packed5, mask);
+    StoreU(raw5, d, raw + 5 * N);
+
+    const VU16 raw6 = And(packed6, mask);
+    StoreU(raw6, d, raw + 6 * N);
+
+    const VU16 raw7 = And(packed7, mask);
+    StoreU(raw7, d, raw + 7 * N);
+
+    const VU16 mid4 = Set(d, 0x3C0u);  // top 4 in lower 10
+    const VU16 raw8 =
+        OrAnd(ShiftRight<10>(packed0), ShiftLeft<6>(packed8), mid4);
+    const VU16 raw9 =
+        OrAnd(ShiftRight<10>(packed1), ShiftLeft<2>(packed8), mid4);
+    const VU16 rawA =
+        OrAnd(ShiftRight<10>(packed2), ShiftRight<2>(packed8), mid4);
+    const VU16 rawB =
+        OrAnd(ShiftRight<10>(packed3), ShiftRight<6>(packed8), mid4);
+    const VU16 rawC =
+        OrAnd(ShiftRight<10>(packed4), ShiftLeft<6>(packed9), mid4);
+    const VU16 rawD =
+        OrAnd(ShiftRight<10>(packed5), ShiftLeft<2>(packed9), mid4);
+    const VU16 rawE =
+        OrAnd(ShiftRight<10>(packed6), ShiftRight<2>(packed9), mid4);
+    const VU16 rawF =
+        OrAnd(ShiftRight<10>(packed7), ShiftRight<6>(packed9), mid4);
+
+    StoreU(raw8, d, raw + 8 * N);
+    StoreU(raw9, d, raw + 9 * N);
+    StoreU(rawA, d, raw + 0xA * N);
+    StoreU(rawB, d, raw + 0xB * N);
+    StoreU(rawC, d, raw + 0xC * N);
+    StoreU(rawD, d, raw + 0xD * N);
+    StoreU(rawE, d, raw + 0xE * N);
+    StoreU(rawF, d, raw + 0xF * N);
+  }
+};  // Pack16<10>
+
+template <>
+struct Pack16<11> {
+  template <class D>
+  HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
+                       uint16_t* HWY_RESTRICT packed_out) const {
+    using VU16 = Vec<decltype(d)>;
+    const size_t N = Lanes(d);
+    const VU16 raw0 = LoadU(d, raw + 0 * N);
+    const VU16 raw1 = LoadU(d, raw + 1 * N);
+    const VU16 raw2 = LoadU(d, raw + 2 * N);
+    const VU16 raw3 = LoadU(d, raw + 3 * N);
+    const VU16 raw4 = LoadU(d, raw + 4 * N);
+    const VU16 raw5 = LoadU(d, raw + 5 * N);
+    const VU16 raw6 = LoadU(d, raw + 6 * N);
+    const VU16 raw7 = LoadU(d, raw + 7 * N);
+    const VU16 raw8 = LoadU(d, raw + 8 * N);
+    const VU16 raw9 = LoadU(d, raw + 9 * N);
+    const VU16 rawA = LoadU(d, raw + 0xA * N);
+    const VU16 rawB = LoadU(d, raw + 0xB * N);
+    const VU16 rawC = LoadU(d, raw + 0xC * N);
+    const VU16 rawD = LoadU(d, raw + 0xD * N);
+    const VU16 rawE = LoadU(d, raw + 0xE * N);
+    const VU16 rawF = LoadU(d, raw + 0xF * N);
+
+    // It is not obvious what the optimal partitioning looks like. To reduce the
+    // number of constants, we want to minimize the number of distinct bit
+    // lengths. 11+5 also requires 6-bit remnants with 4-bit leftovers.
+    // 8+3 seems better: it is easier to scatter 3 bits into the MSBs.
+    const VU16 lo8 = Set(d, 0xFFu);
+
+    // Lower 8 bits of all raw
+    const VU16 packed0 = OrAnd(ShiftLeft<8>(raw1), raw0, lo8);
+    const VU16 packed1 = OrAnd(ShiftLeft<8>(raw3), raw2, lo8);
+    const VU16 packed2 = OrAnd(ShiftLeft<8>(raw5), raw4, lo8);
+    const VU16 packed3 = OrAnd(ShiftLeft<8>(raw7), raw6, lo8);
+    const VU16 packed4 = OrAnd(ShiftLeft<8>(raw9), raw8, lo8);
+    const VU16 packed5 = OrAnd(ShiftLeft<8>(rawB), rawA, lo8);
+    const VU16 packed6 = OrAnd(ShiftLeft<8>(rawD), rawC, lo8);
+    const VU16 packed7 = OrAnd(ShiftLeft<8>(rawF), rawE, lo8);
+
+    StoreU(packed0, d, packed_out + 0 * N);
+    StoreU(packed1, d, packed_out + 1 * N);
+    StoreU(packed2, d, packed_out + 2 * N);
+    StoreU(packed3, d, packed_out + 3 * N);
+    StoreU(packed4, d, packed_out + 4 * N);
+    StoreU(packed5, d, packed_out + 5 * N);
+    StoreU(packed6, d, packed_out + 6 * N);
+    StoreU(packed7, d, packed_out + 7 * N);
+
+    // Three vectors, five 3bit remnants each, plus one 3bit in their MSB.
+    const VU16 top0 = ShiftRight<8>(raw0);
+    const VU16 top1 = ShiftRight<8>(raw1);
+    const VU16 top2 = ShiftRight<8>(raw2);
+    // Insert top raw bits into 3-bit groups within packed8..A. Moving the
+    // mask along avoids masking each of raw0..E and enables OrAnd.
+    VU16 next = Set(d, 0x38u);  // 0x7 << 3
+    VU16 packed8 = OrAnd(top0, ShiftRight<5>(raw3), next);
+    VU16 packed9 = OrAnd(top1, ShiftRight<5>(raw4), next);
+    VU16 packedA = OrAnd(top2, ShiftRight<5>(raw5), next);
+    next = ShiftLeft<3>(next);
+    packed8 = OrAnd(packed8, ShiftRight<2>(raw6), next);
+    packed9 = OrAnd(packed9, ShiftRight<2>(raw7), next);
+    packedA = OrAnd(packedA, ShiftRight<2>(raw8), next);
+    next = ShiftLeft<3>(next);
+    packed8 = OrAnd(packed8, Add(raw9, raw9), next);
+    packed9 = OrAnd(packed9, Add(rawA, rawA), next);
+    packedA = OrAnd(packedA, Add(rawB, rawB), next);
+    next = ShiftLeft<3>(next);
+    packed8 = OrAnd(packed8, ShiftLeft<4>(rawC), next);
+    packed9 = OrAnd(packed9, ShiftLeft<4>(rawD), next);
+    packedA = OrAnd(packedA, ShiftLeft<4>(rawE), next);
+
+    // Scatter upper 3 bits of rawF into the upper bits.
+    next = ShiftLeft<3>(next);  // = 0x8000u
+    packed8 = OrAnd(packed8, ShiftLeft<7>(rawF), next);
+    packed9 = OrAnd(packed9, ShiftLeft<6>(rawF), next);
+    packedA = OrAnd(packedA, ShiftLeft<5>(rawF), next);
+
+    StoreU(packed8, d, packed_out + 8 * N);
+    StoreU(packed9, d, packed_out + 9 * N);
+    StoreU(packedA, d, packed_out + 0xA * N);
+  }
+
+  template <class D>
+  HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
+                         uint16_t* HWY_RESTRICT raw) const {
+    using VU16 = Vec<decltype(d)>;
+    const size_t N = Lanes(d);
+
+    const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N));
+    const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N));
+    const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N));
+    const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N));
+    const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N));
+    const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N));
+    const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N));
+    const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N));
+    const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N));
+    const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N));
+    const VU16 packedA = BitCast(d, LoadU(d, packed_in + 0xA * N));
+
+    const VU16 mask = Set(d, 0xFFu);  // Lowest 8 bits
+
+    const VU16 down0 = And(packed0, mask);
+    const VU16 down1 = ShiftRight<8>(packed0);
+    const VU16 down2 = And(packed1, mask);
+    const VU16 down3 = ShiftRight<8>(packed1);
+    const VU16 down4 = And(packed2, mask);
+    const VU16 down5 = ShiftRight<8>(packed2);
+    const VU16 down6 = And(packed3, mask);
+    const VU16 down7 = ShiftRight<8>(packed3);
+    const VU16 down8 = And(packed4, mask);
+    const VU16 down9 = ShiftRight<8>(packed4);
+    const VU16 downA = And(packed5, mask);
+    const VU16 downB = ShiftRight<8>(packed5);
+    const VU16 downC = And(packed6, mask);
+    const VU16 downD = ShiftRight<8>(packed6);
+    const VU16 downE = And(packed7, mask);
+    const VU16 downF = ShiftRight<8>(packed7);
+
+    // Three bits from packed8..A, eight bits from down0..F.
+    const VU16 hi3 = Set(d, 0x700u);
+    const VU16 raw0 = OrAnd(down0, ShiftLeft<8>(packed8), hi3);
+    const VU16 raw1 = OrAnd(down1, ShiftLeft<8>(packed9), hi3);
+    const VU16 raw2 = OrAnd(down2, ShiftLeft<8>(packedA), hi3);
+
+    const VU16 raw3 = OrAnd(down3, ShiftLeft<5>(packed8), hi3);
+    const VU16 raw4 = OrAnd(down4, ShiftLeft<5>(packed9), hi3);
+    const VU16 raw5 = OrAnd(down5, ShiftLeft<5>(packedA), hi3);
+
+    const VU16 raw6 = OrAnd(down6, ShiftLeft<2>(packed8), hi3);
+    const VU16 raw7 = OrAnd(down7, ShiftLeft<2>(packed9), hi3);
+    const VU16 raw8 = OrAnd(down8, ShiftLeft<2>(packedA), hi3);
+
+    const VU16 raw9 = OrAnd(down9, ShiftRight<1>(packed8), hi3);
+    const VU16 rawA = OrAnd(downA, ShiftRight<1>(packed9), hi3);
+    const VU16 rawB = OrAnd(downB, ShiftRight<1>(packedA), hi3);
+
+    const VU16 rawC = OrAnd(downC, ShiftRight<4>(packed8), hi3);
+    const VU16 rawD = OrAnd(downD, ShiftRight<4>(packed9), hi3);
+    const VU16 rawE = OrAnd(downE, ShiftRight<4>(packedA), hi3);
+
+    // Shift MSB into the top 3-of-11 and mask.
+    const VU16 rawF = Or(downF, Xor3(And(ShiftRight<7>(packed8), hi3),
+                                     And(ShiftRight<6>(packed9), hi3),
+                                     And(ShiftRight<5>(packedA), hi3)));
+
+    StoreU(raw0, d, raw + 0 * N);
+    StoreU(raw1, d, raw + 1 * N);
+    StoreU(raw2, d, raw + 2 * N);
+    StoreU(raw3, d, raw + 3 * N);
+    StoreU(raw4, d, raw + 4 * N);
+    StoreU(raw5, d, raw + 5 * N);
+    StoreU(raw6, d, raw + 6 * N);
+    StoreU(raw7, d, raw + 7 * N);
+    StoreU(raw8, d, raw + 8 * N);
+    StoreU(raw9, d, raw + 9 * N);
+    StoreU(rawA, d, raw + 0xA * N);
+    StoreU(rawB, d, raw + 0xB * N);
+    StoreU(rawC, d, raw + 0xC * N);
+    StoreU(rawD, d, raw + 0xD * N);
+    StoreU(rawE, d, raw + 0xE * N);
+    StoreU(rawF, d, raw + 0xF * N);
+  }
+};  // Pack16<11>
+
+template <>
+struct Pack16<12> {
+  template <class D>
+  HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
+                       uint16_t* HWY_RESTRICT packed_out) const {
+    using VU16 = Vec<decltype(d)>;
+    const size_t N = Lanes(d);
+    const VU16 raw0 = LoadU(d, raw + 0 * N);
+    const VU16 raw1 = LoadU(d, raw + 1 * N);
+    const VU16 raw2 = LoadU(d, raw + 2 * N);
+    const VU16 raw3 = LoadU(d, raw + 3 * N);
+    const VU16 raw4 = LoadU(d, raw + 4 * N);
+    const VU16 raw5 = LoadU(d, raw + 5 * N);
+    const VU16 raw6 = LoadU(d, raw + 6 * N);
+    const VU16 raw7 = LoadU(d, raw + 7 * N);
+    const VU16 raw8 = LoadU(d, raw + 8 * N);
+    const VU16 raw9 = LoadU(d, raw + 9 * N);
+    const VU16 rawA = LoadU(d, raw + 0xA * N);
+    const VU16 rawB = LoadU(d, raw + 0xB * N);
+    const VU16 rawC = LoadU(d, raw + 0xC * N);
+    const VU16 rawD = LoadU(d, raw + 0xD * N);
+    const VU16 rawE = LoadU(d, raw + 0xE * N);
+    const VU16 rawF = LoadU(d, raw + 0xF * N);
+
+    // 8 vectors, each with 12+4 bits; top 8 bits are concatenated into
+    // packed8 to packedB.
+    const VU16 packed0 = Or(ShiftLeft<12>(raw8), raw0);
+    const VU16 packed1 = Or(ShiftLeft<12>(raw9), raw1);
+    const VU16 packed2 = Or(ShiftLeft<12>(rawA), raw2);
+    const VU16 packed3 = Or(ShiftLeft<12>(rawB), raw3);
+    const VU16 packed4 = Or(ShiftLeft<12>(rawC), raw4);
+    const VU16 packed5 = Or(ShiftLeft<12>(rawD), raw5);
+    const VU16 packed6 = Or(ShiftLeft<12>(rawE), raw6);
+    const VU16 packed7 = Or(ShiftLeft<12>(rawF), raw7);
+
+    // Masking after shifting left enables OrAnd.
+    const VU16 hi8 = Set(d, 0xFF00u);
+    const VU16 packed8 = OrAnd(ShiftRight<4>(raw8), ShiftLeft<4>(raw9), hi8);
+    const VU16 packed9 = OrAnd(ShiftRight<4>(rawA), ShiftLeft<4>(rawB), hi8);
+    const VU16 packedA = OrAnd(ShiftRight<4>(rawC), ShiftLeft<4>(rawD), hi8);
+    const VU16 packedB = OrAnd(ShiftRight<4>(rawE), ShiftLeft<4>(rawF), hi8);
+    StoreU(packed0, d, packed_out + 0 * N);
+    StoreU(packed1, d, packed_out + 1 * N);
+    StoreU(packed2, d, packed_out + 2 * N);
+    StoreU(packed3, d, packed_out + 3 * N);
+    StoreU(packed4, d, packed_out + 4 * N);
+    StoreU(packed5, d, packed_out + 5 * N);
+    StoreU(packed6, d, packed_out + 6 * N);
+    StoreU(packed7, d, packed_out + 7 * N);
+    StoreU(packed8, d, packed_out + 8 * N);
+    StoreU(packed9, d, packed_out + 9 * N);
+    StoreU(packedA, d, packed_out + 0xA * N);
+    StoreU(packedB, d, packed_out + 0xB * N);
+  }
+
+  template <class D>
+  HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
+                         uint16_t* HWY_RESTRICT raw) const {
+    using VU16 = Vec<decltype(d)>;
+    const size_t N = Lanes(d);
+
+    const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N));
+    const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N));
+    const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N));
+    const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N));
+    const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N));
+    const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N));
+    const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N));
+    const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N));
+    const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N));
+    const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N));
+    const VU16 packedA = BitCast(d, LoadU(d, packed_in + 0xA * N));
+    const VU16 packedB = BitCast(d, LoadU(d, packed_in + 0xB * N));
+
+    const VU16 mask = Set(d, 0xFFFu);  // Lowest 12 bits
+
+    const VU16 raw0 = And(packed0, mask);
+    StoreU(raw0, d, raw + 0 * N);
+
+    const VU16 raw1 = And(packed1, mask);
+    StoreU(raw1, d, raw + 1 * N);
+
+    const VU16 raw2 = And(packed2, mask);
+    StoreU(raw2, d, raw + 2 * N);
+
+    const VU16 raw3 = And(packed3, mask);
+    StoreU(raw3, d, raw + 3 * N);
+
+    const VU16 raw4 = And(packed4, mask);
+    StoreU(raw4, d, raw + 4 * N);
+
+    const VU16 raw5 = And(packed5, mask);
+    StoreU(raw5, d, raw + 5 * N);
+
+    const VU16 raw6 = And(packed6, mask);
+    StoreU(raw6, d, raw + 6 * N);
+
+    const VU16 raw7 = And(packed7, mask);
+    StoreU(raw7, d, raw + 7 * N);
+
+    const VU16 mid8 = Set(d, 0xFF0u);  // upper 8 in lower 12
+    const VU16 raw8 =
+        OrAnd(ShiftRight<12>(packed0), ShiftLeft<4>(packed8), mid8);
+    const VU16 raw9 =
+        OrAnd(ShiftRight<12>(packed1), ShiftRight<4>(packed8), mid8);
+    const VU16 rawA =
+        OrAnd(ShiftRight<12>(packed2), ShiftLeft<4>(packed9), mid8);
+    const VU16 rawB =
+        OrAnd(ShiftRight<12>(packed3), ShiftRight<4>(packed9), mid8);
+    const VU16 rawC =
+        OrAnd(ShiftRight<12>(packed4), ShiftLeft<4>(packedA), mid8);
+    const VU16 rawD =
+        OrAnd(ShiftRight<12>(packed5), ShiftRight<4>(packedA), mid8);
+    const VU16 rawE =
+        OrAnd(ShiftRight<12>(packed6), ShiftLeft<4>(packedB), mid8);
+    const VU16 rawF =
+        OrAnd(ShiftRight<12>(packed7), ShiftRight<4>(packedB), mid8);
+    StoreU(raw8, d, raw + 8 * N);
+    StoreU(raw9, d, raw + 9 * N);
+    StoreU(rawA, d, raw + 0xA * N);
+    StoreU(rawB, d, raw + 0xB * N);
+    StoreU(rawC, d, raw + 0xC * N);
+    StoreU(rawD, d, raw + 0xD * N);
+    StoreU(rawE, d, raw + 0xE * N);
+    StoreU(rawF, d, raw + 0xF * N);
+  }
+};  // Pack16<12>
+
+template <>
+struct Pack16<13> {
+  template <class D>
+  HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
+                       uint16_t* HWY_RESTRICT packed_out) const {
+    using VU16 = Vec<decltype(d)>;
+    const size_t N = Lanes(d);
+    const VU16 raw0 = LoadU(d, raw + 0 * N);
+    const VU16 raw1 = LoadU(d, raw + 1 * N);
+    const VU16 raw2 = LoadU(d, raw + 2 * N);
+    const VU16 raw3 = LoadU(d, raw + 3 * N);
+    const VU16 raw4 = LoadU(d, raw + 4 * N);
+    const VU16 raw5 = LoadU(d, raw + 5 * N);
+    const VU16 raw6 = LoadU(d, raw + 6 * N);
+    const VU16 raw7 = LoadU(d, raw + 7 * N);
+    const VU16 raw8 = LoadU(d, raw + 8 * N);
+    const VU16 raw9 = LoadU(d, raw + 9 * N);
+    const VU16 rawA = LoadU(d, raw + 0xA * N);
+    const VU16 rawB = LoadU(d, raw + 0xB * N);
+    const VU16 rawC = LoadU(d, raw + 0xC * N);
+    const VU16 rawD = LoadU(d, raw + 0xD * N);
+    const VU16 rawE = LoadU(d, raw + 0xE * N);
+    const VU16 rawF = LoadU(d, raw + 0xF * N);
+
+    // As with 11 bits, it is not obvious what the optimal partitioning looks
+    // like. We similarly go with an 8+5 split.
+    const VU16 lo8 = Set(d, 0xFFu);
+
+    // Lower 8 bits of all raw
+    const VU16 packed0 = OrAnd(ShiftLeft<8>(raw1), raw0, lo8);
+    const VU16 packed1 = OrAnd(ShiftLeft<8>(raw3), raw2, lo8);
+    const VU16 packed2 = OrAnd(ShiftLeft<8>(raw5), raw4, lo8);
+    const VU16 packed3 = OrAnd(ShiftLeft<8>(raw7), raw6, lo8);
+    const VU16 packed4 = OrAnd(ShiftLeft<8>(raw9), raw8, lo8);
+    const VU16 packed5 = OrAnd(ShiftLeft<8>(rawB), rawA, lo8);
+    const VU16 packed6 = OrAnd(ShiftLeft<8>(rawD), rawC, lo8);
+    const VU16 packed7 = OrAnd(ShiftLeft<8>(rawF), rawE, lo8);
+
+    StoreU(packed0, d, packed_out + 0 * N);
+    StoreU(packed1, d, packed_out + 1 * N);
+    StoreU(packed2, d, packed_out + 2 * N);
+    StoreU(packed3, d, packed_out + 3 * N);
+    StoreU(packed4, d, packed_out + 4 * N);
+    StoreU(packed5, d, packed_out + 5 * N);
+    StoreU(packed6, d, packed_out + 6 * N);
+    StoreU(packed7, d, packed_out + 7 * N);
+
+    // Five vectors, three 5bit remnants each, plus one 5bit in their MSB.
+    const VU16 top0 = ShiftRight<8>(raw0);
+    const VU16 top1 = ShiftRight<8>(raw1);
+    const VU16 top2 = ShiftRight<8>(raw2);
+    const VU16 top3 = ShiftRight<8>(raw3);
+    const VU16 top4 = ShiftRight<8>(raw4);
+
+    // Insert top raw bits into 5-bit groups within packed8..C. Moving the
+    // mask along avoids masking each of raw0..E and enables OrAnd.
+    VU16 next = Set(d, 0x3E0u);  // 0x1F << 5
+    VU16 packed8 = OrAnd(top0, ShiftRight<3>(raw5), next);
+    VU16 packed9 = OrAnd(top1, ShiftRight<3>(raw6), next);
+    VU16 packedA = OrAnd(top2, ShiftRight<3>(raw7), next);
+    VU16 packedB = OrAnd(top3, ShiftRight<3>(raw8), next);
+    VU16 packedC = OrAnd(top4, ShiftRight<3>(raw9), next);
+    next = ShiftLeft<5>(next);
+    packed8 = OrAnd(packed8, ShiftLeft<2>(rawA), next);
+    packed9 = OrAnd(packed9, ShiftLeft<2>(rawB), next);
+    packedA = OrAnd(packedA, ShiftLeft<2>(rawC), next);
+    packedB = OrAnd(packedB, ShiftLeft<2>(rawD), next);
+    packedC = OrAnd(packedC, ShiftLeft<2>(rawE), next);
+
+    // Scatter upper 5 bits of rawF into the upper bits.
+    next = ShiftLeft<3>(next);  // = 0x8000u
+    packed8 = OrAnd(packed8, ShiftLeft<7>(rawF), next);
+    packed9 = OrAnd(packed9, ShiftLeft<6>(rawF), next);
+    packedA = OrAnd(packedA, ShiftLeft<5>(rawF), next);
+    packedB = OrAnd(packedB, ShiftLeft<4>(rawF), next);
+    packedC = OrAnd(packedC, ShiftLeft<3>(rawF), next);
+
+    StoreU(packed8, d, packed_out + 8 * N);
+    StoreU(packed9, d, packed_out + 9 * N);
+    StoreU(packedA, d, packed_out + 0xA * N);
+    StoreU(packedB, d, packed_out + 0xB * N);
+    StoreU(packedC, d, packed_out + 0xC * N);
+  }
+
+  template <class D>
+  HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
+                         uint16_t* HWY_RESTRICT raw) const {
+    using VU16 = Vec<decltype(d)>;
+    const size_t N = Lanes(d);
+
+    const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N));
+    const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N));
+    const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N));
+    const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N));
+    const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N));
+    const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N));
+    const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N));
+    const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N));
+    const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N));
+    const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N));
+    const VU16 packedA = BitCast(d, LoadU(d, packed_in + 0xA * N));
+    const VU16 packedB = BitCast(d, LoadU(d, packed_in + 0xB * N));
+    const VU16 packedC = BitCast(d, LoadU(d, packed_in + 0xC * N));
+
+    const VU16 mask = Set(d, 0xFFu);  // Lowest 8 bits
+
+    const VU16 down0 = And(packed0, mask);
+    const VU16 down1 = ShiftRight<8>(packed0);
+    const VU16 down2 = And(packed1, mask);
+    const VU16 down3 = ShiftRight<8>(packed1);
+    const VU16 down4 = And(packed2, mask);
+    const VU16 down5 = ShiftRight<8>(packed2);
+    const VU16 down6 = And(packed3, mask);
+    const VU16 down7 = ShiftRight<8>(packed3);
+    const VU16 down8 = And(packed4, mask);
+    const VU16 down9 = ShiftRight<8>(packed4);
+    const VU16 downA = And(packed5, mask);
+    const VU16 downB = ShiftRight<8>(packed5);
+    const VU16 downC = And(packed6, mask);
+    const VU16 downD = ShiftRight<8>(packed6);
+    const VU16 downE = And(packed7, mask);
+    const VU16 downF = ShiftRight<8>(packed7);
+
+    // Upper five bits from packed8..C, eight bits from down0..F.
+    const VU16 hi5 = Set(d, 0x1F00u);
+    const VU16 raw0 = OrAnd(down0, ShiftLeft<8>(packed8), hi5);
+    const VU16 raw1 = OrAnd(down1, ShiftLeft<8>(packed9), hi5);
+    const VU16 raw2 = OrAnd(down2, ShiftLeft<8>(packedA), hi5);
+    const VU16 raw3 = OrAnd(down3, ShiftLeft<8>(packedB), hi5);
+    const VU16 raw4 = OrAnd(down4, ShiftLeft<8>(packedC), hi5);
+
+    const VU16 raw5 = OrAnd(down5, ShiftLeft<3>(packed8), hi5);
+    const VU16 raw6 = OrAnd(down6, ShiftLeft<3>(packed9), hi5);
+    const VU16 raw7 = OrAnd(down7, ShiftLeft<3>(packedA), hi5);
+    const VU16 raw8 = OrAnd(down8, ShiftLeft<3>(packed9), hi5);
+    const VU16 raw9 = OrAnd(down9, ShiftLeft<3>(packedA), hi5);
+
+    const VU16 rawA = OrAnd(downA, ShiftRight<2>(packed8), hi5);
+    const VU16 rawB = OrAnd(downB, ShiftRight<2>(packed9), hi5);
+    const VU16 rawC = OrAnd(downC, ShiftRight<2>(packedA), hi5);
+    const VU16 rawD = OrAnd(downD, ShiftRight<2>(packed9), hi5);
+    const VU16 rawE = OrAnd(downE, ShiftRight<2>(packedA), hi5);
+
+    // Shift MSB into the top 5-of-11 and mask.
+    const VU16 p0 = Xor3(And(ShiftRight<7>(packed8), hi5),  //
+                         And(ShiftRight<6>(packed9), hi5),
+                         And(ShiftRight<5>(packedA), hi5));
+    const VU16 p1 = Xor3(And(ShiftRight<4>(packedB), hi5),
+                         And(ShiftRight<3>(packedC), hi5), downF);
+    const VU16 rawF = Or(p0, p1);
+
+    StoreU(raw0, d, raw + 0 * N);
+    StoreU(raw1, d, raw + 1 * N);
+    StoreU(raw2, d, raw + 2 * N);
+    StoreU(raw3, d, raw + 3 * N);
+    StoreU(raw4, d, raw + 4 * N);
+    StoreU(raw5, d, raw + 5 * N);
+    StoreU(raw6, d, raw + 6 * N);
+    StoreU(raw7, d, raw + 7 * N);
+    StoreU(raw8, d, raw + 8 * N);
+    StoreU(raw9, d, raw + 9 * N);
+    StoreU(rawA, d, raw + 0xA * N);
+    StoreU(rawB, d, raw + 0xB * N);
+    StoreU(rawC, d, raw + 0xC * N);
+    StoreU(rawD, d, raw + 0xD * N);
+    StoreU(rawE, d, raw + 0xE * N);
+    StoreU(rawF, d, raw + 0xF * N);
+  }
+};  // Pack16<13>
+
+template <>
+struct Pack16<14> {
+  template <class D>
+  HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
+                       uint16_t* HWY_RESTRICT packed_out) const {
+    using VU16 = Vec<decltype(d)>;
+    const size_t N = Lanes(d);
+    const VU16 raw0 = LoadU(d, raw + 0 * N);
+    const VU16 raw1 = LoadU(d, raw + 1 * N);
+    const VU16 raw2 = LoadU(d, raw + 2 * N);
+    const VU16 raw3 = LoadU(d, raw + 3 * N);
+    const VU16 raw4 = LoadU(d, raw + 4 * N);
+    const VU16 raw5 = LoadU(d, raw + 5 * N);
+    const VU16 raw6 = LoadU(d, raw + 6 * N);
+    const VU16 raw7 = LoadU(d, raw + 7 * N);
+    const VU16 raw8 = LoadU(d, raw + 8 * N);
+    const VU16 raw9 = LoadU(d, raw + 9 * N);
+    const VU16 rawA = LoadU(d, raw + 0xA * N);
+    const VU16 rawB = LoadU(d, raw + 0xB * N);
+    const VU16 rawC = LoadU(d, raw + 0xC * N);
+    const VU16 rawD = LoadU(d, raw + 0xD * N);
+    const VU16 rawE = LoadU(d, raw + 0xE * N);
+    const VU16 rawF = LoadU(d, raw + 0xF * N);
+
+    // 14 vectors, each with 14+2 bits; two raw vectors are scattered
+    // across the upper 2 bits.
+    const VU16 hi2 = Set(d, 0xC000u);
+    const VU16 packed0 = Or(raw0, ShiftLeft<14>(rawE));
+    const VU16 packed1 = OrAnd(raw1, ShiftLeft<12>(rawE), hi2);
+    const VU16 packed2 = OrAnd(raw2, ShiftLeft<10>(rawE), hi2);
+    const VU16 packed3 = OrAnd(raw3, ShiftLeft<8>(rawE), hi2);
+    const VU16 packed4 = OrAnd(raw4, ShiftLeft<6>(rawE), hi2);
+    const VU16 packed5 = OrAnd(raw5, ShiftLeft<4>(rawE), hi2);
+    const VU16 packed6 = OrAnd(raw6, ShiftLeft<2>(rawE), hi2);
+    const VU16 packed7 = Or(raw7, ShiftLeft<14>(rawF));
+    const VU16 packed8 = OrAnd(raw8, ShiftLeft<12>(rawF), hi2);
+    const VU16 packed9 = OrAnd(raw9, ShiftLeft<10>(rawF), hi2);
+    const VU16 packedA = OrAnd(rawA, ShiftLeft<8>(rawF), hi2);
+    const VU16 packedB = OrAnd(rawB, ShiftLeft<6>(rawF), hi2);
+    const VU16 packedC = OrAnd(rawC, ShiftLeft<4>(rawF), hi2);
+    const VU16 packedD = OrAnd(rawD, ShiftLeft<2>(rawF), hi2);
+
+    StoreU(packed0, d, packed_out + 0 * N);
+    StoreU(packed1, d, packed_out + 1 * N);
+    StoreU(packed2, d, packed_out + 2 * N);
+    StoreU(packed3, d, packed_out + 3 * N);
+    StoreU(packed4, d, packed_out + 4 * N);
+    StoreU(packed5, d, packed_out + 5 * N);
+    StoreU(packed6, d, packed_out + 6 * N);
+    StoreU(packed7, d, packed_out + 7 * N);
+    StoreU(packed8, d, packed_out + 8 * N);
+    StoreU(packed9, d, packed_out + 9 * N);
+    StoreU(packedA, d, packed_out + 0xA * N);
+    StoreU(packedB, d, packed_out + 0xB * N);
+    StoreU(packedC, d, packed_out + 0xC * N);
+    StoreU(packedD, d, packed_out + 0xD * N);
+  }
+
+  template <class D>
+  HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
+                         uint16_t* HWY_RESTRICT raw) const {
+    using VU16 = Vec<decltype(d)>;
+    const size_t N = Lanes(d);
+
+    const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N));
+    const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N));
+    const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N));
+    const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N));
+    const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N));
+    const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N));
+    const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N));
+    const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N));
+    const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N));
+    const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N));
+    const VU16 packedA = BitCast(d, LoadU(d, packed_in + 0xA * N));
+    const VU16 packedB = BitCast(d, LoadU(d, packed_in + 0xB * N));
+    const VU16 packedC = BitCast(d, LoadU(d, packed_in + 0xC * N));
+    const VU16 packedD = BitCast(d, LoadU(d, packed_in + 0xD * N));
+
+    const VU16 mask = Set(d, 0x3FFFu);  // Lowest 14 bits
+
+    const VU16 raw0 = And(packed0, mask);
+    StoreU(raw0, d, raw + 0 * N);
+
+    const VU16 raw1 = And(packed1, mask);
+    StoreU(raw1, d, raw + 1 * N);
+
+    const VU16 raw2 = And(packed2, mask);
+    StoreU(raw2, d, raw + 2 * N);
+
+    const VU16 raw3 = And(packed3, mask);
+    StoreU(raw3, d, raw + 3 * N);
+
+    const VU16 raw4 = And(packed4, mask);
+    StoreU(raw4, d, raw + 4 * N);
+
+    const VU16 raw5 = And(packed5, mask);
+    StoreU(raw5, d, raw + 5 * N);
+
+    const VU16 raw6 = And(packed6, mask);
+    StoreU(raw6, d, raw + 6 * N);
+
+    const VU16 raw7 = And(packed7, mask);
+    StoreU(raw7, d, raw + 7 * N);
+
+    const VU16 raw8 = And(packed8, mask);
+    StoreU(raw8, d, raw + 8 * N);
+
+    const VU16 raw9 = And(packed9, mask);
+    StoreU(raw9, d, raw + 9 * N);
+
+    const VU16 rawA = And(packedA, mask);
+    StoreU(rawA, d, raw + 0xA * N);
+
+    const VU16 rawB = And(packedB, mask);
+    StoreU(rawB, d, raw + 0xB * N);
+
+    const VU16 rawC = And(packedC, mask);
+    StoreU(rawC, d, raw + 0xC * N);
+
+    const VU16 rawD = And(packedD, mask);
+    StoreU(rawD, d, raw + 0xD * N);
+
+    // rawE is the concatenation of the top two bits in packed0..6.
+    const VU16 E0 = Xor3(ShiftRight<14>(packed0),  //
+                         ShiftRight<12>(AndNot(mask, packed1)),
+                         ShiftRight<10>(AndNot(mask, packed2)));
+    const VU16 E1 = Xor3(ShiftRight<8>(AndNot(mask, packed3)),
+                         ShiftRight<6>(AndNot(mask, packed4)),
+                         ShiftRight<4>(AndNot(mask, packed5)));
+    const VU16 rawE = Xor3(ShiftRight<2>(AndNot(mask, packed6)), E0, E1);
+    const VU16 F0 = Xor3(ShiftRight<14>(AndNot(mask, packed7)),
+                         ShiftRight<12>(AndNot(mask, packed8)),
+                         ShiftRight<10>(AndNot(mask, packed9)));
+    const VU16 F1 = Xor3(ShiftRight<8>(AndNot(mask, packedA)),
+                         ShiftRight<6>(AndNot(mask, packedB)),
+                         ShiftRight<4>(AndNot(mask, packedC)));
+    const VU16 rawF = Xor3(ShiftRight<2>(AndNot(mask, packedD)), F0, F1);
+    StoreU(rawE, d, raw + 0xE * N);
+    StoreU(rawF, d, raw + 0xF * N);
+  }
+};  // Pack16<14>
+
+template <>
+struct Pack16<15> {
+  template <class D>
+  HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
+                       uint16_t* HWY_RESTRICT packed_out) const {
+    using VU16 = Vec<decltype(d)>;
+    const size_t N = Lanes(d);
+    const VU16 raw0 = LoadU(d, raw + 0 * N);
+    const VU16 raw1 = LoadU(d, raw + 1 * N);
+    const VU16 raw2 = LoadU(d, raw + 2 * N);
+    const VU16 raw3 = LoadU(d, raw + 3 * N);
+    const VU16 raw4 = LoadU(d, raw + 4 * N);
+    const VU16 raw5 = LoadU(d, raw + 5 * N);
+    const VU16 raw6 = LoadU(d, raw + 6 * N);
+    const VU16 raw7 = LoadU(d, raw + 7 * N);
+    const VU16 raw8 = LoadU(d, raw + 8 * N);
+    const VU16 raw9 = LoadU(d, raw + 9 * N);
+    const VU16 rawA = LoadU(d, raw + 0xA * N);
+    const VU16 rawB = LoadU(d, raw + 0xB * N);
+    const VU16 rawC = LoadU(d, raw + 0xC * N);
+    const VU16 rawD = LoadU(d, raw + 0xD * N);
+    const VU16 rawE = LoadU(d, raw + 0xE * N);
+    const VU16 rawF = LoadU(d, raw + 0xF * N);
+
+    // 15 vectors, each with 15+1 bits; one packed vector is scattered
+    // across the upper bit.
+    const VU16 hi1 = Set(d, 0x8000u);
+    const VU16 packed0 = Or(raw0, ShiftLeft<15>(rawF));
+    const VU16 packed1 = OrAnd(raw1, ShiftLeft<14>(rawF), hi1);
+    const VU16 packed2 = OrAnd(raw2, ShiftLeft<13>(rawF), hi1);
+    const VU16 packed3 = OrAnd(raw3, ShiftLeft<12>(rawF), hi1);
+    const VU16 packed4 = OrAnd(raw4, ShiftLeft<11>(rawF), hi1);
+    const VU16 packed5 = OrAnd(raw5, ShiftLeft<10>(rawF), hi1);
+    const VU16 packed6 = OrAnd(raw6, ShiftLeft<9>(rawF), hi1);
+    const VU16 packed7 = OrAnd(raw7, ShiftLeft<8>(rawF), hi1);
+    const VU16 packed8 = OrAnd(raw8, ShiftLeft<7>(rawF), hi1);
+    const VU16 packed9 = OrAnd(raw9, ShiftLeft<6>(rawF), hi1);
+    const VU16 packedA = OrAnd(rawA, ShiftLeft<5>(rawF), hi1);
+    const VU16 packedB = OrAnd(rawB, ShiftLeft<4>(rawF), hi1);
+    const VU16 packedC = OrAnd(rawC, ShiftLeft<3>(rawF), hi1);
+    const VU16 packedD = OrAnd(rawD, ShiftLeft<2>(rawF), hi1);
+    const VU16 packedE = OrAnd(rawE, ShiftLeft<1>(rawF), hi1);
+
+    StoreU(packed0, d, packed_out + 0 * N);
+    StoreU(packed1, d, packed_out + 1 * N);
+    StoreU(packed2, d, packed_out + 2 * N);
+    StoreU(packed3, d, packed_out + 3 * N);
+    StoreU(packed4, d, packed_out + 4 * N);
+    StoreU(packed5, d, packed_out + 5 * N);
+    StoreU(packed6, d, packed_out + 6 * N);
+    StoreU(packed7, d, packed_out + 7 * N);
+    StoreU(packed8, d, packed_out + 8 * N);
+    StoreU(packed9, d, packed_out + 9 * N);
+    StoreU(packedA, d, packed_out + 0xA * N);
+    StoreU(packedB, d, packed_out + 0xB * N);
+    StoreU(packedC, d, packed_out + 0xC * N);
+    StoreU(packedD, d, packed_out + 0xD * N);
+    StoreU(packedE, d, packed_out + 0xE * N);
+  }
+
+  template <class D>
+  HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
+                         uint16_t* HWY_RESTRICT raw) const {
+    using VU16 = Vec<decltype(d)>;
+    const size_t N = Lanes(d);
+
+    const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N));
+    const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N));
+    const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N));
+    const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N));
+    const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N));
+    const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N));
+    const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N));
+    const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N));
+    const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N));
+    const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N));
+    const VU16 packedA = BitCast(d, LoadU(d, packed_in + 0xA * N));
+    const VU16 packedB = BitCast(d, LoadU(d, packed_in + 0xB * N));
+    const VU16 packedC = BitCast(d, LoadU(d, packed_in + 0xC * N));
+    const VU16 packedD = BitCast(d, LoadU(d, packed_in + 0xD * N));
+    const VU16 packedE = BitCast(d, LoadU(d, packed_in + 0xE * N));
+
+    const VU16 mask = Set(d, 0x7FFFu);  // Lowest 15 bits
+
+    const VU16 raw0 = And(packed0, mask);
+    StoreU(raw0, d, raw + 0 * N);
+
+    const VU16 raw1 = And(packed1, mask);
+    StoreU(raw1, d, raw + 1 * N);
+
+    const VU16 raw2 = And(packed2, mask);
+    StoreU(raw2, d, raw + 2 * N);
+
+    const VU16 raw3 = And(packed3, mask);
+    StoreU(raw3, d, raw + 3 * N);
+
+    const VU16 raw4 = And(packed4, mask);
+    StoreU(raw4, d, raw + 4 * N);
+
+    const VU16 raw5 = And(packed5, mask);
+    StoreU(raw5, d, raw + 5 * N);
+
+    const VU16 raw6 = And(packed6, mask);
+    StoreU(raw6, d, raw + 6 * N);
+
+    const VU16 raw7 = And(packed7, mask);
+    StoreU(raw7, d, raw + 7 * N);
+
+    const VU16 raw8 = And(packed8, mask);
+    StoreU(raw8, d, raw + 8 * N);
+
+    const VU16 raw9 = And(packed9, mask);
+    StoreU(raw9, d, raw + 9 * N);
+
+    const VU16 rawA = And(packedA, mask);
+    StoreU(rawA, d, raw + 0xA * N);
+
+    const VU16 rawB = And(packedB, mask);
+    StoreU(rawB, d, raw + 0xB * N);
+
+    const VU16 rawC = And(packedC, mask);
+    StoreU(rawC, d, raw + 0xC * N);
+
+    const VU16 rawD = And(packedD, mask);
+    StoreU(rawD, d, raw + 0xD * N);
+
+    const VU16 rawE = And(packedE, mask);
+    StoreU(rawE, d, raw + 0xE * N);
+
+    // rawF is the concatenation of the top bit in packed0..E.
+    const VU16 F0 = Xor3(ShiftRight<15>(packed0),  //
+                         ShiftRight<14>(AndNot(mask, packed1)),
+                         ShiftRight<13>(AndNot(mask, packed2)));
+    const VU16 F1 = Xor3(ShiftRight<12>(AndNot(mask, packed3)),
+                         ShiftRight<11>(AndNot(mask, packed4)),
+                         ShiftRight<10>(AndNot(mask, packed5)));
+    const VU16 F2 = Xor3(ShiftRight<9>(AndNot(mask, packed6)),
+                         ShiftRight<8>(AndNot(mask, packed7)),
+                         ShiftRight<7>(AndNot(mask, packed8)));
+    const VU16 F3 = Xor3(ShiftRight<6>(AndNot(mask, packed9)),
+                         ShiftRight<5>(AndNot(mask, packedA)),
+                         ShiftRight<4>(AndNot(mask, packedB)));
+    const VU16 F4 = Xor3(ShiftRight<3>(AndNot(mask, packedC)),
+                         ShiftRight<2>(AndNot(mask, packedD)),
+                         ShiftRight<1>(AndNot(mask, packedE)));
+    const VU16 rawF = Xor3(F0, F1, Xor3(F2, F3, F4));
+    StoreU(rawF, d, raw + 0xF * N);
+  }
+};  // Pack16<15>
+
+template <>
+struct Pack16<16> {
+  template <class D>
+  HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
+                       uint16_t* HWY_RESTRICT packed_out) const {
+    using VU16 = Vec<decltype(d)>;
+    const size_t N = Lanes(d);
+    const VU16 raw0 = LoadU(d, raw + 0 * N);
+    const VU16 raw1 = LoadU(d, raw + 1 * N);
+    const VU16 raw2 = LoadU(d, raw + 2 * N);
+    const VU16 raw3 = LoadU(d, raw + 3 * N);
+    const VU16 raw4 = LoadU(d, raw + 4 * N);
+    const VU16 raw5 = LoadU(d, raw + 5 * N);
+    const VU16 raw6 = LoadU(d, raw + 6 * N);
+    const VU16 raw7 = LoadU(d, raw + 7 * N);
+    const VU16 raw8 = LoadU(d, raw + 8 * N);
+    const VU16 raw9 = LoadU(d, raw + 9 * N);
+    const VU16 rawA = LoadU(d, raw + 0xA * N);
+    const VU16 rawB = LoadU(d, raw + 0xB * N);
+    const VU16 rawC = LoadU(d, raw + 0xC * N);
+    const VU16 rawD = LoadU(d, raw + 0xD * N);
+    const VU16 rawE = LoadU(d, raw + 0xE * N);
+    const VU16 rawF = LoadU(d, raw + 0xF * N);
+
+    StoreU(raw0, d, packed_out + 0 * N);
+    StoreU(raw1, d, packed_out + 1 * N);
+    StoreU(raw2, d, packed_out + 2 * N);
+    StoreU(raw3, d, packed_out + 3 * N);
+    StoreU(raw4, d, packed_out + 4 * N);
+    StoreU(raw5, d, packed_out + 5 * N);
+    StoreU(raw6, d, packed_out + 6 * N);
+    StoreU(raw7, d, packed_out + 7 * N);
+    StoreU(raw8, d, packed_out + 8 * N);
+    StoreU(raw9, d, packed_out + 9 * N);
+    StoreU(rawA, d, packed_out + 0xA * N);
+    StoreU(rawB, d, packed_out + 0xB * N);
+    StoreU(rawC, d, packed_out + 0xC * N);
+    StoreU(rawD, d, packed_out + 0xD * N);
+    StoreU(rawE, d, packed_out + 0xE * N);
+    StoreU(rawF, d, packed_out + 0xF * N);
+  }
+
+  template <class D>
+  HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
+                         uint16_t* HWY_RESTRICT raw) const {
+    using VU16 = Vec<decltype(d)>;
+    const size_t N = Lanes(d);
+
+    const VU16 raw0 = BitCast(d, LoadU(d, packed_in + 0 * N));
+    const VU16 raw1 = BitCast(d, LoadU(d, packed_in + 1 * N));
+    const VU16 raw2 = BitCast(d, LoadU(d, packed_in + 2 * N));
+    const VU16 raw3 = BitCast(d, LoadU(d, packed_in + 3 * N));
+    const VU16 raw4 = BitCast(d, LoadU(d, packed_in + 4 * N));
+    const VU16 raw5 = BitCast(d, LoadU(d, packed_in + 5 * N));
+    const VU16 raw6 = BitCast(d, LoadU(d, packed_in + 6 * N));
+    const VU16 raw7 = BitCast(d, LoadU(d, packed_in + 7 * N));
+    const VU16 raw8 = BitCast(d, LoadU(d, packed_in + 8 * N));
+    const VU16 raw9 = BitCast(d, LoadU(d, packed_in + 9 * N));
+    const VU16 rawA = BitCast(d, LoadU(d, packed_in + 0xA * N));
+    const VU16 rawB = BitCast(d, LoadU(d, packed_in + 0xB * N));
+    const VU16 rawC = BitCast(d, LoadU(d, packed_in + 0xC * N));
+    const VU16 rawD = BitCast(d, LoadU(d, packed_in + 0xD * N));
+    const VU16 rawE = BitCast(d, LoadU(d, packed_in + 0xE * N));
+    const VU16 rawF = BitCast(d, LoadU(d, packed_in + 0xF * N));
+
+    StoreU(raw0, d, raw + 0 * N);
+    StoreU(raw1, d, raw + 1 * N);
+    StoreU(raw2, d, raw + 2 * N);
+    StoreU(raw3, d, raw + 3 * N);
+    StoreU(raw4, d, raw + 4 * N);
+    StoreU(raw5, d, raw + 5 * N);
+    StoreU(raw6, d, raw + 6 * N);
+    StoreU(raw7, d, raw + 7 * N);
+    StoreU(raw8, d, raw + 8 * N);
+    StoreU(raw9, d, raw + 9 * N);
+    StoreU(rawA, d, raw + 0xA * N);
+    StoreU(rawB, d, raw + 0xB * N);
+    StoreU(rawC, d, raw + 0xC * N);
+    StoreU(rawD, d, raw + 0xD * N);
+    StoreU(rawE, d, raw + 0xE * N);
+    StoreU(rawF, d, raw + 0xF * N);
+  }
+};  // Pack16<16>
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#endif  // HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_
diff --git a/third_party/highway/hwy/contrib/bit_pack/bit_pack_test.cc b/third_party/highway/hwy/contrib/bit_pack/bit_pack_test.cc
new file mode 100644
index 0000000000..a239da9cf6
--- /dev/null
+++ b/third_party/highway/hwy/contrib/bit_pack/bit_pack_test.cc
@@ -0,0 +1,205 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+
+#include <vector>
+
+#include "hwy/aligned_allocator.h"
+#include "hwy/base.h"
+#include "hwy/nanobenchmark.h"
+
+// clang-format off
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/bit_pack/bit_pack_test.cc"  // NOLINT
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+#include "hwy/contrib/bit_pack/bit_pack-inl.h"
+#include "hwy/tests/test_util-inl.h"
+// clang-format on
+
+#ifndef HWY_BIT_PACK_BENCHMARK
+#define HWY_BIT_PACK_BENCHMARK 0
+#endif
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+// Used to prevent running benchmark (slow) for partial vectors and targets
+// except the best available. Global, not per-target, hence must be outside
+// HWY_NAMESPACE. Declare first because HWY_ONCE is only true after some code
+// has been re-included.
+extern size_t last_bits;
+extern uint64_t best_target;
+#if HWY_ONCE
+size_t last_bits = 0;
+uint64_t best_target = ~0ull;
+#endif
+namespace HWY_NAMESPACE {
+
+template <size_t kBits, typename T>
+T Random(RandomState& rng) {
+  return static_cast<T>(Random32(&rng) & kBits);
+}
+
+template <typename T>
+class Checker {
+ public:
+  explicit Checker(size_t num) { raw_.reserve(num); }
+  void NotifyRaw(T raw) { raw_.push_back(raw); }
+
+  void NotifyRawOutput(size_t bits, T raw) {
+    if (raw_[num_verified_] != raw) {
+      HWY_ABORT("%zu bits: pos %zu of %zu, expected %.0f actual %.0f\n", bits,
+                num_verified_, raw_.size(),
+                static_cast<double>(raw_[num_verified_]),
+                static_cast<double>(raw));
+    }
+    ++num_verified_;
+  }
+
+ private:
+  std::vector<T> raw_;
+  size_t num_verified_ = 0;
+};
+
+template <template <size_t> class PackT, size_t kVectors, size_t kBits>
+struct TestPack {
+  template <typename T, class D>
+  void operator()(T /* t */, D d) {
+    constexpr size_t kLoops = 16;  // working set slightly larger than L1
+    const size_t N = Lanes(d);
+    RandomState rng(N * 129);
+    static_assert(kBits <= kVectors, "");
+    const size_t num_per_loop = N * kVectors;
+    const size_t num = num_per_loop * kLoops;
+    const size_t num_packed_per_loop = N * kBits;
+    const size_t num_packed = num_packed_per_loop * kLoops;
+    Checker<T> checker(num);
+    AlignedFreeUniquePtr<T[]> raw = hwy::AllocateAligned<T>(num);
+    AlignedFreeUniquePtr<T[]> raw2 = hwy::AllocateAligned<T>(num);
+    AlignedFreeUniquePtr<T[]> packed = hwy::AllocateAligned<T>(num_packed);
+
+    for (size_t i = 0; i < num; ++i) {
+      raw[i] = Random<kBits, T>(rng);
+      checker.NotifyRaw(raw[i]);
+    }
+
+    best_target = HWY_MIN(best_target, HWY_TARGET);
+    const bool run_bench = HWY_BIT_PACK_BENCHMARK && (kBits != last_bits) &&
+                           (HWY_TARGET == best_target);
+    last_bits = kBits;
+
+    const PackT<kBits> func;
+
+    if (run_bench) {
+      const size_t kNumInputs = 1;
+      const size_t num_items = num * size_t(Unpredictable1());
+      const FuncInput inputs[kNumInputs] = {num_items};
+      Result results[kNumInputs];
+
+      Params p;
+      p.verbose = false;
+      p.max_evals = 7;
+      p.target_rel_mad = 0.002;
+      const size_t num_results = MeasureClosure(
+          [&](FuncInput) HWY_ATTR {
+            for (size_t i = 0, pi = 0; i < num;
+                 i += num_per_loop, pi += num_packed_per_loop) {
+              func.Pack(d, raw.get() + i, packed.get() + pi);
+            }
+            packed.get()[Random32(&rng) % num_packed] += Unpredictable1() - 1;
+            for (size_t i = 0, pi = 0; i < num;
+                 i += num_per_loop, pi += num_packed_per_loop) {
+              func.Unpack(d, packed.get() + pi, raw2.get() + i);
+            }
+            return raw2[Random32(&rng) % num];
+          },
+          inputs, kNumInputs, results, p);
+      if (num_results != kNumInputs) {
+        fprintf(stderr, "MeasureClosure failed.\n");
+        return;
+      }
+      // Print throughput for pack+unpack round trip
+      for (size_t i = 0; i < num_results; ++i) {
+        const size_t bytes_per_element = (kBits + 7) / 8;
+        const double bytes = results[i].input * bytes_per_element;
+        const double seconds =
+            results[i].ticks / platform::InvariantTicksPerSecond();
+        printf("Bits:%2d elements:%3d GB/s:%4.1f (+/-%3.1f%%)\n",
+               static_cast<int>(kBits), static_cast<int>(results[i].input),
+               1E-9 * bytes / seconds, results[i].variability * 100.0);
+      }
+    } else {
+      for (size_t i = 0, pi = 0; i < num;
+           i += num_per_loop, pi += num_packed_per_loop) {
+        func.Pack(d, raw.get() + i, packed.get() + pi);
+      }
+      packed.get()[Random32(&rng) % num_packed] += Unpredictable1() - 1;
+      for (size_t i = 0, pi = 0; i < num;
+           i += num_per_loop, pi += num_packed_per_loop) {
+        func.Unpack(d, packed.get() + pi, raw2.get() + i);
+      }
+    }
+
+    for (size_t i = 0; i < num; ++i) {
+      checker.NotifyRawOutput(kBits, raw2[i]);
+    }
+  }
+};
+
+void TestAllPack8() {
+  ForShrinkableVectors<TestPack<Pack8, 8, 1>>()(uint8_t());
+  ForShrinkableVectors<TestPack<Pack8, 8, 2>>()(uint8_t());
+  ForShrinkableVectors<TestPack<Pack8, 8, 3>>()(uint8_t());
+  ForShrinkableVectors<TestPack<Pack8, 8, 4>>()(uint8_t());
+  ForShrinkableVectors<TestPack<Pack8, 8, 5>>()(uint8_t());
+  ForShrinkableVectors<TestPack<Pack8, 8, 6>>()(uint8_t());
+  ForShrinkableVectors<TestPack<Pack8, 8, 7>>()(uint8_t());
+  ForShrinkableVectors<TestPack<Pack8, 8, 8>>()(uint8_t());
+}
+
+void TestAllPack16() {
+  ForShrinkableVectors<TestPack<Pack16, 16, 1>>()(uint16_t());
+  ForShrinkableVectors<TestPack<Pack16, 16, 2>>()(uint16_t());
+  ForShrinkableVectors<TestPack<Pack16, 16, 3>>()(uint16_t());
+  ForShrinkableVectors<TestPack<Pack16, 16, 4>>()(uint16_t());
+  ForShrinkableVectors<TestPack<Pack16, 16, 5>>()(uint16_t());
+  ForShrinkableVectors<TestPack<Pack16, 16, 6>>()(uint16_t());
+  ForShrinkableVectors<TestPack<Pack16, 16, 7>>()(uint16_t());
+  ForShrinkableVectors<TestPack<Pack16, 16, 8>>()(uint16_t());
+  ForShrinkableVectors<TestPack<Pack16, 16, 9>>()(uint16_t());
+  ForShrinkableVectors<TestPack<Pack16, 16, 10>>()(uint16_t());
+  ForShrinkableVectors<TestPack<Pack16, 16, 11>>()(uint16_t());
+  ForShrinkableVectors<TestPack<Pack16, 16, 12>>()(uint16_t());
+  ForShrinkableVectors<TestPack<Pack16, 16, 13>>()(uint16_t());
+  ForShrinkableVectors<TestPack<Pack16, 16, 14>>()(uint16_t());
+  ForShrinkableVectors<TestPack<Pack16, 16, 15>>()(uint16_t());
+  ForShrinkableVectors<TestPack<Pack16, 16, 16>>()(uint16_t());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(BitPackTest);
+HWY_EXPORT_AND_TEST_P(BitPackTest, TestAllPack8);
+HWY_EXPORT_AND_TEST_P(BitPackTest, TestAllPack16);
+}  // namespace hwy
+
+#endif